R 代码:
df1 <- data.frame(
gene = paste0("gene", 1:4),
change = c('up', 'up', 'down', 'down'),
score = c(5, 3, -2, -4)
)
df1
## gene change score
## 1 gene1 up 5
## 2 gene2 up 3
## 3 gene3 down -2
## 4 gene4 down -4
Python 代码:
DataFrame不是原生的,而是由pandas提供,接受字典,每列是一个键值对的格式。
import pandas as pd
df1 = pd.DataFrame({
'gene': ["gene" + str(i) for i in range(1, 5)],
'change': ['up', 'up', 'down', 'down'],
'score': [5, 3, -2, -4]
})
df1
## gene change score
## 0 gene1 up 5
## 1 gene2 up 3
## 2 gene3 down -2
## 3 gene4 down -4
# 按单列排序
library(dplyr)
arrange(df1, score)
## gene change score
## 1 gene4 down -4
## 2 gene3 down -2
## 3 gene2 up 3
## 4 gene1 up 5
arrange(df1, desc(score))
## gene change score
## 1 gene1 up 5
## 2 gene2 up 3
## 3 gene3 down -2
## 4 gene4 down -4
# 按多列排序
arrange(df1, change, desc(score))
## gene change score
## 1 gene3 down -2
## 2 gene4 down -4
## 3 gene1 up 5
## 4 gene2 up 3
# 计数统计
table(df1$change)
##
## down up
## 2 2
df1 %>% count(change)
## change n
## 1 down 2
## 2 up 2
# 多列计数
df1 %>% count(change, score)
## change score n
## 1 down -4 1
## 2 down -2 1
## 3 up 3 1
## 4 up 5 1
python:
# 按单列排序
df1.sort_values('score')
## gene change score
## 3 gene4 down -4
## 2 gene3 down -2
## 1 gene2 up 3
## 0 gene1 up 5
df1.sort_values('score', ascending=False)
## gene change score
## 0 gene1 up 5
## 1 gene2 up 3
## 2 gene3 down -2
## 3 gene4 down -4
# 按多列排序
df1.sort_values(['change','score'], ascending=[True,False])
## gene change score
## 2 gene3 down -2
## 3 gene4 down -4
## 0 gene1 up 5
## 1 gene2 up 3
# 计数统计
df1['change'].value_counts()
## change
## up 2
## down 2
## Name: count, dtype: int64
df1.groupby('change').size()
## change
## down 2
## up 2
## dtype: int64
# 多列计数
df1.groupby(['change','score']).size()
## change score
## down -4 1
## -2 1
## up 3 1
## 5 1
## dtype: int64
R:
# 取单列(返回向量/Series)
df1$change
## [1] "up" "up" "down" "down"
df1[,"change"]
## [1] "up" "up" "down" "down"
# 取单列(返回数据框)
df1[,"change", drop=FALSE]
## change
## 1 up
## 2 up
## 3 down
## 4 down
# 取多列
df1[,c("gene","score")]
## gene score
## 1 gene1 5
## 2 gene2 3
## 3 gene3 -2
## 4 gene4 -4
# 取前3行
df1[1:3,]
## gene change score
## 1 gene1 up 5
## 2 gene2 up 3
## 3 gene3 down -2
# 取特定行列
df1[1:3, 2:3]
## change score
## 1 up 5
## 2 up 3
## 3 down -2
python:
注意行列索引(也就是行号列号)是从0开始。
# 取单列(返回Series)
df1.change
## 0 up
## 1 up
## 2 down
## 3 down
## Name: change, dtype: object
df1['change']
## 0 up
## 1 up
## 2 down
## 3 down
## Name: change, dtype: object
# 取单列(返回数据框)
df1[['change']]
## change
## 0 up
## 1 up
## 2 down
## 3 down
# 取多列
df1[['gene','score']]
## gene score
## 0 gene1 5
## 1 gene2 3
## 2 gene3 -2
## 3 gene4 -4
df1.loc[:,['gene','score']]
## gene score
## 0 gene1 5
## 1 gene2 3
## 2 gene3 -2
## 3 gene4 -4
# 取前3行
df1.iloc[0:3,:]
## gene change score
## 0 gene1 up 5
## 1 gene2 up 3
## 2 gene3 down -2
# 取特定行列
df1.iloc[0:3, 1:3] #左包右不包
## change score
## 0 up 5
## 1 up 3
## 2 down -2
df1.loc[:, 'change':'score'] #包含结束值,特有用法
## change score
## 0 up 5
## 1 up 3
## 2 down -2
## 3 down -4
条件筛选:
R:
df1[df1$score > 0,]
## gene change score
## 1 gene1 up 5
## 2 gene2 up 3
python:
df1[df1['score'] > 0]
## gene change score
## 0 gene1 up 5
## 1 gene2 up 3
R:
dim(df1)
## [1] 4 3
rownames(df1)
## [1] "1" "2" "3" "4"
colnames(df1)
## [1] "gene" "change" "score"
python:
df1.shape
## (4, 3)
df1.index
## RangeIndex(start=0, stop=4, step=1)
df1.columns
## Index(['gene', 'change', 'score'], dtype='object')
R:
# 修改列名
colnames(df1)[2] = "diff"
names(df1)[2] = "diff"
python:
不允许单独修改一个列名,必须全部修改或者是用rename修改
# 修改列名
df1.columns = ['gene', 'diff', 'score']
df1.rename(columns={'change': 'diff'}, inplace=True)
R:
# 查看数据详情
str(df1)
## 'data.frame': 4 obs. of 3 variables:
## $ gene : chr "gene1" "gene2" "gene3" "gene4"
## $ diff : chr "up" "up" "down" "down"
## $ score: num 5 3 -2 -4
class(df1$score)
## [1] "numeric"
# 转换数据类型
as.character(df1$score)
## [1] "5" "3" "-2" "-4"
python:
# 查看数据详情
df1.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 4 entries, 0 to 3
## Data columns (total 3 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 gene 4 non-null object
## 1 diff 4 non-null object
## 2 score 4 non-null int64
## dtypes: int64(1), object(2)
## memory usage: 228.0+ bytes
type(df1['score'])
## <class 'pandas.core.series.Series'>
# 转换数据类型
df1['score'].astype(str)
## 0 5
## 1 3
## 2 -2
## 3 -4
## Name: score, dtype: object
先制作输入数据
R :
# 创建第一个数据框 test1
test1 <- data.frame(name = c('jimmy', 'nicker', 'Damon', 'Sophie'),
blood_type = c("A", "B", "O", "AB"))
# 创建第二个数据框 test2
test2 <- data.frame(name = c('Damon', 'jimmy', 'nicker', 'tony'),
group = c("group1", "group1", "group2", "group2"),
vision = c(4.2, 4.3, 4.9, 4.5))
# 创建第三个数据框 test3
test3 <- data.frame(NAME = c('Damon', 'jimmy', 'nicker', 'tony'),
weight = c(140, 145, 110, 138))
# 合并 test1 和 test2
merge(test1, test2, by = "name")
## name blood_type group vision
## 1 Damon O group1 4.2
## 2 jimmy A group1 4.3
## 3 nicker B group2 4.9
# 合并 test1 和 test3(列名不同)
merge(test1, test3, by.x = "name", by.y = "NAME")
## name blood_type weight
## 1 Damon O 140
## 2 jimmy A 145
## 3 nicker B 110
Python :
import pandas as pd
# 创建第一个数据框 test1
test1 = pd.DataFrame({
'name': ['jimmy', 'nicker', 'Damon', 'Sophie'],
'blood_type': ['A', 'B', 'O', 'AB']
})
# 创建第二个数据框 test2
test2 = pd.DataFrame({
'name': ['Damon', 'jimmy', 'nicker', 'tony'],
'group': ['group1', 'group1', 'group2', 'group2'],
'vision': [4.2, 4.3, 4.9, 4.5]
})
# 创建第三个数据框 test3
test3 = pd.DataFrame({
'NAME': ['Damon', 'jimmy', 'nicker', 'tony'],
'weight': [140, 145, 110, 138]
})
# 合并 test1 和 test2
pd.merge(test1, test2, on='name')
## name blood_type group vision
## 0 jimmy A group1 4.3
## 1 nicker B group2 4.9
## 2 Damon O group1 4.2
# 合并 test1 和 test3(列名不同)
pd.merge(test1, test3, left_on='name', right_on='NAME')
## name blood_type NAME weight
## 0 jimmy A jimmy 145
## 1 nicker B nicker 110
## 2 Damon O Damon 140
内连接、外连接和左连接
R:
# R 代码
merge(test1, test2, by = "name")
## name blood_type group vision
## 1 Damon O group1 4.2
## 2 jimmy A group1 4.3
## 3 nicker B group2 4.9
merge(test1, test2, by = "name", all = TRUE)
## name blood_type group vision
## 1 Damon O group1 4.2
## 2 jimmy A group1 4.3
## 3 nicker B group2 4.9
## 4 Sophie AB <NA> NA
## 5 tony <NA> group2 4.5
merge(test1, test2, by = "name", all.x = TRUE)
## name blood_type group vision
## 1 Damon O group1 4.2
## 2 jimmy A group1 4.3
## 3 nicker B group2 4.9
## 4 Sophie AB <NA> NA
python:
pd.merge(test1, test2, on='name', how='inner')
## name blood_type group vision
## 0 jimmy A group1 4.3
## 1 nicker B group2 4.9
## 2 Damon O group1 4.2
pd.merge(test1, test2, on='name', how='outer')
## name blood_type group vision
## 0 Damon O group1 4.2
## 1 Sophie AB NaN NaN
## 2 jimmy A group1 4.3
## 3 nicker B group2 4.9
## 4 tony NaN group2 4.5
pd.merge(test1, test2, on='name', how='left')
## name blood_type group vision
## 0 jimmy A group1 4.3
## 1 nicker B group2 4.9
## 2 Damon O group1 4.2
## 3 Sophie AB NaN NaN
R:
# R 代码
# 创建示例数据框 df1 和 df2
d1 <- data.frame(name = c('jimmy', 'nicker'), age = c(25, 30))
d2 <- data.frame(name = c('Damon', 'Sophie'), age = c(28, 22))
d3 <- data.frame(height = c(170, 165), weight = c(70, 60))
rbind(d1,d2)
## name age
## 1 jimmy 25
## 2 nicker 30
## 3 Damon 28
## 4 Sophie 22
cbind(d1,d3)
## name age height weight
## 1 jimmy 25 170 70
## 2 nicker 30 165 60
python:
# 创建示例数据框 d1 和 d2
d1 = pd.DataFrame({'name': ['jimmy', 'nicker'], 'age': [25, 30]})
d2 = pd.DataFrame({'name': ['Damon', 'Sophie'], 'age': [28, 22]})
d3 = pd.DataFrame({'height': [170, 165], 'weight': [70, 60]})
# 行
pd.concat([d1, d2])
## name age
## 0 jimmy 25
## 1 nicker 30
## 0 Damon 28
## 1 Sophie 22
# 列
pd.concat([d1, d3], axis=1)
## name age height weight
## 0 jimmy 25 170 70
## 1 nicker 30 165 60