R特有的变量保存格式(xxx.Rdata)
运行以下代码(Day2作业):
load("gands.Rdata")
length(g)#计算向量长度函数
## [1] 100
seq(2,100,2)
## [1] 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40
## [21] 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80
## [41] 82 84 86 88 90 92 94 96 98 100
g[seq(2,100,2)]#用下标筛选出下标为偶数的基因名。
## [1] "CRAMP1L" "PRSS8" "CRAMP1L" "SLCO1C1" "COMMD1" "CCT4"
## [7] "RAB7A" "ZDHHC16" "MYL12B" "SNRPE" "ZNF586" "GGT7"
## [13] "RAB7A" "AFG3L2" "AC104581.1" "MPP2" "ATP2A2" "SNRPE"
## [19] "PRSS8" "ZNF461" "CECR5" "CLEC17A" "ATG10" "ATG10"
## [25] "SLC25A25" "KRTAP4-3" "SLCO1C1" "GGT7" "GSTP1" "UBAC1"
## [31] "NYNRIN" "MYL12B" "KCND1" "RGPD3" "C10orf128" "SLC30A9"
## [37] "GGT7" "TUBA4A" "KLHDC8A" "HBP1" "MARC2" "LCP1"
## [43] "OR2D3" "LIPE" "LIPE" "CANX" "ATP6V1B2" "MARC2"
## [49] "LCP1" "HOOK2"
g%in%s
## [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
## [14] TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [27] TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [40] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
## [53] TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
## [66] FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [79] TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## [92] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
table(g%in%s)#统计g%in%s结果有多少TRUE,有多少FALSE
##
## FALSE TRUE
## 37 63
g[g%in%s]#逻辑值取子集,筛选出在向量s中存在的g的元素
## [1] "GFM2" "SLCO1C1" "NYNRIN" "COMMD1" "COMMD1"
## [6] "AC017081.1" "RAB7A" "CASKIN2" "GGT7" "SNRPE"
## [11] "RGPD3" "ZNF586" "COMMD1" "GGT7" "URB1"
## [16] "RAB7A" "MPP2" "AFG3L2" "URB1" "AC104581.1"
## [21] "MPP2" "SNRPE" "ARHGAP1" "ZNF461" "OR2D3"
## [26] "CECR5" "SPDL1" "CLEC17A" "ZNF461" "ATG10"
## [31] "ATG10" "ATG10" "SLC25A25" "SLC30A9" "SLCO1C1"
## [36] "GGT7" "CASKIN2" "GSTP1" "MPP2" "NYNRIN"
## [41] "INTS12" "MPP2" "RGPD3" "RGPD3" "SLC30A9"
## [46] "C10orf128" "HBD" "SLC30A9" "GGT7" "HEPH"
## [51] "RP5-1021I20.4" "KLHDC8A" "HBD" "ZNF586" "CECR5"
## [56] "OR2D3" "LIPE" "INTS12" "LIPE" "SPDL1"
## [61] "SLCO1C1" "GGT7" "CECR5"
length(s)
## [1] 50
table(g%in%s)#返回63个TRUE
##
## FALSE TRUE
## 37 63
table(s %in%g)#返回43个TRUE
##
## FALSE TRUE
## 7 43
length(intersect(g,s))#交集返回32,思考为什么不是43或63
## [1] 32
#以上三行代码是因为g和s中有重复元素
aa <- rnorm(10,mean = 0,sd = 18)
aa[aa < -2]#理解aa[aa < -2]和aa[aa <(-2)]和aa[aa <-2]三个代码区别
## [1] -9.790635 -2.826137 -22.182617
【小洁老师语录】代码不报错,不代表真的没错,要检查目的是否达到
坑:rnorm(10,mean = 0,sd = 18)rnorm(10,mean = 0,sd = 18)<(-2)
:[]中和[]外是两个向量。
根据生成函数判断对象的数据类型;用class()函数判断数据类型
数据框来源:代码建、已有数据转换、读取文件、R语言的内置数据
heatmap(volcano)
df1 <- data.frame(gene = paste0("gene",1:4),
change = rep(c("up","down"),each = 2),
score = c(5,3,-2,-4))
变量名只起提示作用,不起决定作用。
文件需要保存到工作目录---经典报错:找不到文件!!!
df2 <- read.csv("gene.csv")
df2
## gene change score
## 1 gene1 up 5
## 2 gene2 up 3
## 3 gene3 down -2
## 4 gene4 down -4
dim(df1)#维度,返回结果是给出数字,几行几列
## [1] 4 3
nrow(df1)#查看df1的行数
## [1] 4
ncol(df1)#查看df1的列数
## [1] 3
rownames(df1)#查看行名
## [1] "1" "2" "3" "4"
colnames(df1)#查看列名
## [1] "gene" "change" "score"
向量运算都能应用于$数据框取子集。
df1$score# $取子集
## [1] 5 3 -2 -4
df1[2,2]#取df1第二行第二列的元素,单取一行是数据框。[,]中逗号分隔维度,向量取子集不适用
## [1] "up"
df1[2,]#取df1第二行,单取一列是向量
## gene change score
## 2 gene2 up 3
df1[c(1,3),1:2]#取1、3行,1到2列
## gene change
## 1 gene1 up
## 3 gene3 down
df1[,"gene"]#取列名是gene的列
## [1] "gene1" "gene2" "gene3" "gene4"
test <- read.csv("exercise.csv")
统计test的strand列有多少个+,多少个-?
#方法1
test$Strand
## [1] "-" "-" "+" "-" "-" "+" "+" "-" "-" "+" "+" "+" "-" "-" "-" "-" "-" "+" "+"
## [20] "-" "+" "-" "-" "+" "-" "-" "+" "+" "-" "+" "-" "+" "+" "+" "+" "-" "-" "+"
## [39] "-" "-" "+" "-" "+" "+" "+" "+" "+" "-" "-" "-" "-" "+" "+" "-" "-" "+" "-"
## [58] "+" "+" "+" "+" "-" "-" "-" "+" "-" "-" "-" "+" "-" "+" "-" "+" "+" "-" "-"
## [77] "-" "-" "-" "+" "+" "+" "+" "+" "+" "+" "-" "-" "+" "+" "-" "-" "+" "+" "+"
## [96] "-" "+" "-" "-" "-"
x <- c("+")
x
## [1] "+"
test$Strand %in% x
## [1] FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## [14] FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [27] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
## [40] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [53] TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## [66] FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
## [79] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [92] FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE
table(test$Strand %in% x)#直接统计出51个'-',49个'+'
##
## FALSE TRUE
## 51 49
#方法2
test$Strand#查看test中strand列内容概括,只有'+'或'-'
## [1] "-" "-" "+" "-" "-" "+" "+" "-" "-" "+" "+" "+" "-" "-" "-" "-" "-" "+" "+"
## [20] "-" "+" "-" "-" "+" "-" "-" "+" "+" "-" "+" "-" "+" "+" "+" "+" "-" "-" "+"
## [39] "-" "-" "+" "-" "+" "+" "+" "+" "+" "-" "-" "-" "-" "+" "+" "-" "-" "+" "-"
## [58] "+" "+" "+" "+" "-" "-" "-" "+" "-" "-" "-" "+" "-" "+" "-" "+" "+" "-" "-"
## [77] "-" "-" "-" "+" "+" "+" "+" "+" "+" "+" "-" "-" "+" "+" "-" "-" "+" "+" "+"
## [96] "-" "+" "-" "-" "-"
length(test$Strand)#test中strand列共有100个元素。
## [1] 100
length(test$Strand[test$Strand %in% x])#test中strand列中"+"有49个,那么'-'有51个
## [1] 49
#实际方法
table(test$Strand)
## - +
## 51 49
反思:想的过于复杂了,起初理解的是怎么用逻辑判断取实现,走弯路了。理解复杂的根本原因是对table()函数理解不到位。table()可以统计多少个数值型、字符型和逻辑型元素。
df1[3,3] <- 0#改一个格(一个元素)
df1$score <- c(1,2,3,4)#改一整列
df1$p.value <- c(5,6,7,8)#新建一列
rownames(df1) <- c('r1','r2','r3','r4')#修改行名
rownames(df1)#返回结果是一个向量
## [1] "r1" "r2" "r3" "r4"
colnames(df1)#返回结果是一个向量
## [1] "gene" "change" "score" "p.value"
colnames(df1)[2] <- "chance"#修改第二列列名就是修改【colnames(df1)】这个向量的第二个元素
列名有则改之,无则新增
数据框中有相同的列名,同时该列有交集。
merge(tes1,test2,by="name")#实现数据框test1和test2快速连接,需要提供信息-共同列的名字。列名不同时,可修改列名为相同,再拼接。
## Error in eval(expr, envir, enclos): object 'tes1' not found
merge(tes1,test2,by.x="name",by.y="NAME")#列名不同时也可修改代码,再拼接。
## Error in eval(expr, envir, enclos): object 'tes1' not found
代码与数据不匹配时:改数据或改代码
m <- matrix(1:9,nrow = 3)
m
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
取子集适用matrix,$不适用matrix取子集。
colnames(m) <- c('a','b','c')#给矩阵m加列名
m
## a b c
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
思考一下数据转化代码和输出结果的区别
t(m)#给矩阵转置,行变为列,列变为行
## [,1] [,2] [,3]
## a 1 2 3
## b 4 5 6
## c 7 8 9
as.data.frame(m)#将矩阵m转换为数据框
## a b c
## 1 1 4 7
## 2 2 5 8
## 3 3 6 9
is.data.frame(m)
## [1] FALSE
t(m)#给矩阵转置,行变为列,列变为行
## [,1] [,2] [,3]
## a 1 2 3
## b 4 5 6
## c 7 8 9
aa <- as.data.frame(m)#将矩阵m转换为数据框
aa
## a b c
## 1 1 4 7
## 2 2 5 8
## 3 3 6 9
is.data.frame(aa)
## [1] TRUE
R语言的修改,都要赋值,没有赋值就没有发生过!
m
## a b c
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
pheatmap::pheatmap(m)#颜色代表数值,热图的排序?原因是聚类树
pheatmap::pheatmap(m,cluster_cols = F,cluster_rows = F)#不想聚类的代码
x <- list('m1' <- matrix(1:9,nrow = 3),
'm2' <- matrix(2:9,nrow = 2))
x[[1]]#取列表x的第一个元素。
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
x$m1#取列表x的名为m1的元素。
## [,1] [,2] [,3]
##[1,] 1 4 7
##[2,] 2 5 8
##[3,] 3 6 9
scores <- c(1,2,3,4)
names(scores) <- c('x','y','z','a')#向量命名,不属于向量的内容,适合取子集时使用。
scores
## x y z a
## 1 2 3 4
names(scores)#名字组成的向量
## [1] "x" "y" "z" "a"
names(scores)[scores>3]#名字组成向量取子集
## [1] "a"
#统计iris最后一列有哪些取值,每个取值重复几次
#方法1
col(iris) #获得iris最后一列是第五列信息
table(iris[,5])
##
## setosa versicolor virginica
## 50 50 50
#方法2
colnames(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
pheatmap::pheatmap(iris[,1:4])#iris前四列数据画热图,而heatmap函数的数据要求是matrix
x <- list(m1 <- matrix(1:9,nrow = 3),
m2 <- matrix(2:9,nrow = 2))
class(x[1])
## [1] "list"
x <- list(m1 <- matrix(1:9,nrow = 3),
m2 <- matrix(2:9,nrow = 2))
class(x[[1]])
## [1] "matrix" "array"
k <- df1$score>0#逻辑值向量
df1[k,]#取子集-向量k逻辑值为TRUE的行组成的数据框
## gene chance score p.value
## r1 gene1 up 1 5
## r2 gene2 up 2 6
## r3 gene3 down 3 7
## r4 gene4 down 4 8
df1[k,1]#取子集-向量k逻辑值为TRUE的第一列,也就是筛选score>0的第一列
## [1] "gene1" "gene2" "gene3" "gene4"
df1$gene[k]#取子集-向量k逻辑值为TRUE的第一列,也就是筛选score>0的第一列
## [1] "gene1" "gene2" "gene3" "gene4"
df1$gene[df1$score>0]#取子集-向量k逻辑值为TRUE的第一列,也就是筛选score>0的第一列
## [1] "gene1" "gene2" "gene3" "gene4"
rm(x)#删除1个
rm(df,m)#删除多个,逗号隔开
## Warning in rm(df, m): object 'df' not found
rm(list = ls())#删除全部
#清空控制台快捷键control+l
jimmy <- function(a,b,m = 2){
(a+b)^m
}#function()中jimmy是函数名;a,b,m是形式参数;m = 2是m的固定值。{}中的内容是编写函数使用的代码。
jimmy(1,2)
## [1] 9
plot(iris[,1],col = iris[,5])#根据第五列分配plot的图片
crazy <- function(i){
plot(iris[,i],col = iris[,5])
}#编写函数,可以对iris数据框的列批量画图
crazy(4)#第4列
Tip: 当一个代码需要复制粘贴3次或以上,就应该写成函数或使用循环
crazy_bio <- function(a,b=2,c=1){
a^b-1
}
crazy_bio(5)
## [1] 24
生信技能树 > 生信入门马拉松
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。