title: "生信技能树学习笔记"
author: "天空"
date: "2023-01-04"
output: html_document
library(stringr)
x <- "The birch canoe slid on the smooth planks."
x
## [1] "The birch canoe slid on the smooth planks."
### 1.检测字符串长度
str_length(x)
## [1] 42
length(x)
## [1] 1
# 区分字符型向量和字符串
y = c("jimmy 150","nicker 140","tony 152")
length(y)
## [1] 3
str_length(y)
## [1] 9 10 8
### 2.字符串拆分
str_split(x," ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
x2 = str_split(x," ")[[1]];x2
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
# 向量里面的字符串一次拆分
y = c("jimmy 150","nicker 140","tony 152")
str_split(y," ")
## [[1]]
## [1] "jimmy" "150"
##
## [[2]]
## [1] "nicker" "140"
##
## [[3]]
## [1] "tony" "152"
str_split(y," ",simplify = T)
## [,1] [,2]
## [1,] "jimmy" "150"
## [2,] "nicker" "140"
## [3,] "tony" "152"
### 4.字符检测
str_detect(x2,"h")
## [1] TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
str_starts(x2,"T")
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
str_ends(x2,"e")
## [1] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
### 5.字符串替换
x2
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
str_replace(x2,"o","A")
## [1] "The" "birch" "canAe" "slid" "An" "the" "smAoth" "planks."
str_replace_all(x2,"o","A")
## [1] "The" "birch" "canAe" "slid" "An" "the" "smAAth" "planks."
### 6.字符删除
x
## [1] "The birch canoe slid on the smooth planks."
str_remove(x," ")
## [1] "Thebirch canoe slid on the smooth planks."
str_remove_all(x," ")
## [1] "Thebirchcanoeslidonthesmoothplanks."
test <- iris[c(1:2,51:52,101:102),]
rownames(test) =NULL # 去掉行名,NULL是“什么都没有”
test
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 7.0 3.2 4.7 1.4 versicolor
## 4 6.4 3.2 4.5 1.5 versicolor
## 5 6.3 3.3 6.0 2.5 virginica
## 6 5.8 2.7 5.1 1.9 virginica
# arrange,数据框按照某一列排序
library(stringr)
library(dplyr)
arrange(test, Sepal.Length) #从小到大
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 4.9 3.0 1.4 0.2 setosa
## 2 5.1 3.5 1.4 0.2 setosa
## 3 5.8 2.7 5.1 1.9 virginica
## 4 6.3 3.3 6.0 2.5 virginica
## 5 6.4 3.2 4.5 1.5 versicolor
## 6 7.0 3.2 4.7 1.4 versicolor
arrange(test, desc(Sepal.Length)) #从大到小
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.0 3.2 4.7 1.4 versicolor
## 2 6.4 3.2 4.5 1.5 versicolor
## 3 6.3 3.3 6.0 2.5 virginica
## 4 5.8 2.7 5.1 1.9 virginica
## 5 5.1 3.5 1.4 0.2 setosa
## 6 4.9 3.0 1.4 0.2 setosa
# distinct,数据框按照某一列去重复
distinct(test,Species,.keep_all = T)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 7.0 3.2 4.7 1.4 versicolor
## 3 6.3 3.3 6.0 2.5 virginica
# mutate,数据框新增一列
mutate(test, new = Sepal.Length * Sepal.Width)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species new
## 1 5.1 3.5 1.4 0.2 setosa 17.85
## 2 4.9 3.0 1.4 0.2 setosa 14.70
## 3 7.0 3.2 4.7 1.4 versicolor 22.40
## 4 6.4 3.2 4.5 1.5 versicolor 20.48
## 5 6.3 3.3 6.0 2.5 virginica 20.79
## 6 5.8 2.7 5.1 1.9 virginica 15.66
#ifelse()+str_detect(),王炸
samples = c("tumor1","tumor2","tumor3","normal1","normal2","normal3")
k1 = str_detect(samples,"tumor");k1
## [1] TRUE TRUE TRUE FALSE FALSE FALSE
ifelse(k1,"tumor","normal")
## [1] "tumor" "tumor" "tumor" "normal" "normal" "normal"
k2 = str_detect(samples,"normal");k2
## [1] FALSE FALSE FALSE TRUE TRUE TRUE
ifelse(k2,"normal","tumor")
## [1] "tumor" "tumor" "tumor" "normal" "normal" "normal"
x <- c(5,6,0,3)
#如何将结果存下来?
s = 0
result = list()
for(i in 1:length(x)){
s=s+x[[i]]
result[[i]] = c(x[[i]],s)
}
result
## [[1]]
## [1] 5 5
##
## [[2]]
## [1] 6 11
##
## [[3]]
## [1] 0 11
##
## [[4]]
## [1] 3 14
do.call(cbind,result)
## [,1] [,2] [,3] [,4]
## [1,] 5 6 0 3
## [2,] 5 11 11 14
do.call(rbind,result)
## [,1] [,2]
## [1,] 5 5
## [2,] 6 11
## [3,] 0 11
## [4,] 3 14
# 表达矩阵
set.seed(10086)
exp = matrix(rnorm(18),ncol = 6)
exp = round(exp,2)
rownames(exp) = paste0("gene",1:3)
colnames(exp) = paste0("test",1:6)
exp[,1:3] = exp[,1:3]+1
exp
## test1 test2 test3 test4 test5 test6
## gene1 1.55 1.49 1.80 -0.37 -1.82 -1.62
## gene2 -1.74 0.37 2.08 2.11 -0.22 1.42
## gene3 1.57 1.25 1.32 2.49 0.58 -0.81
library(tidyr)
library(tibble)
library(dplyr)
dat = t(exp) %>%
as.data.frame() %>%
rownames_to_column() %>%
mutate(group = rep(c("control","treat"),each = 3))
# 宽数据变长数据
pdat = dat%>%
pivot_longer(cols = starts_with("gene"),
names_to = "gene",
values_to = "count")
library(ggplot2)
p = ggplot(pdat,aes(gene,count))+
geom_boxplot(aes(fill = group))+
theme_bw()
p
p + facet_wrap(~gene,scales = "free")
## apply()族函数
### 1.apply 处理矩阵或数据框
#apply(X, MARGIN, FUN, …)
#其中X是数据框/矩阵名;
#MARGIN为1表示行,为2表示列,FUN是函数
test<- iris[1:6,1:4]
apply(test, 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 4.9500000 3.3833333 1.4500000 0.2333333
apply(test, 1, sum)
## 1 2 3 4 5 6
## 10.2 9.5 9.4 9.4 10.2 11.4
### 2.lapply(list, FUN, …)
# 对列表/向量中的每个元素(向量)实施相同的操作
test <- list(x = 36:33,y = 32:35,z = 30:27);test
## $x
## [1] 36 35 34 33
##
## $y
## [1] 32 33 34 35
##
## $z
## [1] 30 29 28 27
#返回值是列表,对列表中的每个元素(向量)求均值(试试方差var,分位数quantile)
lapply(test,mean)
## $x
## [1] 34.5
##
## $y
## [1] 33.5
##
## $z
## [1] 28.5
lapply(test,fivenum)
## $x
## [1] 33.0 33.5 34.5 35.5 36.0
##
## $y
## [1] 32.0 32.5 33.5 34.5 35.0
##
## $z
## [1] 27.0 27.5 28.5 29.5 30.0
### 3.sapply 简化结果,返回矩阵或向量
sapply(test,mean)
## x y z
## 34.5 33.5 28.5
sapply(test,fivenum)
## x y z
## [1,] 33.0 32.0 27.0
## [2,] 33.5 32.5 27.5
## [3,] 34.5 33.5 28.5
## [4,] 35.5 34.5 29.5
## [5,] 36.0 35.0 30.0
class(sapply(test,fivenum))
## [1] "matrix" "array"
test1 <- data.frame(name = c('jimmy','nicker','Damon','Sophie'),
blood_type = c("A","B","O","AB"))
test1
## name blood_type
## 1 jimmy A
## 2 nicker B
## 3 Damon O
## 4 Sophie AB
test2 <- data.frame(name = c('Damon','jimmy','nicker','tony'),
group = c("group1","group1","group2","group2"),
vision = c(4.2,4.3,4.9,4.5))
test2
## name group vision
## 1 Damon group1 4.2
## 2 jimmy group1 4.3
## 3 nicker group2 4.9
## 4 tony group2 4.5
library(dplyr)
inner_join(test1,test2,by="name")
## name blood_type group vision
## 1 jimmy A group1 4.3
## 2 nicker B group2 4.9
## 3 Damon O group1 4.2
right_join(test1,test2,by="name")
## name blood_type group vision
## 1 jimmy A group1 4.3
## 2 nicker B group2 4.9
## 3 Damon O group1 4.2
## 4 tony <NA> group2 4.5
full_join(test1,test2,by="name")
## name blood_type group vision
## 1 jimmy A group1 4.3
## 2 nicker B group2 4.9
## 3 Damon O group1 4.2
## 4 Sophie AB <NA> NA
## 5 tony <NA> group2 4.5
semi_join(test1,test2,by="name")
## name blood_type
## 1 jimmy A
## 2 nicker B
## 3 Damon O
anti_join(test1,test2,by="name")
## name blood_type
## 1 Sophie AB
# 1.match-----
load("matchtest.Rdata")
x
## file_name ID
## 1 708a16a3-7a5e-4e27-b06b-4c3c308b11fe.htseq.counts.gz TCGA-AA-3531-01A-01R-0821-07
## 2 95e726db-5ccc-4836-a2ae-7feaddaf9f1b.htseq.counts.gz TCGA-A6-2678-11A-01R-A32Z-07
## 3 90a46dce-5762-47ec-925c-deff853069aa.htseq.counts.gz TCGA-AA-A02K-01A-03R-A32Y-07
## 4 587e44e4-87ba-4981-a520-d20612486f53.htseq.counts.gz TCGA-NH-A6GA-01A-11R-A37K-07
## 5 1b843dbb-5ef0-47ca-9783-dbeb94aa6df3.htseq.counts.gz TCGA-AZ-6600-11A-01R-1774-07
## 6 09796233-3f40-4deb-b77d-2267c3afff59.htseq.counts.gz TCGA-CM-6676-01A-11R-1839-07
## 7 44f1dc34-a01e-4a7b-a7a1-a90064039fdd.htseq.counts.gz TCGA-AA-3971-01A-01R-1022-07
y
## 90a46dce-5762-47ec-925c-deff853069aa.htseq.counts.gz
## ENSG00000000003.13 6564
## ENSG00000000005.5 29
## ENSG00000000419.11 2659
## ENSG00000000457.12 246
## ENSG00000000460.15 145
## ENSG00000000938.11 37
## ENSG00000000971.14 77
## 587e44e4-87ba-4981-a520-d20612486f53.htseq.counts.gz
## ENSG00000000003.13 3127
## ENSG00000000005.5 0
## ENSG00000000419.11 889
## ENSG00000000457.12 382
## ENSG00000000460.15 188
## ENSG00000000938.11 749
## ENSG00000000971.14 708
## 95e726db-5ccc-4836-a2ae-7feaddaf9f1b.htseq.counts.gz
## ENSG00000000003.13 6330
## ENSG00000000005.5 0
## ENSG00000000419.11 2428
## ENSG00000000457.12 1701
## ENSG00000000460.15 1009
## ENSG00000000938.11 220
## ENSG00000000971.14 530
## 09796233-3f40-4deb-b77d-2267c3afff59.htseq.counts.gz
## ENSG00000000003.13 3583
## ENSG00000000005.5 70
## ENSG00000000419.11 1436
## ENSG00000000457.12 590
## ENSG00000000460.15 440
## ENSG00000000938.11 812
## ENSG00000000971.14 4173
## 708a16a3-7a5e-4e27-b06b-4c3c308b11fe.htseq.counts.gz
## ENSG00000000003.13 643
## ENSG00000000005.5 17
## ENSG00000000419.11 2476
## ENSG00000000457.12 804
## ENSG00000000460.15 496
## ENSG00000000938.11 962
## ENSG00000000971.14 3958
## 44f1dc34-a01e-4a7b-a7a1-a90064039fdd.htseq.counts.gz
## ENSG00000000003.13 1514
## ENSG00000000005.5 13
## ENSG00000000419.11 876
## ENSG00000000457.12 483
## ENSG00000000460.15 250
## ENSG00000000938.11 75
## ENSG00000000971.14 272
## 1b843dbb-5ef0-47ca-9783-dbeb94aa6df3.htseq.counts.gz
## ENSG00000000003.13 11751
## ENSG00000000005.5 26
## ENSG00000000419.11 2494
## ENSG00000000457.12 531
## ENSG00000000460.15 632
## ENSG00000000938.11 85
## ENSG00000000971.14 319
## 如何把y的列名正确替换为x里面的ID?
## (1)分步解法
a = colnames(y)
b = x$file_name
k = match(a,b);k
## [1] 3 4 2 6 1 7 5
#match(a,b)的意思是a里的每个元素在b的第几个位置上。
#是b的下标,可以给b取子集,也可以给与b对应的其他向量取子集。
colnames(y) = x$ID[k]
## (2)一步解法
load("matchtest.Rdata")
colnames(y) = x$ID[match(colnames(y),x$file_name)]
## (3)放弃match的解法
load("matchtest.Rdata")
rownames(x) = x$file_name
x = x[colnames(y),]
colnames(y) = x$ID
# 2.一些搞文件的函数----
dir() # 列出工作目录下的文件
## [1] "0_pre_install.R"
## [2] "1_<e7>\u008e<a9>转<e5><ad>\u0097符串.R"
## [3] "2_<e7>\u008e<a9>转<e6>\u0095<b0><e6>\u008d<ae><e6><a1>\u0086.R"
## [4] "3_<e6>\u009d<a1>件<e5>\u0092\u008c循<e7>\u008e<af>.R"
## [5] "4_表达<e7>\u009f<a9><e9>\u0098<b5><e7>\u0094<bb>箱线<e5>\u009b<be>.R"
## [6] "5_<e9>\u009a\u0090<e5><bc>\u008f循<e7>\u008e<af>.R"
## [7] "6_两个<e6>\u0095<b0><e6>\u008d<ae><e6><a1>\u0086<e7>\u009a\u0084<e8><bf>\u009e<e6>\u008e<a5>.R"
## [8] "7_<e4><b8>\u0080<e4><ba>\u009b顶<e5>\u0091<b1><e5>\u0091<b1><e7>\u009a\u0084<e5>\u0087<bd><e6>\u0095<b0>.R"
## [9] "8_exercise.R"
## [10] "9_anwser.R"
## [11] "deg.Rdata"
## [12] "figure"
## [13] "group.csv"
## [14] "matchtest.Rdata"
## [15] "R_04.Rproj"
## [16] "test1.Rdata"
## [17] "test2.Rdata"
## [18] "<e7>\u0094\u009f信<e6>\u008a\u0080<e8>\u0083<bd><e6><a0>\u0091<e7><ac>\u0094记day7.html"
## [19] "<e7>\u0094\u009f信<e6>\u008a\u0080<e8>\u0083<bd><e6><a0>\u0091<e7><ac>\u0094记day7.Rmd"
dir(pattern = ".R$") #列出工作目录下以.R结尾的文件
## [1] "0_pre_install.R"
## [2] "1_<e7>\u008e<a9>转<e5><ad>\u0097符串.R"
## [3] "2_<e7>\u008e<a9>转<e6>\u0095<b0><e6>\u008d<ae><e6><a1>\u0086.R"
## [4] "3_<e6>\u009d<a1>件<e5>\u0092\u008c循<e7>\u008e<af>.R"
## [5] "4_表达<e7>\u009f<a9><e9>\u0098<b5><e7>\u0094<bb>箱线<e5>\u009b<be>.R"
## [6] "5_<e9>\u009a\u0090<e5><bc>\u008f循<e7>\u008e<af>.R"
## [7] "6_两个<e6>\u0095<b0><e6>\u008d<ae><e6><a1>\u0086<e7>\u009a\u0084<e8><bf>\u009e<e6>\u008e<a5>.R"
## [8] "7_<e4><b8>\u0080<e4><ba>\u009b顶<e5>\u0091<b1><e5>\u0091<b1><e7>\u009a\u0084<e5>\u0087<bd><e6>\u0095<b0>.R"
## [9] "8_exercise.R"
## [10] "9_anwser.R"
file.create("douhua.txt") #用代码创建文件
## [1] TRUE
file.exists("douhua.txt") #某文件在工作目录下是否存在
## [1] TRUE
file.remove("douhua.txt") #用代码删除文件
## [1] TRUE
file.exists("douhua.txt") #删掉了就不存在啦
## [1] FALSE
## 可以批量的新建和删除
f = paste0("douhua",1:100,".txt")
file.create(f)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [19] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [55] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
file.remove(f)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [19] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [55] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。