1.R包和数据准备 import pandas as pd import numpy as np import scanpy as sc import matplotlib.pyplot as plt GSM6736629_10x-PBMC-1_ds0.1974_CountMatrix.tsv.gz文件在GEO直接搜索编号即可下载。 ### step1:读入表达量矩阵,构建单细胞对象 ------ path ='GSM6736629_10x-PBMC-1_ds0.1974_CountMatrix.tsv.gz' adata=sc.read_text(path).T #.T是转置 adata.shape (4557, 33104) adata.var_names = adata.var_names.str.split(':').str[0]#行名不规范,只想留下前半段 adata.var_names_make_unique() adata.var.head()
2.质控 sc.pp.filter_cells(adata,min_genes=200) sc.pp.filter_genes(adata,min_cells=3) adata.var['mt']=adata.var_names.str.startswith('MT-') adata.var.mt.value_counts() mt False 25965 True 30 Name: count, dtype: int64 sc.pp.calculate_qc_metrics(adata,qc_vars=['mt'],log1p=False,percent_top=None,inplace=True) sc.pl.violin(adata,["n_genes_by_counts", "total_counts", "pct_counts_mt"],jitter=0.4, multi_panel=True) #mt是0 #sc.pl.violin(adata,["n_genes_by_counts", "total_counts"],jitter=0.4, multi_panel=True)
adata=adata[adata.obs.n_genes_by_counts>200] adata=adata[adata.obs.n_genes_by_counts<4000] adata=adata[adata.obs.pct_counts_mt<10] adata.shape sc.pp.normalize_total(adata,target_sum=1e4) sc.pp.log1p(adata) adata.lognorm=adata.copy() celltypist包需要的数据是log normlize之后的数据,不需要进行标准流程后续的降维聚类,都被这个包包进去了。 3.注释 celltypist包提供了大量的参考数据,目前是52个,还挺丰富的。https://www.celltypist.org/models 根据自己的数据的物种、组织等背景知识选择合适的。 import celltypist from celltypist import models model = models.Model.load(model='Immune_All_High.pkl') #https://www.celltypist.org/models model predictions = celltypist.annotate(adata.lognorm, model = model, majority_voting = True) predictions.predicted_labels 🔬 Input data has 4321 cells and 25995 genes 🔗 Matching reference genes in the model 🧬 5601 features used for prediction ⚖️ Scaling input data 🖋️ Predicting labels ✅ Prediction done! 👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it ⛓️ Over-clustering input data with resolution set to 5 🗳️ Majority voting the predictions ✅ Majority voting done!
adata2 = predictions.to_adata() adata2.obs
可以看到预测结果中已经有了注释的结果,其中majority_voting是多数投票的结果,就用它。 adata2.obs.majority_voting = adata2.obs.majority_voting.str.replace(' cells', '', regex=False) adata2.obs.majority_voting = adata2.obs.majority_voting.str.replace(' monocytes', ' Mono', regex=False) adata2.obs.majority_voting AAACCCAAGTAGGGTC Monocytes AAACCCACACCATTCC B AAACCCATCTACACTT T AAACGAAAGCACGTCC T AAACGAAGTTCAAACC ILC ... TTTGGTTAGGTCATAA T TTTGTTGAGAGGGTAA ILC TTTGTTGCAGGATGAC T TTTGTTGCATCGCTCT B TTTGTTGTCTGGGATT T Name: majority_voting, Length: 4321, dtype: object sc.tl.umap(adata2) sc.pl.umap(adata2, color = 'majority_voting',legend_loc="on data")
4.搞个漂亮点的图 import pandas as pd umap_coords = adata2.obsm['X_umap'] labels = adata2.obs['majority_voting'] plt.figure(figsize=(6,5)) unique_labels = labels.unique() colors = plt.cm.get_cmap('Accent', len(unique_labels)) for i, label in enumerate(unique_labels): plt.scatter(umap_coords[labels == label, 0], umap_coords[labels == label, 1], color=colors(i),s=1,alpha = 0.4) for label in unique_labels: label_coords = umap_coords[labels == label] center_x = label_coords[:, 0].mean() center_y = label_coords[:, 1].mean() plt.text(center_x, center_y, label, fontsize=10, ha='center', va='center') plt.grid(False) plt.title('majority_voting', fontsize=14) plt.xlabel('UMAP 1', fontsize=12) plt.ylabel('UMAP 2', fontsize=12) plt.show()