from pathlib import Path
import warnings
import numpy as np
import pandas as pd
import scanpy as sc
import snapatac2 as snap
import scvi
import bioquest as bq
import sckit as sk
基因组注释文件
gff_file="~/DataHub/Genomics/GENCODE/hg38.v43.chr_patch_hapl_scaff.basic.annotation.gff3.gz"
adata = snap.read_dataset("PBMC.h5ads")
query = snap.pp.make_gene_matrix(adata, gff_file=gff_file)
query.obs_names = adata.obs_names
从网站下载的已经注释好的单细胞转录组h5ad文件作为reference
reference = sc.read("10x-Multiome-Pbmc10k-RNA.h5ad")
data = reference.concatenate(query, batch_categories=["reference", "query"])
data.layers["counts"] = data.X.copy()
#过滤
sc.pp.filter_genes(data, min_cells=5)
#文库大小
sc.pp.normalize_total(data, target_sum=1e4)
#log转化
sc.pp.log1p(data)
# 前5000高变基因
sc.pp.highly_variable_genes(
data,
n_top_genes = 5000,
flavor="seurat_v3",
layer="counts",
batch_key="batch",
subset=True
)
scvi.model.SCVI.setup_anndata(data, layer="counts", batch_key="batch")
vae = scvi.model.SCVI(
data,
n_layers=2,
n_latent=30,
gene_likelihood="nb",
dispersion="gene-batch",
)
vae.train(max_epochs=1000, early_stopping=True,use_gpu=True)
converged 即1000个epochs前,模型的损失不在大幅度减小
ax = vae.history['elbo_train'][1:].plot()
vae.history['elbo_validation'].plot(ax=ax)
data.obs["celltype_scanvi"] = 'Unknown'
ref_idx = data.obs['batch'] == "reference"
data.obs["celltype_scanvi"][ref_idx] = data.obs['cell_type'][ref_idx]
lvae = scvi.model.SCANVI.from_scvi_model(
vae,
adata=data,
labels_key="celltype_scanvi",
unlabeled_category="Unknown",
)
lvae.train(max_epochs=1000, n_samples_per_label=100,use_gpu=True)
lvae.history['elbo_train'][1:].plot()
data.obs["C_scANVI"] = lvae.predict(data)
data.obsm["X_scANVI"] = lvae.get_latent_representation(data)
sc.pp.neighbors(data, use_rep="X_scANVI")
sc.tl.umap(data)
sc.pl.umap(data, color=['C_scANVI', "batch"], wspace=0.45)
obs = data.obs
obs = obs[obs['batch'] == 'query']
obs.index = list(map(lambda x: x.split("-query")[0], obs.index))
atac=adata.to_adata()
adata.close()
atac.obs['cell_type'] = obs.loc[atac.obs.index]['C_scANVI']
根据leiden cluster手动注释细胞类型
from collections import Counter
cell_type_labels = np.unique(atac.obs['cell_type'])
count_table = {}
for cl, ct in zip(atac.obs['leiden'], atac.obs['cell_type']):
if cl in count_table:
count_table[cl].append(ct)
else:
count_table[cl] = [ct]
mat = []
for cl, counts in count_table.items():
c = Counter(counts)
c = np.array([c[ct] for ct in cell_type_labels])
c = c / c.sum()
mat.append(c)
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
df_cm = pd.DataFrame(
mat,
index = count_table.keys(),
columns = cell_type_labels,
)
sn.clustermap(df_cm, annot=True)
sk.label_helper(len(atac.obs.leiden.unique())-1)
new_cluster_names ='''
0,CD14 Mono
1,MAIT
2,CD4 Naive
3,NK
4,Memory B
5,CD14 Mono
6,CD14 Mono
7,Intermediate B
8,cDC
9,Naive B
10,Naive B
11,Treg
12,cDC
13,Treg
14,CD14 Mono
15,pDC
'''
sk.labeled(atac,cluster_names=new_cluster_names,reference_key='leiden')
sc.pl.umap(atac, color=['leiden', "CellType"], wspace=0.45,legend_loc="on data",legend_fontoutline=3)
atac.write_h5ad("atac_annot.h5ad",compression='lzf')
adata.close()
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有