
前面我们介绍了python版visium HD空转数据的分析,scanpy+spatialdata策略(10X空转visium HD多样本分析策略二【scanpy+spatialdata】),这也是比较正式的一种方式。其中有亮点问题没有涉及到,一个是多样本去批次问题(事实证明,我们演示的数据没有批次效应,这里我们仅仅是演示,么有意义),一个是类似于R中那样进行下采样分析,因为visium HD数据太大了,虽然服务器python处理百万级别的矩阵没有问题,能够轻松完成,但是如果样本增加,那么分析也是需要花费一定时间的,所以这里介绍一下sketch下采样分析。
import scanpy as sc
import numpy as np
import pandas as pd
import squidpy as sq
import matplotlib.pyplot as plt
import spatialdata as spd
import spatialdata_plot as splt
import spatialdata_io as so
import geosketch as sketch
import json
import gc
from PIL import Image加载数据,这个数据是之前两个样本,读入之后合并保存了结果,这里直接读入进行后续演示!
#已经保存的spatial object的读入
sdata_comb = spd.read_zarr('./comb_sdata/')
sdata_comb├── Images
│ ├── 'P5_CRC_cytassist_image': DataArray[cyx] (3, 3000, 3199)
│ ├── 'P5_CRC_hires_image': DataArray[cyx] (3, 5298, 6000)
│ ├── 'P5_CRC_lowres_image': DataArray[cyx] (3, 530, 600)
│ ├── 'P5_NAT_cytassist_image': DataArray[cyx] (3, 3004, 3200)
│ ├── 'P5_NAT_hires_image': DataArray[cyx] (3, 4153, 6000)
│ └── 'P5_NAT_lowres_image': DataArray[cyx] (3, 415, 600)
├── Shapes
│ ├── 'P5_CRC_square_008um': GeoDataFrame shape: (541968, 1) (2D shapes)
│ └── 'P5_NAT_square_008um': GeoDataFrame shape: (435773, 1) (2D shapes)
└── Tables
└── 'square_008um': AnnData (977741, 18085)
with coordinate systems:
▸ 'P5_CRC', with elements:
P5_CRC_cytassist_image (Images), P5_CRC_hires_image (Images), P5_CRC_lowres_image (Images), P5_CRC_square_008um (Shapes)
▸ 'P5_CRC_downscaled_hires', with elements:
P5_CRC_hires_image (Images), P5_CRC_square_008um (Shapes)
▸ 'P5_CRC_downscaled_lowres', with elements:
P5_CRC_lowres_image (Images), P5_CRC_square_008um (Shapes)
▸ 'P5_NAT', with elements:
P5_NAT_cytassist_image (Images), P5_NAT_hires_image (Images), P5_NAT_lowres_image (Images), P5_NAT_square_008um (Shapes)
▸ 'P5_NAT_downscaled_hires', with elements:
P5_NAT_hires_image (Images), P5_NAT_square_008um (Shapes)
▸ 'P5_NAT_downscaled_lowres', with elements:
P5_NAT_lowres_image (Images), P5_NAT_square_008um (Shapes)adata_comb = sdata_comb["square_008um"]
#计算线粒体基因比例
adata_comb.var["mt"] = adata_comb.var_names.str.startswith(("MT-"))
sc.pp.calculate_qc_metrics(adata_comb, qc_vars=['mt'], inplace=True, percent_top=None)#常规指标过滤
adata_comb = adata_comb[adata_comb.obs["pct_counts_mt"] < 40].copy()
sc.pp.filter_genes(adata_comb, min_cells=50)
sc.pp.filter_cells(adata_comb, min_counts=50)
sc.pp.filter_cells(adata_comb, max_counts=3000)#标准化
sc.pp.normalize_total(adata_comb, target_sum = None)
sc.pp.log1p(adata_comb)
sc.tl.pca(adata_comb)
# Elbow plot
sc.pl.pca_variance_ratio(adata_comb, log=True,n_pcs=50)
#spatial object是已经合并的,要对样本下采样,先按照sample分开!
sketched_adatas = {} #将分开的samle object储存在字典
for value in adata_comb.obs["sample"].unique():
subset = adata_comb.obs["sample"] == value
subset_adata = adata_comb[subset, :].to_memory()
N = 0.2 * subset_adata.shape[0]#需要抽取的细胞数,抽取20%的细胞
sketch_index = sketch.gs(X=subset_adata.obsm["X_pca"], N=int(N), seed=1) #Geometric Sketching
sketched_adatas[value] = subset_adata[sketch_index, :]
sketched_adata = sc.concat(sketched_adatas)#合并进行后续降维分析sc.tl.pca(sketched_adata)#这里直接演示使用harmony进行批次矫正,其他方法参照scanpy教程,空间数据不应该使用矫正太过于严重的方法
import scanpy.external as sce
sce.pp.harmony_integrate(sketched_adata, key="sample", basis="X_pca",max_iter_harmony=20)sc.pp.neighbors(sketched_adata, n_neighbors=30, use_rep="X_pca_harmony",metric="correlation")
sc.tl.leiden(sketched_adata, flavor="igraph", key_added="clusters_sketched", resolution=0.5,random_state=0)
sc.tl.umap(sketched_adata,min_dist=0.5, spread=2, random_state=0)# Plot UMAP
#这个结果起来批次矫枉过正了,作为演示流程,不再修改了。
sc.pl.umap(sketched_adata, color=["clusters_sketched"], title="UMAP by Clusters")
sc.pl.umap(sketched_adata, color=["sample"], title="UMAP by Sample")

将下采样降维聚类数据分批映射回所有细胞!这里采取分块(chunk)的方式,也就说说每次执行10万个细胞,最后将返回的结果concat到一起,这样做的目的是为了节省内存!
# Projecting the results onto the complete data set
chunk_size = 100000#每次执行的细胞数
#计算所有细胞分为多少块
num_chunks = adata_comb.n_obs // chunk_size + (1 if adata_comb.n_obs % chunk_size != 0 else 0)
#分块的数据储存在一盒列表
adata_projected_list = []
for i in range(num_chunks):
start = i * chunk_size
end = min((i + 1) * chunk_size, adata_comb.n_obs)
adata_query_chunk = adata_comb[start:end].to_memory()
sc.tl.ingest(adata_query_chunk, sketched_adata, obs="clusters_sketched")
adata_projected_list.append(adata_query_chunk)
# Concatenate the ingested chunks
adata_projected = sc.concat(adata_projected_list)
del adata_query_chunk, start, end, i, num_chunks
gc.collect()
sc.pl.umap(adata_projected, color=["clusters_sketched"],title="clusters_projected")
table = sdata_comb.tables["square_008um"]
table.obs["clusters_sketched"] = pd.NA
table.obsm["X_umap"] = np.full((table.n_obs, 2), np.nan)
table.obsm["X_pca"] = np.full((table.n_obs, adata_projected.obsm["X_pca"].shape[1]), np.nan)
#获取降维数据的index,将降维的bins信息返回原先的数据,那么之前过滤的bins就成为NA组了。
idx = adata_projected.obs.index
table.obs.loc[idx, "clusters_sketched"] = adata_projected.obs["clusters_sketched"].values
table.obsm["X_umap"][table.obs.index.isin(idx)] = adata_projected.obsm["X_umap"]
table.obsm["X_pca"][table.obs.index.isin(idx)] = adata_projected.obsm["X_pca"]sdata_comb├── Images
│ ├── 'P5_CRC_cytassist_image': DataArray[cyx] (3, 3000, 3199)
│ ├── 'P5_CRC_hires_image': DataArray[cyx] (3, 5298, 6000)
│ ├── 'P5_CRC_lowres_image': DataArray[cyx] (3, 530, 600)
│ ├── 'P5_NAT_cytassist_image': DataArray[cyx] (3, 3004, 3200)
│ ├── 'P5_NAT_hires_image': DataArray[cyx] (3, 4153, 6000)
│ └── 'P5_NAT_lowres_image': DataArray[cyx] (3, 415, 600)
├── Shapes
│ ├── 'P5_CRC_square_008um': GeoDataFrame shape: (541968, 1) (2D shapes)
│ └── 'P5_NAT_square_008um': GeoDataFrame shape: (435773, 1) (2D shapes)
└── Tables
└── 'square_008um': AnnData (977741, 18085)
with coordinate systems:
▸ 'P5_CRC', with elements:
P5_CRC_cytassist_image (Images), P5_CRC_hires_image (Images), P5_CRC_lowres_image (Images), P5_CRC_square_008um (Shapes)
▸ 'P5_CRC_downscaled_hires', with elements:
P5_CRC_hires_image (Images), P5_CRC_square_008um (Shapes)
▸ 'P5_CRC_downscaled_lowres', with elements:
P5_CRC_lowres_image (Images), P5_CRC_square_008um (Shapes)
▸ 'P5_NAT', with elements:
P5_NAT_cytassist_image (Images), P5_NAT_hires_image (Images), P5_NAT_lowres_image (Images), P5_NAT_square_008um (Shapes)
▸ 'P5_NAT_downscaled_hires', with elements:
P5_NAT_hires_image (Images), P5_NAT_square_008um (Shapes)
▸ 'P5_NAT_downscaled_lowres', with elements:
P5_NAT_lowres_image (Images), P5_NAT_square_008um (Shapes)sdata_comb.pl.render_images("P5_CRC_hires_image")\
.pl.render_shapes("P5_CRC_square_008um", color="clusters_sketched")\
.pl.show(coordinate_systems="P5_CRC_downscaled_lowres", title='P5_CRC')
觉得分享有用的点个赞再走呗!
合作联系
关注不迷路:扫描下面二维码关注公众号! B站视频号链接:https://space.bilibili.com/471040659?spm_id_from=333.1007.0.0