社区首页 >专栏 >内容更新---单细胞空间差异基因的富集分析（python）

内容更新---单细胞空间差异基因的富集分析（python）

原创

追风少年i

发布于 2025-05-22 08:46:44

10600

代码可运行

运行总次数：0

代码可运行

作者，Evil Genius

快51了，大家准备去哪里玩？

目前针对全python流程的搭建，不知道各个公司做的怎么样了，不过针对我们课程内容，需要准备完成。

R版本的GO、KEGG富集分析采用了clusterprofiler，来到我们python，采用的是gseapy模块。

我们更新一下这部分内容

前面的基础分析要做到，最好做一下细胞注释，保存出h5ad文件

import scanpy as sc
import pandas as pd
import gseapy as gp
import numpy as np
import matplotlib.pyplot as plt
import os

# 创建保存结果的目录
os.makedirs('enrichment_results', exist_ok=True)
os.makedirs('enrichment_plots', exist_ok=True)

# 加载单细胞数据
adata = sc.read("test.h5ad")
####基础分析(需要调整)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)

# 聚类分析
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_pcs=30, n_neighbors=20)
sc.tl.leiden(adata, resolution=0.5)
sc.tl.umap(adata)

#最好细胞聚类与注释（细胞注释放在celltype列），这里以cluster为例。
sc.tl.rank_genes_groups(adata, groupby='cluster', method='wilcoxon')
# 获取所有cluster ID
clusters = adata.obs['leiden'].cat.categories.tolist()

# 设置富集分析的参数
gene_sets = ['GO_Biological_Process_2023', 'KEGG_2021_Human']
organism = 'human'  ####物种参数

# 循环处理每个cluster
for cluster in clusters:
    print(f"\nProcessing cluster {cluster}...")
    
    # 1. 获取差异基因
    deg_df = pd.DataFrame({
        'names': adata.uns['rank_genes_groups']['names'][cluster],
        'scores': adata.uns['rank_genes_groups']['scores'][cluster],
        'pvals': adata.uns['rank_genes_groups']['pvals'][cluster],
        'pvals_adj': adata.uns['rank_genes_groups']['pvals_adj'][cluster],
        'logfoldchanges': adata.uns['rank_genes_groups']['logfoldchanges'][cluster]
    })
    
    # 筛选显著差异基因 (调整p值<0.05且logFC>0.5)
    significant_genes = deg_df[(deg_df['pvals_adj'] < 0.05) & (abs(deg_df['logfoldchanges']) > 0.5)]
    
    if len(significant_genes) < 5:
        print(f"Cluster {cluster} has too few significant genes ({len(significant_genes)}), skipping...")
        continue
    
    # 取差异最显著的前100个基因
    gene_list = significant_genes.sort_values('scores', ascending=False)['names'].head(100).tolist()
    
    # 2. 执行富集分析
    try:
        # GO和KEGG富集分析
        enr_results = gp.enrich(
            gene_list=gene_list,
            gene_sets=gene_sets,
            organism=organism,
            outdir=None,
            cutoff=0.05
        )
        
        # 3. 保存结果
        # 保存差异基因
        deg_filename = f'enrichment_results/cluster_{cluster}_differential_genes.csv'
        significant_genes.to_csv(deg_filename, index=False)
        
        # 保存富集结果
        if enr_results is not None:
            enr_filename = f'enrichment_results/cluster_{cluster}_enrichment.csv'
            enr_results.res2d.to_csv(enr_filename)
            
            # 4. 可视化
            # 绘制并保存前10个富集项
            plt.figure(figsize=(10, 6))
            top_terms = enr_results.res2d.head(10)
            top_terms.plot.barh(x='Term', y='Adjusted P-value', legend=False)
            plt.title(f'Cluster {cluster} - Top 10 Enriched Terms')
            plt.xlabel('-log10(Adjusted P-value)')
            plt.tight_layout()
            plot_filename = f'enrichment_plots/cluster_{cluster}_enrichment.png'
            plt.savefig(plot_filename)
            plt.close()
            
            print(f"Cluster {cluster} analysis completed. Results saved.")
        else:
            print(f"No enrichment results for cluster {cluster}")
            
    except Exception as e:
        print(f"Error processing cluster {cluster}: {str(e)}")

# 收集所有cluster的富集结果
all_enrichments = []

for cluster in clusters:
    try:
        filename = f'enrichment_results/cluster_{cluster}_enrichment.csv'
        if os.path.exists(filename):
            df = pd.read_csv(filename)
            df['Cluster'] = cluster
            all_enrichments.append(df)
    except:
        continue

if all_enrichments:
    # 合并所有结果
    combined_enr = pd.concat(all_enrichments)
    
    # 保存合并结果
    combined_enr.to_csv('enrichment_results/combined_enrichment_results.csv', index=False)
    
    # 找出在多个cluster中富集的通路
    common_pathways = combined_enr['Term'].value_counts().head(20)
    print("\nPathways enriched in multiple clusters:")
    print(common_pathways)
    
    # 可视化共同通路
    plt.figure(figsize=(10, 8))
    common_pathways.plot.barh()
    plt.title('Pathways Enriched in Multiple Clusters')
    plt.xlabel('Number of Clusters')
    plt.tight_layout()
    plt.savefig('enrichment_plots/common_pathways_across_clusters.png')
    plt.close()
else:
    print("No enrichment results to combine")