CDR3 的氨基酸序列多样性是免疫系统识别海量抗原(如病原体、肿瘤抗原、自身抗原等)的分子基础,而 CDR3 motif 往往与特定抗原的识别相关。通过分析 CDR3 motif 的序列特征(如长度、关键氨基酸残基),可阐明免疫受体(Ig/TCR)与抗原结合的分子规律,揭示 “抗原 - 受体” 相互作用的特异性机制,为理解免疫应答的精准调控提供依据
CDR3 氨基酸motif比较通常选择特定长度进行。一般选择组间有显著差异的CDR3 长度。如下图所示,左边与右边分别为不同组。CDR3的氨基酸长度分别为16、17与18。然后展示了CDR3 氨基酸motif的特征。通常两端的氨基酸较为保守,而中间的氨基酸多样性大。不同氨基酸按照化学性质标记为不同颜色。

图1.不同长度的CDR3 氨基酸motif比较
这是一种忽略CDR3长度,关注CDR3中间氨基酸多样性motif分析策略。通常以CDR3最中间的氨基酸作为位置0,然后分别计算中间与左右两边各两个氨基酸的motif特征。

图2.CDR3中间氨基酸motif 分析
library(tidyr)
library(tidyverse)
files <-list.files(path = "./samples/HC", pattern = "\\clones.txt$",full.names = TRUE)
# 创建一个空列表用于存储不同长度的数据框
CDR3_data_list <- list()
for (length_val in c(13, 14, 15,16, 17, 18,19,20, 21,26)) {
# 创建对应长度的数据框
CDR3_data_list[[paste0("CDR3of", length_val, "AAA")]] <- data.frame(length = 1:500)
}
# 在循环中动态填充数据
for (file_path in files) {
raw <- file_path
# 提取文件名和目录名称
out <- basename(raw)
nam <- tools::file_path_sans_ext(out)
mycounts <- read.table(raw, sep = "\t", header = TRUE)
new_data <- mycounts[, c("cloneId", "cloneCount", "targetSequences",
"allVHitsWithScore","allDHitsWithScore",
"allJHitsWithScore","allCHitsWithScore","aaSeqCDR3")]
# 假设data是你的数据框,columns是需要处理的列名
columns <- c("allVHitsWithScore", "allDHitsWithScore", "allJHitsWithScore", "allCHitsWithScore")
# 使用lapply遍历每个列并应用相同的操作
new_data[columns] <- lapply(new_data[columns], function(x) sub("\\*(.*)", "", x))
new_data <- new_data[!grepl("[*|_]", new_data[["aaSeqCDR3"]]), ]
##计算CDR3 碱基长度
new_data$CDR3AA_length <- nchar(as.character(new_data$aaSeqCDR3))
for (length_val in c(13,14,15,16,17,18,19,20,21,26)) {
matches <- (length_val == new_data$CDR3AA_length)
matched_rows <- new_data[matches, ]
CDR3_data_list[[paste0("CDR3of", length_val, "AAA")]][[nam]] <- matched_rows$aaSeqCDR3[1:500]
}
}
files_to_process <- c("CDR3of13AAA", "CDR3of14AAA","CDR3of15AAA","CDR3of16AAA", "CDR3of17AAA",
"CDR3of18AAA","CDR3of19AAA", "CDR3of20AAA","CDR3of21AAA","CDR3of26AAA")
for (file_name in files_to_process) {
# 从CDR3_data_list中获取对应的数据框
file_data <- CDR3_data_list[[file_name]]
file_data <-file_data[-1]
merged_df <- list()
# 遍历文件的前500行数据
for (i in 1:500) {
row_data <- file_data[i, ]
merged_df[[i]] <- row_data
}
# 转换并保存数据
merged_df2 <- t(as.data.frame(merged_df))
rownames(merged_df2) <- NULL
merged_df2 <- merged_df2[complete.cases(merged_df2), ]
new_file_name <- paste("./CDR3", file_name, "5K.txt", sep="")
write.table(merged_df2, new_file_name, sep = "\t", quote = FALSE, row.names = FALSE)
}
####特定CDR3 length AA sequence to motif
#加载包
library(ggseqlogo)
files <-list.files(path = "CDR3", pattern = "5K.txt$",full.names = TRUE)
if (!dir.exists("motifigure")) {
dir.create("motifigure")
}
for (file_path in files) {
raw <- file_path
# 提取无后缀的文件名作为输出名称
out <- basename(raw)
nam <- tools::file_path_sans_ext(out)
# 读取数据
mycounts <- read.table(raw, sep = "\t", header = TRUE)
# 生成序列标识图
p <- ggseqlogo(mycounts,method="probability")
# 保存为PDF(自动关闭图形设备)
ggsave(
filename = paste0("motifigure/", nam, ".pdf"), # 动态文件名
plot = p, # 指定要保存的图形对象
device = "pdf", # 设备类型
width = 8, # 宽度(英寸)
height = 3 # 高度(英寸)
)
}df <- read.table("AA-sigclone.txt", sep='\t', header = TRUE)
# 定义样本列名
sample_columns <- c("HC1", "HC2", "HC3", "HC4", "AA1", "AA2", "AA3", "AA4", "AA5")
# 定义过滤条件
filter_conditions <- list(
AAhighVSHC = subset(df, Sig_AAvsHC != "ns" & log2FC_AAvsHC > 0),)
)
# 定义扩展范围(示例为-2到+2,共5个位置)
positions <- c(-2, -1, 0, 1, 2) # 相对CDR3中心的偏移位置,0为中心,-2/-1为左侧,1/2为右侧 # 直接定义法
aa_order <- c("I", "V", "L", "F", "C", "M", "A", "W", "G",
"T", "S", "Y", "P", "H", "N", "D", "Q", "E", "K", "R")
# 定义一个函数来处理和绘制每个样本的HC_mean向量 # 定义函数:处理样本数据并绘制logo图
process_and_plot <- function(filtered_data, sample) {
# 初始化结果矩阵(氨基酸×位置)
result <- matrix(0,
nrow = length(aa_order),
ncol = length(positions),
dimnames = list(aa_order, positions))
# 数据校验
frequencies <- filtered_data[[sample]]
if (any(is.na(frequencies)) || sum(frequencies) == 0) {
next # 跳过无效样本
}
# 权重归一化处理
total_weight <- sum(frequencies)
# 遍历每条CDR3序列
for (i in 1:nrow(filtered_data)) {
cdr3 <- filtered_data[i, 1] # 提取序列
weight <- frequencies[i] / total_weight # 计算归一化权重
# 确定序列中心位置
seq_length <- nchar(cdr3)
center <- ifelse(seq_length %% 2 == 1,
(seq_length + 1) / 2,
seq_length / 2)
# 遍历目标相对位置
for (p in positions) {
actual_pos <- center + p
# 边界检查
if (between(actual_pos, 1, seq_length)) {
aa <- substr(cdr3, actual_pos, actual_pos)
# 有效氨基酸权重累加
if (aa %in% aa_order) {
result[aa, as.character(p)] <-
result[aa, as.character(p)] + weight
}
}
}
}
# 生成序列logo图(两种展示方式)
plot_bits <- ggseqlogo(result, method = 'bits') +
ggtitle(paste0(sample, " - bits"))
plot_prob <- ggseqlogo(result, method = 'prob') +
ggtitle(paste0(sample, " - prob"))
return(list(bits_plot = plot_bits, prob_plot = plot_prob))
}
# 处理不同过滤条件数据集
analysis_pipeline <- function(filter_conditions) {
results <- list()
for (condition_name in names(filter_conditions)) {
filtered_data <- filter_conditions[[condition_name]]
# 可添加样本遍历逻辑
# for (sample in colnames(filtered_data)[-1]) {
# results[[paste(condition_name, sample)]] <-
# process_and_plot(filtered_data, sample)
# }
}
return(results)
}
##循环构建
for (sample in sample_columns) {
p<- process_and_plot(filtered_data,sample)
all_plots[[length(all_plots) + 1]] <- p
}
# 每页20张图(4列5行),总共分几页
num_plots <- length(all_plots)
plots_per_page <- 10
total_pages <- ceiling(num_plots / plots_per_page)
# 创建一个空的 grob 列表来存储每一页的内容
page_grobs <- list()
for (i in seq(1, num_plots, by = plots_per_page)) {
page_plots <- all_plots[i:min(i + plots_per_page - 1, num_plots)]
combined_plot <- wrap_plots(page_plots, ncol = 2, nrow = 5) +
plot_annotation(tag_levels = "A")
page_grobs[[length(page_grobs) + 1]] <- combined_plot
}
# 将所有页面组合成一个 PDF 文件
pdf(paste0(condition_name,".png"), width = 9, height = 14) # A4 size in inches
for (page_grob in page_grobs) {
print(page_grob)
}
dev.off()
}
图片来源:
图1:本作者创作
图2:Mark, Michal et al. “Viral infection reveals hidden sharing of TCR CDR3 sequences between individuals.” Frontiers in immunology vol. 14 1199064. 30 May. 2023, doi:10.3389/fimmu.2023.1199064