TCGA 数据分析实战 —— 突变及拷贝数分析
在介绍完的查询下载和数据分析功能之后,我们简单展示几个示例,来练练手,加深对这个包的理解和使用我们主要从基因组、转录组和表观组3个维度分别举例来进行说明。
TCGA 数据分析实战 —— 突变及拷贝数分析
文章目录
前言
在介绍完 TCGAbiolinks
的查询下载和数据分析功能之后,我们简单展示几个示例,来练练手,加深对这个包的理解和使用
我们主要从基因组、转录组和表观组 3
个维度分别举例来进行说明。
基因组分析
拷贝数变异(CNV
)在癌症的发生和发展研究中扮演重要的角色。由于基因组重排(如染色体缺失、重复、插入和易位),导致染色体片段的扩增或删失。
CNV
是大于 1kb
的基因组区域发生拷贝数的扩增和删失。
TCGA
的数据可以分为三个等级:
level 1
: 原始的测序数据(fasta
、fastq等
)level 2
: 比对好的bam
文件level 3
: 经过标准化处理的数据
我们需要获取到 level 3
的数据来进行分析,来识别癌症基因组变异
数据预处理
我们分析的是 legacy
数据库中 Affymetrix Genome-Wide Human SNP Array 6.0
平台的数据
获取 20
个 GBM
和 LGG
样本的 CNV
数据
library(TCGAbiolinks) # 2.30.4
library(tidyverse)
query.gbm.nocnv <- GDCquery(
project = "TCGA-GBM",
data.category = "Copy Number Variation",
data.type = "Masked Copy Number Segment",
access = "open"
)
# 只挑选 20 个样本
query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,]
GDCdownload(query.gbm.nocnv)
cnvMatrix <- GDCprepare(query.gbm.nocnv) %>%
filter(abs(Segment_Mean) > 0.3) %>%
mutate(label = if_else(Segment_Mean < -0.3, 0, 1)) %>%
dplyr::select(-Segment_Mean, -GDC_Aliquot) %>%
rename_with(~ c("Chromosome", "Start", "End", "Num.of.Markers", "Sample.Name", "Aberration")) %>%
mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
Chromosome = as.integer(Chromosome)) %>%
dplyr::select(c(5, 1:4, 6)) %>%
# 注意要转换为 data.frame 格式
as.data.frame()
查看 cnv
矩阵
cnvMatrix[1:3,1:3]
# Sample.Name Chromosome Start
# 1 TCGA-16-1056-10A-01D-0517-01 2 212458388
# 2 TCGA-16-1056-10A-01D-0517-01 2 229437826
# 3 TCGA-16-1056-10A-01D-0517-01 2 232263591
识别 recurrent CNV
癌症相关的 CNV
会在许多样本中反复出现,我们使用 GAIA
包来识别这种显著的 recurrent CNV
GAIA
算法基于随机排列的统计检验,同时考虑样本内的显著性和同质性,以重复迭代的方式删除区间内的峰,直至剩下的峰没有显著性为止识别显著的 CNV
处理探针数据,这个需要从 GDC Reference Files 中下载相应版本的探针集合。
将文件读取进来,并进行相应的处理
markersMatrix <- readr::read_tsv(
"~/Downloads/snp6.na35.remap.hg38.subset.txt",
show_col_types = FALSE
) %>%
dplyr::filter(freqcnv == FALSE) %>% # use for GISTIC
dplyr::select(1:3) %>%
rename_with(~c("Probe.Name", "Chromosome", "Start")) %>%
mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
Chromosome = as.integer(Chromosome)) %>%
dplyr::filter(!duplicated(Probe.Name)) %>%
as.data.frame %>%
arrange(Chromosome, Start) # 排序是重要的
运行 gaia
,识别 recurrent CNV
library(gaia)
set.seed(200)
markers_obj <- load_markers(as.data.frame(markersMatrix_filtered))
nbsamples <- length(unique(cnvMatrix$Sample.Name))
cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples)
suppressWarnings({
cancer <- "GBM"
results <- runGAIA(
cnv_obj, markers_obj,
output_file_name = paste0("~/Downloads/GAIA_", cancer, "_flt.txt"),
# 指定需要分析的变异类型,-1 分析所有的变异
aberrations = -1,
# 指定需要分析的染色体,默认为 -1,表示所有染色体
chromosomes = -1,
# 使用近似的方式可以加快计算速度
approximation = TRUE,
# 设置迭代次数
num_iterations = 5000,
threshold = 0.25
)
})
在运行 load_cnv
或 runGAIA
时经常会报错,可以使用下面修改后的代码。
load_cnv <- function(segmentation_matrix, markers_list, num_of_samples){
message("Loading Copy Number Data");
# Detect the chromosomes
chromosomes <- as.numeric(sort(unique(names(markers_list))));
chromosomes <- chromosomes[which(!is.na(chromosomes))];
# Detect the aberration kinds
aberration_kinds <- 1:length(unique(segmentation_matrix[,6]));
names(aberration_kinds) <- sort(unique(segmentation_matrix[,6]));
# Detect the samples
samples <- 1:num_of_samples;
if(!is.numeric(segmentation_matrix[,1])){
sample_names <- unique(segmentation_matrix[,1]);
for(i in 1:length(sample_names)){
segmentation_matrix[which(segmentation_matrix[,1]==sample_names[i]),1] <- i;
}
}
region_list <- list();
# Create the final structure list of the returned list
for(k in 1:length(aberration_kinds)){
region_list[[ aberration_kinds[k] ]] <- list();
for(i in 1:length(chromosomes) ){
region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]] <- matrix(0, length(samples), ncol(markers_list[[ chromosomes[i] ]]));
rownames(region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]]) <- samples;
colnames(region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]]) <- c(1:ncol(markers_list[[ chromosomes[i] ]]));
}
}
for(k in 1:length(aberration_kinds)){
ab_ids <- which(segmentation_matrix[,6]==names(aberration_kinds[k]));
# In this matrix all regions aberrant as aberration_kinds[k] are stored
tmp_matrix1 <- segmentation_matrix[ab_ids,];
if(class(tmp_matrix1)=="numeric"){
tmp_matrix1 <- t(as.matrix(tmp_matrix1));
}
for(i in 1:length(chromosomes) ){
# In this matrix all regions aberrant as aberration_kinds[k] for the i-th chromsome are stored
tmp_matrix2 <- tmp_matrix1[which(tmp_matrix1[,2]==chromosomes[i]),];
if(class(tmp_matrix2)=="numeric"){
tmp_matrix2 <- t(as.matrix(tmp_matrix2));
}
message(".", appendLF = FALSE);
for(j in 1:length(samples)){
#In this matrix all regions aberrant as aberration_kinds[k] for the i-th chromsome and for the j-th sample are stored
tmp_matrix3 <- tmp_matrix2[which(tmp_matrix2[,1]==samples[j]),];
if(class(tmp_matrix3)=="numeric"){
tmp_matrix3 <- t(as.matrix(tmp_matrix3));
}
if(nrow(tmp_matrix3)>0){
for(t in 1:nrow(tmp_matrix3)){
start_prob <- tmp_matrix3[t,3];
end_prob <- tmp_matrix3[t,4];
start_index <- which(markers_list[[ chromosomes[i] ]][1,] == start_prob);
end_index <- which(markers_list[[ chromosomes[i] ]][2,] == end_prob);
if (is_empty(start_index) | is_empty(end_index))
next
region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]][samples[j], start_index:end_index] <- 1;
}
}
}
}
}
message("\nDone");
names(region_list) <- names(aberration_kinds);
return(region_list);
}
peel_off <- function (pvalues_list, threshold, chromosomes, aberrations,
discontinuity, hom_threshold)
{
regions_list <- list()
for (i in 1:length(aberrations)) {
region_chromosome_list <- list()
cnv_index <- aberrations[i]
for (j in 1:length(chromosomes)) {
property_list <- list()
chromosome_index <- chromosomes[j]
message(".", appendLF = FALSE)
curr_pvalue <- pvalues_list[[cnv_index]][[chromosome_index]]
qvals <- qvalue(curr_pvalue)
curr_qvalue <- qvals
tmp_qvalue <- curr_qvalue
tmp_start <- c()
tmp_end <- c()
start <- c()
end <- c()
qval <- c()
tmp_pvalue <- curr_pvalue
while (min(tmp_qvalue) < threshold) {
tmp_list <- search_peaks_in_regions(tmp_qvalue,
1, length(tmp_qvalue), discontinuity[[chromosome_index]],
hom_threshold, threshold)
# 原来是 tmp_list[[1]] == -1
if (tmp_list[[1]][1] == -1) {
break
}
else {
tmp_start <- tmp_list[[1]]
tmp_end <- tmp_list[[2]]
for (k in 1:length(tmp_start)) {
start <- c(start, tmp_start[k])
end <- c(end, tmp_end[k])
qval <- c(qval, tmp_qvalue[tmp_start[k]])
tmp_pvalue[tmp_start[k]:tmp_end[k]] <- 1
}
}
qvals <- qvalue(tmp_pvalue)
tmp_qvalue <- qvals
}
if (length(start) > 0) {
property_list[[1]] <- start
property_list[[2]] <- end
property_list[[3]] <- qval
}
else {
property_list[[1]] <- -1
property_list[[2]] <- -1
property_list[[3]] <- -1
}
region_chromosome_list[[chromosome_index]] <- property_list
}
regions_list[[cnv_index]] <- region_chromosome_list
}
message("\nDone")
return(regions_list)
}
myrunGAIA <- function (cnv_obj, markers_obj, output_file_name = "", aberrations = -1,
chromosomes = -1, num_iterations = 10, threshold = 0.25,
hom_threshold = 0.12, approximation = FALSE)
{
message("\nPerforming Data Preprocessing")
if (chromosomes == -1) {
chromosomes <- as.numeric(sort(unique(names(markers_obj))))
chromosomes <- chromosomes[which(!is.na(chromosomes))]
}
else {
known_chr <- as.numeric(names(markers_obj))
if ((length(known_chr[chromosomes]) != length(chromosomes)) ||
(sum(is.na(known_chr[chromosomes])) > 0)) {
error_string <- "Error in the list of chromosomes passed as argument.\n"
error_string <- cat(error_string, "The list of the chromosomes that can be analyzed follows:\n",
known_chr, "\n")
stop(error_string, call. = FALSE)
}
}
if (aberrations == -1) {
aberrations <- as.numeric(names(cnv_obj))
names(aberrations) <- names(cnv_obj)
if (length(aberrations) > 0 && aberrations[1] == 0) {
aberrations <- aberrations + 1
}
}
else {
aberrations <- as.numeric(names(cnv_obj))
names(aberrations) <- names(cnv_obj)
if (length(aberrations) > 0 && aberrations[1] == 0) {
aberrations <- aberrations + 1
}
known_aberr <- as.numeric(names(cnv_obj))
if ((length(known_aberr[aberrations]) != length(aberrations)) ||
(sum(is.na(known_aberr[aberrations]) > 0))) {
error_string <- "Error in the list of aberrations passed as argument.\n"
error_string <- cat(error_string, "The aberrations that can be analyzed follow:\n",
known_aberr, "\n")
stop(error_string, call. = FALSE)
}
}
discontinuity <- list()
if (length(aberrations) == 2 && hom_threshold >= 0) {
message("\nDone")
message("\nComputing Discontinuity Matrix")
for (i in 1:length(chromosomes)) {
message(".", appendLF = FALSE)
tmp <- cnv_obj[[2]][[chromosomes[i]]] - cnv_obj[[1]][[chromosomes[i]]]
tmp_vec <- 0 * c(1:(ncol(tmp) - 1))
for (k in 1:(ncol(tmp) - 1)) {
for (z in 1:nrow(tmp)) {
tmp_vec[k] <- tmp_vec[k] + abs(tmp[z, k] -
tmp[z, k + 1])
}
}
discontinuity[[chromosomes[i]]] <- tmp_vec/nrow(cnv_obj[[2]][[chromosomes[i]]])
}
}
else {
if (length(aberrations) != 2 && hom_threshold >= 0) {
message("\nHomogeneous cannot be applied on the data (data must contain exactly two different kinds of aberrations)\n")
}
for (i in 1:length(chromosomes)) {
tmp <- cnv_obj[[1]][[chromosomes[i]]] - cnv_obj[[1]][[chromosomes[i]]]
tmp_vec <- 0 * c(1:(ncol(tmp) - 1))
discontinuity[[chromosomes[i]]] <- tmp_vec
hom_threshold = -1
}
}
message("\nDone")
null_hypothesis_list <- list()
null_hypothesis_chromosome_list <- list()
message("Computing Probability Distribution")
for (k in 1:length(aberrations)) {
null_hypothesis_chromosome_list <- list()
for (i in 1:length(chromosomes)) {
aberrations_index <- (aberrations[k])
chromosome_index <- chromosomes[i]
obs_data <- cnv_obj[[aberrations_index]][[chromosome_index]]
message(".", appendLF = FALSE)
if (approximation == FALSE) {
null_hypothesis_chromosome_list[[chromosome_index]] <- generate_null_hypothesis(obs_data,
num_iterations)
}
if (approximation == TRUE) {
null_hypothesis_chromosome_list[[chromosome_index]] <- generate_approx_null_hypothesis(obs_data,
num_iterations)
}
}
null_hypothesis_list[[aberrations_index]] <- null_hypothesis_chromosome_list
}
pvalue_distribution_list <- list()
for (k in 1:length(aberrations)) {
tmp_pvalue_list <- list()
for (i in 1:length(chromosomes)) {
message(".", appendLF = FALSE)
tmp_pvalue_list[[chromosomes[i]]] <- rev(cumsum(rev(null_hypothesis_list[[aberrations[k]]][[chromosomes[i]]])))
}
pvalue_distribution_list[[aberrations[k]]] <- tmp_pvalue_list
}
message("\nDone")
pvalues_list <- list()
message("Assessing the Significance of Observed Data")
for (k in 1:length(aberrations)) {
pvalue_chromosome_list <- list()
for (i in 1:length(chromosomes)) {
message(".", appendLF = FALSE)
aberrations_index <- (aberrations[k])
chromosome_index <- chromosomes[i]
curr_pvalue <- pvalue_distribution_list[[aberrations_index]][[chromosome_index]]
curr_obs_data <- cnv_obj[[aberrations_index]][[chromosome_index]]
obs_freq <- apply(curr_obs_data, 2, sum)
pvalue_chromosome_list[[chromosome_index]] <- round(curr_pvalue[obs_freq[] +
1], 7)
}
pvalues_list[[aberrations_index]] <- pvalue_chromosome_list
}
message("\nDone")
if (length(aberrations) == 2) {
gistic_label <- c("Del", "Amp")
message("Writing ", paste(output_file_name, ".igv.gistic",
sep = ""), " File for Integrative Genomics Viewer (IGV) Tool")
gistic <- matrix(nrow = 0, ncol = 8)
colnames(gistic) <- c("Type", "Chromosome", "Start",
"End", "q-value", "G-score", "average amplitude",
"frequency")
for (k in 1:length(aberrations)) {
tmp_pvalue_list <- list()
for (i in 1:length(chromosomes)) {
message(".", appendLF = FALSE)
curr_qval <- as.numeric(pvalues_list[[aberrations[k]]][[chromosomes[i]]])
curr_qval <- qvalue(curr_qval)
start <- 1
for (z in 2:(length(curr_qval))) {
if (curr_qval[z - 1] != curr_qval[z]) {
end <- z - 1
print_qval <- curr_qval[z - 1]
gistic <- rbind(gistic, c(gistic_label[k],
chromosomes[i], markers_obj[[chromosomes[i]]][1,
start], markers_obj[[chromosomes[i]]][2,
end], print_qval, 0, 0, 0))
start <- z
}
}
end <- length(curr_qval)
print_qval <- curr_qval[end]
gistic <- rbind(gistic, c(gistic_label[k], chromosomes[i],
markers_obj[[chromosomes[i]]][1, start], markers_obj[[chromosomes[i]]][2,
end], print_qval, 0, 0, 0))
}
}
write.table(gistic, paste(output_file_name, ".igv.gistic",
sep = ""), sep = "\t", col.names = TRUE, row.names = FALSE,
eol = "\n", quote = FALSE)
message("\nDone")
}
if (hom_threshold >= 0) {
message("Running Homogeneous peel-off Algorithm With Significance Threshold of ",
threshold, " and Homogenous Threshold of ", hom_threshold)
}
else {
message("Running Standard peel-off Algorithm With Significance Threshold of ",
threshold)
}
significant_regions_list <- peel_off(pvalues_list, threshold,
chromosomes, aberrations, discontinuity, hom_threshold)
if (output_file_name != "") {
message("\nWriting Output File '", output_file_name,
"' Containing the Significant Regions\n", sep = "")
results <- write_significant_regions(markers_obj, significant_regions_list,
output_file_name, chromosomes, aberrations)
message("File '", output_file_name, "' Saved\n", sep = "")
}
else {
results <- write_significant_regions(markers_obj, significant_regions_list,
output_file_name, chromosomes, aberrations)
}
return(results)
}
最后识别重复出现的 CNV
,并绘制结果图
RecCNV <- as_tibble(results) %>%
mutate_at(vars("q-value"), as.numeric) %>%
mutate_at(vars(-"q-value"), as.integer) %>% {
# 如果 q-value 为 0,则设置其值为数据中的最小非零值
minval = min(.$`q-value`[which(.$`q-value` != 0)])
mutate(., `q-value` = ifelse(`q-value` == 0, minval, `q-value`))
} %>%
mutate(score = -log10(`q-value`)) %>%
as.data.frame()
threshold <- 0.3
gaiaCNVplot(RecCNV,threshold)
save(results, RecCNV, threshold, file = paste0("~/Downloads/", cancer,"_CNV_results.rda"))
recurrent CNV 基因注释
在使用 GAIA
识别出癌症中重复出现的基因组区域变异之后,我们需要将其注释到对应的基因。
我们使用 biomaRt
来获取所有人类基因的基因组范围信息,然后将显著变异的基因组区域映射到对应的基因上
library(GenomicRanges)
# 注意,这里用的是 hg38 的基因信息,
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg38") %>%
filter(gene_name != "" & seqnames %in% paste0('chr', c(1:22, "X", "Y"))) %>%
mutate(
seqnames = ifelse(
seqnames == "X", 23, ifelse(seqnames == "Y", 24, seqnames)
),
seqnames = as.integer(seqnames)
) %>%
arrange(seqnames, start) %>%
dplyr::select(c("gene_name", "seqnames", "start","end")) %>%
rename_with(~ c("GeneSymbol", "Chr", "Start", "End"))
genes_GR <- makeGRangesFromDataFrame(genes, keep.extra.columns = TRUE)
save(genes_GR, genes, file = "~/Downloads/genes_GR.rda", compress = "xz")
将 CNV
区域注释到基因
# 选择需要的列
sCNV <- dplyr::select(RecCNV, c(1:4, 6)) %>%
filter(`q-value` <= threshold) %>%
arrange(1, 3) %>%
rename_with(~c("Chr", "Aberration", "Start", "End", "q-value"))
# 转换为 GenomicRanges 格式
sCNV_GR <- makeGRangesFromDataFrame(sCNV, keep.extra.columns = TRUE)
# 寻找交叠区间
hits <- findOverlaps(genes_GR, sCNV_GR, type = "within")
sCNV_ann <- cbind(sCNV[subjectHits(hits),], genes[queryHits(hits),])
sCNV_ann[1:3,]
# Chr Aberration Start End q-value GeneSymbol Chr Start End
# 1 1 0 6520190 6995266 0.03735917 NOL9 1 6521347 6554513
# 1.1 1 0 6520190 6995266 0.03735917 RNU6-731P 1 6540854 6540964
# 1.2 1 0 6520190 6995266 0.03735917 AL591866.1 1 6547905 6548619
格式化输出内容
library(glue)
AmpDel_genes <- as_tibble(sCNV_ann, .name_repair = "universal") %>%
mutate(
AberrantRegion = glue("{Chr...1}:{Start...3}-{End...4}"),
GeneRegion = glue("{Chr...7}:{Start...8}-{End...9}")
) %>%
dplyr::select(c(6, 2, 5, 10, 11)) %>%
mutate(Aberration = if_else( Aberration == 0, "Del", "Amp"))
save(RecCNV, AmpDel_genes, file = paste0("~/Downloads/", cancer, "_CNV_results.rda"))
输出结果如下
AmpDel_genes[1:3,]
# # A tibble: 3 × 5
# GeneSymbol Aberration q.value AberrantRegion GeneRegion
# <chr> <chr> <dbl> <glue> <glue>
# 1 NOL9 Del 0.0374 1:6520190-6995266 1:6521347-6554513
# 2 RNU6-731P Del 0.0374 1:6520190-6995266 1:6540854-6540964
# 3 AL591866.1 Del 0.0374 1:6520190-6995266 1:6547905-6548619
基因组变异可视化
OncoPrint
下面我们展示如何绘制基因组突变数据,首先,我们使用的是之前介绍过的 complexHeatmap
包的 OncoPrint
函数。
GDCquery_maf
函数用于下载突变数据
query <- GDCquery(
project = "TCGA-LGG",
data.category = "Simple Nucleotide Variation",
access = "open",
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
LGGmut <- GDCprepare(query)
query <- GDCquery(
project = "TCGA-GBM",
data.category = "Simple Nucleotide Variation",
access = "open",
data.type = "Masked Somatic Mutation",
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
GBMmut <- GDCprepare(query)
mut <- LGGmut %>% add_row(GBMmut)
并提取胶质瘤相关通路中的基因信息
# 提取胶质瘤通路中基因的突变信息
Glioma_signaling <- as_tibble(TCGAbiolinks:::listEA_pathways) %>%
filter(grepl("glioma", Pathway, ignore.case = TRUE)) %>%
subset(Pathway == "Glioma Signaling")
# 只取 10 个基因的突变信息
Glioma_signaling_genes <- unlist(str_split(Glioma_signaling$Molecules, ","))[1:10]
mut <- mut[mut$Hugo_Symbol %in% Glioma_signaling_genes,]
# 获取对应的临床信息
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
clinical <- gbm_clin %>% add_row(lgg_clin)
绘制突变瀑布图
TCGAvisualize_oncoprint(
mut = mut,
gene = Glioma_signaling_genes,
annotation = clinical[, c("bcr_patient_barcode", "primary_diagnosis", "vital_status", "ethnicity")],
annotation.position = "bottom",
color=c("background"="#CCCCCC", "DEL"="#fb8072", "INS"="#80b1d3", "SNP"="#fdb462"),
label.font.size = 16,
rm.empty.columns = FALSE
)
circos plot
基因组变异信息也可以使用 circos
图形来展示,我们使用前面介绍的 circlize
包来展示 CNV
和突变信息
CNV
的数据是 GAIA
分析的结果,突变数据使用的是从 GDC
数据库中获取的 LGG
样本的 MAF
文件的结果
首先,处理一下突变数据
# 提取需要的突变类型
mut_type <- c(
"Missense_Mutation",
"Nonsense_Mutation",
"Nonstop_Mutation",
"Frame_Shift_Del",
"Frame_Shift_Ins"
)
mut <- filter(LGGmut, Variant_Classification %in% mut_type) %>%
mutate(
mut.id = glue("{Chromosome}:{Start_Position}-{End_Position}|{Reference_Allele}"),
.before = Hugo_Symbol
) %>%
# 只考虑至少在两个样本中发生突变的基因
filter(duplicated(mut.id)) %>%
dplyr::select(c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")) %>%
distinct_all() %>% {
# 将 typeNames 作为全局变量,后面还要使用
typeNames <<- unique(.$Variant_Classification)
type = structure(1:length(typeNames), names = typeNames)
mutate(., type = type[Variant_Classification])
} %>%
dplyr::select(c(1:3,6,4,5))
CNV
数据处理
load("~/Downloads/GBM_CNV_results.rda")
cnv <- as_tibble(RecCNV) %>%
# 值考虑小于阈值的区域,threshold = 0.3
filter(`q-value` <= threshold) %>%
select(c(1, 3, 4, 2)) %>%
# 将染色体名称转换为 chr 开头的字符串
mutate(
Chromosome = ifelse(
Chromosome == 23, "X", ifelse(Chromosome == 24, "Y", Chromosome)
),
Chromosome = paste0("chr", Chromosome)
) %>%
mutate(CNV = 1) %>%
`names<-`(c("Chromosome","Start_position","End_position","Aberration_Kind","CNV"))
绘制 circos
图
library(circlize)
par(mar=c(1,1,1,1), cex=1)
circos.initializeWithIdeogram()
# 添加 CNV 结果
colors <- c("forestgreen","firebrick")
circos.genomicTrackPlotRegion(
cnv, ylim = c(0, 1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(
region, value, ytop.column = 2,
ybottom = 0, col = colors[value[[1]] + 1],
border = "white"
)
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
}
)
# 添加突变结果
colors <- structure(c("blue","green","red", "gold"), names = typeNames[1:4])
circos.genomicTrackPlotRegion(
mut, ylim = c(1.2, 4.2),
panel.fun = function(region, value, ...) {
circos.genomicPoints(
region, value, cex = 0.8, pch = 16,
col = colors[value[[2]]], ...)
}
)
circos.clear()
# 添加图例
legend(
-0.1, 0.2, bty = "n", y.intersp = 1,
c("Amp", "Del"), pch = 15,
col = c("firebrick", "forestgreen"),
title = "CNVs", text.font = 1,
cex = 0.8, title.adj = 0
)
legend(
1, 0.2, bty = "n", y.intersp = 1,
names(colors), pch = 20, col = colors,
title = "Mutations", text.font = 1,
cex = 0.8, title.adj = 0
)
部分区域可视化
我们可以将基因组范围限制在某一条染色体上,来更好地查看该染色体上突变和拷贝数的变异。
可以使用如下图形来展示
par(mar=c(.1, .1, .1, .1),cex=1.5)
circos.par(
"start.degree" = 90, canvas.xlim = c(0, 1),
canvas.ylim = c(0, 1), gap.degree = 270,
cell.padding = c(0, 0, 0, 0),
track.margin = c(0.005, 0.005)
)
circos.initializeWithIdeogram(chromosome.index = "chr17")
circos.par(cell.padding = c(0, 0, 0, 0))
# 添加 CNV 结果
colors <- c("forestgreen","firebrick")
names(colors) <- c(0,1)
circos.genomicTrackPlotRegion(
cnv, ylim = c(0, 1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(
region, value, ytop.column = 2,
ybottom = 0, col = colors[value[[1]]],
border = "white"
)
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
}
)
# 添加单基因突变结果
gene.mut <- mutate(mut, genes.mut = glue("{Hugo_Symbol}-{type}")) %>%
filter(!duplicated(genes.mut)) %>% {
n.mut <- table(.$genes.mut)
mutate(., num = n.mut[.$genes.mut])
} %>%
mutate(
genes.num = as.character(glue("{Hugo_Symbol}({num})")),
type = type / 2
) %>%
dplyr::select(-(6:8))
colors <- c("blue","green","red","gold")
names(colors) <- typeNames[1:4]
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0.3, 2.2), track.height = 0.05,
panel.fun = function(region, value, ...) {
circos.genomicPoints(
region, value, cex = 0.4, pch = 16,
col = colors[value[[2]]], ...
)
}
)
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0, 1), track.height = 0.1, bg.border = NA
)
i_track = get.cell.meta.data("track.index")
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0, 1),
panel.fun = function(region, value, ...) {
circos.genomicText(
region, value, y = 1, labels.column = 3,
col = colors[value[[2]]], facing = "clockwise",
adj = c(1, 0.5), posTransform = posTransform.text,
cex = 0.4, niceFacing = TRUE
)
},
track.height = 0.1,
bg.border = NA
)
circos.genomicPosTransformLines(
gene.mut,
posTransform = function(region, value)
posTransform.text(
region, y = 1, labels = value[[3]],
cex = 0.4, track.index = i_track + 1
),
direction = "inside",
track.index = i_track
)
circos.clear()
legend(
0, 0.38, bty = "n", y.intersp = 1, c("Amp", "Del"),
pch = 15, col = c("firebrick", "forestgreen"),
title = "CNVs", text.font = 1, cex = 0.7, title.adj = 0
)
legend(
0, 0.2, bty = "n", y.intersp = 1, names(colors),
pch = 16, col = colors, title = "Mutations",
text.font = 1, cex = 0.7, title.adj = 0
)
完整代码:https://github.com/dxsbiocc/learn/blob/main/R/TCGA/Mutation_CNV_visualize.R
更多推荐
所有评论(0)