GTEx联合TCGA数据库差异分析(更新)—科研工具箱

请关注公众号【叨客学习资料】 在使用网站的过程中有疑问,请来公众号进行反馈哦

GTEx(Genotype-Tissue Expression,基因型-组织表达)数据库,研究从来自449名生前健康的人类捐献者的7000多份尸检样本,涵盖44个组织(42种不同的组织类型),包括31个实体器官组织、10个闹分区、全血、2个来自捐献者血液和皮肤的细胞系,作者利用这些样本研究基因表达在不同组织和个体中有何差异。

GTEx对几乎所有转录基因的基因表达模式进行了观察,从而能够确定基因组中影响基因表达的特定区域。

此外,合并GTEx与TCGA数据库数据能够有效解决TCGA数据库中正常组织样本量不足的缺陷,从而提高比较的准确性。

1. 数据来源

(数据比较大,如果下载困难可以留言)

2. 注释来自 TCGA GTEx 的样本

library(stringr)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(data.table)


#################======= step1: clean GTEx pheno data   =======#################
gtex <- read.table(\"samplepair.txt\",header=T,sep=\'\\t\')

tcga_ref <- gtex[,1:2]

gtex$type <- paste0(gtex$TCGA,\"_normal_GTEx\")
gtex$sample_type <-\"normal\"
gtex <- gtex[,c(\"TCGA\",\"GTEx\",\"type\",\"sample_type\")]
names(gtex)[1:2] <- c(\"tissue\",\"X_primary_site\")


gp <- read.delim(file=\"GTEX_phenotype.gz\",header=T,as.is = T)
gtex2tcga <- merge(gtex,gp,by=\"X_primary_site\")
gtex_data <- gtex2tcga[,c(5,2:4)]
names(gtex_data)[1] <- \"sample\"
#write.table(gtex_data,\"GTEx_pheno.txt\",row.names=F,quote=F,sep=\'\\t\')

#################======= step2: clean a TCGA pheno data   =======#################
tcga <- read.delim(file=\"TCGA_phenotype_denseDataOnlyDownload.tsv.gz\",header=T,as.is = T)
tcga <- merge(tcga_ref,tcga,by.y=\"X_primary_disease\",by.x=\"Detail\",all.y = T)
tcga <- tcga[tcga$sample_type %in% c(\"Primary Tumor\",\"Solid Tissue Normal\"),]

tcga$type <- ifelse(tcga$sample_type==\'Solid Tissue Normal\',
                    paste(tcga$TCGA,\"normal_TCGA\",sep=\"_\"),paste(tcga$TCGA,\"tumor_TCGA\",sep=\"_\"))
tcga$sample_type <- ifelse(tcga$sample_type==\'Solid Tissue Normal\',\"normal\",\"tumor\")
tcga<-tcga[,c(3,2,6,5)]
names(tcga)[2] <- \"tissue\"
#write.table(tcga,\"tcga_pheno.txt\",row.names = F,quote=F,sep=\'\\t\')


#################======= step3: remove samples without tpm data =======############
gtex_exp <-  fread(\"gtex_RSEM_gene_tpm.gz\",data.table = F)
gtexS <- gtex_data[ gtex_data$sample%in%colnames(gtex_exp)[-1],]

tcga_exp <- fread(\"tcga_RSEM_gene_tpm.gz\",data.table = F)
tcgaS <- tcga[tcga$sample %in%colnames(tcga_exp)[-1],]
tcga_gtex <- rbind(tcgaS,gtexS)
write.table(tcga_gtex,\"tcga_gtex_sample.txt\",row.names = F,quote=F,sep=\'\\t\')

3. 提取感兴趣的基因

library(stringr)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(data.table)
library(tibble)

rm(list = ls())
options(stringsAsFactors = FALSE) 

target <- \"YTHDC2\"

idmap <- read.delim(\"gencode.v23.annotation.gene.probemap\",as.is=T)
tcga_exp <- fread(\"tcga_RSEM_gene_tpm.gz\",data.table = F)
gtex_exp <- fread(\"gtex_RSEM_gene_tpm.gz\",data.table=F)
tcga_gtex <- read.table(\"tcga_gtex_sample.txt\",sep=\'\\t\',header = T)

id <- idmap$id[which(idmap$gene==target)]
tcga_data <- t(tcga_exp[tcga_exp$sample==id,colnames(tcga_exp)%in%c(\"sample\",tcga_gtex$sample)])
tcga_data <- data.frame(tcga_data[-1,])
tcga_data <- rownames_to_column(tcga_data,\"sample\")
names(tcga_data)[2] <- \"tpm\"

gtex_data <- t(gtex_exp[gtex_exp$sample==id,colnames(gtex_exp)%in%c(\"sample\",tcga_gtex$sample)])
gtex_data <- data.frame(gtex_data[-1,])
gtex_data <- rownames_to_column(gtex_data,\"sample\")
names(gtex_data)[2] <- \"tpm\"

tmp  <- rbind(tcga_data,gtex_data)
exp <- merge(tmp,tcga_gtex,by=\"sample\",all.x=T)
exp <- exp[,c(\"tissue\",\"sample_type\",\"tpm\")]
exp <- arrange(exp,tissue)
write.table(exp,\"Merge gene expression/YTHDC2 expression.txt\",row.names = F,quote=F,sep=\'\\t\')

4. 可视化基因表达

library(ggplot2)
library(ggpubr)
library(RColorBrewer)

rm(list = ls())
options(stringsAsFactors = FALSE) 

exp <- read.table(\"Merge gene expression/YTHDC2 expression.txt\",header=T,sep=\'\\t\')

ylabname <- paste(\"YTHDC2\", \"expression\")
colnames(exp) <- c(\"Tissue\", \"Group\", \"Gene\")

p1 <- ggboxplot(exp, x = \"Tissue\", y = \"Gene\", fill = \'Group\',
                ylab = ylabname,
                color = \"Group\", 
                palette = c(\"#00AFBB\",  \"#FC4E07\"),
                ggtheme = theme_minimal())

##计算每种肿瘤正常和肿瘤组织的样本量
count_N<-exp %>% group_by(Tissue, Group) %>% tally
count_N$n <- paste(\"n =\",count_N$n)
##添加N = 到图中
p1 <-p1+geom_text(data=count_N, aes(label=n, y=-9,color=Group), position=position_dodge2(0.9),size = 3,angle=90, hjust = 0)+
  theme(axis.text.x = element_text(angle = 45,hjust = 1.2))

#计算t检验显著性
comp<- compare_means(Gene ~ Group, group.by = \"Tissue\", data = exp,
                         method = \"t.test\", symnum.args = list(cutpoints = c(0,0.001, 0.01, 0.05, 1), symbols = c( \"***\", \"**\", \"*\", \"ns\")),
                         p.adjust.method = \"holm\")
#添加显著性标记
p2 <- p1 + stat_pvalue_manual(comp, x = \"Tissue\", y.position = 7.5,
                     label = \"p.signif\", position = position_dodge(0.8))
p2
#dev.off()

##保存图片
### pdf version
ggsave(\"figure/pancancer_Plot.pdf\", width = 14, height = 5)

### png version
#png(\"figure/pancancer_Plot.png\", width = 465, height = 225, units=\'mm\', res = 300)
图片[1]-GTEx联合TCGA数据库差异分析(更新)—科研工具箱-叨客学习资料网

代码参考GitHub:https://github.com/cmutd/TCGA_GTEx

© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享
评论 抢沙发
头像
请输入有效评论哦,肆意灌水或者乱打评论是不会通过的,会影响您评论后获得资源哦~~
提交
头像

昵称

取消
昵称表情

    暂无评论内容