利用DESeq2包及clusterProfiler包进行差异表达分析及GO分析

最新推荐文章于 2024-07-03 10:47:26 发布

Eric's blog

最新推荐文章于 2024-07-03 10:47:26 发布

阅读量2k

点赞数

CC 4.0 BY-SA版权

文章标签： R

本文链接：https://blog.csdn.net/Eric_blog/article/details/119951736

本文介绍了如何使用DESeq2包进行基因表达差异分析，详细步骤包括数据预处理、模型构建、结果筛选，并利用clusterProfiler进行基因富集研究。最终输出了显著差异表达基因并探讨其可能的功能注释。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

DESeq2包进行差异分析

a<-read.table(file ="GEO13067.txt",
              sep="\t",
              header = T)
##读入表达矩阵
a<-a[,-1]
rownames(a)<-a[,1]
a<-as.data.frame(a)
a<-t(a)
a[1:3,1:3]
##整理成数据框，行名为样本名，列名为基因名
order(a$TP53)
a<-a[order(a$TP53),]
head(a$TP53)
##将TP53基因的表达量按照从低到高对样本排序
a<-t(a)
dim(a)
a<-a+1
###得到a为按照TP53表达量排序的表达矩阵
####下面进行差异分析
##BiocManager::install("DESeq2")
library(DESeq2)
##加载包
a<-(2^a)-1
a<-floor(a)
##表达矩阵数据需要count值，这里变回原始count值并且取整数
condition <- factor(c(rep("low",37),
                      rep("high",37)),
                    levels = c("low","high"))
condition
##构建分组，前37组为低表达组，后37组为高表达组
colData <- data.frame(row.names=colnames(a), condition)
colData
##得到样本名对应的分组信息
dds <- DESeqDataSetFromMatrix(a, colData, 
                              design= ~ condition )
head(dds)  #查看一下构建好的矩阵
dds <- DESeq(dds)
dds
res = results(dds, contrast=c("condition", "low", "high"))
res = res[order(res$pvalue),]
head(res)
summary(res)
# out of 20183 with nonzero total read count
# adjusted p-value < 0.1
# LFC > 0 (up)       : 18, 0.089%
# LFC < 0 (down)     : 29, 0.14%
# outliers [1]       : 0, 0%
# low counts [2]     : 3522, 17%
# (mean count < 8)
# [1] see 'cooksCutoff' argument of ?results
# [2] see 'independentFiltering' argument of ?results
##这里有18个基因上调，29个基因下调；
#将分析结果输出
write.csv(res,file="All_results.csv")
##提取差异表达基因 这里我用的方法是倍差法
##获取padj（p值经过多重校验校正后的值）小于0.05
##表达倍数取以2为对数后绝对值大于0.5
diff_gene_deseq2 <-
  subset(res, padj < 0.05 & abs(log2FoldChange) > 0.5)
dim(diff_gene_deseq2)
head(diff_gene_deseq2)
write.csv(diff_gene_deseq2,file= "DEG_treat_vs_control.csv")
save(a,condition,colData,file="input.Rdata")
#####得到差异表达基因
a<-read.csv(file="DEG_treat_vs_control.csv")

clusterprofiler包对差异基因进行GO富集分析

##下面利用clusterprofiler包对差异基因进行GO富集分析
BiocManager::install("clusterProfiler")
library(clusterProfiler)
library(DOSE)
library(org.Hs.eg.db)
library(topGO)
library(clusterProfiler)
library(pathview)##for kegg
BiocManager::install("topGO")
test = bitr(a$X, #数据集
            fromType="SYMBOL", #输入为SYMBOL格式
            toType="ENTREZID",  # 转为ENTERZID格式
            OrgDb="org.Hs.eg.db")
data(geneList, package="DOSE") #富集分析的背景基因集
gene <- names(geneList)[abs(geneList) > 2]
ego_ALL <- enrichGO(gene = test$ENTREZID, 
                    universe = names(geneList), #背景基因集
                    OrgDb = org.Hs.eg.db, #没有organism="human"，改为OrgDb=org.Hs.eg.db
                    #keytype = 'ENSEMBL',
                    ont = "ALL", #也可以是 CC  BP  MF中的一种
                    pAdjustMethod = "BH", #矫正方式 holm”, “hochberg”, “hommel”, “bonferroni”, “BH”, “BY”, “fdr”, “none”中的一种
                    pvalueCutoff = 1, #P值会过滤掉很多，可以全部输出
                    qvalueCutoff = 1,
                    readable = TRUE) #Gene ID 转成gene Symbol ，易读
head(ego_ALL,2)
write.csv(summary(ego_ALL),"ALL-enrich.csv",row.names =FALSE)
dotplot(ego_ALL,title="EnrichmentGO_ALL_dot")#点图，按富集的数从大到小的
barplot(ego_ALL, showCategory=20,title="EnrichmentGO_ALL")
#条状图，按p从小到大排，绘制前20个Term