DESeq2包进行差异分析
a<-read.table(file ="GEO13067.txt",
sep="\t",
header = T)
##读入表达矩阵
a<-a[,-1]
rownames(a)<-a[,1]
a<-as.data.frame(a)
a<-t(a)
a[1:3,1:3]
##整理成数据框,行名为样本名,列名为基因名
order(a$TP53)
a<-a[order(a$TP53),]
head(a$TP53)
##将TP53基因的表达量按照从低到高对样本排序
a<-t(a)
dim(a)
a<-a+1
###得到a为按照TP53表达量排序的表达矩阵
####下面进行差异分析
##BiocManager::install("DESeq2")
library(DESeq2)
##加载包
a<-(2^a)-1
a<-floor(a)
##表达矩阵数据需要count值,这里变回原始count值并且取整数
condition <- factor(c(rep("low",37),
rep("high",37)),
levels = c("low","high"))
condition
##构建分组,前37组为低表达组,后37组为高表达组
colData <- data.frame(row.names=colnames(a), condition)
colData
##得到样本名对应的分组信息
dds <- DESeqDataSetFromMatrix(a, colData,
design= ~ condition )
head(dds) #查看一下构建好的矩阵
dds <- DESeq(dds)
dds
res = results(dds, contrast=c("condition", "low", "high"))
res = res[order(res$pvalue),]
head(res)
summary(res)
# out of 20183 with nonzero total read count
# adjusted p-value < 0.1
# LFC > 0 (up) : 18, 0.089%
# LFC < 0 (down) : 29, 0.14%
# outliers [1] : 0, 0%
# low counts [2] : 3522, 17%
# (mean count < 8)
# [1] see 'cooksCutoff' argument of ?results
# [2] see 'independentFiltering' argument of ?results
##这里有18个基因上调,29个基因下调;
#将分析结果输出
write.csv(res,file="All_results.csv")
##提取差异表达基因 这里我用的方法是倍差法
##获取padj(p值经过多重校验校正后的值)小于0.05
##表达倍数取以2为对数后绝对值大于0.5
diff_gene_deseq2 <-
subset(res, padj < 0.05 & abs(log2FoldChange) > 0.5)
dim(diff_gene_deseq2)
head(diff_gene_deseq2)
write.csv(diff_gene_deseq2,file= "DEG_treat_vs_control.csv")
save(a,condition,colData,file="input.Rdata")
#####得到差异表达基因
a<-read.csv(file="DEG_treat_vs_control.csv")
clusterprofiler包对差异基因进行GO富集分析
##下面利用clusterprofiler包对差异基因进行GO富集分析
BiocManager::install("clusterProfiler")
library(clusterProfiler)
library(DOSE)
library(org.Hs.eg.db)
library(topGO)
library(clusterProfiler)
library(pathview)##for kegg
BiocManager::install("topGO")
test = bitr(a$X, #数据集
fromType="SYMBOL", #输入为SYMBOL格式
toType="ENTREZID", # 转为ENTERZID格式
OrgDb="org.Hs.eg.db")
data(geneList, package="DOSE") #富集分析的背景基因集
gene <- names(geneList)[abs(geneList) > 2]
ego_ALL <- enrichGO(gene = test$ENTREZID,
universe = names(geneList), #背景基因集
OrgDb = org.Hs.eg.db, #没有organism="human",改为OrgDb=org.Hs.eg.db
#keytype = 'ENSEMBL',
ont = "ALL", #也可以是 CC BP MF中的一种
pAdjustMethod = "BH", #矫正方式 holm”, “hochberg”, “hommel”, “bonferroni”, “BH”, “BY”, “fdr”, “none”中的一种
pvalueCutoff = 1, #P值会过滤掉很多,可以全部输出
qvalueCutoff = 1,
readable = TRUE) #Gene ID 转成gene Symbol ,易读
head(ego_ALL,2)
write.csv(summary(ego_ALL),"ALL-enrich.csv",row.names =FALSE)
dotplot(ego_ALL,title="EnrichmentGO_ALL_dot")#点图,按富集的数从大到小的
barplot(ego_ALL, showCategory=20,title="EnrichmentGO_ALL")
#条状图,按p从小到大排,绘制前20个Term