#源代码出处
http://sthda.com/english/articles/24-ggpubr-publication-ready-plots/76-add-p-values-and-significance-levels-to-ggplots/
input
画出每个基因在不同分组下的表达 及其显著性
codes
library(ggplot2)
library(ggpubr)
ggboxplot(mydata2, x = "group", y = "KCNRG",
color = "group", palette = "jco",
add = "jitter")+
stat_compare_means(comparisons = list(c('control','COP'),
c('control','IPF/UIP'),
c('control','NSIP'),
c('control','DIP'),
c('control','UF'),
c('control','RB-ILD')
))
循环画图代码
for (eachgene in colnames(mydata2)[!(colnames(mydata2)%in% "group")] ) {
#eachgene="KCNRG"
p=ggboxplot(mydata2, x = "group", y = eachgene,
color = "group", palette = "jco",
add = "jitter")+
stat_compare_means(comparisons = list(c('control','COP'),
c('control','IPF/UIP'),
c('control','NSIP'),
c('control','DIP'),
c('control','UF'),
c('control','RB-ILD')
))
pdf(paste0(eachgene, "_gene_expression_gse32537.pdf"),width = 5, height = 5)
print(p, newpage = FALSE)
dev.off()
}
结果显示
准备工作
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)#在调用as.data.frame的时,将stringsAsFactors设置为FALSE可以避免character类型自动转化为factor类型
# 注意查看下载文件的大小,检查数据
getwd()
dir.create('G:/r/duqiang_IPF/GSE32537_cilium_Gene_signures_IPF',recursive = T)
setwd("G:/r/duqiang_IPF/GSE70866_BAL_IPF_donors_RNA-seq/")
f='GSE32537_eSet.Rdata'
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE32537
library(GEOquery)
??getGEO()
getwd()
Sys.setenv("VROOM_CONNECTION_SIZE"=999999999)
if(!file.exists(f)){
gset <- getGEO('GSE32537', destdir=".",
AnnotGPL = F, ## 注释文件
getGPL = F) ## 平台文件
save(gset,file=f) ## 保存到本地
}
##
getwd()
load("G:/r/duqiang_IPF/GSE70866_BAL_IPF_donors_RNA-seq/GSE32537_eSet.Rdata") ## 载入数据
class(gset) #查看数据类型
length(gset) #
class(gset[[1]])
gset
names(gset)
gset[[1]]@annotation # 平台信息——提取芯片平台编号
gset[[2]]@annotation
> gset
getgeo得到的表达矩阵是归一化之后的矩阵,是否直接下载就是raw data 了呢
exprs(gset[[1]])[1:4,1:4]
查看第一个样本的基因表达分布情况
boxplot(exprs(gset[[1]])[,1],las=2)
meta.32573=pData(gset[[1]])
head(meta.32573)[.1:ncol(meta.32573)]
expr.32573=exprs(gset[[1]])
head(expr.32573)[1:4,1:5]
dim(expr.32573)
dim(meta.32573)
getwd()#"G:/r/duqiang_IPF/GSE70866_BAL_IPF_donors_RNA-seq"
gp6244 <- getGEO("GPL6244", destdir=".") ##根据GPL号下载的是芯片设计的信息!
head(gp6244) #https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6244
colnames(gp6244)
colnames(gp6244@dataTable@table)
head(gp6244@dataTable@table)
gp6244@dataTable@table[,c("ID","gene_assignment")]
head(gp6244@dataTable@table[,c("ID","gene_assignment")])
dim(gp6244@dataTable@table)
ids=gp6244@dataTable@table[,c("ID","gene_assignment")]
ids[1:3,1:2]
library(stringr)
library(dplyr)
str_split("// 19p13.3 // 81099 /// BC136867 // OR4F17 // olfact",pattern = " // ")[[1]][2]
#自建函数
get_no.2<-function(x){
str_split(x,pattern = " // ")[[1]][2]
}
unlist(lapply(ids$gene_assignment[1:4],get_no.2))
ids6244=ids %>% mutate(gene_assignment=unlist(lapply(ids$gene_assignment,get_no.2))) %>%na.omit()
head(ids6244)
head(ids6244)
colnames(ids6244)=c("ID","gene_assignment")
ids6244=ids6244[,c("ID","gene_assignment")]
head(ids6244)
colnames(ids6244) <- c("PROBE_ID", "SYMBOL_ID")#改名,让他适合下面的自定义函数
#ids6244 =ids6244 %>% mutate(PROBE_ID=transform(as.character(PROBE_ID)))
colnames(ids6244) <- c("PROBE_ID", "SYMBOL_ID")#改名,让他适合下面的自定义函数
head(ids6244)
str(ids6244)
ids6244$PROBE_ID=as.character(ids6244$PROBE_ID)
head(ids6244)
expr.32573
head(expr.32573)[,1:4]
#自建函数
p2g <- function(eset,probe2symbol){
library(dplyr)
library(tibble)
library(tidyr)
# eset=expr.32573
# probe2symbol=ids6244
eset <- as.data.frame(eset)
p2g_eset <- eset %>%
rownames_to_column(var="PROBE_ID") %>% transform(PROBE_ID=as.character(PROBE_ID))%>% #合并探针的信息
inner_join(probe2symbol,by="PROBE_ID") %>% #去掉多余信息
select(-PROBE_ID) %>% #重新排列
dplyr::select(SYMBOL_ID,everything()) %>% #求出平均数(这边的点号代表上一步产出的数据)
mutate(rowMean = rowMeans(.[grep("GSM", names(.))])) %>% #去除symbol中的NA
filter(SYMBOL_ID != "NA") %>% #把表达量的平均值按从大到小排序
arrange(desc(rowMean)) %>% # symbol留下第一个
distinct(SYMBOL_ID,.keep_all = T) %>% #反向选择去除rowMean这一列
dplyr::select(-rowMean) %>% # 列名变成行名
column_to_rownames(var = "SYMBOL_ID")
#save(p2g_eset, file = "p2g_eset.Rdata")
return(p2g_eset)
}
p2g_eset <- p2g(eset = expr.32573, probe2symbol = ids6244)
head(p2g_eset)
dim(p2g_eset)
#探针注释信息下载
if(1==3){
#BiocManager::install('hugene10sttranscriptcluster.db') #下载,注意在包的后面手动加上.db
library('hugene10sttranscriptcluster.db')#载入
ls("package:hugene10sttranscriptcluster.db")#查看,找到后面是SYMBOL的,复制名字
ids=toTable(hugene10sttranscriptclusterSYMBOL)#提取注释信息
table(table(ids$symbol)>1)#统计有多少个基因对应一个以上的探针
table(sort(table(ids$symbol)))#统计有1、2、3等个探针数的基因分别有多少个,大多数基因只有一个探针
head(toTable(hugene10sttranscriptclusterSYMBOL))
exprSet=expr.32573
#更改ID https://www.jianshu.com/p/bf4e5b57b05a
if(1==1){
rownames(exprSet)%in% ids$probe_id#a %in% table a是否位于table中,返回T or F。这里判断exprSet的行名是否位于ids的probe_id这一列中
table(rownames(exprSet)%in% ids$probe_id)#统计T、F的个数,有13483个探针不存在对应的基因名
n_exprSet=exprSet[!(rownames(exprSet)%in% ids$probe_id)==F,]#将包含在探针包里的探针测序数据新建一个数据框,这里很奇怪,因该是满足条件的包括进来,这里却填F的时候会包括进来
n_exprSet=exprSet[(rownames(exprSet)%in% ids$probe_id),]#和上面的语句等价
dim(n_exprSet)
dim(ids)
tmp = by(n_exprSet,ids[ids$probe_id %in% rownames(n_exprSet),]$probe_id,
function(x)
rownames(x)[which.max(rowMeans(x))] )
head(tmp)#by函数,将对象按行或某种方式分为一个个小的子集,
#对每个子集进行操作。这里将对n_exprSet的行按与ids中的symbol的对应关系,将同一个symbol的探针放在一起,
#然后对子集中的每一行取平均值,返回每个子集中平均值大的那一行的行名(探针名)
probes = as.character(tmp)#定义为字符型
View(probes)#这里就是样本中相同基因平均值最大的探针
exprSet=exprSet[rownames(exprSet) %in% probes ,]#对exprSet的行进行操作,若该行的行名在probes中出现了则保留下来
dim(exprSet)#现在为18821个探针在6个样本中的表达量
View(exprSet)
rownames(exprSet)=ids[match(rownames(exprSet),ids$probe_id),2]#将exprSet的行名与ids的probe_id相互匹配,若相同,则exprSet的行名为ids的第二列
View(exprSet)
exprSet['GAPDH',]#查看GAPDH在六个样本中的表达量
exprSet['ACTB',]#查看β-actin在六个样本中的表达量
boxplot(exprSet)#画六个样本表达量的箱型图
}
}
head(meta.32573)[,1:5]
colnames(meta.32573)
head(meta.32573$type)
meta.32573=meta.32573[,c('geo_accession',"final diagnosis:ch1","age:ch1",
"gender:ch1","fvc pre-bronchodilator % predicted:ch1"
)]
head(meta.32573)
table(meta.32573$`final diagnosis:ch1`)
colnames(meta.32573)=c('geo_acc','diagnosis','age','gender','fvc')
meta.info=meta.32573
head(meta.info)
dim(expr.32573)
dim(p2g_eset)
getwd()
setwd('G:/r/duqiang_IPF/GSE32537_cilium_Gene_signures_IPF')
#save(p2g_eset,expr.32573,meta.info,file = 'G:/r/duqiang_IPF/GSE32537_cilium_Gene_signures_IPF/gse32537.RData')
load('G:/r/duqiang_IPF/GSE32537_cilium_Gene_signures_IPF/gse32537.RData')
head(meta.info)
colnames(meta.info)
expr.32573=p2g_eset
head(expr.32573)[,1:4]
expr.32573=t(expr.32573) %>%as.data.frame() %>% rownames_to_column(var = "sample")
head(expr.32573)[,1:4]
expr.32573$geo_acc=expr.32573$sample
mydata=inner_join(expr.32573,meta.info,by='geo_acc')
head(mydata)[,1:10]
dim(mydata)
mydata=mydata %>% select(c('geo_acc','diagnosis'),everything())
head(mydata)[,1:5]
table(mydata$diagnosis)
gene_interested=readClipboard()
head(gene_interested)
gene_interested= str_split(gene_interested,pattern = ",")[[1]]
head(gene_interested)
mydata2=mydata[,colnames(mydata)%in% gene_interested]
head(mydata2)
mydata2=cbind(mydata2,mydata$geo_acc,mydata$diagnosis)
head(mydata2)
mydata2=mydata2 %>% rename(geo_acc='mydata$geo_acc',
group='mydata$diagnosis')
mydata2=mydata2 %>% select(group,geo_acc,everything())%>% select(-geo_acc)
data_mean= select(mydata2,1,2) %>%
group_by(group) %>%
dplyr::summarize(
count=n(),
mean = mean(KCNRG),
sd = sd(KCNRG)
)
head(data_mean)
head(mydata2)
table(mydata2$group)
library(ggplot2)
library(ggpubr)
ggboxplot(mydata2, x = "group", y = "KCNRG",
color = "group", palette = "jco",
add = "jitter")+
stat_compare_means(comparisons = list(c('control','COP'),
c('control','IPF/UIP'),
c('control','NSIP'),
c('control','DIP'),
c('control','UF'),
c('control','RB-ILD')
))
for (eachgene in colnames(mydata2)[!(colnames(mydata2)%in% "group")] ) {
#eachgene="KCNRG"
p=ggboxplot(mydata2, x = "group", y = eachgene,
color = "group", palette = "jco",
add = "jitter")+
stat_compare_means(comparisons = list(c('control','COP'),
c('control','IPF/UIP'),
c('control','NSIP'),
c('control','DIP'),
c('control','UF'),
c('control','RB-ILD')
))
pdf(paste0(eachgene, "_gene_expression_gse32537.pdf"),width = 5, height = 5)
print(p, newpage = FALSE)
dev.off()
}
getwd()
#save(mydata2,file = "G:/r/duqiang_IPF/GSE32537_cilium_Gene_signures_IPF/data_for_gene_expression_sigficance.rds")
load("G:/r/duqiang_IPF/GSE32537_cilium_Gene_signures_IPF/data_for_gene_expression_sigficance.rds")