【GEO数据库挖掘】三、表达矩阵

2024-12-03 00:00:38 阅读数 2569

代码参考：

https://github.com/jmzeng1314/5years/blob/master/learn-R/tasks/3-r-20-codes.R

1 构建新矩阵

rm(list=ls())

if(T){
  # BiocManager::install('CLL')
  suppressPackageStartupMessages(library(CLL))
  data(sCLLex)
  sCLLex
  exprSet=exprs(sCLLex)   
  ##sCLLex是依赖于CLL这个package的一个对象
  samples=sampleNames(sCLLex)
  pdata=pData(sCLLex)
  group_list=as.character(pdata[,2])
  dim(exprSet)
  exprSet[1:5,1:5]
  
  # BiocManager::install('hgu95av2.db')
  library(hgu95av2.db)
  ls("package:hgu95av2.db") 
  
  ids=toTable(hgu95av2SYMBOL)
  save(ids,exprSet,pdata,file = 'input.Rdata')
  length(unique(ids$symbol))
  tail(sort(table(ids$symbol)))
  table(sort(table(ids$symbol)))
  
  plot(table(sort(table(ids$symbol))))
  
  table(rownames(exprSet) %in% ids$probe_id)
  dim(exprSet)
  exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]
  dim(exprSet)
  
  ids=ids[match(rownames(exprSet),ids$probe_id),]
  head(ids)
  rownames(exprSet)=ids[match(rownames(exprSet),ids$probe_id),2]##关键一步，把exprSet的列改为ids匹配的基因名列
  exprSet[1:5,1:5]
}

image.png

##检验一些常见基因，查看GAPDH基因表达量
> exprSet['GAPDH',]
CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL CLL18.CEL 
 11.43349  10.62135  11.64079  11.15077  11.58473  11.69951  11.40295  10.35316 
CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL  CLL2.CEL  CLL3.CEL 
 10.78962  11.20747  11.66021  10.98364  11.57492  10.97744  11.33880  11.46153 
 CLL4.CEL  CLL5.CEL  CLL6.CEL  CLL7.CEL  CLL8.CEL  CLL9.CEL 
 11.84727  11.40218  11.57526  11.98662  10.65648  11.29905 

##查看第一列的表达值
> boxplot(exprSet[,1])

image.png

##看到表达值5左右，GAPDH的表达值10很高
##查看ACTB基因表达量，都是比较高的
> exprSet['ACTB',]
CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL CLL18.CEL 
 11.61486  12.64229  12.45857  12.28548  11.79786  11.00114  10.91522  11.48071 
CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL CLL23.CEL CLL24.CEL  CLL2.CEL  CLL3.CEL 
 11.48298  10.64709  12.82371  12.20490  12.46759  11.31447  12.40611  11.96456 
 CLL4.CEL  CLL5.CEL  CLL6.CEL  CLL7.CEL  CLL8.CEL  CLL9.CEL 
 11.77779  11.82606  12.30924  11.20237  12.18950  12.12897

library(reshape2)
exprSet_L=melt(exprSet)
colnames(exprSet_L)=c('probe','sample','value')
exprSet_L$group=rep(group_list,each=nrow(exprSet))
head(exprSet_L)

### ggplot2 
library(ggplot2)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
print(p)##图1
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_violin()
print(p)
p=ggplot(exprSet_L,aes(value,fill=group))+geom_histogram(bins = 200)+facet_wrap(~sample, nrow = 4)
print(p)
p=ggplot(exprSet_L,aes(value,col=group))+geom_density()+facet_wrap(~sample, nrow = 4)##查看样本分布趋势是否相同
print(p)##图2
p=ggplot(exprSet_L,aes(value,col=group))+geom_density() 
print(p)
p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
p=p+theme_set(theme_set(theme_bw(base_size=20)))
p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
print(p)

图1 全部数据的箱线图，整体数据表达量在一条线上，表明可以比较

image.png

图2 检查样本分布是否有问题

image.png

##绘画样本间进化关系
plot(hclust(dist(t(exprSet))))

image.png

##把名改为progress（横坐标），再次画图，整体结果符合实验趋势

## hclust 
colnames(exprSet)=paste(group_list,1:22,sep='')

image.png

## PCA图 

# BiocManager::install('ggfortify')
library(ggfortify)
df=as.data.frame(t(exprSet))
df$group=group_list 
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')

image.png

2 回到GSE42872表达矩阵

还有很多需要加工的地方。

dat=exprSet

##ids新建median这一列，列名为median，同时对dat这个矩阵按行操作，取每一行的中位数，将结果给到median这一列的每一行
ids$median=apply(dat,1,median)

##对ids$symbol按照ids$median中位数从大到小排列的顺序排序，将对应的行赋值为一个新的ids
ids=ids[order(ids$symbol,ids$median,decreasing = T),]

##将symbol这一列取取出重复项，'!'为否，即取出不重复的项，去除重复的gene ，保留每个基因最大表达量结果s
ids=ids[!duplicated(ids$symbol),]

##新的ids取出probe_id这一列，将dat按照取出的这一列中的每一行组成一个新的dat
dat=dat[ids$probe_id,] 

##把ids的symbol这一列中的每一行给dat作为dat的行名
rownames(dat)=ids$symbol

##保留每个基因ID第一次出现的信息
dat[1:4,1:4]

表达矩阵的变化：新的表达矩阵叫dat

image.png

看箱线图

group_list=c('control','control','control','case','case','case')

library(reshape2)
exprSet_L=melt(dat)
colnames(exprSet_L)=c('sample','value')

library(ggplot2)
p=ggplot(exprSet_L,aes(x=sample,y=value))+geom_boxplot()
print(p)

image.png

整体分布还是比较均一。

hc图

## hclust 
colnames(dat)=paste(group_list,1:6,sep='')

## Define nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), 
                cex = 0.7, col = "blue")
hc=hclust(dist(t(dat)))
par(mar=c(5,5,5,10)) 
plot(as.dendrogram(hc), nodePar = nodePar,  horiz = TRUE)

image.png

可以看出，control和case严格区分，实验结果符合预期。

PCA图

## BiocManager::install('ggfortify')
library(ggfortify)
df=as.data.frame(t(dat))
df$group=group_list 
autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')

image.png

数据区分，药物处理的结果明显。

表达矩阵分析就到这里，下一篇我们继续差异分析。

我们下一篇再见！