geo读取表达矩阵 RNA-seq R语言部分（表达矩阵合并及id转换）

这篇具有很好参考价值的文章主要介绍了geo读取表达矩阵 RNA-seq R语言部分（表达矩阵合并及id转换）。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

geo读取表达矩阵 RNA-seq R语言
方法一：1.从geo页面直接下载表达矩阵，然后通过r读取表达矩阵
2.利用getgeo函数读取表达矩阵
3.利用geo自带的geo2r，调整p值为1，获取探针和基因名的对应关系


#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/

getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE150425_a549_kras_organoid/")

expr.df=read.csv("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE150425_a549_kras_organoid/GSE150425_Cufflinks_Gene_Counts/GSE150425_Cufflinks_Gene_Counts.csv",
                   header=T,
                   sep=",", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
                   comment.char="!")
head(expr.df)
head(expr.df)
expr.df[expr.df$Gene_ID %in% 'RETN',]
library(dplyr)



colnames(expr.df)[1]='Gene_Symbol'
expr.df[expr.df$Gene_Symbol=='Retn'|
          expr.df$Gene_Symbol=='Retnlb'|
          expr.df$Gene_Symbol=='Jchain'|
          expr.df$Gene_Symbol=='Igha1'|
          expr.df$Gene_Symbol=='Pdk4'|
          expr.df$Gene_Symbol=='Actb',]

geo rna-seq,笔记,r语言,矩阵,数据挖掘

#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/

getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE133037_a549_transfection 0f runx__tgfb/")

expr.df=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE133037_a549_transfection 0f runx__tgfb/GSE133037_VK-Rx3.TGFb_FPKM/GSE133037_VK.Rx3.TGFb_FPKM.txt",
                          header=T,
                          sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
                          comment.char="!")
head(expr.df)

#colnames(expr.df)=expr.df[1,]
input=expr.df
head(input)
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Chr')|
                 grepl(colnames(input),pattern = 'Start')|
                 grepl(colnames(input),pattern = 'End')|
                 grepl(colnames(input),pattern = 'Strand'))]
head(input)
colnames(input)[1]='Gene.ID'
colnames(input)[2]='Gene.Name'

head(input)
mydata=input %>% filter(Gene.Name=='RETNLB'|Gene.Name=='ACTB'|Gene.ID=='1053'|
                          Gene.Name=='56729'|Gene.ID=='3493'|
                          Gene.ID=='3512'|Gene.ID=='3934')
mydata
dim(input)

多个组别合并 id转化


#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/

getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007697_A549-2/")

expr_SRR957678=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007697_A549-2/GSM4007696_A549-1.htseq.count.txt",
                             header=FALSE,
                             sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
                             comment.char="!")
expr_SRR957677=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007697_A549-2/GSM4007697_A549-2.htseq.count.txt",
                          header=FALSE,
                          sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
                          comment.char="!")

expr_SRR957679 =read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007699_TD-2/GSM4007698_TD-1.htseq.count.txt",
                          header=FALSE,
                          sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
                          comment.char="!")
expr_SRR957680 =read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/GSE135402_RAW/GSM4007699_TD-2/GSM4007699_TD-2.htseq.count.txt",
                           header=FALSE,
                           sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
                           comment.char="!")


head(expr_SRR957677)
head(expr_SRR957678)


#先对照和实验组分别合并到一个data.frame再合并为一个总的数据框
control <- merge(expr_SRR957677,expr_SRR957678,by="V1")#by="gene_id"按照gene_id这列合并
head(control)

treat <- merge(expr_SRR957679,expr_SRR957680,by="V1")
head(treat)
raw_count <- merge(control,treat,by="V1")
head(raw_count)
colnames(raw_count)=c('gene_id','a5491','a5492','td1','td2')
raw_count <-raw_count[-1:-5,]#删掉前五行，由上一步结果可以看出前五行gene_id有问题
head(raw_count)



2#.去除gene_id名字中的小数点
#EBI数据库中无法搜到ENSG00000000003.10这样的基因，只能是整数ENSG00000000003才能进行id转换

library(stringr)
ENSEMBL <- raw_count$gene_id
 head(ENSEMBL)

 ENSEMBL <- str_split_fixed(ENSEMBL,'[.]',2)[,1]#将小数点及后面的数字去掉，ensembl_gene_id是整数
 head(ENSEMBL)

 rownames(raw_count) <- ENSEMBL#将去除小数点后的ensembl_gene_id作为行名后期方便绘图
 raw_count$ensembl_gene_id <-  ENSEMBL#新建ensembl_gene_id列便于合并
 head(raw_count)


 3.#对基因进行注释，获取gene_symbol
 #bioMart包是一个连接bioMart数据库的R语言接口，能通过这个软件包自由连接到bioMart数据库
# BiocManager::install('biomaRt')
 library(biomaRt)
 
 #显示一下能连接的数据库
 listMarts()
  listMarts()

  #用useMart函数选定数据库
  plant<-useMart("ensembl")
  #用listDatasets()函数显示当前数据库所含的基因组注释,我们要获取的基因注释的基因是人类基因，所以选择hsapiens_gene_ensembl
   listDatasets(plant)
grep(listDatasets(plant)[,1],pattern = 'hsa')
   #选定ensembl数据库中的hsapiens_gene_ensembl基因组
   mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))


   
   #选定我们需要获得的注释类型
   #用lsitFilters()函数查看可选择的类型，选定要获取的注释类型，以及已知注释的类型用lsitFilters()函数查看可选择的类型，选定要获取的注释类型，以及已知注释的类型
    listFilters(mart)   
    #这里我们选择这些要获得数值的类型ensembl_gene_id ,hgnc_symbol chromosome_name start_position end_position band我们已知的类型是ensembl_gene_id选择好数据库，基因组，要获得的注释类型，和已知的注释类型，就可以开始获取注释了
    
    #用getBM()函数获取注释
    hg_symbols<- getBM(attributes=c('ensembl_gene_id','hgnc_symbol',"chromosome_name", "start_position","end_position", "band"), filters= 'ensembl_gene_id', values = ENSEMBL, mart = mart)
    
    #attributers()里面的值为我们要获取的注释类型ensembl_gene_id ,hgnc_symbol chromosome_name start_position end_position band
    #filters（）里面的值为我们已知的注释类型ensembl_gene_id
    #values= 这个值就是我们已知的注释类型的数据，把上面我们通过数据处理得到的ensembl基因序号作为ensembl_gene_id 的值
    #mart= 这个值是我们所选定的数据库的基因组
    head(hg_symbols)   
   
 getwd()  
 setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb")
    save(mart, file = "mart.RData")#将mart变量保存为mart.RData
    save(hg_symbols, file = "hg_symbols.RData")#将hg_symbols变量保存为hg_symbols，biomart包不太稳定，有时连不上    
    
    
    4.#将合并后的表达数据框raw_count与注释得到的hg_symbols整合    
    
 read_count <- merge(raw_count,hg_symbols,by="ensembl_gene_id")#将raw_count与注释后得到的hg_symbols合并
     head(read_count)    
    openxlsx::write.xlsx(read_count,file = "readcount_all.xlsx")    

   input =openxlsx::read.xlsx("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE135402_A549_TD_tgfb/readcount_all.xlsx") 
     
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Chr')|
                 grepl(colnames(input),pattern = 'Start')|
                 grepl(colnames(input),pattern = 'End')|
                 grepl(colnames(input),pattern = 'Strand'))]
head(input)
colnames(input)[1]='Gene.ID'
colnames(input)[7]='Gene.Name'

head(input)
mydata=input %>% filter(Gene.Name=='RETNLB'|Gene.Name=='ACTB'|Gene.ID=='ENSG00000104918'|
                          Gene.Name=='RETN'|Gene.ID=='ENSG00000132465'|
                          Gene.ID=='ENSG00000211895'|Gene.ID=='ENSG00000092067')
mydata
dim(input)

下载表达矩阵和getgeo函数联合使用


#http://zouyawen.top/2020/10/09/%E8%BD%AC%E5%BD%95%E7%BB%844/

getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/")

expr.df=read.table("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/geo2r.tsv",
                   header=T,
                   sep="\t", #Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec
                   comment.char="!")
head(expr.df)
expr.df[1:10,]
#colnames(expr.df)=expr.df[1,]
input=expr.df
head(input)
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Chr')|
                 grepl(colnames(input),pattern = 'Start')|
                 grepl(colnames(input),pattern = 'End')|
                 grepl(colnames(input),pattern = 'Strand'))]
head(input)
grep(input$Gene.title,pattern = 'RETN')
input[grep(input$Gene.symbol,pattern = 'RETN'),]
input[grep(input$Gene.symbol,pattern = 'IGHA1'),]
input[grep(input$Gene.title,pattern = 'immunoglobulin heavy'),]



library(GEOquery)
f='GSE114761_eSet.Rdata'
if(!file.exists(f)){
  gset <- getGEO('GSE114761', destdir=".",
                 AnnotGPL = F,     ## 注释文件
                 getGPL = F)       ## 平台文件
  save(gset,file=f)   ## 保存到本地
}

getwd()
load('GSE114761_eSet.Rdata')  ## 载入数据
class(gset)  #查看数据类型
length(gset)  #
class(gset[[1]])
gset
exprs(gset[[1]])
gset[[1]]


a=exprs(gset[[1]])
head(a)
a['220570_at',]
a[rownames(a)=='220570_at',]

input[grep(input$Gene.symbol,pattern = 'IGHA1'),][,1]
a[rownames(a) %in% input[grep(input$Gene.symbol,pattern = 'IGHA1'),][,1],]
a[rownames(a) %in% input[grep(input$Gene.title,pattern = 'IGCJ'),][,1],]


myexpression=as.data.frame(a)
head(myexpression)
myexpression$probe_id=rownames(myexpression)
head(myexpression)

head(input)
input$probe_id=input$ID
head(input)

mydata=merge(myexpression,input,by='probe_id')
head(mydata)

colnames(a)
boxplot(mydata[,2:42])
getwd()
save(mydata,file = "G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")
load("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE114761_various_A549_TGFB/myexpression_matrix.rds")

colnames(mydata)
a=read.table(file = 'clipboard')
head(a)
a=a[-1,]
a
colnames(mydata)[2:43]=a$V2
head(mydata)


mydata[mydata$Gene.symbol=='RETN'|
         mydata$Gene.symbol=='IGHA1'|
         mydata$Gene.symbol=='RETNLB',]

读取excel表达矩阵文章来源地址https://www.toymoban.com/news/detail-615344.html


getwd()
setwd("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE164160_CMT64_TGFB_Alveogenic lung carcinoma/")

syndecan_fibrosis=openxlsx::read.xlsx("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE164160_CMT64_TGFB_Alveogenic lung carcinoma/GSE164160_RAW/GSM4998575_TGFb_1.clean_trimmed_GE_.xlsx",
                                      colNames = T)
head(syndecan_fibrosis)

input=syndecan_fibrosis
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Region')|
                 grepl(colnames(input),pattern = 'Start')|
                 grepl(colnames(input),pattern = 'End')|
                 grepl(colnames(input),pattern = 'Strand'))]
head(input)
mydata=input %>% filter(Name=='Regnla'|Name=='Actb'|ENSEMBL=='ENSMUSG00000061100'|ENSEMBL=='ENSMUSG00000015839'|
                          Name=='Retnlg'|ENSEMBL=='ENSMUSG00000052435'|
                          ENSEMBL=='ENSMUSG00000095079'|ENSEMBL=='ENSMUSG00000067149')
mydata
dim(input)
#################################33
syndecan_fibrosis=openxlsx::read.xlsx("G:/silicosis/geo/GSE103548_rna-seq_llc_mle-12/GSE164160_CMT64_TGFB_Alveogenic lung carcinoma/GSE164160_RAW/GSM4998576_TNFa_1.clean_trimmed_GE_.xlsx",
                                      colNames = T)
head(syndecan_fibrosis)

input=syndecan_fibrosis
library(dplyr)
input=input[,!(grepl(colnames(input),pattern = 'Region')|
                 grepl(colnames(input),pattern = 'Start')|
                 grepl(colnames(input),pattern = 'End')|
                 grepl(colnames(input),pattern = 'Strand'))]
head(input)
mydata=input %>% filter(Name=='Regnla'|Name=='Actb'|ENSEMBL=='ENSMUSG00000061100'|ENSEMBL=='ENSMUSG00000015839'|
                          Name=='Retnlg'|ENSEMBL=='ENSMUSG00000052435'|
                          ENSEMBL=='ENSMUSG00000095079'|ENSEMBL=='ENSMUSG00000067149')
mydata
dim(input)