Retrieve Entrez IDs for enriched GO terms

0

Entering edit mode

Yuan Hao ▴ 90

@yuan-hao-3303

Last seen 9.7 years ago

Dear list, I spent a long time trying to figure out this problem, but without progress. I would appreciate it very much if you could give me some help. I got a list of differential expressed genes from microarray analysis by using limma. Then I did GO enrichment analysis on these genes by hypeGTest() method available in GOstats package. Now I want to retrieve entrez gene IDs in my gene list that correspond to each enriched GO terms. I found there are two ways to get the entrez gene IDs: using probeSetSummary() from GOstats, or using getBM() from biomaRt. I tried both method, and they all worked, but I got two different lists (lengths 13 vs 24) of entrez gene IDs corresponding to a single GO term, and most of them are not overlapped. I am not very familiar with the annotation and/or genome assembly, so I am not sure whether it is because the two methods using different annotation/ assembly that caused this problem. # get geneIds for hyperGTest > topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) > prbs<-topA[,1] > hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) > prbs<-prbs[hasGO] > prbs<-getEG(prbs,"hgu133plus2") > prbs<-prbs[!duplicated(prbs)] # get universeGeneIds for hyperGTest > univ<-featureNames(eset) > hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) > univ<-univ[hasUnivGO] > univ<-unique(getEG(univ,"hgu133plus2")) # compose params and carry out hyperGTest > p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, ontology="BP", annotation="hgu133plus2", conditional=TRUE) > if(interactive()){ + hyp<-hyperGTest(p) + ps<-probeSetSummary(hyp) } # retrieve entrez IDs for one enriched GO term GO:0007498 > unique(ps$"GO:0007498"$EntrezID) [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" "695" [10] "8013" "8320" "83439" "9314" # using biomaRt package > ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") > summary <- summary(hyp) > goID<-summary$GOBPID > E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), filters="go", values=goID, mart=ensembl) > oneGO<-sapply(E$"go_biological_process_id",function(i) + if (i=="GO:0007498") TRUE else FALSE) > EE<-E[oneGO,] # retrieve entrez IDs for the same GO term, GO:0007498 > unique(EE$entrezgene) [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 6911 10320 [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 6943 6862 Thank you very much in advance! Kind regards, Yuan [[alternative HTML version deleted]]

Microarray Annotation GO limma GOstats biomaRt Microarray Annotation GO limma GOstats • 2.1k views

ADD COMMENT • link updated 14.6 years ago by Heidi Dvinge ★ 2.0k • written 14.6 years ago by Yuan Hao ▴ 90

0

Entering edit mode

Heidi Dvinge ★ 2.0k

@heidi-dvinge-2195

Last seen 9.7 years ago

Hello Yuan, have you tried using the accessor functions for your test object directly? For example: > geneIdsByCategory(hyp, catids="GO:0007498") Does this give you what you want? Cheers \Heidi On 24 Sep 2009, at 11:31, Yuan Hao wrote: > Dear list, > > I spent a long time trying to figure out this problem, but without > progress. I would appreciate it very much if you could give me some > help. > > I got a list of differential expressed genes from microarray analysis > by using limma. Then I did GO enrichment analysis on these genes by > hypeGTest() method available in GOstats package. Now I want to > retrieve entrez gene IDs in my gene list that correspond to each > enriched GO terms. I found there are two ways to get the entrez gene > IDs: using probeSetSummary() from GOstats, or using getBM() from > biomaRt. I tried both method, and they all worked, but I got two > different lists (lengths 13 vs 24) of entrez gene IDs corresponding to > a single GO term, and most of them are not overlapped. I am not very > familiar with the annotation and/or genome assembly, so I am not sure > whether it is because the two methods using different annotation/ > assembly that caused this problem. > > # get geneIds for hyperGTest > >> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) > >> prbs<-topA[,1] > >> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) > > + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) > >> prbs<-prbs[hasGO] > >> prbs<-getEG(prbs,"hgu133plus2") > >> prbs<-prbs[!duplicated(prbs)] > > # get universeGeneIds for hyperGTest > >> univ<-featureNames(eset) > >> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) > > + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) > >> univ<-univ[hasUnivGO] > >> univ<-unique(getEG(univ,"hgu133plus2")) > > # compose params and carry out hyperGTest > >> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, > ontology="BP", annotation="hgu133plus2", conditional=TRUE) > >> if(interactive()){ > > + hyp<-hyperGTest(p) > > + ps<-probeSetSummary(hyp) > > } > > # retrieve entrez IDs for one enriched GO term GO:0007498 > >> unique(ps$"GO:0007498"$EntrezID) > > [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" > "695" > > [10] "8013" "8320" "83439" "9314" > > > > # using biomaRt package > >> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") > >> summary <- summary(hyp) > >> goID<-summary$GOBPID > >> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), > filters="go", values=goID, mart=ensembl) > >> oneGO<-sapply(E$"go_biological_process_id",function(i) > > + if (i=="GO:0007498") TRUE else FALSE) > >> EE<-E[oneGO,] > > # retrieve entrez IDs for the same GO term, GO:0007498 > >> unique(EE$entrezgene) > > [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 > 6911 10320 > > [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 > 6943 6862 > > > Thank you very much in advance! > > Kind regards, > Yuan > > > > > > > > > > > [[alternative HTML version deleted]] > > _______________________________________________ > Bioconductor mailing list > Bioconductor@stat.math.ethz.ch > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/ > gmane.science.biology.informatics.conductor [[alternative HTML version deleted]]

ADD COMMENT • link 14.6 years ago Heidi Dvinge ★ 2.0k

0

Entering edit mode

Hi Heidi, Thank you very much for your reply. The method you provided gives the same result as using probeSetSummary, but I don't understand why this result is different from the one got from biomaRt? Do you have some insight about it? Kind regards, Yuan On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: > Hello Yuan, > > have you tried using the accessor functions for your test object > directly? For example: > > > geneIdsByCategory(hyp, catids="GO:0007498") > > Does this give you what you want? > > Cheers > \Heidi > > > On 24 Sep 2009, at 11:31, Yuan Hao wrote: > >> Dear list, >> >> I spent a long time trying to figure out this problem, but without >> progress. I would appreciate it very much if you could give me some >> help. >> >> I got a list of differential expressed genes from microarray analysis >> by using limma. Then I did GO enrichment analysis on these genes by >> hypeGTest() method available in GOstats package. Now I want to >> retrieve entrez gene IDs in my gene list that correspond to each >> enriched GO terms. I found there are two ways to get the entrez gene >> IDs: using probeSetSummary() from GOstats, or using getBM() from >> biomaRt. I tried both method, and they all worked, but I got two >> different lists (lengths 13 vs 24) of entrez gene IDs corresponding >> to >> a single GO term, and most of them are not overlapped. I am not very >> familiar with the annotation and/or genome assembly, so I am not sure >> whether it is because the two methods using different annotation/ >> assembly that caused this problem. >> >> # get geneIds for hyperGTest >> >>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >> >>> prbs<-topA[,1] >> >>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >> >> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >> >>> prbs<-prbs[hasGO] >> >>> prbs<-getEG(prbs,"hgu133plus2") >> >>> prbs<-prbs[!duplicated(prbs)] >> >> # get universeGeneIds for hyperGTest >> >>> univ<-featureNames(eset) >> >>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >> >> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >> >>> univ<-univ[hasUnivGO] >> >>> univ<-unique(getEG(univ,"hgu133plus2")) >> >> # compose params and carry out hyperGTest >> >>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >> >>> if(interactive()){ >> >> + hyp<-hyperGTest(p) >> >> + ps<-probeSetSummary(hyp) >> >> } >> >> # retrieve entrez IDs for one enriched GO term GO:0007498 >> >>> unique(ps$"GO:0007498"$EntrezID) >> >> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >> "695" >> >> [10] "8013" "8320" "83439" "9314" >> >> >> >> # using biomaRt package >> >>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >> >>> summary <- summary(hyp) >> >>> goID<-summary$GOBPID >> >>> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), >> filters="go", values=goID, mart=ensembl) >> >>> oneGO<-sapply(E$"go_biological_process_id",function(i) >> >> + if (i=="GO:0007498") TRUE else FALSE) >> >>> EE<-E[oneGO,] >> >> # retrieve entrez IDs for the same GO term, GO:0007498 >> >>> unique(EE$entrezgene) >> >> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >> 6911 10320 >> >> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >> 6943 6862 >> >> >> Thank you very much in advance! >> >> Kind regards, >> Yuan >> >> >> >> >> >> >> >> >> >> >> [[alternative HTML version deleted]] >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor@stat.math.ethz.ch >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor > [[alternative HTML version deleted]]

ADD REPLY • link 14.6 years ago Yuan Hao ▴ 90

0

Entering edit mode

these databases are built using different methods so you get different results. A common problem in bioinformatics! Solution. Go to the ncbi ftp site, for the entrez gene database and download the gene2go.gz file. Unzip and query. ________________________________________ From: bioconductor-bounces@stat.math.ethz.ch [bioconductor- bounces@stat.math.ethz.ch] On Behalf Of Yuan Hao [yuan.hao@ucd.ie] Sent: 24 September 2009 11:46 To: Heidi Dvinge Cc: bioconductor at stat.math.ethz.ch Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms Hi Heidi, Thank you very much for your reply. The method you provided gives the same result as using probeSetSummary, but I don't understand why this result is different from the one got from biomaRt? Do you have some insight about it? Kind regards, Yuan On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: > Hello Yuan, > > have you tried using the accessor functions for your test object > directly? For example: > > > geneIdsByCategory(hyp, catids="GO:0007498") > > Does this give you what you want? > > Cheers > \Heidi > > > On 24 Sep 2009, at 11:31, Yuan Hao wrote: > >> Dear list, >> >> I spent a long time trying to figure out this problem, but without >> progress. I would appreciate it very much if you could give me some >> help. >> >> I got a list of differential expressed genes from microarray analysis >> by using limma. Then I did GO enrichment analysis on these genes by >> hypeGTest() method available in GOstats package. Now I want to >> retrieve entrez gene IDs in my gene list that correspond to each >> enriched GO terms. I found there are two ways to get the entrez gene >> IDs: using probeSetSummary() from GOstats, or using getBM() from >> biomaRt. I tried both method, and they all worked, but I got two >> different lists (lengths 13 vs 24) of entrez gene IDs corresponding >> to >> a single GO term, and most of them are not overlapped. I am not very >> familiar with the annotation and/or genome assembly, so I am not sure >> whether it is because the two methods using different annotation/ >> assembly that caused this problem. >> >> # get geneIds for hyperGTest >> >>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >> >>> prbs<-topA[,1] >> >>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >> >> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >> >>> prbs<-prbs[hasGO] >> >>> prbs<-getEG(prbs,"hgu133plus2") >> >>> prbs<-prbs[!duplicated(prbs)] >> >> # get universeGeneIds for hyperGTest >> >>> univ<-featureNames(eset) >> >>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >> >> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >> >>> univ<-univ[hasUnivGO] >> >>> univ<-unique(getEG(univ,"hgu133plus2")) >> >> # compose params and carry out hyperGTest >> >>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >> >>> if(interactive()){ >> >> + hyp<-hyperGTest(p) >> >> + ps<-probeSetSummary(hyp) >> >> } >> >> # retrieve entrez IDs for one enriched GO term GO:0007498 >> >>> unique(ps$"GO:0007498"$EntrezID) >> >> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >> "695" >> >> [10] "8013" "8320" "83439" "9314" >> >> >> >> # using biomaRt package >> >>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >> >>> summary <- summary(hyp) >> >>> goID<-summary$GOBPID >> >>> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), >> filters="go", values=goID, mart=ensembl) >> >>> oneGO<-sapply(E$"go_biological_process_id",function(i) >> >> + if (i=="GO:0007498") TRUE else FALSE) >> >>> EE<-E[oneGO,] >> >> # retrieve entrez IDs for the same GO term, GO:0007498 >> >>> unique(EE$entrezgene) >> >> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >> 6911 10320 >> >> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >> 6943 6862 >> >> >> Thank you very much in advance! >> >> Kind regards, >> Yuan >> >> >> >> >> >> >> >> >> >> >> [[alternative HTML version deleted]] >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor at stat.math.ethz.ch >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor > [[alternative HTML version deleted]] _______________________________________________ Bioconductor mailing list Bioconductor at stat.math.ethz.ch https://stat.ethz.ch/mailman/listinfo/bioconductor Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor

ADD REPLY • link 14.6 years ago michael watson IAH-C ★ 3.4k

0

Entering edit mode

On Thu, Sep 24, 2009 at 6:51 AM, michael watson (IAH-C) <michael.watson at="" bbsrc.ac.uk=""> wrote: > > these databases are built using different methods so you get different results. A common problem in bioinformatics! Solution. Go to the ncbi ftp site, for the entrez gene database and download the gene2go.gz file. Unzip and query. > ________________________________________ As usual, there are many ways to solve the problem and downloading tab-delimited text files is one. However, the GO package in bioconductor is built using the NCBI data (actually, the file noted above), so there really isn't a need to download files. The data are already present in the GO.db package and in all chip annotation packages built by Bioconductor. Sean > From: bioconductor-bounces at stat.math.ethz.ch [bioconductor- bounces at stat.math.ethz.ch] On Behalf Of Yuan Hao [yuan.hao at ucd.ie] > Sent: 24 September 2009 11:46 > To: Heidi Dvinge > Cc: bioconductor at stat.math.ethz.ch > Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms > > Hi Heidi, > > Thank you very much for your reply. The method you provided gives the > same result as using probeSetSummary, but I don't understand why this > result is different from the one got from biomaRt? Do you have some > insight about it? > > Kind regards, > Yuan > > On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: > >> Hello Yuan, >> >> have you tried using the accessor functions for your test object >> directly? For example: >> >> > geneIdsByCategory(hyp, catids="GO:0007498") >> >> Does this give you what you want? >> >> Cheers >> \Heidi >> >> >> On 24 Sep 2009, at 11:31, Yuan Hao wrote: >> >>> Dear list, >>> >>> I spent a long time trying to figure out this problem, but without >>> progress. I would appreciate it very much if you could give me some >>> help. >>> >>> I got a list of differential expressed genes from microarray analysis >>> by using limma. Then I did GO enrichment analysis on these genes by >>> hypeGTest() method available in GOstats package. Now I want to >>> retrieve entrez gene IDs in my gene list that correspond to each >>> enriched GO terms. I found there are two ways to get the entrez gene >>> IDs: using probeSetSummary() from GOstats, or using getBM() from >>> biomaRt. I tried both method, and they all worked, but I got two >>> different lists (lengths 13 vs 24) of entrez gene IDs corresponding >>> to >>> a single GO term, and most of them are not overlapped. I am not very >>> familiar with the annotation and/or genome assembly, so I am not sure >>> whether it is because the two methods using different annotation/ >>> assembly that caused this problem. >>> >>> # get geneIds for hyperGTest >>> >>>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >>> >>>> prbs<-topA[,1] >>> >>>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >>> >>> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>> >>>> prbs<-prbs[hasGO] >>> >>>> prbs<-getEG(prbs,"hgu133plus2") >>> >>>> prbs<-prbs[!duplicated(prbs)] >>> >>> # get universeGeneIds for hyperGTest >>> >>>> univ<-featureNames(eset) >>> >>>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >>> >>> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>> >>>> univ<-univ[hasUnivGO] >>> >>>> univ<-unique(getEG(univ,"hgu133plus2")) >>> >>> # compose params and carry out hyperGTest >>> >>>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >>> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >>> >>>> if(interactive()){ >>> >>> + hyp<-hyperGTest(p) >>> >>> + ps<-probeSetSummary(hyp) >>> >>> } >>> >>> # retrieve entrez IDs for one enriched GO term GO:0007498 >>> >>>> unique(ps$"GO:0007498"$EntrezID) >>> >>> ? [1] "2131" ?"2139" ?"2296" ?"3717" ?"4088" ?"4771" ?"6398" ?"655" >>> "695" >>> >>> [10] "8013" ?"8320" ?"83439" "9314" >>> >>> >>> >>> # using biomaRt package >>> >>>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >>> >>>> summary <- summary(hyp) >>> >>>> goID<-summary$GOBPID >>> >>>> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), >>> filters="go", values=goID, mart=ensembl) >>> >>>> oneGO<-sapply(E$"go_biological_process_id",function(i) >>> >>> + if (i=="GO:0007498") TRUE else FALSE) >>> >>>> EE<-E[oneGO,] >>> >>> # retrieve entrez IDs for the same GO term, GO:0007498 >>> >>>> unique(EE$entrezgene) >>> >>> ? [1] ?5515 ? ?NA ? ?90 ?6398 ?2131 ?3717 ? 660 ?4145 84667 ?3055 >>> 6911 10320 >>> >>> [13] 10220 22806 ? 695 ?5017 23184 ?9355 ?2303 ?7075 ?4232 ? ?92 >>> 6943 ?6862 >>> >>> >>> Thank you very much in advance! >>> >>> Kind regards, >>> Yuan >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> ? ? ?[[alternative HTML version deleted]] >>> >>> _______________________________________________ >>> Bioconductor mailing list >>> Bioconductor at stat.math.ethz.ch >>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >> > > > ? ? ? ?[[alternative HTML version deleted]] > > _______________________________________________ > Bioconductor mailing list > Bioconductor at stat.math.ethz.ch > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor > > _______________________________________________ > Bioconductor mailing list > Bioconductor at stat.math.ethz.ch > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >

ADD REPLY • link 14.6 years ago Sean Davis 21k

0

Entering edit mode

Hi Sean I realise the GO packages are built using the NCBI data, but when there is a disagreement between derived data packages, it is good practice to go to the source database. Cheers Mick ________________________________________ From: Sean Davis [seandavi@gmail.com] Sent: 24 September 2009 12:08 To: michael watson (IAH-C) Cc: Yuan Hao; Heidi Dvinge; bioconductor at stat.math.ethz.ch Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms On Thu, Sep 24, 2009 at 6:51 AM, michael watson (IAH-C) <michael.watson at="" bbsrc.ac.uk=""> wrote: > > these databases are built using different methods so you get different results. A common problem in bioinformatics! Solution. Go to the ncbi ftp site, for the entrez gene database and download the gene2go.gz file. Unzip and query. > ________________________________________ As usual, there are many ways to solve the problem and downloading tab-delimited text files is one. However, the GO package in bioconductor is built using the NCBI data (actually, the file noted above), so there really isn't a need to download files. The data are already present in the GO.db package and in all chip annotation packages built by Bioconductor. Sean > From: bioconductor-bounces at stat.math.ethz.ch [bioconductor- bounces at stat.math.ethz.ch] On Behalf Of Yuan Hao [yuan.hao at ucd.ie] > Sent: 24 September 2009 11:46 > To: Heidi Dvinge > Cc: bioconductor at stat.math.ethz.ch > Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms > > Hi Heidi, > > Thank you very much for your reply. The method you provided gives the > same result as using probeSetSummary, but I don't understand why this > result is different from the one got from biomaRt? Do you have some > insight about it? > > Kind regards, > Yuan > > On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: > >> Hello Yuan, >> >> have you tried using the accessor functions for your test object >> directly? For example: >> >> > geneIdsByCategory(hyp, catids="GO:0007498") >> >> Does this give you what you want? >> >> Cheers >> \Heidi >> >> >> On 24 Sep 2009, at 11:31, Yuan Hao wrote: >> >>> Dear list, >>> >>> I spent a long time trying to figure out this problem, but without >>> progress. I would appreciate it very much if you could give me some >>> help. >>> >>> I got a list of differential expressed genes from microarray analysis >>> by using limma. Then I did GO enrichment analysis on these genes by >>> hypeGTest() method available in GOstats package. Now I want to >>> retrieve entrez gene IDs in my gene list that correspond to each >>> enriched GO terms. I found there are two ways to get the entrez gene >>> IDs: using probeSetSummary() from GOstats, or using getBM() from >>> biomaRt. I tried both method, and they all worked, but I got two >>> different lists (lengths 13 vs 24) of entrez gene IDs corresponding >>> to >>> a single GO term, and most of them are not overlapped. I am not very >>> familiar with the annotation and/or genome assembly, so I am not sure >>> whether it is because the two methods using different annotation/ >>> assembly that caused this problem. >>> >>> # get geneIds for hyperGTest >>> >>>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >>> >>>> prbs<-topA[,1] >>> >>>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >>> >>> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>> >>>> prbs<-prbs[hasGO] >>> >>>> prbs<-getEG(prbs,"hgu133plus2") >>> >>>> prbs<-prbs[!duplicated(prbs)] >>> >>> # get universeGeneIds for hyperGTest >>> >>>> univ<-featureNames(eset) >>> >>>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >>> >>> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>> >>>> univ<-univ[hasUnivGO] >>> >>>> univ<-unique(getEG(univ,"hgu133plus2")) >>> >>> # compose params and carry out hyperGTest >>> >>>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >>> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >>> >>>> if(interactive()){ >>> >>> + hyp<-hyperGTest(p) >>> >>> + ps<-probeSetSummary(hyp) >>> >>> } >>> >>> # retrieve entrez IDs for one enriched GO term GO:0007498 >>> >>>> unique(ps$"GO:0007498"$EntrezID) >>> >>> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >>> "695" >>> >>> [10] "8013" "8320" "83439" "9314" >>> >>> >>> >>> # using biomaRt package >>> >>>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >>> >>>> summary <- summary(hyp) >>> >>>> goID<-summary$GOBPID >>> >>>> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), >>> filters="go", values=goID, mart=ensembl) >>> >>>> oneGO<-sapply(E$"go_biological_process_id",function(i) >>> >>> + if (i=="GO:0007498") TRUE else FALSE) >>> >>>> EE<-E[oneGO,] >>> >>> # retrieve entrez IDs for the same GO term, GO:0007498 >>> >>>> unique(EE$entrezgene) >>> >>> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >>> 6911 10320 >>> >>> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >>> 6943 6862 >>> >>> >>> Thank you very much in advance! >>> >>> Kind regards, >>> Yuan >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> [[alternative HTML version deleted]] >>> >>> _______________________________________________ >>> Bioconductor mailing list >>> Bioconductor at stat.math.ethz.ch >>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >> > > > [[alternative HTML version deleted]] > > _______________________________________________ > Bioconductor mailing list > Bioconductor at stat.math.ethz.ch > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor > > _______________________________________________ > Bioconductor mailing list > Bioconductor at stat.math.ethz.ch > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >

ADD REPLY • link 14.6 years ago michael watson IAH-C ★ 3.4k

0

Entering edit mode

Hi everyone, Even just downloading the file from NCBI might result in some disagreements. GO changes all the time, and the file from NCBI is downloaded and parsed into the GO annotations for GO.db and the organism packages once every 6 months. So particularly right now when the packages are just about to be updated again, you might find some disagreements with the sources that are presently at NCBI or biomaRt. With the GO.db and organism packages we are attempting to balance keeping things "current" with keeping things "reproducible". So we update things everything every 6 months, but then we "freeze" those annotation packages with a release number so that if in the future you need to reproduce a result, you can come back and grab that particular release again along with the software that goes with it. In a few weeks there will be an entirely new set of annotation packages when we make the new release. Hope this helps. Marc michael watson (IAH-C) wrote: > Hi Sean > > I realise the GO packages are built using the NCBI data, but when there is a disagreement between derived data packages, it is good practice to go to the source database. > > Cheers > Mick > ________________________________________ > From: Sean Davis [seandavi at gmail.com] > Sent: 24 September 2009 12:08 > To: michael watson (IAH-C) > Cc: Yuan Hao; Heidi Dvinge; bioconductor at stat.math.ethz.ch > Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms > > On Thu, Sep 24, 2009 at 6:51 AM, michael watson (IAH-C) > <michael.watson at="" bbsrc.ac.uk=""> wrote: > >> these databases are built using different methods so you get different results. A common problem in bioinformatics! Solution. Go to the ncbi ftp site, for the entrez gene database and download the gene2go.gz file. Unzip and query. >> ________________________________________ >> > > As usual, there are many ways to solve the problem and downloading > tab-delimited text files is one. However, the GO package in > bioconductor is built using the NCBI data (actually, the file noted > above), so there really isn't a need to download files. The data are > already present in the GO.db package and in all chip annotation > packages built by Bioconductor. > > Sean > > > >> From: bioconductor-bounces at stat.math.ethz.ch [bioconductor- bounces at stat.math.ethz.ch] On Behalf Of Yuan Hao [yuan.hao at ucd.ie] >> Sent: 24 September 2009 11:46 >> To: Heidi Dvinge >> Cc: bioconductor at stat.math.ethz.ch >> Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms >> >> Hi Heidi, >> >> Thank you very much for your reply. The method you provided gives the >> same result as using probeSetSummary, but I don't understand why this >> result is different from the one got from biomaRt? Do you have some >> insight about it? >> >> Kind regards, >> Yuan >> >> On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: >> >> >>> Hello Yuan, >>> >>> have you tried using the accessor functions for your test object >>> directly? For example: >>> >>> >>>> geneIdsByCategory(hyp, catids="GO:0007498") >>>> >>> Does this give you what you want? >>> >>> Cheers >>> \Heidi >>> >>> >>> On 24 Sep 2009, at 11:31, Yuan Hao wrote: >>> >>> >>>> Dear list, >>>> >>>> I spent a long time trying to figure out this problem, but without >>>> progress. I would appreciate it very much if you could give me some >>>> help. >>>> >>>> I got a list of differential expressed genes from microarray analysis >>>> by using limma. Then I did GO enrichment analysis on these genes by >>>> hypeGTest() method available in GOstats package. Now I want to >>>> retrieve entrez gene IDs in my gene list that correspond to each >>>> enriched GO terms. I found there are two ways to get the entrez gene >>>> IDs: using probeSetSummary() from GOstats, or using getBM() from >>>> biomaRt. I tried both method, and they all worked, but I got two >>>> different lists (lengths 13 vs 24) of entrez gene IDs corresponding >>>> to >>>> a single GO term, and most of them are not overlapped. I am not very >>>> familiar with the annotation and/or genome assembly, so I am not sure >>>> whether it is because the two methods using different annotation/ >>>> assembly that caused this problem. >>>> >>>> # get geneIds for hyperGTest >>>> >>>> >>>>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >>>>> >>>>> prbs<-topA[,1] >>>>> >>>>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >>>>> >>>> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>> >>>> >>>>> prbs<-prbs[hasGO] >>>>> >>>>> prbs<-getEG(prbs,"hgu133plus2") >>>>> >>>>> prbs<-prbs[!duplicated(prbs)] >>>>> >>>> # get universeGeneIds for hyperGTest >>>> >>>> >>>>> univ<-featureNames(eset) >>>>> >>>>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >>>>> >>>> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>> >>>> >>>>> univ<-univ[hasUnivGO] >>>>> >>>>> univ<-unique(getEG(univ,"hgu133plus2")) >>>>> >>>> # compose params and carry out hyperGTest >>>> >>>> >>>>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >>>>> >>>> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >>>> >>>> >>>>> if(interactive()){ >>>>> >>>> + hyp<-hyperGTest(p) >>>> >>>> + ps<-probeSetSummary(hyp) >>>> >>>> } >>>> >>>> # retrieve entrez IDs for one enriched GO term GO:0007498 >>>> >>>> >>>>> unique(ps$"GO:0007498"$EntrezID) >>>>> >>>> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >>>> "695" >>>> >>>> [10] "8013" "8320" "83439" "9314" >>>> >>>> >>>> >>>> # using biomaRt package >>>> >>>> >>>>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >>>>> >>>>> summary <- summary(hyp) >>>>> >>>>> goID<-summary$GOBPID >>>>> >>>>> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), >>>>> >>>> filters="go", values=goID, mart=ensembl) >>>> >>>> >>>>> oneGO<-sapply(E$"go_biological_process_id",function(i) >>>>> >>>> + if (i=="GO:0007498") TRUE else FALSE) >>>> >>>> >>>>> EE<-E[oneGO,] >>>>> >>>> # retrieve entrez IDs for the same GO term, GO:0007498 >>>> >>>> >>>>> unique(EE$entrezgene) >>>>> >>>> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >>>> 6911 10320 >>>> >>>> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >>>> 6943 6862 >>>> >>>> >>>> Thank you very much in advance! >>>> >>>> Kind regards, >>>> Yuan >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> [[alternative HTML version deleted]] >>>> >>>> _______________________________________________ >>>> Bioconductor mailing list >>>> Bioconductor at stat.math.ethz.ch >>>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>>> >> [[alternative HTML version deleted]] >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor at stat.math.ethz.ch >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor at stat.math.ethz.ch >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >> >> > > _______________________________________________ > Bioconductor mailing list > Bioconductor at stat.math.ethz.ch > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor > >

ADD REPLY • link 14.6 years ago Marc Carlson ★ 7.2k

0

Entering edit mode

Hi, Thank you all very much for your reply on this topic. I got the following findings after a further investigation on my data. I downloaded the gene2go.gz from NCBI, from which the result agrees with the one got by using biomaRt (i.e. query ensembl). However, most of the retrieved entrez gene IDs are not available in my differential expressed gene list. In contrast, I found all the retrieved entrez gene IDs, which were retrieved by using probeSetSummary() or geneIdsByCategory(), in my DE gene list. So I conclude that probeSetSummary()/geneIdsByCategory() takes consideration of the selected genes for the hypergeometric test when retrieving entrez genes, but biomaRt on the other hand will not take this information and simply queries the database. In the context of hypergeometric test here, I would say the former methods might be more proper. Kind regards, Yuan On 24 Sep 2009, at 19:25, Marc Carlson wrote: > Hi everyone, > > Even just downloading the file from NCBI might result in some > disagreements. GO changes all the time, and the file from NCBI is > downloaded and parsed into the GO annotations for GO.db and the > organism > packages once every 6 months. So particularly right now when the > packages are just about to be updated again, you might find some > disagreements with the sources that are presently at NCBI or biomaRt. > With the GO.db and organism packages we are attempting to balance > keeping things "current" with keeping things "reproducible". So we > update things everything every 6 months, but then we "freeze" those > annotation packages with a release number so that if in the future you > need to reproduce a result, you can come back and grab that particular > release again along with the software that goes with it. > > In a few weeks there will be an entirely new set of annotation > packages > when we make the new release. Hope this helps. > > > Marc > > > > michael watson (IAH-C) wrote: >> Hi Sean >> >> I realise the GO packages are built using the NCBI data, but when >> there is a disagreement between derived data packages, it is good >> practice to go to the source database. >> >> Cheers >> Mick >> ________________________________________ >> From: Sean Davis [seandavi at gmail.com] >> Sent: 24 September 2009 12:08 >> To: michael watson (IAH-C) >> Cc: Yuan Hao; Heidi Dvinge; bioconductor at stat.math.ethz.ch >> Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms >> >> On Thu, Sep 24, 2009 at 6:51 AM, michael watson (IAH-C) >> <michael.watson at="" bbsrc.ac.uk=""> wrote: >> >>> these databases are built using different methods so you get >>> different results. A common problem in bioinformatics! Solution. >>> Go to the ncbi ftp site, for the entrez gene database and download >>> the gene2go.gz file. Unzip and query. >>> ________________________________________ >>> >> >> As usual, there are many ways to solve the problem and downloading >> tab-delimited text files is one. However, the GO package in >> bioconductor is built using the NCBI data (actually, the file noted >> above), so there really isn't a need to download files. The data are >> already present in the GO.db package and in all chip annotation >> packages built by Bioconductor. >> >> Sean >> >> >> >>> From: bioconductor-bounces at stat.math.ethz.ch [bioconductor- bounces at stat.math.ethz.ch >>> ] On Behalf Of Yuan Hao [yuan.hao at ucd.ie] >>> Sent: 24 September 2009 11:46 >>> To: Heidi Dvinge >>> Cc: bioconductor at stat.math.ethz.ch >>> Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms >>> >>> Hi Heidi, >>> >>> Thank you very much for your reply. The method you provided gives >>> the >>> same result as using probeSetSummary, but I don't understand why >>> this >>> result is different from the one got from biomaRt? Do you have some >>> insight about it? >>> >>> Kind regards, >>> Yuan >>> >>> On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: >>> >>> >>>> Hello Yuan, >>>> >>>> have you tried using the accessor functions for your test object >>>> directly? For example: >>>> >>>> >>>>> geneIdsByCategory(hyp, catids="GO:0007498") >>>>> >>>> Does this give you what you want? >>>> >>>> Cheers >>>> \Heidi >>>> >>>> >>>> On 24 Sep 2009, at 11:31, Yuan Hao wrote: >>>> >>>> >>>>> Dear list, >>>>> >>>>> I spent a long time trying to figure out this problem, but without >>>>> progress. I would appreciate it very much if you could give me >>>>> some >>>>> help. >>>>> >>>>> I got a list of differential expressed genes from microarray >>>>> analysis >>>>> by using limma. Then I did GO enrichment analysis on these genes >>>>> by >>>>> hypeGTest() method available in GOstats package. Now I want to >>>>> retrieve entrez gene IDs in my gene list that correspond to each >>>>> enriched GO terms. I found there are two ways to get the entrez >>>>> gene >>>>> IDs: using probeSetSummary() from GOstats, or using getBM() from >>>>> biomaRt. I tried both method, and they all worked, but I got two >>>>> different lists (lengths 13 vs 24) of entrez gene IDs >>>>> corresponding >>>>> to >>>>> a single GO term, and most of them are not overlapped. I am not >>>>> very >>>>> familiar with the annotation and/or genome assembly, so I am not >>>>> sure >>>>> whether it is because the two methods using different annotation/ >>>>> assembly that caused this problem. >>>>> >>>>> # get geneIds for hyperGTest >>>>> >>>>> >>>>>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >>>>>> >>>>>> prbs<-topA[,1] >>>>>> >>>>>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >>>>>> >>>>> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>>> >>>>> >>>>>> prbs<-prbs[hasGO] >>>>>> >>>>>> prbs<-getEG(prbs,"hgu133plus2") >>>>>> >>>>>> prbs<-prbs[!duplicated(prbs)] >>>>>> >>>>> # get universeGeneIds for hyperGTest >>>>> >>>>> >>>>>> univ<-featureNames(eset) >>>>>> >>>>>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >>>>>> >>>>> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>>> >>>>> >>>>>> univ<-univ[hasUnivGO] >>>>>> >>>>>> univ<-unique(getEG(univ,"hgu133plus2")) >>>>>> >>>>> # compose params and carry out hyperGTest >>>>> >>>>> >>>>>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >>>>>> >>>>> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >>>>> >>>>> >>>>>> if(interactive()){ >>>>>> >>>>> + hyp<-hyperGTest(p) >>>>> >>>>> + ps<-probeSetSummary(hyp) >>>>> >>>>> } >>>>> >>>>> # retrieve entrez IDs for one enriched GO term GO:0007498 >>>>> >>>>> >>>>>> unique(ps$"GO:0007498"$EntrezID) >>>>>> >>>>> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >>>>> "695" >>>>> >>>>> [10] "8013" "8320" "83439" "9314" >>>>> >>>>> >>>>> >>>>> # using biomaRt package >>>>> >>>>> >>>>>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >>>>>> >>>>>> summary <- summary(hyp) >>>>>> >>>>>> goID<-summary$GOBPID >>>>>> >>>>>> E <- getBM(attributes=c("go_biological_process_id", >>>>>> "entrezgene"), >>>>>> >>>>> filters="go", values=goID, mart=ensembl) >>>>> >>>>> >>>>>> oneGO<-sapply(E$"go_biological_process_id",function(i) >>>>>> >>>>> + if (i=="GO:0007498") TRUE else FALSE) >>>>> >>>>> >>>>>> EE<-E[oneGO,] >>>>>> >>>>> # retrieve entrez IDs for the same GO term, GO:0007498 >>>>> >>>>> >>>>>> unique(EE$entrezgene) >>>>>> >>>>> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >>>>> 6911 10320 >>>>> >>>>> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >>>>> 6943 6862 >>>>> >>>>> >>>>> Thank you very much in advance! >>>>> >>>>> Kind regards, >>>>> Yuan >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> [[alternative HTML version deleted]] >>>>> >>>>> _______________________________________________ >>>>> Bioconductor mailing list >>>>> Bioconductor at stat.math.ethz.ch >>>>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>>>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>>>> >>> [[alternative HTML version deleted]] >>> >>> _______________________________________________ >>> Bioconductor mailing list >>> Bioconductor at stat.math.ethz.ch >>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>> >>> _______________________________________________ >>> Bioconductor mailing list >>> Bioconductor at stat.math.ethz.ch >>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>> >>> >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor at stat.math.ethz.ch >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >> >> > -------------------------------- Yuan Hao PhD student Conway Institute University College Dublin Belfield, Dublin 4, Ireland E-mail: yuan.hao at ucd.ie

ADD REPLY • link 14.6 years ago Yuan Hao ▴ 90

0

Entering edit mode

Hi, Thank you all very much for your reply on this topic. I got the following findings after a further investigation on my data. I downloaded the gene2go.gz from NCBI, from which the result agrees with the one got by using biomaRt (i.e. query ensembl). However, most of the retrieved entrez gene IDs are not available in my differential expressed gene list. In contrast, I found all the retrieved entrez gene IDs, which were retrieved by using probeSetSummary() or geneIdsByCategory(), in my DE gene list. So I conclude that probeSetSummary()/geneIdsByCategory() takes consideration of the selected genes for the hypergeometric test when retrieving entrez genes, but biomaRt on the other hand will not take this information and simply queries the database. In the context of hypergeometric test here, I would say the former methods might be more proper. Kind regards, Yuan On 24 Sep 2009, at 19:25, Marc Carlson wrote: > Hi everyone, > > Even just downloading the file from NCBI might result in some > disagreements. GO changes all the time, and the file from NCBI is > downloaded and parsed into the GO annotations for GO.db and the > organism > packages once every 6 months. So particularly right now when the > packages are just about to be updated again, you might find some > disagreements with the sources that are presently at NCBI or biomaRt. > With the GO.db and organism packages we are attempting to balance > keeping things "current" with keeping things "reproducible". So we > update things everything every 6 months, but then we "freeze" those > annotation packages with a release number so that if in the future you > need to reproduce a result, you can come back and grab that particular > release again along with the software that goes with it. > > In a few weeks there will be an entirely new set of annotation > packages > when we make the new release. Hope this helps. > > > Marc > > > > michael watson (IAH-C) wrote: >> Hi Sean >> >> I realise the GO packages are built using the NCBI data, but when >> there is a disagreement between derived data packages, it is good >> practice to go to the source database. >> >> Cheers >> Mick >> ________________________________________ >> From: Sean Davis [seandavi at gmail.com] >> Sent: 24 September 2009 12:08 >> To: michael watson (IAH-C) >> Cc: Yuan Hao; Heidi Dvinge; bioconductor at stat.math.ethz.ch >> Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms >> >> On Thu, Sep 24, 2009 at 6:51 AM, michael watson (IAH-C) >> <michael.watson at="" bbsrc.ac.uk=""> wrote: >> >>> these databases are built using different methods so you get >>> different results. A common problem in bioinformatics! Solution. >>> Go to the ncbi ftp site, for the entrez gene database and download >>> the gene2go.gz file. Unzip and query. >>> ________________________________________ >>> >> >> As usual, there are many ways to solve the problem and downloading >> tab-delimited text files is one. However, the GO package in >> bioconductor is built using the NCBI data (actually, the file noted >> above), so there really isn't a need to download files. The data are >> already present in the GO.db package and in all chip annotation >> packages built by Bioconductor. >> >> Sean >> >> >> >>> From: bioconductor-bounces at stat.math.ethz.ch [bioconductor- bounces at stat.math.ethz.ch >>> ] On Behalf Of Yuan Hao [yuan.hao at ucd.ie] >>> Sent: 24 September 2009 11:46 >>> To: Heidi Dvinge >>> Cc: bioconductor at stat.math.ethz.ch >>> Subject: Re: [BioC] Retrieve Entrez IDs for enriched GO terms >>> >>> Hi Heidi, >>> >>> Thank you very much for your reply. The method you provided gives >>> the >>> same result as using probeSetSummary, but I don't understand why >>> this >>> result is different from the one got from biomaRt? Do you have some >>> insight about it? >>> >>> Kind regards, >>> Yuan >>> >>> On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: >>> >>> >>>> Hello Yuan, >>>> >>>> have you tried using the accessor functions for your test object >>>> directly? For example: >>>> >>>> >>>>> geneIdsByCategory(hyp, catids="GO:0007498") >>>>> >>>> Does this give you what you want? >>>> >>>> Cheers >>>> \Heidi >>>> >>>> >>>> On 24 Sep 2009, at 11:31, Yuan Hao wrote: >>>> >>>> >>>>> Dear list, >>>>> >>>>> I spent a long time trying to figure out this problem, but without >>>>> progress. I would appreciate it very much if you could give me >>>>> some >>>>> help. >>>>> >>>>> I got a list of differential expressed genes from microarray >>>>> analysis >>>>> by using limma. Then I did GO enrichment analysis on these genes >>>>> by >>>>> hypeGTest() method available in GOstats package. Now I want to >>>>> retrieve entrez gene IDs in my gene list that correspond to each >>>>> enriched GO terms. I found there are two ways to get the entrez >>>>> gene >>>>> IDs: using probeSetSummary() from GOstats, or using getBM() from >>>>> biomaRt. I tried both method, and they all worked, but I got two >>>>> different lists (lengths 13 vs 24) of entrez gene IDs >>>>> corresponding >>>>> to >>>>> a single GO term, and most of them are not overlapped. I am not >>>>> very >>>>> familiar with the annotation and/or genome assembly, so I am not >>>>> sure >>>>> whether it is because the two methods using different annotation/ >>>>> assembly that caused this problem. >>>>> >>>>> # get geneIds for hyperGTest >>>>> >>>>> >>>>>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >>>>>> >>>>>> prbs<-topA[,1] >>>>>> >>>>>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >>>>>> >>>>> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>>> >>>>> >>>>>> prbs<-prbs[hasGO] >>>>>> >>>>>> prbs<-getEG(prbs,"hgu133plus2") >>>>>> >>>>>> prbs<-prbs[!duplicated(prbs)] >>>>>> >>>>> # get universeGeneIds for hyperGTest >>>>> >>>>> >>>>>> univ<-featureNames(eset) >>>>>> >>>>>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >>>>>> >>>>> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>>> >>>>> >>>>>> univ<-univ[hasUnivGO] >>>>>> >>>>>> univ<-unique(getEG(univ,"hgu133plus2")) >>>>>> >>>>> # compose params and carry out hyperGTest >>>>> >>>>> >>>>>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >>>>>> >>>>> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >>>>> >>>>> >>>>>> if(interactive()){ >>>>>> >>>>> + hyp<-hyperGTest(p) >>>>> >>>>> + ps<-probeSetSummary(hyp) >>>>> >>>>> } >>>>> >>>>> # retrieve entrez IDs for one enriched GO term GO:0007498 >>>>> >>>>> >>>>>> unique(ps$"GO:0007498"$EntrezID) >>>>>> >>>>> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >>>>> "695" >>>>> >>>>> [10] "8013" "8320" "83439" "9314" >>>>> >>>>> >>>>> >>>>> # using biomaRt package >>>>> >>>>> >>>>>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >>>>>> >>>>>> summary <- summary(hyp) >>>>>> >>>>>> goID<-summary$GOBPID >>>>>> >>>>>> E <- getBM(attributes=c("go_biological_process_id", >>>>>> "entrezgene"), >>>>>> >>>>> filters="go", values=goID, mart=ensembl) >>>>> >>>>> >>>>>> oneGO<-sapply(E$"go_biological_process_id",function(i) >>>>>> >>>>> + if (i=="GO:0007498") TRUE else FALSE) >>>>> >>>>> >>>>>> EE<-E[oneGO,] >>>>>> >>>>> # retrieve entrez IDs for the same GO term, GO:0007498 >>>>> >>>>> >>>>>> unique(EE$entrezgene) >>>>>> >>>>> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >>>>> 6911 10320 >>>>> >>>>> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >>>>> 6943 6862 >>>>> >>>>> >>>>> Thank you very much in advance! >>>>> >>>>> Kind regards, >>>>> Yuan >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> >>>>> [[alternative HTML version deleted]] >>>>> >>>>> _______________________________________________ >>>>> Bioconductor mailing list >>>>> Bioconductor at stat.math.ethz.ch >>>>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>>>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>>>> >>> [[alternative HTML version deleted]] >>> >>> _______________________________________________ >>> Bioconductor mailing list >>> Bioconductor at stat.math.ethz.ch >>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>> >>> _______________________________________________ >>> Bioconductor mailing list >>> Bioconductor at stat.math.ethz.ch >>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>> >>> >> >> _______________________________________________ >> Bioconductor mailing list >> Bioconductor at stat.math.ethz.ch >> https://stat.ethz.ch/mailman/listinfo/bioconductor >> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >> >> > -------------------------------- Yuan Hao PhD student Conway Institute University College Dublin Belfield, Dublin 4, Ireland E-mail: yuan.hao at ucd.ie

ADD REPLY • link 14.6 years ago Yuan Hao ▴ 90

0

Entering edit mode

Well, BioMart and the organisms packages might use different versions of the GO annotation (or even different sources?) which might account for some differences. Also, are you doing a conditional or unconditional hypergeometric test? If it's conditional that could affect the gene IDs linked to each GO term. \Heidi On 24 Sep 2009, at 11:46, Yuan Hao wrote: > Hi Heidi, > > Thank you very much for your reply. The method you provided gives > the same result as using probeSetSummary, but I don't understand > why this result is different from the one got from biomaRt? Do you > have some insight about it? > > Kind regards, > Yuan > > On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: > >> Hello Yuan, >> >> have you tried using the accessor functions for your test object >> directly? For example: >> >> > geneIdsByCategory(hyp, catids="GO:0007498") >> >> Does this give you what you want? >> >> Cheers >> \Heidi >> >> >> On 24 Sep 2009, at 11:31, Yuan Hao wrote: >> >>> Dear list, >>> >>> I spent a long time trying to figure out this problem, but without >>> progress. I would appreciate it very much if you could give me some >>> help. >>> >>> I got a list of differential expressed genes from microarray >>> analysis >>> by using limma. Then I did GO enrichment analysis on these genes by >>> hypeGTest() method available in GOstats package. Now I want to >>> retrieve entrez gene IDs in my gene list that correspond to each >>> enriched GO terms. I found there are two ways to get the entrez gene >>> IDs: using probeSetSummary() from GOstats, or using getBM() from >>> biomaRt. I tried both method, and they all worked, but I got two >>> different lists (lengths 13 vs 24) of entrez gene IDs >>> corresponding to >>> a single GO term, and most of them are not overlapped. I am not very >>> familiar with the annotation and/or genome assembly, so I am not >>> sure >>> whether it is because the two methods using different annotation/ >>> assembly that caused this problem. >>> >>> # get geneIds for hyperGTest >>> >>>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >>> >>>> prbs<-topA[,1] >>> >>>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >>> >>> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>> >>>> prbs<-prbs[hasGO] >>> >>>> prbs<-getEG(prbs,"hgu133plus2") >>> >>>> prbs<-prbs[!duplicated(prbs)] >>> >>> # get universeGeneIds for hyperGTest >>> >>>> univ<-featureNames(eset) >>> >>>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >>> >>> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>> >>>> univ<-univ[hasUnivGO] >>> >>>> univ<-unique(getEG(univ,"hgu133plus2")) >>> >>> # compose params and carry out hyperGTest >>> >>>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >>> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >>> >>>> if(interactive()){ >>> >>> + hyp<-hyperGTest(p) >>> >>> + ps<-probeSetSummary(hyp) >>> >>> } >>> >>> # retrieve entrez IDs for one enriched GO term GO:0007498 >>> >>>> unique(ps$"GO:0007498"$EntrezID) >>> >>> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >>> "695" >>> >>> [10] "8013" "8320" "83439" "9314" >>> >>> >>> >>> # using biomaRt package >>> >>>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >>> >>>> summary <- summary(hyp) >>> >>>> goID<-summary$GOBPID >>> >>>> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), >>> filters="go", values=goID, mart=ensembl) >>> >>>> oneGO<-sapply(E$"go_biological_process_id",function(i) >>> >>> + if (i=="GO:0007498") TRUE else FALSE) >>> >>>> EE<-E[oneGO,] >>> >>> # retrieve entrez IDs for the same GO term, GO:0007498 >>> >>>> unique(EE$entrezgene) >>> >>> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >>> 6911 10320 >>> >>> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >>> 6943 6862 >>> >>> >>> Thank you very much in advance! >>> >>> Kind regards, >>> Yuan >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> >>> [[alternative HTML version deleted]] >>> >>> _______________________________________________ >>> Bioconductor mailing list >>> Bioconductor@stat.math.ethz.ch >>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>> Search the archives: http://news.gmane.org/ >>> gmane.science.biology.informatics.conductor >> > [[alternative HTML version deleted]]

ADD REPLY • link 14.6 years ago Heidi Dvinge ★ 2.0k

0

Entering edit mode

Hi Heidi, Thanks very much for your explanation. I am using conditional hypergeometric test. So do you think I should more rely on the results from probeSetSummary()/geneIdsByCategory than from BiomaRt in this case? kind regards, Yuan On 24 Sep 2009, at 11:56, Heidi Dvinge wrote: > Well, BioMart and the organisms packages might use different > versions of the GO annotation (or even different sources?) which > might account for some differences. Also, are you doing a > conditional or unconditional hypergeometric test? If it's > conditional that could affect the gene IDs linked to each GO term. > > \Heidi > > On 24 Sep 2009, at 11:46, Yuan Hao wrote: > >> Hi Heidi, >> >> Thank you very much for your reply. The method you provided gives >> the same result as using probeSetSummary, but I don't understand >> why this result is different from the one got from biomaRt? Do you >> have some insight about it? >> >> Kind regards, >> Yuan >> >> On 24 Sep 2009, at 11:38, Heidi Dvinge wrote: >> >>> Hello Yuan, >>> >>> have you tried using the accessor functions for your test object >>> directly? For example: >>> >>> > geneIdsByCategory(hyp, catids="GO:0007498") >>> >>> Does this give you what you want? >>> >>> Cheers >>> \Heidi >>> >>> >>> On 24 Sep 2009, at 11:31, Yuan Hao wrote: >>> >>>> Dear list, >>>> >>>> I spent a long time trying to figure out this problem, but without >>>> progress. I would appreciate it very much if you could give me some >>>> help. >>>> >>>> I got a list of differential expressed genes from microarray >>>> analysis >>>> by using limma. Then I did GO enrichment analysis on these genes by >>>> hypeGTest() method available in GOstats package. Now I want to >>>> retrieve entrez gene IDs in my gene list that correspond to each >>>> enriched GO terms. I found there are two ways to get the entrez >>>> gene >>>> IDs: using probeSetSummary() from GOstats, or using getBM() from >>>> biomaRt. I tried both method, and they all worked, but I got two >>>> different lists (lengths 13 vs 24) of entrez gene IDs >>>> corresponding to >>>> a single GO term, and most of them are not overlapped. I am not >>>> very >>>> familiar with the annotation and/or genome assembly, so I am not >>>> sure >>>> whether it is because the two methods using different annotation/ >>>> assembly that caused this problem. >>>> >>>> # get geneIds for hyperGTest >>>> >>>>> topA<-topTable(fit2,coef=1,p.value=0.01,n=nrow(fit2)) >>>> >>>>> prbs<-topA[,1] >>>> >>>>> hasGO<-sapply(mget(prbs,hgu133plus2GO),function(ids) >>>> >>>> + if(!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>> >>>>> prbs<-prbs[hasGO] >>>> >>>>> prbs<-getEG(prbs,"hgu133plus2") >>>> >>>>> prbs<-prbs[!duplicated(prbs)] >>>> >>>> # get universeGeneIds for hyperGTest >>>> >>>>> univ<-featureNames(eset) >>>> >>>>> hasUnivGO<-sapply(mget(univ,hgu133plus2GO),function(ids) >>>> >>>> + if (!is.na(ids) && length(ids) > 1) TRUE else FALSE) >>>> >>>>> univ<-univ[hasUnivGO] >>>> >>>>> univ<-unique(getEG(univ,"hgu133plus2")) >>>> >>>> # compose params and carry out hyperGTest >>>> >>>>> p<-new("GOHyperGParams", geneIds=prbs, universeGeneIds=univ, >>>> ontology="BP", annotation="hgu133plus2", conditional=TRUE) >>>> >>>>> if(interactive()){ >>>> >>>> + hyp<-hyperGTest(p) >>>> >>>> + ps<-probeSetSummary(hyp) >>>> >>>> } >>>> >>>> # retrieve entrez IDs for one enriched GO term GO:0007498 >>>> >>>>> unique(ps$"GO:0007498"$EntrezID) >>>> >>>> [1] "2131" "2139" "2296" "3717" "4088" "4771" "6398" "655" >>>> "695" >>>> >>>> [10] "8013" "8320" "83439" "9314" >>>> >>>> >>>> >>>> # using biomaRt package >>>> >>>>> ensembl=useMart("ensembl",dataset="hsapiens_gene_ensembl") >>>> >>>>> summary <- summary(hyp) >>>> >>>>> goID<-summary$GOBPID >>>> >>>>> E <- getBM(attributes=c("go_biological_process_id", "entrezgene"), >>>> filters="go", values=goID, mart=ensembl) >>>> >>>>> oneGO<-sapply(E$"go_biological_process_id",function(i) >>>> >>>> + if (i=="GO:0007498") TRUE else FALSE) >>>> >>>>> EE<-E[oneGO,] >>>> >>>> # retrieve entrez IDs for the same GO term, GO:0007498 >>>> >>>>> unique(EE$entrezgene) >>>> >>>> [1] 5515 NA 90 6398 2131 3717 660 4145 84667 3055 >>>> 6911 10320 >>>> >>>> [13] 10220 22806 695 5017 23184 9355 2303 7075 4232 92 >>>> 6943 6862 >>>> >>>> >>>> Thank you very much in advance! >>>> >>>> Kind regards, >>>> Yuan >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> >>>> [[alternative HTML version deleted]] >>>> >>>> _______________________________________________ >>>> Bioconductor mailing list >>>> Bioconductor@stat.math.ethz.ch >>>> https://stat.ethz.ch/mailman/listinfo/bioconductor >>>> Search the archives: http://news.gmane.org/gmane.science.biology.informatics.conductor >>> >> > [[alternative HTML version deleted]]

ADD REPLY • link 14.6 years ago Yuan Hao ▴ 90

Login before adding your answer.