Question

I am trying to recreate results from an Alzheimer's paper but I can't figure out my linear regression model

0

Entering edit mode

ravelarvargas • 0

@ravelarvargas-17116

Last seen 6.6 years ago

I am trying to recreate the differential gene expression results from the following paper between Alzheimer’s and normal brain donors (https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-016-0355-3). Specifically, the authors divided the brains up into 19 different regions, and divided the patients up into normal, low, and high Alzheimer’s based on 6 different categories. The supplementary table 4a from their paper shows the number of differentially expressed probes based on linear regression analysis. I have subset their data into just looking at differential gene expression in the inferior frontal gyrus (BM44-1FG) between high and low Alzheimer’s patients based on their Clinical Dementia Rating (CDR). Specifically, the authors report that after their linear regression analysis, they find 468 significantly differentially expressed genes (supplementary table 4a). I am unable to recreate their results and I can’t seem to find any differentially expressed genes in the Inferior Frontal Gyrus. I have tried contacting the authors but I have not received any replies and I am stuck until I can figure out how they got their results.

My first step was to download the data from GEO and organise the relevant information, which I did using the following code:

library(GEOquery)
library(limma)
library(dplyr)
library(compare)
library(edgeR)

gg = getGEO("GSE84422")

#Load in microarray data for both platforms
gpl96 <- pData(gg$`GSE84422-GPL96_series_matrix.txt.gz`)
gpl97 <- pData(gg$`GSE84422-GPL97_series_matrix.txt.gz`)

#Find the same subject in both platforms GPL96 and GPL97
gpl96_to_split=as.data.frame(as.character(gpl96$title), stringsAsFactors=FALSE)
colnames(gpl96_to_split)=c('title')

unique_patients=c()
accessions=c()
for (i in 1:nrow(gpl96_to_split)){
  y=unlist(strsplit(gpl96_to_split$title[i], '[,]'))
  accessions=rbind(accessions, c(y[1], y[2], gpl96$geo_accession[i], gpl97$geo_accession[i]))
  unique_patients=rbind(unique_patients, c(y[1], gpl96$geo_accession[i]))
}
colnames(accessions)=c('subject', 'region', 'gpl96', 'gpl97')

accessions=as.data.frame(accessions)

subject=c()
for (i in 1:nrow(accessions)){
  y=unlist(strsplit(as.character(accessions$subject[i]), '[ ]'))
  subject=rbind(subject, y[2])
}
accessions=cbind(subject, as.character(accessions$region), as.character(accessions$gpl96), as.character(accessions$gpl97))
accessions=as.data.frame(accessions)
colnames(accessions)=c('subject', 'region', 'gpl96', 'gpl97')

#Find covariates
unique_patients=as.data.frame(unique_patients)
unique_patients=unique_patients[!duplicated(unique_patients$V1),]
rownames(unique_patients)=c()

gpl96_df <- as.data.frame(gpl96)
covariates=gpl96_df%>%dplyr::filter(geo_accession%in%unique_patients$V2)%>%dplyr::select(
  `subject id:ch1`, `postmortem interval minutes:ch1`,`ph:ch1`,`Sex:ch1`, `race:ch1`,`age:ch1`,  
  `clinical dementia rating:ch1`,`braak neurofibrillary tangle score:ch1`, `neuropathological category:ch1`,
  `average neuritic plaque density:ch1`,`sum of cerad rating scores in multiple brain regions:ch1`,
  `sum of neurofibrillary tangles density in multiple brain regions:ch1`)

colnames(covariates)=c('Sampleid', 'PMI', 'pH', 'Sex', 'Race', 'Age', 'CDR', 'Braak', 'CERAD',
                       'PLQ_Mn', 'NPrSum', 'NTrSum')
normal=covariates%>%dplyr::filter(CERAD=='Normal')%>%mutate(CERAD2=1)
definite_AD=covariates%>%dplyr::filter(CERAD=='definite AD')%>%mutate(CERAD2=2)
probable_AD=covariates%>%dplyr::filter(CERAD=='Probable AD')%>%mutate(CERAD2=3)
possible_AD=covariates%>%dplyr::filter(CERAD=='Possible AD')%>%mutate(CERAD2=4)
covariates=rbind(normal, definite_AD, probable_AD, possible_AD)
covariates=covariates%>%dplyr::select(-CERAD)
covariates=covariates%>%dplyr::select(Sampleid, PMI, pH, Sex, Race, Age, CDR, Braak, CERAD2,
                                      PLQ_Mn, NPrSum, NTrSum)
colnames(covariates)=c('Sampleid', 'PMI', 'pH', 'Sex', 'Race', 'Age', 'CDR', 'Braak', 'CERAD',
                       'PLQ_Mn', 'NPrSum', 'NTrSum')

#Load in microarray expression data for both platforms
gpl96_expression <- exprs(gg$`GSE84422-GPL96_series_matrix.txt.gz`)
gpl97_expression <- exprs(gg$`GSE84422-GPL97_series_matrix.txt.gz`)

#Find accessions that are inferior frontal gyrus
gpl96_accession <- gpl96$geo_accession[grepl("Inferior Frontal Gyrus", gpl96$title)]
gpl97_accession <- gpl97$geo_accession[grepl("Inferior Frontal Gyrus", gpl97$title)]

#Get just the expression data for IFG accessions
gpl96_ifg_expression <- gpl96_expression[,gpl96_accession]
gpl97_ifg_expression <- gpl97_expression[,gpl97_accession]

#Find probes that are in both tables
common_probe_ID <- as.data.frame(intersect(rownames(gpl96_ifg_expression), rownames(gpl97_ifg_expression)))
colnames(common_probe_ID)=c('probe')

common_probe=c()
common_probe_rownames=c()
for (i in 1:nrow(gpl96_ifg_expression)){
  if (rownames(gpl96_ifg_expression)[i]%in%common_probe_ID$probe){
    common_probe=rbind(common_probe, gpl96_ifg_expression[i,])
    common_probe_rownames=rbind(common_probe_rownames, rownames(gpl96_ifg_expression)[i])
  }
}

#group probes that are not in both tables
##This takes a while to run
'%!in%' <- function(x,y)!('%in%'(x,y))
combined_probes=c()
rownames_platforms=c()
for (i in 1:nrow(gpl96_ifg_expression)){
  if (rownames(gpl96_ifg_expression)[i]%!in%common_probe_rownames){
    combined_probes=rbind(combined_probes, gpl96_ifg_expression[i,])
    rownames_platforms=rbind(rownames_platforms,  rownames(gpl96_ifg_expression)[i])
  }
}

for (i in 1:nrow(gpl97_ifg_expression)){
  if (rownames(gpl97_ifg_expression)[i]%!in%common_probe_rownames){
    combined_probes=rbind(combined_probes, gpl97_ifg_expression[i,])
    rownames_platforms=rbind(rownames_platforms,  rownames(gpl97_ifg_expression)[i])
  }
}

#Group probes that are unique to each platform with those that are shared in both platforms
combined_probes=rbind(combined_probes, common_probe)
rownames_platforms=rbind(rownames_platforms, common_probe_rownames)

rownames(combined_probes)=rownames_platforms

subject_colnames=c()
for (i in 1:nrow(accessions)){
  if (accessions$gpl96[i]%in%colnames(combined_probes)){
    subject_colnames=rbind(subject_colnames, as.character(accessions$subject[i]))
  }
}
colnames(combined_probes)=subject_colnames

#CDR
#The authors divided up the patients into low and high alzheimers and normal based on a few different variables. I am looking at CDR and have partitioned the data correctly based on supplementary table 2C from the paper:

CDR_normal=covariates%>%filter(CDR==0)%>%mutate(alzheimers='normal')
CDR_low=covariates%>%filter(CDR%in%c(0.5, 1, 2))%>%mutate(alzheimers='low')
CDR_high=covariates%>%filter(CDR%in%c(3, 4, 5))%>%mutate(alzheimers='high')
CDR=rbind(CDR_normal, CDR_low, CDR_high)

CDR_low_high=CDR%>%dplyr::filter(alzheimers!='normal')

I have checked my probes and values against the published values and I am 100% sure that up until here I have not done any mistakes. However, after this is where I feel like things start to go wrong

#filter out alzheimers table results that don't match the patient data
CDR_low_high_combined_probes=c()
CDR_low_high_combined_probes_colnames=c()
for (i in 1:ncol(combined_probes)){
  if (colnames(combined_probes)[i]%in%CDR_low_high$Sampleid){
    CDR_low_high_combined_probes=cbind(CDR_low_high_combined_probes, combined_probes[,i])
    CDR_low_high_combined_probes_colnames=rbind(CDR_low_high_combined_probes_colnames, colnames(combined_probes)[i])

  }
}

colnames(CDR_low_high_combined_probes)=CDR_low_high_combined_probes_colnames
rownames(CDR_low_high_combined_probes)=rownames(combined_probes)

#filter out patient table results that don't match the alzheimers data
CDR_low_high_present=c()
for (i in 1:nrow(CDR_low_high)){
  if (CDR_low_high$Sampleid[i]%in%colnames(CDR_low_high_combined_probes)){
    CDR_low_high_present=rbind(CDR_low_high_present, CDR_low_high[i,])
  }
}
CDR_low_high=CDR_low_high_present

rownames(CDR_low_high)=c()

#Design matrix
#The authors report that they corrected for pH, sex, race, and PMI, which I have tried doing here
CDR_low_high_design <- model.matrix(~0+CDR_low_high$alzheimers+as.numeric(CDR_low_high$pH)+CDR_low_high$Sex+CDR_low_high$Race+as.numeric(CDR_low_high$PMI))
colnames(CDR_low_high_design)=c('alzheimers_high', 'alzheimers_low', 'pH', 'Sex_Male', 'Race_Hispanic', 'Race_White', 'PMI')
rownames(CDR_low_high_design)=CDR_low_high$Sampleid

#Ideally here I would filter out lowely expressed genes. However, I’m sure the authors did not do this based on the paper so I haven’t done it 

CDR_low_high_expression_fit <- lmFit(CDR_low_high_combined_probes, CDR_low_high_design)

CDR_low_high_expression_contr.matrix <- makeContrasts(
  CDR_alzheimers_low_CDR_alzheimers_high = alzheimers_high-alzheimers_low, 
  levels = CDR_low_high_design)

CDR_low_high_expression_fit2 <- contrasts.fit(CDR_low_high_expression_fit, CDR_low_high_expression_contr.matrix)
CDR_low_high_expression_fit2 <- eBayes(CDR_low_high_expression_fit2)

I have included a section of the s4a table filtered for the Inferior Frontal Gyrus and showing CDR low vs high to showcase the different results the authors have found vs the ones I find using my script. I'm not sure what to do differently to replicate their results, even my log2FC results are different from theirs. Their differential gene expression results

limma microarray differential gene expression linear regression edger • 1.3k views

ADD COMMENT • link 6.9 years ago ravelarvargas • 0