edgeR with two different groups (unpaired) at two time points (paired): Need some input

0

Entering edit mode

Sindre ▴ 110

@sindre-6193

Last seen 3.7 years ago

Hi! The edgeR manual is quite nice, but I'm not quite sure if I'm on the right track.. The question to answer is: "Is there any significantly changed genes in diabetics at time point 2, compared to healthy?" Heres my codes: #Making the design matrix status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), levels=c("Healthy","Diabetic")) patients <- factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,75,7 2,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62,61 ,55,53,21,04)) timepoints = as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1, 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) design <- model.matrix(~patients + timepoints*status) > colnames(design) [1] "(Intercept)" "patients4" [3] "patients8" "patients13" [5] "patients21" "patients53" [7] "patients55" "patients61" [9] "patients62" "patients67" [11] "patients70" "patients71" [13] "patients72" "patients73" [15] "patients75" "patients76" [17] "patients77" "patients79" [19] "patients81" "patients82" [21] "patients83" "patients86" [23] "patients87" "patients88" [25] "timepoints2" "statusDiabetic" [27] "timepoints2:statusDiabetic" #Reading in HTSeq data description <- c("test"); fileNames <- list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); files <- sort(fileNames,decreasing=TRUE); targets <- data.frame(files=files, group=status, description=description); library(edgeR); d <- readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,commen t.char="!"); colnames(d) <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271","N 270","N213","N208","N201","N188","187","N186","N183","N182","N181","N1 75","N172","N171","N170","N113","N108","N101","D279","D277","D276","D2 73","D267","D262","D261","D255","D253","D221","D204","D179","D177","D1 76","D173","D167","D162","D161","D155","D153","D121","D104")); d$samples; dim(d) #Normalization d <- (calcNormFactors(d,method="RLE")); #Estimating the dispersion: d <- estimateCommonDisp(d,verbose=TRUE) Disp = 0.05823 , BCV = 0.2413 d <- estimateTagwiseDisp(d,trend="none") #diffeksp > fit <- glmFit(d,design) Error in glmFit.default(y = y$counts, design = design, dispersion = dispersion, : Design matrix not of full rank. The following coefficients not estimable: statusDiabetic Can someone tell me why I get this error? Thanks alot!

edgeR edgeR • 1.4k views

ADD COMMENT • link updated 10.5 years ago by Gordon Smyth 50k • written 10.5 years ago by Sindre ▴ 110

0

Entering edit mode

Sindre ▴ 110

@sindre-6193

Last seen 3.7 years ago

On 2013-10-22 20:03, Sindre Lee wrote: > Hi! > The edgeR manual is quite nice, but I'm not quite sure if I'm on the > right track.. > > The question to answer is: "Is there any significantly changed genes > in diabetics at time point 2, compared to healthy?" > > Heres my codes: > > #Making the design matrix > > status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), > levels=c("Healthy","Diabetic")) > patients <- > factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,75 ,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62, 61,55,53,21,04)) > timepoints = > as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1, 1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) > design <- model.matrix(~patients + timepoints*status) >> colnames(design) > [1] "(Intercept)" "patients4" > [3] "patients8" "patients13" > [5] "patients21" "patients53" > [7] "patients55" "patients61" > [9] "patients62" "patients67" > [11] "patients70" "patients71" > [13] "patients72" "patients73" > [15] "patients75" "patients76" > [17] "patients77" "patients79" > [19] "patients81" "patients82" > [21] "patients83" "patients86" > [23] "patients87" "patients88" > [25] "timepoints2" "statusDiabetic" > [27] "timepoints2:statusDiabetic" > > #Reading in HTSeq data > > description <- c("test"); > fileNames <- > list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); > files <- sort(fileNames,decreasing=TRUE); > targets <- data.frame(files=files, group=status, > description=description); > library(edgeR); > d <- > readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,comm ent.char="!"); > colnames(d) > <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271", "N270","N213","N208","N201","N188","187","N186","N183","N182","N181"," N175","N172","N171","N170","N113","N108","N101","D279","D277","D276"," D273","D267","D262","D261","D255","D253","D221","D204","D179","D177"," D176","D173","D167","D162","D161","D155","D153","D121","D104")); > d$samples; > dim(d) > > #Normalization > > d <- (calcNormFactors(d,method="RLE")); > > #Estimating the dispersion: > > d <- estimateCommonDisp(d,verbose=TRUE) > Disp = 0.05823 , BCV = 0.2413 > d <- estimateTagwiseDisp(d,trend="none") > > #diffeksp > >> fit <- glmFit(d,design) > Error in glmFit.default(y = y$counts, design = design, dispersion = > dispersion, : > Design matrix not of full rank. The following coefficients not > estimable: > statusDiabetic > > Can someone tell me why I get this error? > > Thanks alot! MARK! In group 1: 13 samples In group 2: 11 samples I guess this is the problem? How to handle this?

ADD COMMENT • link 10.5 years ago Sindre ▴ 110

0

Entering edit mode

The problem is that the columns of your design matrix are linearly dependent: the statusDiabetic column can be written as a sum of all the columns corresponding to diabetic patients. In other words, the patient and status variables are confounded. I think your only option with edgeR is to drop the patients term from your model and just use "~ status * timepoints", but someone else might have another idea. -Ryan On Tue Oct 22 13:56:46 2013, Sindre Lee wrote: > On 2013-10-22 20:03, Sindre Lee wrote: >> Hi! >> The edgeR manual is quite nice, but I'm not quite sure if I'm on the >> right track.. >> >> The question to answer is: "Is there any significantly changed genes >> in diabetics at time point 2, compared to healthy?" >> >> Heres my codes: >> >> #Making the design matrix >> >> status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), >> levels=c("Healthy","Diabetic")) >> patients <- >> factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,7 5,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62 ,61,55,53,21,04)) >> >> timepoints = >> as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1 ,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) >> >> design <- model.matrix(~patients + timepoints*status) >>> colnames(design) >> [1] "(Intercept)" "patients4" >> [3] "patients8" "patients13" >> [5] "patients21" "patients53" >> [7] "patients55" "patients61" >> [9] "patients62" "patients67" >> [11] "patients70" "patients71" >> [13] "patients72" "patients73" >> [15] "patients75" "patients76" >> [17] "patients77" "patients79" >> [19] "patients81" "patients82" >> [21] "patients83" "patients86" >> [23] "patients87" "patients88" >> [25] "timepoints2" "statusDiabetic" >> [27] "timepoints2:statusDiabetic" >> >> #Reading in HTSeq data >> >> description <- c("test"); >> fileNames <- >> list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); >> files <- sort(fileNames,decreasing=TRUE); >> targets <- data.frame(files=files, group=status, >> description=description); >> library(edgeR); >> d <- >> readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,com ment.char="!"); >> >> colnames(d) >> <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271" ,"N270","N213","N208","N201","N188","187","N186","N183","N182","N181", "N175","N172","N171","N170","N113","N108","N101","D279","D277","D276", "D273","D267","D262","D261","D255","D253","D221","D204","D179","D177", "D176","D173","D167","D162","D161","D155","D153","D121","D104")); >> >> d$samples; >> dim(d) >> >> #Normalization >> >> d <- (calcNormFactors(d,method="RLE")); >> >> #Estimating the dispersion: >> >> d <- estimateCommonDisp(d,verbose=TRUE) >> Disp = 0.05823 , BCV = 0.2413 >> d <- estimateTagwiseDisp(d,trend="none") >> >> #diffeksp >> >>> fit <- glmFit(d,design) >> Error in glmFit.default(y = y$counts, design = design, dispersion = >> dispersion, : >> Design matrix not of full rank. The following coefficients not >> estimable: >> statusDiabetic >> >> Can someone tell me why I get this error? >> >> Thanks alot! > > MARK! > In group 1: 13 samples > In group 2: 11 samples > > I guess this is the problem? How to handle this? > > _______________________________________________ > Bioconductor mailing list > Bioconductor at r-project.org > https://stat.ethz.ch/mailman/listinfo/bioconductor > Search the archives: > http://news.gmane.org/gmane.science.biology.informatics.conductor

ADD REPLY • link 10.5 years ago Ryan C. Thompson ★ 7.9k

0

Entering edit mode

Gordon Smyth 50k

@gordon-smyth

Last seen 16 minutes ago

WEHI, Melbourne, Australia

Dear Sindre, I think that you want to find genes for which the change from time1 to time2 is different in the diabetic patients vs the healthy patients. In your experiment, you have two types of factors. The time points are within patients whereas the desease status is between patients. See section 3.5 of the edgeR User's Guide. Setting up the design matrix for such an experiment is a little tricky, not because of the edgeR but because of the limitations of model.matrix() in R. I suggest you do it like this. Set up a variable for timepoint 2 for healthy patients only: time2.healthy <- as.numeric(status=="Healthy" & timepoints==2) Same for diabetics: time2.diabetic <- as.numeric(status=="Diabetic" & timepoints==2) Then design <- model.matrix(~patients+time2.healthy+time2.diabetic) The patients term will pick up the baseline expression level for each patient at time1. time2.healthy will pick up the time2 (treatment) effect for healthy patients and time2.diabetic will do the same for diabetics. Finally, you will want to test the time2.diabetic-time2.healthy contrast. Best wishes Gordon > Date: Tue, 22 Oct 2013 20:03:32 +0200 > From: Sindre Lee <sindre.lee at="" studmed.uio.no=""> > To: <bioconductor at="" r-project.org=""> > Subject: [BioC] edgeR with two different groups (unpaired) at two time > points (paired): Need some input > > Hi! > The edgeR manual is quite nice, but I'm not quite sure if I'm on the > right track.. > > The question to answer is: "Is there any significantly changed genes in > diabetics at time point 2, compared to healthy?" > > Heres my codes: > > #Making the design matrix > > status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), > levels=c("Healthy","Diabetic")) > patients <- > factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81,75 ,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,62, 61,55,53,21,04)) > timepoints = > as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1, 1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) > design <- model.matrix(~patients + timepoints*status) >> colnames(design) > [1] "(Intercept)" "patients4" > [3] "patients8" "patients13" > [5] "patients21" "patients53" > [7] "patients55" "patients61" > [9] "patients62" "patients67" > [11] "patients70" "patients71" > [13] "patients72" "patients73" > [15] "patients75" "patients76" > [17] "patients77" "patients79" > [19] "patients81" "patients82" > [21] "patients83" "patients86" > [23] "patients87" "patients88" > [25] "timepoints2" "statusDiabetic" > [27] "timepoints2:statusDiabetic" > > #Reading in HTSeq data > > description <- c("test"); > fileNames <- > list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); > files <- sort(fileNames,decreasing=TRUE); > targets <- data.frame(files=files, group=status, > description=description); > library(edgeR); > d <- > readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,comm ent.char="!"); > colnames(d) > <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271", "N270","N213","N208","N201","N188","187","N186","N183","N182","N181"," N175","N172","N171","N170","N113","N108","N101","D279","D277","D276"," D273","D267","D262","D261","D255","D253","D221","D204","D179","D177"," D176","D173","D167","D162","D161","D155","D153","D121","D104")); > d$samples; > dim(d) > > #Normalization > > d <- (calcNormFactors(d,method="RLE")); > > #Estimating the dispersion: > > d <- estimateCommonDisp(d,verbose=TRUE) > Disp = 0.05823 , BCV = 0.2413 > d <- estimateTagwiseDisp(d,trend="none") > > #diffeksp > >> fit <- glmFit(d,design) > Error in glmFit.default(y = y$counts, design = design, dispersion = > dispersion, : > Design matrix not of full rank. The following coefficients not > estimable: > statusDiabetic > > Can someone tell me why I get this error? > > Thanks alot! > ______________________________________________________________________ The information in this email is confidential and intend...{{dropped:4}}

ADD COMMENT • link 10.5 years ago Gordon Smyth 50k

0

Entering edit mode

Gordon Smyth 50k

@gordon-smyth

Last seen 16 minutes ago

WEHI, Melbourne, Australia

Please keep the discussion on the Bioconductor mailing list. On Thu, 24 Oct 2013, Sindre Lee wrote: > On 2013-10-24 00:40, Gordon K Smyth wrote: >> Dear Sindre, >> >> I think that you want to find genes for which the change from time1 >> to time2 is different in the diabetic patients vs the healthy >> patients. >> >> In your experiment, you have two types of factors. The time points >> are within patients whereas the desease status is between patients. >> See section 3.5 of the edgeR User's Guide. >> >> Setting up the design matrix for such an experiment is a little >> tricky, not because of the edgeR but because of the limitations of >> model.matrix() in R. >> >> I suggest you do it like this. Set up a variable for timepoint 2 for >> healthy patients only: >> >> time2.healthy <- as.numeric(status=="Healthy" & timepoints==2) >> >> Same for diabetics: >> >> time2.diabetic <- as.numeric(status=="Diabetic" & timepoints==2) >> >> Then >> >> design <- model.matrix(~patients+time2.healthy+time2.diabetic) >> >> The patients term will pick up the baseline expression level for each >> patient at time1. time2.healthy will pick up the time2 (treatment) >> effect for healthy patients and time2.diabetic will do the same for >> diabetics. >> >> Finally, you will want to test the time2.diabetic-time2.healthy contrast. >> >> Best wishes >> Gordon >> >>> Date: Tue, 22 Oct 2013 20:03:32 +0200 >>> From: Sindre Lee <sindre.lee at="" studmed.uio.no=""> >>> To: <bioconductor at="" r-project.org=""> >>> Subject: [BioC] edgeR with two different groups (unpaired) at two time >>> points (paired): Need some input >>> >>> Hi! >>> The edgeR manual is quite nice, but I'm not quite sure if I'm on the >>> right track.. >>> >>> The question to answer is: "Is there any significantly changed genes in >>> diabetics at time point 2, compared to healthy?" >>> >>> Heres my codes: >>> >>> #Making the design matrix >>> >>> status <- factor(c(rep("Healthy",26), rep("Diabetic",22)), >>> levels=c("Healthy","Diabetic")) >>> patients <- >>> factor(c(88,87,86,83,82,81,75,72,71,70,13,08,01,88,87,86,83,82,81, 75,72,71,70,13,08,01,79,77,76,73,67,62,61,55,53,21,04,79,77,76,73,67,6 2,61,55,53,21,04)) >>> timepoints = >>> as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1, 1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2)) >>> design <- model.matrix(~patients + timepoints*status) >>>> colnames(design) >>> [1] "(Intercept)" "patients4" >>> [3] "patients8" "patients13" >>> [5] "patients21" "patients53" >>> [7] "patients55" "patients61" >>> [9] "patients62" "patients67" >>> [11] "patients70" "patients71" >>> [13] "patients72" "patients73" >>> [15] "patients75" "patients76" >>> [17] "patients77" "patients79" >>> [19] "patients81" "patients82" >>> [21] "patients83" "patients86" >>> [23] "patients87" "patients88" >>> [25] "timepoints2" "statusDiabetic" >>> [27] "timepoints2:statusDiabetic" >>> >>> #Reading in HTSeq data >>> >>> description <- c("test"); >>> fileNames <- >>> list.files(path="/Volumes/timemachine/HTseq_DEseq2",pattern="*.txt"); >>> files <- sort(fileNames,decreasing=TRUE); >>> targets <- data.frame(files=files, group=status, >>> description=description); >>> library(edgeR); >>> d <- >>> readDGE(targets,path="/Volumes/timemachine/HTseq_DEseq2",skip=5,co mment.char="!"); >>> colnames(d) >>> <-(c("N288","N287","N286","N283","N282","N281","N275","N272","N271 ","N270","N213","N208","N201","N188","187","N186","N183","N182","N181" ,"N175","N172","N171","N170","N113","N108","N101","D279","D277","D276" ,"D273","D267","D262","D261","D255","D253","D221","D204","D179","D177" ,"D176","D173","D167","D162","D161","D155","D153","D121","D104")); >>> d$samples; >>> dim(d) >>> >>> #Normalization >>> >>> d <- (calcNormFactors(d,method="RLE")); >>> >>> #Estimating the dispersion: >>> >>> d <- estimateCommonDisp(d,verbose=TRUE) >>> Disp = 0.05823 , BCV = 0.2413 >>> d <- estimateTagwiseDisp(d,trend="none") >>> >>> #diffeksp >>> >>>> fit <- glmFit(d,design) >>> Error in glmFit.default(y = y$counts, design = design, dispersion = >>> dispersion, : >>> Design matrix not of full rank. The following coefficients not >>> estimable: >>> statusDiabetic >>> >>> Can someone tell me why I get this error? >>> >>> Thanks alot! >>> >> > > Will this do? > > lrt <- glmLRT(fit, > contrast=c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,1)) > > Coefficient: -1*time2.healthy 1*time2.diabetic > Yes, that is correct. To find genes DE at time2 in healthy patients you can use lrt <- glmLRT(fit, coef=25) To find genes DE at time2 in diabetic patients you can use lrt <- glmLRT(fit, coef=26) To find genes whose response differs in diabetic patients vs healthy, the call that you give is correct. You could construct this by: con <- makeContrasts(time2.diabetic-time2.healthy, levels=design) lrt <- glmLRT(fit, contrast=con) Or alternatively con <- rep(0,26) con[26] <- 1 con[25] <- -1 Best wishes Gordon > design <- model.matrix(~patients+time2.healthy+time2.diabetic) > colnames(design) [1] "(Intercept)" "patients4" "patients8" "patients13" [5] "patients21" "patients53" "patients55" "patients61" [9] "patients62" "patients67" "patients70" "patients71" [13] "patients72" "patients73" "patients75" "patients76" [17] "patients77" "patients79" "patients81" "patients82" [21] "patients83" "patients86" "patients87" "patients88" [25] "time2.healthy" "time2.diabetic" > ______________________________________________________________________ The information in this email is confidential and intend...{{dropped:4}}

ADD COMMENT • link 10.5 years ago Gordon Smyth 50k

Login before adding your answer.