Dear Community,
I’m currently analyzing an RNA-Seq gene expression dataset, downloaded from the recount project. It includes trios of paired samples of primary cancers, their liver metastases and their normal adjucent tissue samples (18 patients-54 samples). The code chunk used for downstream analysis in order to perform paired group comparisons of primary cancers versus normal samples and metastatic vs normal:
colData(rse)$title
[1] "primary colorectal cancer AMC_2-1" "primary colorectal cancer AMC_3-1"
[3] "primary colorectal cancer AMC_6-1" "primary colorectal cancer AMC_5-1"
[5] "primary colorectal cancer AMC_7-1" "primary colorectal cancer AMC_8-1"
[7] "primary colorectal cancer AMC_9-1" "primary colorectal cancer AMC_10-1"
[9] "primary colorectal cancer AMC_12-1" "primary colorectal cancer AMC_13-1"
[11] "primary colorectal cancer AMC_17-1" "primary colorectal cancer AMC_18-1"
[13] "primary colorectal cancer AMC_19-1" "primary colorectal cancer AMC_20-1"
[15] "primary colorectal cancer AMC_21-1" "primary colorectal cancer AMC_22-1"
[17] "primary colorectal cancer AMC_23-1" "primary colorectal cancer AMC_24-1"
[19] "normal colon AMC_2-2" "normal colon AMC_3-2"
[21] "normal colon AMC_5-2" "normal colon AMC_6-2"
[23] "normal colon AMC_7-2" "normal colon AMC_8-2"
[25] "normal colon AMC_9-2" "normal colon AMC_10-2"
[27] "normal colon AMC_12-2" "normal colon AMC_13-2"
[29] "normal colon AMC_17-2" "normal colon AMC_18-2"
[31] "normal colon AMC_19-2" "normal colon AMC_20-2"
[33] "normal colon AMC_21-2" "normal colon AMC_22-2"
[35] "normal colon AMC_23-2" "normal colon AMC_24-2"
[37] "metastasized cancer AMC_2-3" "metastasized cancer AMC_3-3"
[39] "metastasized cancer AMC_5-3" "metastasized cancer AMC_6-3"
[41] "metastasized cancer AMC_7-3" "metastasized cancer AMC_8-3"
[43] "metastasized cancer AMC_9-3" "metastasized cancer AMC_10-3"
[45] "metastasized cancer AMC_12-3" "metastasized cancer AMC_17-3"
[47] "metastasized cancer AMC_13-3" "metastasized cancer AMC_18-3"
[49] "metastasized cancer AMC_19-3" "metastasized cancer AMC_20-3"
[51] "metastasized cancer AMC_21-3" "metastasized cancer AMC_22-3"
[53] "metastasized cancer AMC_23-3" "metastasized cancer AMC_24-3"
sampleID = colData(rse)$title
sampleID <- gsub("primary colorectal cancer AMC_","s",sampleID)
sampleID <- gsub("normal colon AMC_","s",sampleID)
sampleID <- gsub("metastasized cancer AMC_","s",sampleID)
sampleID <-gsub("-1","",sampleID)
sampleID <-gsub("-2","",sampleID)
sampleID <-gsub("-3","",sampleID)
sample_info <- data.frame(
run = colData(rse_gene)$run,
group = dx,
sampleID = as.factor(sampleID),
sex = rse$predicted_sex,
tissue = as.factor(rse$predicted_tissue))
sample_info$sex <- droplevels(sample_info$sex)
sample_info$tissue <- droplevels(sample_info$tissue)
colData(rse)$group <- sample_info$group
colData(rse)$sampleID <- sample_info$sampleID
colData(rse)$Tissue <- sample_info$tissue
colData(rse)$sex <- sample_info$sex
pdata <- SummarizedExperiment::colData(rse)[,c("group","sampleID","Tissue","sex")]
dge <- calcNormFactors(y, method = "TMM")
dge$samples
group lib.size norm.factors sampleID Tissue sex
SRR975551 PC 35844673 1.1616300 s2 Intestine male
SRR975552 PC 35323037 1.1391519 s3 Intestine male
SRR975554 PC 35085885 1.0716169 s6 Intestine male
SRR975553 PC 34522671 1.0263454 s5 Intestine female
SRR975555 PC 35522336 1.0567555 s7 Intestine female
SRR975556 PC 34645824 0.9466883 s8 Intestine male
SRR975557 PC 35819078 1.0069037 s9 Intestine male
SRR975558 PC 35597072 1.1556164 s10 Intestine male
SRR975559 PC 35644146 1.1339967 s12 Intestine male
SRR975560 PC 34354387 0.9064396 s13 Intestine male
SRR975561 PC 34984479 0.8787232 s17 Intestine male
SRR975562 PC 35309596 0.9132710 s18 Intestine male
SRR975563 PC 34192875 1.0469936 s19 Intestine male
SRR975564 PC 36541848 1.0269095 s20 Intestine male
SRR975565 PC 35054486 1.1543370 s21 Intestine male
SRR975566 PC 34810630 1.0930002 s22 Intestine male
SRR975567 PC 35187393 1.1480416 s23 Intestine male
SRR975568 PC 34440602 1.0693811 s24 Intestine female
SRR975569 NC 34331252 1.1120207 s2 Intestine male
SRR975570 NC 34791427 1.0362509 s3 Intestine male
SRR975571 NC 34134685 1.0313145 s5 Intestine female
SRR975572 NC 33389913 1.0868100 s6 Intestine male
SRR975573 NC 34403772 1.0498040 s7 Intestine female
SRR975574 NC 34118206 0.9927762 s8 Intestine male
SRR975575 NC 34240628 1.0132546 s9 Intestine male
SRR975576 NC 33537688 0.9343132 s10 Intestine male
SRR975577 NC 33515873 0.9544540 s12 Intestine male
SRR975578 NC 34219524 0.8533124 s13 Intestine male
SRR975579 NC 36504080 0.8891647 s17 Intestine male
SRR975580 NC 34796910 0.9228019 s18 Intestine male
SRR975581 NC 34051137 1.0257172 s19 Intestine male
SRR975582 NC 34224865 1.0302613 s20 Intestine male
SRR975583 NC 34719272 1.0324915 s21 Intestine male
SRR975584 NC 34440539 1.0310448 s22 Intestine male
SRR975585 NC 34425993 1.0752771 s23 Intestine male
SRR975586 NC 34897034 0.9417058 s24 Intestine female
SRR975587 MC 35280616 1.0106831 s2 Liver male
SRR975588 MC 35477420 1.1044894 s3 Intestine male
SRR975589 MC 36036263 1.0950962 s5 Intestine female
SRR975590 MC 35076617 1.1788985 s6 Liver male
SRR975591 MC 33780190 1.1091615 s7 Liver female
SRR975592 MC 35827952 0.9863091 s8 Liver male
SRR975593 MC 35536763 1.1022580 s9 Liver male
SRR975594 MC 35962611 0.5544939 s10 Liver male
SRR975595 MC 35042076 1.0685238 s12 Liver male
SRR975597 MC 35531306 1.1121061 s17 Liver male
SRR975596 MC 35778359 1.0032715 s13 Intestine male
SRR975598 MC 35524218 1.0070292 s18 Liver male
SRR975599 MC 33710189 0.4600261 s19 Liver male
SRR975600 MC 35206322 1.1116100 s20 Liver male
SRR975601 MC 34967028 0.9487985 s21 Liver male
SRR975602 MC 34348262 0.8462756 s22 Intestine male
SRR975603 MC 35301582 1.0008410 s23 Liver male
SRR975604 MC 35778773 0.9315822 s24 Liver female
condition.group <- as.factor(dge$samples$group)
tissue.group <- as.factor(dge$samples$Tissue)
sex.group <- as.factor(dge$samples$sex)
Subject <- as.factor(dge$samples$sampleID) # the factor indicating the pairs/trios of samples belonging to each patient
condition.group <- relevel(condition.group,ref="NC") # as the first level the normal samples
design <- model.matrix(~Subject +condition.group +tissue.group +sex.group)
head(design)
(Intercept) Subjects12 Subjects13 Subjects17 Subjects18 Subjects19 Subjects2
1 1 0 0 0 0 0 1
2 1 0 0 0 0 0 0
3 1 0 0 0 0 0 0
4 1 0 0 0 0 0 0
5 1 0 0 0 0 0 0
6 1 0 0 0 0 0 0
Subjects20 Subjects21 Subjects22 Subjects23 Subjects24 Subjects3 Subjects5
1 0 0 0 0 0 0 0
2 0 0 0 0 0 1 0
3 0 0 0 0 0 0 0
4 0 0 0 0 0 0 1
5 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0
Subjects6 Subjects7 Subjects8 Subjects9 condition.groupMC condition.groupPC
1 0 0 0 0 0 1
2 0 0 0 0 0 1
3 1 0 0 0 0 1
4 0 0 0 0 0 1
5 0 1 0 0 0 1
6 0 0 1 0 0 1
tissue.groupLiver sex.groupmale
1 0 1
2 0 1
3 0 1
4 0 0
5 0 0
6 0 1
y3 <- estimateDisp(dge, design, robust=TRUE)
Error in glmFit.default(sely, design, offset = seloffset, dispersion = 0.05, :
Design matrix not of full rank. The following coefficients not estimable:
sex.groupmale
Thus, my questions are the following:
1) For the above error, i should simply remove the coefficient "sex.groupmale" ? and essentially, the coefficients condition.groupMC and condition.groupPC represent the paired analysis for the comparison of primary cancer versus normal, and metastatic versus normal, adjusted for tissue and sex ?
2) If i would like additionally to perform a direct comparison between primary cancers and their metastatic counterparts, i should follow an "intersept-free comparison" ? without even including the normal samples ?
Thank you in advance,
Efstathios