incomplete imputation of missing values with LEA impute() function
0
0
Entering edit mode
nadia • 0
@4d56f8e3
Last seen 18 days ago
Australia

Hi all,

I would like to run a lfmm to identify any outliers, but before this I have to impute any missing values of which I have roughly 17,000 of in my genotype matrix (prior to imputing). However, when imputing missing values within the LEA package using the impute() function, I am still getting 30 missing values or '9s' within my imputed genotype matrix.

I was thinking it might have something to do with the fact the matrix was initially converted from a .raw output file from PLINK, but the .lfmm file seems to be in the appropriate format, so I am not sure. It is probably something minor that I am missing but I have spent all day on this, and it was running fine yesterday, so any help would be appreciated!

For context as well I am running two analyses, one for K = 2 and one for K = 5.

Here is my code and the output for this is below (note I have already run snmf() so just skipped this) :


# LD pruned using plink and want to run lfmm, so I export the pruned plink files in .raw format
system2(plink_path, 
        args = c("--bfile", "05_results/LD_results/Conservative/poa_ld_c_final",
                 "--recode", "A",
                 "--out", "05_results/LD_results/Conservative/poa_ld_c_final_raw"))


# Genotype matrix (Y):
poa_geno_c_raw <- read.table("05_results/LD_results/Conservative/poa_ld_c_final_raw.raw", header = TRUE)
poa_lfdat_c <- poa_geno_c_raw[, -(1:6)] #remove non-genotype columns
poa_lfdat_c[is.na(poa_lfdat_c)] <- 9
ghead(poa_lfdat_c); dim(poa_lfdat_c)


# The environmental matrix (X)
poa_lfs_c <- as.matrix(poa_lfs_c, ncol = 1)
head(poa_lfs_c); dim(poa_lfs_c)
write.table(poa_lfs_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env",
            sep = "\t", row.names = FALSE, col.names = FALSE)


# The genotype matrix (Y):
ghead(poa_lfdat_c); dim(poa_lfdat_c)
write.table(poa_lfdat_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm",
            sep = "\t", row.names = FALSE, col.names = FALSE)



poa_env <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"
poa_lfmm <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm"

## Find K 
#project <- snmf(poa_lfmm, 
               # K = 1:10, repetitions = 10, 
             #   entropy = TRUE, project = "new") 


# K = 2
best = which.min(cross.entropy(project, K = 2))
impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
       method = 'mode', K = 2, run = best)

file.rename(
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"
)

# K = 5
best = which.min(cross.entropy(project, K = 5))
impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
       method = 'mode', K = 5, run = best) #impute same file as above but rename by K


file.rename(
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k5_imputed.lfmm"
)

## -- Run lfmm analysis

# K = 2
Y_k2 <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"))
X <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"))


sum(Y_k2 == 9)        # still 30 9s
which(Y_k2 == 9, arr.ind = TRUE)
# LD pruned using plink and want to run lfmm, so I export the pruned plink files as .raw
> system2(plink_path, 
+         args = c("--bfile", "05_results/LD_results/Conservative/poa_ld_c_final",
+                  "--recode", "A",
+                  "--out", "05_results/LD_results/Conservative/poa_ld_c_final_raw"))
PLINK v1.9.0-b.7.7 64-bit (22 Oct 2024)            cog-genomics.org/plink/1.9/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 05_results/LD_results/Conservative/poa_ld_c_final_raw.log.
Options in effect:
  --bfile 05_results/LD_results/Conservative/poa_ld_c_final
  --out 05_results/LD_results/Conservative/poa_ld_c_final_raw
  --recode A

15715 MB RAM detected; reserving 7857 MB for main workspace.
10395 variants loaded from .bim file.
172 people (0 males, 0 females, 172 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
05_results/LD_results/Conservative/poa_ld_c_final_raw.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 172 founders and 0 nonfounders present.
Calculating allele frequencies... done.
Total genotyping rate is 0.990325.
10395 variants and 172 people pass filters and QC.
Note: No phenotypes present.
--recode A to 05_results/LD_results/Conservative/poa_ld_c_final_raw.raw ...
done.
> 
> 
> # Genotype matrix:
> poa_geno_c_raw <- read.table("05_results/LD_results/Conservative/poa_ld_c_final_raw.raw", header = TRUE)
> poa_lfdat_c <- poa_geno_c_raw[, -(1:6)] #remove non-genotype columns
> poa_lfdat_c[is.na(poa_lfdat_c)] <- 9
> ghead(poa_lfdat_c); dim(poa_lfdat_c)
  X100580940.8.C.A_A X100581730.11.C.A_A X100581861.18.C.A_A X100903155.15.C.G_G X100903673.8.C.G_G X100903797.14.C.T_T X100903884.21.C.A_A
1                  0                   0                   0                   0                  0                   1                   0
2                  0                   0                   0                   0                  0                   1                   0
3                  0                   0                   0                   1                  0                   1                   0
4                  0                   0                   0                   0                  0                   1                   1
5                  0                   0                   1                   0                  0                   0                   0
6                  0                   0                   1                   0                  1                   0                   0
  X100903891.6.C.T_T X100904783.20.T.C_C X100905068.16.C.T_T
1                  1                   0                   1
2                  0                   1                   0
3                  0                   0                   1
4                  0                   0                   0
5                  1                   0                   0
6                  0                   0                   0
[1]   172 10395
> 
> 
> # The environmental matrix (X)
> poa_lfs_c <- as.matrix(poa_lfs_c, ncol = 1)
> head(poa_lfs_c); dim(poa_lfs_c)
     [,1]
[1,]    1
[2,]    1
[3,]    1
[4,]    1
[5,]    1
[6,]    1
[1] 172   1
> write.table(poa_lfs_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env",
+             sep = "\t", row.names = FALSE, col.names = FALSE)
> 
> 
> # The genotype matrix (Y):
> ghead(poa_lfdat_c); dim(poa_lfdat_c)
  X100580940.8.C.A_A X100581730.11.C.A_A X100581861.18.C.A_A X100903155.15.C.G_G X100903673.8.C.G_G X100903797.14.C.T_T X100903884.21.C.A_A
1                  0                   0                   0                   0                  0                   1                   0
2                  0                   0                   0                   0                  0                   1                   0
3                  0                   0                   0                   1                  0                   1                   0
4                  0                   0                   0                   0                  0                   1                   1
5                  0                   0                   1                   0                  0                   0                   0
6                  0                   0                   1                   0                  1                   0                   0
  X100903891.6.C.T_T X100904783.20.T.C_C X100905068.16.C.T_T
1                  1                   0                   1
2                  0                   1                   0
3                  0                   0                   1
4                  0                   0                   0
5                  1                   0                   0
6                  0                   0                   0
[1]   172 10395
> write.table(poa_lfdat_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm",
+             sep = "\t", row.names = FALSE, col.names = FALSE)
> 
> 
> 
> poa_env <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"
> poa_lfmm <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm"
> 
> ## Find K 
> #project <- snmf(poa_lfmm, 
>                # K = 1:10, repetitions = 10, 
>              #   entropy = TRUE, project = "new") 
> 
> 
> # K = 2
> best = which.min(cross.entropy(project, K = 2))
> impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
+        method = 'mode', K = 2, run = best)
Missing genotype imputation for K = 2 
Missing genotype imputation for run = 1 
Results are written in the file:  ./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm 
> 
> file.rename(
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"
+ )
[1] TRUE
> 
> # K = 5
> best = which.min(cross.entropy(project, K = 5))
> impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
+        method = 'mode', K = 5, run = best) #impute same file as above but rename by K
Missing genotype imputation for K = 5 
Missing genotype imputation for run = 1 
Results are written in the file:  ./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm 
> 
> 
> file.rename(
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k5_imputed.lfmm"
+ )
[1] TRUE
> 
> ## -- Run lfmm analysis
> 
> # K = 2
> Y_k2 <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"))
> X <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"))
> 
> 
> sum(Y_k2 == 9)        # still 30 9s
[1] 30
> which(Y_k2 == 9, arr.ind = TRUE)
      row   col
 [1,]  33 10376
 [2,]  47 10378
 [3,]  58 10378
 [4,] 148 10378
 [5,] 157 10378
 [6,]  42 10380
 [7,] 120 10381
 [8,] 123 10383
 [9,]  40 10385
[10,]  46 10385
[11,]  56 10385
[12,] 106 10385
[13,] 110 10385
[14,] 157 10385
[15,]   2 10387
[16,] 128 10387
[17,]  82 10388
[18,]   1 10390
[19,]   2 10390
[20,]  17 10390
[21,]  25 10390
[22,]  77 10390
[23,]  85 10390
[24,]  94 10390
[25,] 146 10390
[26,]  66 10392
[27,]  57 10394
[28,]  70 10394
[29,] 110 10394
[30,] 147 10394
> 
> sessionInfo()
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22631)

Matrix products: default


locale:
[1] LC_COLLATE=English_Australia.utf8  LC_CTYPE=English_Australia.utf8    LC_MONETARY=English_Australia.utf8 LC_NUMERIC=C                      
[5] LC_TIME=English_Australia.utf8    

time zone: Australia/Brisbane
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dartR_2.9.9.5    dartR.data_1.0.8 dplyr_1.1.4      ggplot2_4.0.0    adegenet_2.1.11  ade4_1.7-23      qvalue_2.38.0    LEA_3.18.0      
[9] pcadapt_4.4.1   

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1   viridisLite_0.4.2  farver_2.1.2       R.utils_2.13.0     fields_17.1        S7_0.2.0           gdsfmt_1.42.1     
 [8] combinat_0.0-8     fastmap_1.2.0      GGally_2.4.0       promises_1.3.3     digest_0.6.37      dotCall64_1.2      mime_0.13         
[15] lifecycle_1.0.4    cluster_2.1.8.1    gdata_3.0.1        terra_1.8-60       magrittr_2.0.4     compiler_4.4.1     rlang_1.1.6       
[22] tools_4.4.1        igraph_2.1.4       SNPRelate_1.40.0   calibrate_1.7.7    data.table_1.17.8  knitr_1.50         StAMPP_1.6.3      
[29] sp_2.2-0           plyr_1.8.9         RColorBrewer_1.1-3 gap.datasets_0.0.6 withr_3.0.2        purrr_1.1.0        R.oo_1.27.1       
[36] PopGenReport_3.1.3 grid_4.4.1         xtable_1.8-4       colorspace_2.1-2   iterators_1.0.14   gtools_3.9.5       scales_1.4.0      
[43] pegas_1.3          MASS_7.3-65        RgoogleMaps_1.5.3  mvtnorm_1.3-3      cli_3.6.5          vegan_2.7-1        crayon_1.5.3      
[50] generics_0.1.4     rstudioapi_0.17.1  reshape2_1.4.4     genetics_1.3.8.1.3 ape_5.8-1          stringr_1.5.2      splines_4.4.1     
[57] maps_3.4.3         parallel_4.4.1     vctrs_0.6.5        mmod_1.3.3         Matrix_1.7-4       patchwork_1.3.2    gdistance_1.6.5   
[64] seqinr_4.2-36      foreach_1.5.2      tidyr_1.3.1        proto_1.0.0        gap_1.6            glue_1.8.0         spam_2.11-1       
[71] codetools_0.2-20   dismo_1.3-16       ggstats_0.11.0     stringi_1.8.7      gtable_0.3.6       raster_3.6-32      later_1.4.4       
[78] tibble_3.3.0       pillar_1.11.1      htmltools_0.5.8.1  R6_2.6.1           Rdpack_2.6.4       doParallel_1.0.17  shiny_1.11.1      
[85] evaluate_1.0.5     lattice_0.22-7     gsubfn_0.7         png_0.1-8          rbibutils_2.3      R.methodsS3_1.8.2  httpuv_1.6.16     
[92] Rcpp_1.1.0         gridExtra_2.3      nlme_3.1-168       permute_0.9-8      mgcv_1.9-3         xfun_0.53          pkgconfig_2.0.3   
>
Bioconductor LEA • 163 views
ADD COMMENT
1
Entering edit mode

Nadia, please tag your question with the name of the package ("LEA") that you are asking about. "impute" is the name of a different Bioconductor package.

ADD REPLY

Login before adding your answer.

Traffic: 648 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6