Hi all,
I would like to run a lfmm to identify any outliers, but before this I have to impute any missing values of which I have roughly 17,000 of in my genotype matrix (prior to imputing). However, when imputing missing values within the LEA package using the impute() function, I am still getting 30 missing values or '9s' within my imputed genotype matrix.
I was thinking it might have something to do with the fact the matrix was initially converted from a .raw output file from PLINK, but the .lfmm file seems to be in the appropriate format, so I am not sure. It is probably something minor that I am missing but I have spent all day on this, and it was running fine yesterday, so any help would be appreciated!
For context as well I am running two analyses, one for K = 2 and one for K = 5.
Here is my code and the output for this is below (note I have already run snmf() so just skipped this) :
# LD pruned using plink and want to run lfmm, so I export the pruned plink files in .raw format
system2(plink_path, 
        args = c("--bfile", "05_results/LD_results/Conservative/poa_ld_c_final",
                 "--recode", "A",
                 "--out", "05_results/LD_results/Conservative/poa_ld_c_final_raw"))
# Genotype matrix (Y):
poa_geno_c_raw <- read.table("05_results/LD_results/Conservative/poa_ld_c_final_raw.raw", header = TRUE)
poa_lfdat_c <- poa_geno_c_raw[, -(1:6)] #remove non-genotype columns
poa_lfdat_c[is.na(poa_lfdat_c)] <- 9
ghead(poa_lfdat_c); dim(poa_lfdat_c)
# The environmental matrix (X)
poa_lfs_c <- as.matrix(poa_lfs_c, ncol = 1)
head(poa_lfs_c); dim(poa_lfs_c)
write.table(poa_lfs_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env",
            sep = "\t", row.names = FALSE, col.names = FALSE)
# The genotype matrix (Y):
ghead(poa_lfdat_c); dim(poa_lfdat_c)
write.table(poa_lfdat_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm",
            sep = "\t", row.names = FALSE, col.names = FALSE)
poa_env <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"
poa_lfmm <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm"
## Find K 
#project <- snmf(poa_lfmm, 
               # K = 1:10, repetitions = 10, 
             #   entropy = TRUE, project = "new") 
# K = 2
best = which.min(cross.entropy(project, K = 2))
impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
       method = 'mode', K = 2, run = best)
file.rename(
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"
)
# K = 5
best = which.min(cross.entropy(project, K = 5))
impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
       method = 'mode', K = 5, run = best) #impute same file as above but rename by K
file.rename(
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
  "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k5_imputed.lfmm"
)
## -- Run lfmm analysis
# K = 2
Y_k2 <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"))
X <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"))
sum(Y_k2 == 9)        # still 30 9s
which(Y_k2 == 9, arr.ind = TRUE)
# LD pruned using plink and want to run lfmm, so I export the pruned plink files as .raw
> system2(plink_path, 
+         args = c("--bfile", "05_results/LD_results/Conservative/poa_ld_c_final",
+                  "--recode", "A",
+                  "--out", "05_results/LD_results/Conservative/poa_ld_c_final_raw"))
PLINK v1.9.0-b.7.7 64-bit (22 Oct 2024)            cog-genomics.org/plink/1.9/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to 05_results/LD_results/Conservative/poa_ld_c_final_raw.log.
Options in effect:
  --bfile 05_results/LD_results/Conservative/poa_ld_c_final
  --out 05_results/LD_results/Conservative/poa_ld_c_final_raw
  --recode A
15715 MB RAM detected; reserving 7857 MB for main workspace.
10395 variants loaded from .bim file.
172 people (0 males, 0 females, 172 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
05_results/LD_results/Conservative/poa_ld_c_final_raw.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 172 founders and 0 nonfounders present.
Calculating allele frequencies... done.
Total genotyping rate is 0.990325.
10395 variants and 172 people pass filters and QC.
Note: No phenotypes present.
--recode A to 05_results/LD_results/Conservative/poa_ld_c_final_raw.raw ...
done.
> 
> 
> # Genotype matrix:
> poa_geno_c_raw <- read.table("05_results/LD_results/Conservative/poa_ld_c_final_raw.raw", header = TRUE)
> poa_lfdat_c <- poa_geno_c_raw[, -(1:6)] #remove non-genotype columns
> poa_lfdat_c[is.na(poa_lfdat_c)] <- 9
> ghead(poa_lfdat_c); dim(poa_lfdat_c)
  X100580940.8.C.A_A X100581730.11.C.A_A X100581861.18.C.A_A X100903155.15.C.G_G X100903673.8.C.G_G X100903797.14.C.T_T X100903884.21.C.A_A
1                  0                   0                   0                   0                  0                   1                   0
2                  0                   0                   0                   0                  0                   1                   0
3                  0                   0                   0                   1                  0                   1                   0
4                  0                   0                   0                   0                  0                   1                   1
5                  0                   0                   1                   0                  0                   0                   0
6                  0                   0                   1                   0                  1                   0                   0
  X100903891.6.C.T_T X100904783.20.T.C_C X100905068.16.C.T_T
1                  1                   0                   1
2                  0                   1                   0
3                  0                   0                   1
4                  0                   0                   0
5                  1                   0                   0
6                  0                   0                   0
[1]   172 10395
> 
> 
> # The environmental matrix (X)
> poa_lfs_c <- as.matrix(poa_lfs_c, ncol = 1)
> head(poa_lfs_c); dim(poa_lfs_c)
     [,1]
[1,]    1
[2,]    1
[3,]    1
[4,]    1
[5,]    1
[6,]    1
[1] 172   1
> write.table(poa_lfs_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env",
+             sep = "\t", row.names = FALSE, col.names = FALSE)
> 
> 
> # The genotype matrix (Y):
> ghead(poa_lfdat_c); dim(poa_lfdat_c)
  X100580940.8.C.A_A X100581730.11.C.A_A X100581861.18.C.A_A X100903155.15.C.G_G X100903673.8.C.G_G X100903797.14.C.T_T X100903884.21.C.A_A
1                  0                   0                   0                   0                  0                   1                   0
2                  0                   0                   0                   0                  0                   1                   0
3                  0                   0                   0                   1                  0                   1                   0
4                  0                   0                   0                   0                  0                   1                   1
5                  0                   0                   1                   0                  0                   0                   0
6                  0                   0                   1                   0                  1                   0                   0
  X100903891.6.C.T_T X100904783.20.T.C_C X100905068.16.C.T_T
1                  1                   0                   1
2                  0                   1                   0
3                  0                   0                   1
4                  0                   0                   0
5                  1                   0                   0
6                  0                   0                   0
[1]   172 10395
> write.table(poa_lfdat_c, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm",
+             sep = "\t", row.names = FALSE, col.names = FALSE)
> 
> 
> 
> poa_env <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"
> poa_lfmm <- "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm"
> 
> ## Find K 
> #project <- snmf(poa_lfmm, 
>                # K = 1:10, repetitions = 10, 
>              #   entropy = TRUE, project = "new") 
> 
> 
> # K = 2
> best = which.min(cross.entropy(project, K = 2))
> impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
+        method = 'mode', K = 2, run = best)
Missing genotype imputation for K = 2 
Missing genotype imputation for run = 1 
Results are written in the file:  ./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm 
> 
> file.rename(
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"
+ )
[1] TRUE
> 
> # K = 5
> best = which.min(cross.entropy(project, K = 5))
> impute(project, "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm", 
+        method = 'mode', K = 5, run = best) #impute same file as above but rename by K
Missing genotype imputation for K = 5 
Missing genotype imputation for run = 1 
Results are written in the file:  ./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm 
> 
> 
> file.rename(
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno.lfmm_imputed.lfmm",
+   "./05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k5_imputed.lfmm"
+ )
[1] TRUE
> 
> ## -- Run lfmm analysis
> 
> # K = 2
> Y_k2 <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_geno_k2_imputed.lfmm"))
> X <- as.matrix(read.table("05_results/Loci_under_selection/LFMM/Conservative/poa_env.env"))
> 
> 
> sum(Y_k2 == 9)        # still 30 9s
[1] 30
> which(Y_k2 == 9, arr.ind = TRUE)
      row   col
 [1,]  33 10376
 [2,]  47 10378
 [3,]  58 10378
 [4,] 148 10378
 [5,] 157 10378
 [6,]  42 10380
 [7,] 120 10381
 [8,] 123 10383
 [9,]  40 10385
[10,]  46 10385
[11,]  56 10385
[12,] 106 10385
[13,] 110 10385
[14,] 157 10385
[15,]   2 10387
[16,] 128 10387
[17,]  82 10388
[18,]   1 10390
[19,]   2 10390
[20,]  17 10390
[21,]  25 10390
[22,]  77 10390
[23,]  85 10390
[24,]  94 10390
[25,] 146 10390
[26,]  66 10392
[27,]  57 10394
[28,]  70 10394
[29,] 110 10394
[30,] 147 10394
> 
> sessionInfo()
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 22631)
Matrix products: default
locale:
[1] LC_COLLATE=English_Australia.utf8  LC_CTYPE=English_Australia.utf8    LC_MONETARY=English_Australia.utf8 LC_NUMERIC=C                      
[5] LC_TIME=English_Australia.utf8    
time zone: Australia/Brisbane
tzcode source: internal
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
[1] dartR_2.9.9.5    dartR.data_1.0.8 dplyr_1.1.4      ggplot2_4.0.0    adegenet_2.1.11  ade4_1.7-23      qvalue_2.38.0    LEA_3.18.0      
[9] pcadapt_4.4.1   
loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1   viridisLite_0.4.2  farver_2.1.2       R.utils_2.13.0     fields_17.1        S7_0.2.0           gdsfmt_1.42.1     
 [8] combinat_0.0-8     fastmap_1.2.0      GGally_2.4.0       promises_1.3.3     digest_0.6.37      dotCall64_1.2      mime_0.13         
[15] lifecycle_1.0.4    cluster_2.1.8.1    gdata_3.0.1        terra_1.8-60       magrittr_2.0.4     compiler_4.4.1     rlang_1.1.6       
[22] tools_4.4.1        igraph_2.1.4       SNPRelate_1.40.0   calibrate_1.7.7    data.table_1.17.8  knitr_1.50         StAMPP_1.6.3      
[29] sp_2.2-0           plyr_1.8.9         RColorBrewer_1.1-3 gap.datasets_0.0.6 withr_3.0.2        purrr_1.1.0        R.oo_1.27.1       
[36] PopGenReport_3.1.3 grid_4.4.1         xtable_1.8-4       colorspace_2.1-2   iterators_1.0.14   gtools_3.9.5       scales_1.4.0      
[43] pegas_1.3          MASS_7.3-65        RgoogleMaps_1.5.3  mvtnorm_1.3-3      cli_3.6.5          vegan_2.7-1        crayon_1.5.3      
[50] generics_0.1.4     rstudioapi_0.17.1  reshape2_1.4.4     genetics_1.3.8.1.3 ape_5.8-1          stringr_1.5.2      splines_4.4.1     
[57] maps_3.4.3         parallel_4.4.1     vctrs_0.6.5        mmod_1.3.3         Matrix_1.7-4       patchwork_1.3.2    gdistance_1.6.5   
[64] seqinr_4.2-36      foreach_1.5.2      tidyr_1.3.1        proto_1.0.0        gap_1.6            glue_1.8.0         spam_2.11-1       
[71] codetools_0.2-20   dismo_1.3-16       ggstats_0.11.0     stringi_1.8.7      gtable_0.3.6       raster_3.6-32      later_1.4.4       
[78] tibble_3.3.0       pillar_1.11.1      htmltools_0.5.8.1  R6_2.6.1           Rdpack_2.6.4       doParallel_1.0.17  shiny_1.11.1      
[85] evaluate_1.0.5     lattice_0.22-7     gsubfn_0.7         png_0.1-8          rbibutils_2.3      R.methodsS3_1.8.2  httpuv_1.6.16     
[92] Rcpp_1.1.0         gridExtra_2.3      nlme_3.1-168       permute_0.9-8      mgcv_1.9-3         xfun_0.53          pkgconfig_2.0.3   
>

Nadia, please tag your question with the name of the package ("LEA") that you are asking about. "impute" is the name of a different Bioconductor package.