Question

Syntenet application to bacterial genome and file processing

0

Entering edit mode

kluongni • 0

@0fd28637

Last seen 8 months ago

Canada

Hello,

I've run into an issue with the check_input() function and was not sure if it was a biological issue or a mistake I was making somewhere along the process.

I am attempting to visualize synteny across five completely assembled Klebsiella pneumoniae protein fastas and gffs pulled from RefSeq. I have followed the vignette and FAQ to the best of my knowledge however, since hypothetical genes had no names I opted for their WP accession ID instead. The error that occurs is that the sequence names do in my seq input do not match my gene names in annotation. From what I understand, the protein sequence file supplied for example GCF_000814805.1_ASM81480v1 contains 4891 sequences and it's gff contains 4993 ranges fitting the requirements of check_input().

Examples for syntenet are plant genomes, so I wonder if this issue is due to my application of the tool.

Any help or advice would be appreciated, Thanks!

library(tidyverse)
library(syntenet)

# Running Syntenet on a sample of 5 RefSeq Klebsiella pneumoniae genomes

setwd("/analysis1/R_environments/BARNARDS_project")

## Path to directory containing FASTA files

fasta_dir <- "./0_input/testing_syntenet/RefSeq/test_faa/"

dir(fasta_dir) # see the contents of the directory
#> [1] "GCF_000814805.1_ASM81480v1.faa"  "GCF_001482345.1_ASM148234v1.faa"
#> [3] "GCF_003812105.1_ASM381210v1.faa" "GCF_007955015.2_ASM795501v2.faa"
#> [5] "GCF_009661535.2_ASM966153v2.faa"

# Read all FASTA files in `fasta_dir`
RefSeq_aastringsetlist <- fasta2AAStringSetlist(fasta_dir)

head(names(RefSeq_aastringsetlist$GCF_000814805.1_ASM81480v1))
#> [1] "WP_000004292.1 MULTISPECIES: H-NS histone family protein [Enterobacteriaceae]"                 
#> [2] "WP_000005710.1 MULTISPECIES: His-Xaa-Ser system radical SAM maturase HxsC [Enterobacteriaceae]"
#> [3] "WP_000005768.1 MULTISPECIES: zinc metallochaperone GTPase ZigA [Enterobacterales]"             
#> [4] "WP_000014594.1 MULTISPECIES: RNA chaperone/antiterminator CspA [Bacteria]"                     
#> [5] "WP_000020322.1 MULTISPECIES: GTP cyclohydrolase FolE2 [Enterobacterales]"                      
#> [6] "WP_000039782.1 MULTISPECIES: hypothetical protein [Enterobacterales]"

# Remove whitespace and everything after it in protein fasta list
RefSeq_aastringsetlist <- lapply(RefSeq_aastringsetlist, function(x) {
  names(x) <- gsub(" .*", "", names(x))
  return(x)
})

# Taking a look at the new names
head(names(RefSeq_aastringsetlist$GCF_000814805.1_ASM81480v1))
#> [1] "WP_000004292.1" "WP_000005710.1" "WP_000005768.1" "WP_000014594.1"
#> [5] "WP_000020322.1" "WP_000039782.1"

## Path to directory containing GFF files
gff_dir <- "./0_input/testing_syntenet/RefSeq/test_gff/"

dir(gff_dir) # see the contents of the directory
#> [1] "GCF_000814805.1_ASM81480v1.gff"  "GCF_001482345.1_ASM148234v1.gff"
#> [3] "GCF_003812105.1_ASM381210v1.gff" "GCF_007955015.2_ASM795501v2.gff"
#> [5] "GCF_009661535.2_ASM966153v2.gff"

# Read all GFF files in `fasta_dir`
RefSeq_grangeslist <- gff2GRangesList(gff_dir)

# RefSeq uses IDs for CDS, not for genes
# Check for the gff ranges greater than matching protein fasta
# head(RefSeq_grangeslist$GCF_000814805.1_ASM81480v1[RefSeq_grangeslist$GCF_000814805.1_ASM81480v1$type == "CDS"])

# in this case the gene name is in the Name column and protein IDs need to be collapsed to gene IDs

# Create a list of data frames containing protein-to-gene ID correspondences
protein2gene <- lapply(RefSeq_grangeslist, function(x) {

  # Extract only CDS ranges
  cds_ranges <- x[x$type == "CDS"]

  # Create the ID correspondence data frame
  df <- data.frame(
    protein_id = cds_ranges$protein_id,
    gene_id = cds_ranges$Name
  )

  # Remove duplicate rows
  df <- df[!duplicated(df$protein_id), ]

  return(df)
})

# Taking a look at the list
head(protein2gene$GCF_000814805.1_ASM81480v1)
#>       protein_id        gene_id
#> 1 WP_004145090.1 WP_004145090.1
#> 2 WP_004150293.1 WP_004150293.1
#> 3 WP_004173845.1 WP_004173845.1
#> 4 WP_004151530.1 WP_004151530.1
#> 5 WP_029884213.1 WP_029884213.1
#> 6           <NA>           <NA>

# Collapse protein IDs to gene IDs in list of sequences
RefSeq_aastringsetlist_collapsed <- collapse_protein_ids(RefSeq_aastringsetlist, protein2gene)

# Looking at the new sequences
RefSeq_aastringsetlist_collapsed$GCF_000814805.1_ASM81480v1
#> AAStringSet object of length 4891:
#>        width seq                                            names               
#>    [1]  3206 MDNTSGDFPCNKMDTRKQLPLT...QIAKRALQIAKNTARSHTSLH WP_001518711.1
#>    [2]  3163 MDNLRFSSAPTADSIDASIAQH...ACAQHITRWLCATSTQPENTL WP_039109377.1
#>    [3]  2166 MTIHHAALARMLPAEKKEKLLR...QRDASRSRAQQRLVRRHQRQR WP_012131830.1
#>    [4]  2154 MTYSESDIAIVGMNCRYPGVHS...AARENLALRRKRAQQGEKGDE WP_023316156.1
#>    [5]  2035 MISGAPSQDSLLPDNRHAADYQ...GVATVEKTKRPRPVRRRQRRI WP_039109379.1
#>    ...   ... ...
#> [4887]    19 MIEHELGNWKDFIEGMLRK                            WP_100181659.1
#> [4888]    19 MDSRSPLFKGLFFCPGVRR                            WP_227650304.1
#> [4889]    16 MNRVQFKHHHHHHHPD                               WP_004899375.1
#> [4890]    15 MKLVPFFFAFFFTFP                                WP_100250063.1
#> [4891]    14 MNAAIFRFFFYFST                                 WP_001386830.1

check_input(RefSeq_aastringsetlist_collapsed, RefSeq_grangeslist, gene_field = "Name")
#> Error in check_gene_names(seq, annotation, gene_field = gene_field): Sequence names in 'seq' do not match gene names in 'annotation' for:
#> 1. GCF_000814805.1_ASM81480v1
#> 2. GCF_001482345.1_ASM148234v1
#> 3. GCF_003812105.1_ASM381210v1
#> 4. GCF_007955015.2_ASM795501v2
#> 5. GCF_009661535.2_ASM966153v2

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.1 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
#>  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
#>  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/Vancouver
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] syntenet_1.2.4  lubridate_1.9.3 forcats_1.0.0   stringr_1.5.0  
#>  [5] dplyr_1.1.3     purrr_1.0.2     readr_2.1.4     tidyr_1.3.0    
#>  [9] tibble_3.2.1    ggplot2_3.4.4   tidyverse_2.0.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0            Biostrings_2.68.1          
#>  [3] bitops_1.0-7                fastmap_1.1.1              
#>  [5] RCurl_1.98-1.12             GenomicAlignments_1.36.0   
#>  [7] reprex_2.0.2                XML_3.99-0.14              
#>  [9] digest_0.6.33               timechange_0.2.0           
#> [11] lifecycle_1.0.3             magrittr_2.0.3             
#> [13] compiler_4.3.1              rlang_1.1.1                
#> [15] tools_4.3.1                 igraph_1.5.1               
#> [17] utf8_1.2.3                  yaml_2.3.7                 
#> [19] rtracklayer_1.60.1          knitr_1.44                 
#> [21] S4Arrays_1.0.6              htmlwidgets_1.6.2          
#> [23] DelayedArray_0.26.7         RColorBrewer_1.1-3         
#> [25] abind_1.4-5                 BiocParallel_1.34.2        
#> [27] withr_2.5.1                 BiocGenerics_0.46.0        
#> [29] grid_4.3.1                  stats4_4.3.1               
#> [31] fansi_1.0.5                 colorspace_2.1-0           
#> [33] scales_1.2.1                SummarizedExperiment_1.30.2
#> [35] cli_3.6.1                   rmarkdown_2.25             
#> [37] crayon_1.5.2                generics_0.1.3             
#> [39] rstudioapi_0.15.0           tzdb_0.4.0                 
#> [41] rjson_0.2.21                zlibbioc_1.46.0            
#> [43] network_1.18.1              parallel_4.3.1             
#> [45] XVector_0.40.0              restfulr_0.0.15            
#> [47] matrixStats_1.0.0           vctrs_0.6.4                
#> [49] Matrix_1.6-0                IRanges_2.34.1             
#> [51] hms_1.1.3                   S4Vectors_0.38.2           
#> [53] intergraph_2.0-3            glue_1.6.2                 
#> [55] statnet.common_4.9.0        codetools_0.2-19           
#> [57] stringi_1.7.12              gtable_0.3.4               
#> [59] GenomeInfoDb_1.36.4         GenomicRanges_1.52.1       
#> [61] BiocIO_1.10.0               munsell_0.5.0              
#> [63] pillar_1.9.0                ggnetwork_0.5.12           
#> [65] htmltools_0.5.6.1           GenomeInfoDbData_1.2.10    
#> [67] R6_2.5.1                    networkD3_0.4              
#> [69] evaluate_0.22               lattice_0.21-8             
#> [71] Biobase_2.60.0              Rsamtools_2.16.0           
#> [73] pheatmap_1.0.12             Rcpp_1.0.11                
#> [75] coda_0.19-4                 xfun_0.40                  
#> [77] fs_1.6.3                    MatrixGenerics_1.12.3      
#> [79] pkgconfig_2.0.3

syntenet Bacteria synteny • 675 views

ADD COMMENT • link updated 7 months ago by Fabricio Almeida-Silva ▴ 40 • written 9 months ago by kluongni • 0

score 0 · Answer 1 · 2023-12-24

0

Entering edit mode

Fabricio Almeida-Silva ▴ 40

@fabricio_almeidasilva-14890

Last seen 4 months ago

Ghent, Belgium

Hi,

could you show the GRanges object and the AAStringSet object for the same species, just to check if the names match?

Best,

Fabricio

ADD COMMENT • link 7 months ago Fabricio Almeida-Silva ▴ 40