gene length file for GOseq
0
0
Entering edit mode
mictadlo ▴ 10
@mictadlo-10885
Last seen 4.3 years ago
#!/usr/bin/env Rscript
# based on https://www.biostars.org/p/84467/#84495
# source("https://bioconductor.org/biocLite.R")
# biocLite("GenomicRanges")
# biocLite("rtracklayer")
biocLite("Rsamtools")
library(GenomicRanges)
library(rtracklayer)
library(Rsamtools)

GFFfile = "Sp.gff"
FASTAfile = "Sp.faa"

#Load the annotation and reduce it
GFF <- import.gff(GFFfile, format="gff", feature.type="exon")
head(GFF)

grl <- reduce(split(GFF, elementMetadata(GFF)$ID))

reducedGFF <- unlist(grl, use.names=T)
elementMetadata(reducedGFF)$gene_name <- rep(names(grl), elementNROWS(grl))

#Open the fasta file
FASTA <- FaFile(FASTAfile)
open(FASTA)

#Add the GC numbers
elementMetadata(reducedGFF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGFF), "GC")[,1]
elementMetadata(reducedGFF)$widths <- width(reducedGFF)

#Create a list of the ensembl_id/GC/length
calc_GC_length <- function(x) {
  nGCs = sum(elementMetadata(x)$nGCs)
  width = sum(elementMetadata(x)$widths)
  c(width, nGCs/width)
}
output <- t(sapply(split(reducedGFF, elementMetadata(reducedGFF)$gene_name), calc_GC_length))
colnames(output) <- c("Length", "GC")

write.table(output, file="GC_lengths.tsv", sep="\t")

The above script has produced the below output:

"Length"        "GC"
"sp0000001-mRNA-1:exon:204"     17      0.529411764705882
"sp0000001-mRNA-1:exon:205"     631     0.393026941362916
"sp0000001-mRNA-1:exon:206"     146     0.417808219178082
"sp0000001-mRNA-1:exon:207"     187     0.379679144385027
"sp0000001-mRNA-1:exon:208"     2046    0.389540566959922
"sp0000001-mRNA-1:exon:209"     20      0.45
"sp0000001-mRNA-1:exon:210"     4       0.5

Should be all sp0000001-mRNA-1:exon:xxx be summed up to produce sp0000001 because in the below gff3 file the id is Parent=sp0000001?

My GFF file looks like this:

lcl|ScwjSwM_3729        maker   gene    9131    15536   .       -       .       ID=sp0000001;Name=sp0000001
lcl|ScwjSwM_3729        maker   mRNA    9131    15536   .       -       .       ID=sp0000001-mRNA-1;Parent=sp0000001;Name=sp0000001-mRNA-1;_AED=0.28;_eAED=0.28;_QI=0|0|0|0.14|1|1|7|0|1016
lcl|ScwjSwM_3729        maker   exon    15533   15536   .       -       .       ID=sp0000001-mRNA-1:exon:210;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   exon    14631   14650   .       -       .       ID=sp0000001-mRNA-1:exon:209;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   exon    12563   14608   .       -       .       ID=sp0000001-mRNA-1:exon:208;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   exon    11160   11346   .       -       .       ID=sp0000001-mRNA-1:exon:207;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   exon    10840   10985   .       -       .       ID=sp0000001-mRNA-1:exon:206;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   exon    10109   10739   .       -       .       ID=sp0000001-mRNA-1:exon:205;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   exon    9131    9147    .       -       .       ID=sp0000001-mRNA-1:exon:204;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   CDS     15533   15536   .       -       0       ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   CDS     14631   14650   .       -       2       ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   CDS     12563   14608   .       -       0       ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   CDS     11160   11346   .       -       0       ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   CDS     10840   10985   .       -       2       ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   CDS     10109   10739   .       -       0       ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729        maker   CDS     9131    9147    .       -       2       ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
> #Load the annotation and reduce it
> GFF <- import.gff(GFFfile, format="gff", feature.type="exon")
> head(GFF)
GRanges object with 6 ranges and 6 metadata columns:
              seqnames         ranges strand |   source     type     score     phase                        ID
                 <Rle>      <IRanges>  <Rle> | <factor> <factor> <numeric> <integer>               <character>
  [1] lcl|ScwjSwM_3729 [15533, 15536]      - |    maker     exon      <NA>      <NA> sp0000001-mRNA-1:exon:210
  [2] lcl|ScwjSwM_3729 [14631, 14650]      - |    maker     exon      <NA>      <NA> sp0000001-mRNA-1:exon:209
  [3] lcl|ScwjSwM_3729 [12563, 14608]      - |    maker     exon      <NA>      <NA> sp0000001-mRNA-1:exon:208
  [4] lcl|ScwjSwM_3729 [11160, 11346]      - |    maker     exon      <NA>      <NA> sp0000001-mRNA-1:exon:207
  [5] lcl|ScwjSwM_3729 [10840, 10985]      - |    maker     exon      <NA>      <NA> sp0000001-mRNA-1:exon:206
  [6] lcl|ScwjSwM_3729 [10109, 10739]      - |    maker     exon      <NA>      <NA> sp0000001-mRNA-1:exon:205
                Parent
       <CharacterList>
  [1] sp0000001-mRNA-1
  [2] sp0000001-mRNA-1
  [3] sp0000001-mRNA-1
  [4] sp0000001-mRNA-1
  [5] sp0000001-mRNA-1
  [6] sp0000001-mRNA-1
  -------
  seqinfo: 4724 sequences from an unspecified genome; no seqlength

If the ids have to be merge then what would be the best way to do it?

Thank you in advance.

goseq • 1.2k views
ADD COMMENT

Login before adding your answer.

Traffic: 554 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6