Entering edit mode
mictadlo
▴
10
@mictadlo-10885
Last seen 4.8 years ago
#!/usr/bin/env Rscript
# based on https://www.biostars.org/p/84467/#84495
# source("https://bioconductor.org/biocLite.R")
# biocLite("GenomicRanges")
# biocLite("rtracklayer")
biocLite("Rsamtools")
library(GenomicRanges)
library(rtracklayer)
library(Rsamtools)
GFFfile = "Sp.gff"
FASTAfile = "Sp.faa"
#Load the annotation and reduce it
GFF <- import.gff(GFFfile, format="gff", feature.type="exon")
head(GFF)
grl <- reduce(split(GFF, elementMetadata(GFF)$ID))
reducedGFF <- unlist(grl, use.names=T)
elementMetadata(reducedGFF)$gene_name <- rep(names(grl), elementNROWS(grl))
#Open the fasta file
FASTA <- FaFile(FASTAfile)
open(FASTA)
#Add the GC numbers
elementMetadata(reducedGFF)$nGCs <- letterFrequency(getSeq(FASTA, reducedGFF), "GC")[,1]
elementMetadata(reducedGFF)$widths <- width(reducedGFF)
#Create a list of the ensembl_id/GC/length
calc_GC_length <- function(x) {
nGCs = sum(elementMetadata(x)$nGCs)
width = sum(elementMetadata(x)$widths)
c(width, nGCs/width)
}
output <- t(sapply(split(reducedGFF, elementMetadata(reducedGFF)$gene_name), calc_GC_length))
colnames(output) <- c("Length", "GC")
write.table(output, file="GC_lengths.tsv", sep="\t")
The above script has produced the below output:
"Length" "GC"
"sp0000001-mRNA-1:exon:204" 17 0.529411764705882
"sp0000001-mRNA-1:exon:205" 631 0.393026941362916
"sp0000001-mRNA-1:exon:206" 146 0.417808219178082
"sp0000001-mRNA-1:exon:207" 187 0.379679144385027
"sp0000001-mRNA-1:exon:208" 2046 0.389540566959922
"sp0000001-mRNA-1:exon:209" 20 0.45
"sp0000001-mRNA-1:exon:210" 4 0.5
Should be all sp0000001-mRNA-1:exon:xxx
be summed up to produce sp0000001
because in the below gff3 file the id is Parent=sp0000001?
My GFF file looks like this:
lcl|ScwjSwM_3729 maker gene 9131 15536 . - . ID=sp0000001;Name=sp0000001
lcl|ScwjSwM_3729 maker mRNA 9131 15536 . - . ID=sp0000001-mRNA-1;Parent=sp0000001;Name=sp0000001-mRNA-1;_AED=0.28;_eAED=0.28;_QI=0|0|0|0.14|1|1|7|0|1016
lcl|ScwjSwM_3729 maker exon 15533 15536 . - . ID=sp0000001-mRNA-1:exon:210;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker exon 14631 14650 . - . ID=sp0000001-mRNA-1:exon:209;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker exon 12563 14608 . - . ID=sp0000001-mRNA-1:exon:208;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker exon 11160 11346 . - . ID=sp0000001-mRNA-1:exon:207;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker exon 10840 10985 . - . ID=sp0000001-mRNA-1:exon:206;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker exon 10109 10739 . - . ID=sp0000001-mRNA-1:exon:205;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker exon 9131 9147 . - . ID=sp0000001-mRNA-1:exon:204;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker CDS 15533 15536 . - 0 ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker CDS 14631 14650 . - 2 ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker CDS 12563 14608 . - 0 ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker CDS 11160 11346 . - 0 ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker CDS 10840 10985 . - 2 ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker CDS 10109 10739 . - 0 ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
lcl|ScwjSwM_3729 maker CDS 9131 9147 . - 2 ID=sp0000001-mRNA-1:cds;Parent=sp0000001-mRNA-1
> #Load the annotation and reduce it
> GFF <- import.gff(GFFfile, format="gff", feature.type="exon")
> head(GFF)
GRanges object with 6 ranges and 6 metadata columns:
seqnames ranges strand | source type score phase ID
<Rle> <IRanges> <Rle> | <factor> <factor> <numeric> <integer> <character>
[1] lcl|ScwjSwM_3729 [15533, 15536] - | maker exon <NA> <NA> sp0000001-mRNA-1:exon:210
[2] lcl|ScwjSwM_3729 [14631, 14650] - | maker exon <NA> <NA> sp0000001-mRNA-1:exon:209
[3] lcl|ScwjSwM_3729 [12563, 14608] - | maker exon <NA> <NA> sp0000001-mRNA-1:exon:208
[4] lcl|ScwjSwM_3729 [11160, 11346] - | maker exon <NA> <NA> sp0000001-mRNA-1:exon:207
[5] lcl|ScwjSwM_3729 [10840, 10985] - | maker exon <NA> <NA> sp0000001-mRNA-1:exon:206
[6] lcl|ScwjSwM_3729 [10109, 10739] - | maker exon <NA> <NA> sp0000001-mRNA-1:exon:205
Parent
<CharacterList>
[1] sp0000001-mRNA-1
[2] sp0000001-mRNA-1
[3] sp0000001-mRNA-1
[4] sp0000001-mRNA-1
[5] sp0000001-mRNA-1
[6] sp0000001-mRNA-1
-------
seqinfo: 4724 sequences from an unspecified genome; no seqlength
If the ids have to be merge then what would be the best way to do it?
Thank you in advance.