| 1 |
#! /usr/bin/perl -w |
|---|
| 2 |
use strict; |
|---|
| 3 |
|
|---|
| 4 |
#usage statement |
|---|
| 5 |
my $usage = " |
|---|
| 6 |
USAGE: |
|---|
| 7 |
genemark_gtf2gff3 <filename> |
|---|
| 8 |
|
|---|
| 9 |
This converts genemark's GTF output into GFF3 format. |
|---|
| 10 |
The script prints to STDOUT. Use the '>' character to |
|---|
| 11 |
redirect output into a file. |
|---|
| 12 |
|
|---|
| 13 |
"; |
|---|
| 14 |
|
|---|
| 15 |
my $file = shift; |
|---|
| 16 |
|
|---|
| 17 |
#error checking |
|---|
| 18 |
if (! $file ){ |
|---|
| 19 |
print $usage; |
|---|
| 20 |
exit; |
|---|
| 21 |
} |
|---|
| 22 |
|
|---|
| 23 |
if(! -e $file){ |
|---|
| 24 |
warn "ERROR: The file $file does not exist\n"; |
|---|
| 25 |
print $usage; |
|---|
| 26 |
} |
|---|
| 27 |
|
|---|
| 28 |
#parse file |
|---|
| 29 |
open(IN, "< $file"); |
|---|
| 30 |
my %genes; |
|---|
| 31 |
while(my $line = <IN>){ |
|---|
| 32 |
chomp $line; |
|---|
| 33 |
my @F = split(/\t/, $line); |
|---|
| 34 |
next if(@F < 8); |
|---|
| 35 |
next if($F[2] ne 'CDS'); |
|---|
| 36 |
|
|---|
| 37 |
#genemark by default only fills in the ids and not the names |
|---|
| 38 |
my ($g) = $F[8] =~ /gene_id \"([^\"]+)\"/; |
|---|
| 39 |
($g) = $F[8] =~ /gene_name \"([^\"]+)\"/ if(! defined $g); |
|---|
| 40 |
my ($t) = $F[8] =~ /transcript_id \"([^\"]+)\"/; |
|---|
| 41 |
($t) = $F[8] =~ /transcript_name \"([^\"]+)\"/ if(! defined $t); |
|---|
| 42 |
|
|---|
| 43 |
die "ERROR: Cannot understand format\n". |
|---|
| 44 |
"expecting -> gene_id \"xxxx\"\; transcript_id \"xxxx\"\;\n" |
|---|
| 45 |
if(! defined $g || ! defined $t); |
|---|
| 46 |
|
|---|
| 47 |
#get cintig name |
|---|
| 48 |
my $s = $F[0]; |
|---|
| 49 |
|
|---|
| 50 |
#set needed column information |
|---|
| 51 |
$genes{$s}{$g}{seqid} = $F[0] if(! $genes{$s}{$g}{seqid}); |
|---|
| 52 |
$genes{$s}{$g}{source} = $F[1] if(! $genes{$s}{$g}{source}); |
|---|
| 53 |
$genes{$s}{$g}{strand} = $F[6] if(! $genes{$s}{$g}{strand}); |
|---|
| 54 |
|
|---|
| 55 |
$genes{$s}{$g}{mRNA}{$t}{seqid} = $F[0] if(! $genes{$s}{$g}{mRNA}{$t}{seqid}); |
|---|
| 56 |
$genes{$s}{$g}{mRNA}{$t}{source} = $F[1] if(! $genes{$s}{$g}{mRNA}{$t}{source}); |
|---|
| 57 |
$genes{$s}{$g}{mRNA}{$t}{strand} = $F[6] if(! $genes{$s}{$g}{mRNA}{$t}{strand}); |
|---|
| 58 |
$genes{$s}{$g}{mRNA}{$t}{parent} = $g if(! $genes{$s}{$g}{mRNA}{$t}{parent}); |
|---|
| 59 |
|
|---|
| 60 |
#set start/end of gene |
|---|
| 61 |
$genes{$s}{$g}{B} = $F[3] if(! defined $genes{$s}{$g}{B} || $F[3] < $genes{$s}{$g}{B}); |
|---|
| 62 |
$genes{$s}{$g}{E} = $F[4] if(! defined $genes{$s}{$g}{E} || $F[4] > $genes{$s}{$g}{E}); |
|---|
| 63 |
|
|---|
| 64 |
#set start/end of transcript |
|---|
| 65 |
$genes{$s}{$g}{mRNA}{$t}{B} = $F[3] if(! defined $genes{$s}{$g}{mRNA}{$t}{B} || |
|---|
| 66 |
$F[3] < $genes{$s}{$g}{mRNA}{$t}{B} |
|---|
| 67 |
); |
|---|
| 68 |
$genes{$s}{$g}{mRNA}{$t}{E} = $F[4] if(! defined $genes{$s}{$g}{mRNA}{$t}{E} || |
|---|
| 69 |
$F[4] > $genes{$s}{$g}{mRNA}{$t}{E} |
|---|
| 70 |
); |
|---|
| 71 |
|
|---|
| 72 |
#add CDS to transcript |
|---|
| 73 |
my %c = (seqid => $F[0], |
|---|
| 74 |
source => $F[1], |
|---|
| 75 |
B => $F[3], |
|---|
| 76 |
E => $F[4], |
|---|
| 77 |
score => $F[5], |
|---|
| 78 |
strand => $F[6], |
|---|
| 79 |
phase => $F[7], |
|---|
| 80 |
parent => $t |
|---|
| 81 |
); |
|---|
| 82 |
|
|---|
| 83 |
push (@{$genes{$s}{$g}{mRNA}{$t}{CDS}}, \%c); |
|---|
| 84 |
} |
|---|
| 85 |
close(IN); |
|---|
| 86 |
|
|---|
| 87 |
|
|---|
| 88 |
#build GFF3 structure and dump to file |
|---|
| 89 |
print "\#\#gff-version 3\n"; |
|---|
| 90 |
gff3_contig(\%genes); |
|---|
| 91 |
|
|---|
| 92 |
#-------------------------------------------------------------------------- |
|---|
| 93 |
#-------------------------------- SUBS ------------------------------------ |
|---|
| 94 |
#-------------------------------------------------------------------------- |
|---|
| 95 |
sub gff3_contig { |
|---|
| 96 |
my $hash = shift; |
|---|
| 97 |
foreach my $f (keys %$hash){ |
|---|
| 98 |
gff3_gene($hash->{$f}); |
|---|
| 99 |
} |
|---|
| 100 |
} |
|---|
| 101 |
#-------------------------------------------------------------------------- |
|---|
| 102 |
sub gff3_gene { |
|---|
| 103 |
my $hash = shift; |
|---|
| 104 |
|
|---|
| 105 |
foreach my $g (sort {$hash->{$a}{B} <=> $hash->{$b}{B}} keys %$hash){ |
|---|
| 106 |
my $gene = $hash->{$g}; |
|---|
| 107 |
|
|---|
| 108 |
print $gene->{seqid}."\t".$gene->{source}."\tgene\t".$gene->{B}."\t". |
|---|
| 109 |
$gene->{E}."\t.\t".$gene->{strand}."\t.\tID=$g\;Name=$g\n"; |
|---|
| 110 |
|
|---|
| 111 |
gff3_mRNA($gene->{mRNA}); |
|---|
| 112 |
} |
|---|
| 113 |
} |
|---|
| 114 |
#-------------------------------------------------------------------------- |
|---|
| 115 |
sub gff3_mRNA { |
|---|
| 116 |
my $hash = shift; |
|---|
| 117 |
|
|---|
| 118 |
foreach my $t (keys %$hash){ |
|---|
| 119 |
my $mRNA = $hash->{$t}; |
|---|
| 120 |
print $mRNA->{seqid}."\t".$mRNA->{source}."\tmRNA\t".$mRNA->{B}."\t".$mRNA->{E}. |
|---|
| 121 |
"\t.\t".$mRNA->{strand}."\t.\tID=$t\;Name=$t\;Parent=".$mRNA->{parent}."\n"; |
|---|
| 122 |
|
|---|
| 123 |
gff3_CDS($mRNA->{CDS}); |
|---|
| 124 |
} |
|---|
| 125 |
} |
|---|
| 126 |
#-------------------------------------------------------------------------- |
|---|
| 127 |
sub gff3_CDS { |
|---|
| 128 |
my $array = shift; |
|---|
| 129 |
|
|---|
| 130 |
#define the id |
|---|
| 131 |
my $i = 1; |
|---|
| 132 |
|
|---|
| 133 |
my @exons; |
|---|
| 134 |
my @CDSs; |
|---|
| 135 |
foreach my $c (@$array){ |
|---|
| 136 |
#make exon line |
|---|
| 137 |
my $id = $c->{parent} .":exon:". $i; |
|---|
| 138 |
my $exon = $c->{seqid}."\t".$c->{source}."\texon\t".$c->{B}."\t".$c->{E}."\t.\t". |
|---|
| 139 |
$c->{strand}."\t.\tID=$id\;Name=$id\;Parent=".$c->{parent}."\n"; |
|---|
| 140 |
push(@exons, $exon); |
|---|
| 141 |
|
|---|
| 142 |
#make CDS line |
|---|
| 143 |
$id = $c->{parent} .":CDS:". $i++; |
|---|
| 144 |
my $cds = $c->{seqid}."\t".$c->{source}."\tCDS\t".$c->{B}."\t".$c->{E}."\t".$c->{score}. |
|---|
| 145 |
"\t".$c->{strand}."\t".$c->{phase}."\tID=$id\;Name=$id\;Parent=".$c->{parent}."\n"; |
|---|
| 146 |
push(@CDSs, $cds); |
|---|
| 147 |
} |
|---|
| 148 |
|
|---|
| 149 |
#print all exons together then all CDSs together |
|---|
| 150 |
print join('', @exons); |
|---|
| 151 |
print join('', @CDSs); |
|---|
| 152 |
} |
|---|
| 153 |
|
|---|
| 154 |
|
|---|
| 155 |
|
|---|
| 156 |
|
|---|