root/bin/genemark_gtf2gff3

Revision 251, 4.4 kB (checked in by cholt, 3 months ago)

added genemark gtf converter

  • Property svn:executable set to *
Line 
1 #! /usr/bin/perl -w
2 use strict;
3
4 #usage statement
5 my $usage = "
6 USAGE:
7       genemark_gtf2gff3 <filename>
8
9       This converts genemark's GTF output into GFF3 format.
10       The script prints to STDOUT. Use the '>' character to
11       redirect output into a file.
12
13 ";
14
15 my $file = shift;
16
17 #error checking
18 if (! $file ){
19     print $usage;
20     exit;
21 }
22
23 if(! -e $file){
24     warn "ERROR: The file $file does not exist\n";
25     print $usage;
26 }
27
28 #parse file
29 open(IN, "< $file");
30 my %genes;
31 while(my $line = <IN>){
32     chomp $line;
33     my @F = split(/\t/, $line);
34     next if(@F < 8);
35     next if($F[2] ne 'CDS');
36
37     #genemark by default only fills in the ids and not the names
38     my ($g) = $F[8] =~ /gene_id \"([^\"]+)\"/;
39        ($g) = $F[8] =~ /gene_name \"([^\"]+)\"/ if(! defined $g);
40     my ($t) = $F[8] =~ /transcript_id \"([^\"]+)\"/;
41        ($t) = $F[8] =~ /transcript_name \"([^\"]+)\"/ if(! defined $t);
42
43     die "ERROR: Cannot understand format\n".
44         "expecting -> gene_id \"xxxx\"\; transcript_id \"xxxx\"\;\n"
45         if(! defined $g || ! defined $t);
46
47     #get cintig name
48     my $s = $F[0];
49    
50     #set needed column information
51     $genes{$s}{$g}{seqid}  = $F[0] if(! $genes{$s}{$g}{seqid});
52     $genes{$s}{$g}{source} = $F[1] if(! $genes{$s}{$g}{source});
53     $genes{$s}{$g}{strand} = $F[6] if(! $genes{$s}{$g}{strand});
54
55     $genes{$s}{$g}{mRNA}{$t}{seqid}  = $F[0] if(! $genes{$s}{$g}{mRNA}{$t}{seqid});
56     $genes{$s}{$g}{mRNA}{$t}{source} = $F[1] if(! $genes{$s}{$g}{mRNA}{$t}{source});
57     $genes{$s}{$g}{mRNA}{$t}{strand} = $F[6] if(! $genes{$s}{$g}{mRNA}{$t}{strand});
58     $genes{$s}{$g}{mRNA}{$t}{parent} = $g if(! $genes{$s}{$g}{mRNA}{$t}{parent});
59
60     #set start/end of gene
61     $genes{$s}{$g}{B} = $F[3] if(! defined $genes{$s}{$g}{B} || $F[3] < $genes{$s}{$g}{B});
62     $genes{$s}{$g}{E} = $F[4] if(! defined $genes{$s}{$g}{E} || $F[4] > $genes{$s}{$g}{E});
63
64     #set start/end of transcript
65     $genes{$s}{$g}{mRNA}{$t}{B} = $F[3] if(! defined $genes{$s}{$g}{mRNA}{$t}{B} ||
66                                            $F[3] < $genes{$s}{$g}{mRNA}{$t}{B}
67                                            );
68     $genes{$s}{$g}{mRNA}{$t}{E} = $F[4] if(! defined $genes{$s}{$g}{mRNA}{$t}{E} ||
69                                            $F[4] > $genes{$s}{$g}{mRNA}{$t}{E}
70                                            );
71    
72     #add CDS to transcript
73     my %c = (seqid  => $F[0],
74              source => $F[1],
75              B      => $F[3],
76              E      => $F[4],
77              score  => $F[5],
78              strand => $F[6],
79              phase  => $F[7],
80              parent => $t
81              );
82
83     push (@{$genes{$s}{$g}{mRNA}{$t}{CDS}}, \%c);
84 }
85 close(IN);
86
87
88 #build GFF3 structure and dump to file
89 print "\#\#gff-version 3\n";
90 gff3_contig(\%genes);
91
92 #--------------------------------------------------------------------------
93 #-------------------------------- SUBS ------------------------------------
94 #--------------------------------------------------------------------------
95 sub gff3_contig {
96     my $hash = shift;
97     foreach my $f (keys %$hash){
98         gff3_gene($hash->{$f});
99     }
100 }
101 #--------------------------------------------------------------------------
102 sub gff3_gene {
103     my $hash = shift;
104
105     foreach my $g (sort {$hash->{$a}{B} <=> $hash->{$b}{B}} keys %$hash){
106         my $gene = $hash->{$g};
107        
108         print $gene->{seqid}."\t".$gene->{source}."\tgene\t".$gene->{B}."\t".
109         $gene->{E}."\t.\t".$gene->{strand}."\t.\tID=$g\;Name=$g\n";
110        
111         gff3_mRNA($gene->{mRNA});
112     }
113 }
114 #--------------------------------------------------------------------------
115 sub gff3_mRNA {
116     my $hash = shift;
117
118     foreach my $t (keys %$hash){
119         my $mRNA = $hash->{$t};
120         print $mRNA->{seqid}."\t".$mRNA->{source}."\tmRNA\t".$mRNA->{B}."\t".$mRNA->{E}.
121             "\t.\t".$mRNA->{strand}."\t.\tID=$t\;Name=$t\;Parent=".$mRNA->{parent}."\n";
122        
123         gff3_CDS($mRNA->{CDS});
124     }
125 }
126 #--------------------------------------------------------------------------
127 sub gff3_CDS {
128     my $array = shift;
129
130     #define the id
131     my $i = 1;
132
133     my @exons;
134     my @CDSs;
135     foreach my $c (@$array){
136         #make exon line
137         my $id = $c->{parent} .":exon:". $i;
138         my $exon = $c->{seqid}."\t".$c->{source}."\texon\t".$c->{B}."\t".$c->{E}."\t.\t".
139             $c->{strand}."\t.\tID=$id\;Name=$id\;Parent=".$c->{parent}."\n";
140         push(@exons, $exon);
141        
142         #make CDS line
143         $id = $c->{parent} .":CDS:". $i++;
144         my $cds = $c->{seqid}."\t".$c->{source}."\tCDS\t".$c->{B}."\t".$c->{E}."\t".$c->{score}.
145             "\t".$c->{strand}."\t".$c->{phase}."\tID=$id\;Name=$id\;Parent=".$c->{parent}."\n";
146         push(@CDSs, $cds);
147     }
148    
149     #print all exons together then all CDSs together
150     print join('', @exons);
151     print join('', @CDSs);
152 }
153
154
155
156
Note: See TracBrowser for help on using the browser.