root/bin/maker2zff.pl

Revision 235, 4.6 kB (checked in by cholt, 4 months ago)

fixed for abinit to pred_gff

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl -w
2 use strict;
3 use Getopt::Std;
4
5 ##### Initialize Threshhold  ####
6 my @thresh = ();
7 my $thrAED = 0.5;
8 use vars qw($opt_h $opt_c $opt_e $opt_o $opt_a $opt_t $opt_l $opt_x);
9
10 push @thresh, 0.5;
11 push @thresh, 0.5;
12 push @thresh, 0.5;
13 push @thresh, 0;
14 push @thresh, 0;
15 push @thresh, 75;
16
17
18 getopts("hc:e:o:a:t:x");
19 my $usage = "maker2zff.pl directory name [options]
20
21 directory  - the location of the gff files to use for the hmm
22 name       - a name for the outfile, will produce name.dna and name.ann
23
24 OPTIONS
25 For determining which genes are High Confidence for Retraining, there are 6 criteria.
26 -c fraction  The fraction of splice sites confirmed by an EST alignment, default 0.5
27 -e fraction  The fraction of exons that overlap an EST alignment, default 0.5
28 -o fraction  The faction of exons that overlap any evidence (EST or Protein), default 0.5
29 -a fraction  The fraction of splice sites confirmed by an ab-initio SNAP prediction, default 0
30 -t fraction  The fraction of exons the overlap an ab-initio SNAP prediction, default 0
31 -l number    The min length of the protein sequence produced by the mRNA
32 -x number    Max AED to allow 0.5 is default
33 ";
34
35 if ($opt_c) {$thresh[0] = $opt_c}
36 if ($opt_e) {$thresh[1] = $opt_e}
37 if ($opt_o) {$thresh[2] = $opt_o}
38 if ($opt_a) {$thresh[3] = $opt_a}
39 if ($opt_t) {$thresh[4] = $opt_t}
40 if ($opt_l) {$thresh[5] = $opt_l}
41 if ($opt_x) {$thrAED = $opt_x}
42
43 my %id2name = ();
44 my %status = ();
45 my %parent = ();
46 my %exons = ();
47 my %hc = ();
48 my %bad = ();
49 my %seq = ();
50
51 my $dir = shift @ARGV;
52 my $outfile = shift @ARGV;
53
54 if ($opt_h) {die $usage;}
55 if ((not $dir) || (not $outfile)) {
56     die $usage;
57 }
58
59 #### Get all of the GFF file in the directory  ####
60 opendir DIR, "$dir" or die $!;
61 my @files = grep /^.+\.gff$/, readdir DIR;
62 close DIR;
63
64 #### Go through the GFF file and determine which genes are HC  ####
65 #### This step must finish before any output is produced since ####
66 #### featues can be out of order ####
67
68 my %index; #used to replace long maker names that mess up snap
69 my $count = 0;
70
71 foreach my $file (@files) {
72     open GFF, "<$dir/$file" or die $!;
73     while (my $line = <GFF>) {
74         chomp($line);
75         if ($line =~ m/\#\#FASTA/) {
76             my $header;
77             while (my $d = <GFF>) {
78                 if($d =~ /^>(\S+)/){
79                     $header = $1;
80                     $seq{$header} = "";
81                 }
82                 else{
83                     $seq{$header} .= $d;
84                 }
85             }
86         }
87         elsif($line =~ /^\s*\#|^\n$|^\s*$/){
88             next;
89         }
90         else {
91             my ($seqid, $source, $tag, $start, $end, $score, $strand, $phase, $annot) = split(/\t/, $line);
92             my %annotation = split(/[;=]/, $annot);
93             if ($tag eq "mRNA" && $source eq "maker") {
94                 my $id = $annotation{'ID'};
95                 my $lname = $annotation{'Name'};
96                 my $parent = $annotation{'Parent'};
97                 my ($name, $qi) = split(/\s+/, $lname);
98                 if(! $qi){
99                     ($qi) = $line =~ /_QI\=([^\;\n]+)/;
100                 }
101                 my ($AED) = $line =~ /_AED\=([^\;\n]+)/;
102                 my $ishc = is_hc($qi, $AED);
103                 if ($ishc == 1 ) {
104                     push(@{$hc{$seqid}}, $id);
105                 }
106             }elsif ($tag eq "CDS" && $source eq "maker") {
107                 my $parent = $annotation{'Parent'};
108
109                 #set alias
110                 if(! $index{$parent}){
111                     $index{$parent} = "MODEL$count";
112                     $count++;
113                 }
114
115                 if($strand eq '-'){
116                     ($start, $end) = ($end, $start);
117                 }
118
119                 push @{$exons{$seqid}{$parent}}, [$start, $end];
120             }
121         }
122     }
123 }
124
125 #### Now filter out anything thats not high quality ####
126
127 #### Print out the exon locations of the HC genes ####
128
129 open(ZFF, ">$outfile\.ann") or die;
130 open DNA, ">$outfile\.dna" or die $!;
131
132 foreach my $seqid (sort {$a cmp $b} keys %hc) {
133     print ZFF ">",$seqid,"\n";
134     print DNA ">",$seqid,"\n",$seq{$seqid},"\n";
135
136     foreach my $mRNA (@{$hc{$seqid}}){
137         my @exons = @{$exons{$seqid}{$mRNA}};
138         my $pname = $index{$mRNA};
139
140         next if (! @exons);
141
142         my $f = shift @exons;
143         if(! @exons){
144             print ZFF join(" ", "Esngl ", $f->[0], $f->[1], $pname),"\n";
145             next;
146         }
147         elsif($f->[0] < $f->[1]){
148             print ZFF join(" ", "Einit ", $f->[0], $f->[1], $pname),"\n";
149         }
150         else{
151             print ZFF join(" ", "Eterm ", $f->[0], $f->[1], $pname),"\n";
152         }
153
154         my $l = pop @exons;
155         if($l->[0] < $l->[1]){
156             print ZFF join(" ", "Eterm ", $l->[0], $l->[1], $pname),"\n";
157         }
158         else{
159             print ZFF join(" ", "Einit ", $l->[0], $l->[1], $pname),"\n";
160         }
161
162         foreach my $e (@exons) {
163             print ZFF join(" ", "Exon ", $e->[0], $e->[1], $pname),"\n";
164         }
165     }
166 }
167
168 close(ZFF);
169 close(DNA);
170
171 ################## Subroutines ###################
172
173 sub is_hc {
174     my $qi = shift @_;
175     my $AED = shift @_;
176
177     my @q = split(/\|/, $qi);
178     my @qual = (@q[1..5],$q[8]);
179     my $hc = 1;
180     foreach my $i (0..4) {
181         if ($qual[$i] < $thresh[$i]) {
182             $hc = 0;
183         }
184     }
185
186     $hc = 0 if($AED > $thrAED);
187
188     return $hc;
189 }
Note: See TracBrowser for help on using the browser.