| 1 |
|
|---|
| 2 |
use strict; |
|---|
| 3 |
use Getopt::Std; |
|---|
| 4 |
|
|---|
| 5 |
|
|---|
| 6 |
my @thresh = (); |
|---|
| 7 |
my $thrAED = 0.5; |
|---|
| 8 |
use vars qw($opt_h $opt_c $opt_e $opt_o $opt_a $opt_t $opt_l $opt_x); |
|---|
| 9 |
|
|---|
| 10 |
push @thresh, 0.5; |
|---|
| 11 |
push @thresh, 0.5; |
|---|
| 12 |
push @thresh, 0.5; |
|---|
| 13 |
push @thresh, 0; |
|---|
| 14 |
push @thresh, 0; |
|---|
| 15 |
push @thresh, 75; |
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
getopts("hc:e:o:a:t:x"); |
|---|
| 19 |
my $usage = "maker2zff.pl directory name [options] |
|---|
| 20 |
|
|---|
| 21 |
directory - the location of the gff files to use for the hmm |
|---|
| 22 |
name - a name for the outfile, will produce name.dna and name.ann |
|---|
| 23 |
|
|---|
| 24 |
OPTIONS |
|---|
| 25 |
For determining which genes are High Confidence for Retraining, there are 6 criteria. |
|---|
| 26 |
-c fraction The fraction of splice sites confirmed by an EST alignment, default 0.5 |
|---|
| 27 |
-e fraction The fraction of exons that overlap an EST alignment, default 0.5 |
|---|
| 28 |
-o fraction The faction of exons that overlap any evidence (EST or Protein), default 0.5 |
|---|
| 29 |
-a fraction The fraction of splice sites confirmed by an ab-initio SNAP prediction, default 0 |
|---|
| 30 |
-t fraction The fraction of exons the overlap an ab-initio SNAP prediction, default 0 |
|---|
| 31 |
-l number The min length of the protein sequence produced by the mRNA |
|---|
| 32 |
-x number Max AED to allow 0.5 is default |
|---|
| 33 |
"; |
|---|
| 34 |
|
|---|
| 35 |
if ($opt_c) {$thresh[0] = $opt_c} |
|---|
| 36 |
if ($opt_e) {$thresh[1] = $opt_e} |
|---|
| 37 |
if ($opt_o) {$thresh[2] = $opt_o} |
|---|
| 38 |
if ($opt_a) {$thresh[3] = $opt_a} |
|---|
| 39 |
if ($opt_t) {$thresh[4] = $opt_t} |
|---|
| 40 |
if ($opt_l) {$thresh[5] = $opt_l} |
|---|
| 41 |
if ($opt_x) {$thrAED = $opt_x} |
|---|
| 42 |
|
|---|
| 43 |
my %id2name = (); |
|---|
| 44 |
my %status = (); |
|---|
| 45 |
my %parent = (); |
|---|
| 46 |
my %exons = (); |
|---|
| 47 |
my %hc = (); |
|---|
| 48 |
my %bad = (); |
|---|
| 49 |
my %seq = (); |
|---|
| 50 |
|
|---|
| 51 |
my $dir = shift @ARGV; |
|---|
| 52 |
my $outfile = shift @ARGV; |
|---|
| 53 |
|
|---|
| 54 |
if ($opt_h) {die $usage;} |
|---|
| 55 |
if ((not $dir) || (not $outfile)) { |
|---|
| 56 |
die $usage; |
|---|
| 57 |
} |
|---|
| 58 |
|
|---|
| 59 |
|
|---|
| 60 |
opendir DIR, "$dir" or die $!; |
|---|
| 61 |
my @files = grep /^.+\.gff$/, readdir DIR; |
|---|
| 62 |
close DIR; |
|---|
| 63 |
|
|---|
| 64 |
|
|---|
| 65 |
|
|---|
| 66 |
|
|---|
| 67 |
|
|---|
| 68 |
my %index; |
|---|
| 69 |
my $count = 0; |
|---|
| 70 |
|
|---|
| 71 |
foreach my $file (@files) { |
|---|
| 72 |
open GFF, "<$dir/$file" or die $!; |
|---|
| 73 |
while (my $line = <GFF>) { |
|---|
| 74 |
chomp($line); |
|---|
| 75 |
if ($line =~ m/\ |
|---|
| 76 |
my $header; |
|---|
| 77 |
while (my $d = <GFF>) { |
|---|
| 78 |
if($d =~ /^>(\S+)/){ |
|---|
| 79 |
$header = $1; |
|---|
| 80 |
$seq{$header} = ""; |
|---|
| 81 |
} |
|---|
| 82 |
else{ |
|---|
| 83 |
$seq{$header} .= $d; |
|---|
| 84 |
} |
|---|
| 85 |
} |
|---|
| 86 |
} |
|---|
| 87 |
elsif($line =~ /^\s*\ |
|---|
| 88 |
next; |
|---|
| 89 |
} |
|---|
| 90 |
else { |
|---|
| 91 |
my ($seqid, $source, $tag, $start, $end, $score, $strand, $phase, $annot) = split(/\t/, $line); |
|---|
| 92 |
my %annotation = split(/[;=]/, $annot); |
|---|
| 93 |
if ($tag eq "mRNA" && $source eq "maker") { |
|---|
| 94 |
my $id = $annotation{'ID'}; |
|---|
| 95 |
my $lname = $annotation{'Name'}; |
|---|
| 96 |
my $parent = $annotation{'Parent'}; |
|---|
| 97 |
my ($name, $qi) = split(/\s+/, $lname); |
|---|
| 98 |
if(! $qi){ |
|---|
| 99 |
($qi) = $line =~ /_QI\=([^\;\n]+)/; |
|---|
| 100 |
} |
|---|
| 101 |
my ($AED) = $line =~ /_AED\=([^\;\n]+)/; |
|---|
| 102 |
my $ishc = is_hc($qi, $AED); |
|---|
| 103 |
if ($ishc == 1 ) { |
|---|
| 104 |
push(@{$hc{$seqid}}, $id); |
|---|
| 105 |
} |
|---|
| 106 |
}elsif ($tag eq "CDS" && $source eq "maker") { |
|---|
| 107 |
my $parent = $annotation{'Parent'}; |
|---|
| 108 |
|
|---|
| 109 |
|
|---|
| 110 |
if(! $index{$parent}){ |
|---|
| 111 |
$index{$parent} = "MODEL$count"; |
|---|
| 112 |
$count++; |
|---|
| 113 |
} |
|---|
| 114 |
|
|---|
| 115 |
if($strand eq '-'){ |
|---|
| 116 |
($start, $end) = ($end, $start); |
|---|
| 117 |
} |
|---|
| 118 |
|
|---|
| 119 |
push @{$exons{$seqid}{$parent}}, [$start, $end]; |
|---|
| 120 |
} |
|---|
| 121 |
} |
|---|
| 122 |
} |
|---|
| 123 |
} |
|---|
| 124 |
|
|---|
| 125 |
|
|---|
| 126 |
|
|---|
| 127 |
|
|---|
| 128 |
|
|---|
| 129 |
open(ZFF, ">$outfile\.ann") or die; |
|---|
| 130 |
open DNA, ">$outfile\.dna" or die $!; |
|---|
| 131 |
|
|---|
| 132 |
foreach my $seqid (sort {$a cmp $b} keys %hc) { |
|---|
| 133 |
print ZFF ">",$seqid,"\n"; |
|---|
| 134 |
print DNA ">",$seqid,"\n",$seq{$seqid},"\n"; |
|---|
| 135 |
|
|---|
| 136 |
foreach my $mRNA (@{$hc{$seqid}}){ |
|---|
| 137 |
my @exons = @{$exons{$seqid}{$mRNA}}; |
|---|
| 138 |
my $pname = $index{$mRNA}; |
|---|
| 139 |
|
|---|
| 140 |
next if (! @exons); |
|---|
| 141 |
|
|---|
| 142 |
my $f = shift @exons; |
|---|
| 143 |
if(! @exons){ |
|---|
| 144 |
print ZFF join(" ", "Esngl ", $f->[0], $f->[1], $pname),"\n"; |
|---|
| 145 |
next; |
|---|
| 146 |
} |
|---|
| 147 |
elsif($f->[0] < $f->[1]){ |
|---|
| 148 |
print ZFF join(" ", "Einit ", $f->[0], $f->[1], $pname),"\n"; |
|---|
| 149 |
} |
|---|
| 150 |
else{ |
|---|
| 151 |
print ZFF join(" ", "Eterm ", $f->[0], $f->[1], $pname),"\n"; |
|---|
| 152 |
} |
|---|
| 153 |
|
|---|
| 154 |
my $l = pop @exons; |
|---|
| 155 |
if($l->[0] < $l->[1]){ |
|---|
| 156 |
print ZFF join(" ", "Eterm ", $l->[0], $l->[1], $pname),"\n"; |
|---|
| 157 |
} |
|---|
| 158 |
else{ |
|---|
| 159 |
print ZFF join(" ", "Einit ", $l->[0], $l->[1], $pname),"\n"; |
|---|
| 160 |
} |
|---|
| 161 |
|
|---|
| 162 |
foreach my $e (@exons) { |
|---|
| 163 |
print ZFF join(" ", "Exon ", $e->[0], $e->[1], $pname),"\n"; |
|---|
| 164 |
} |
|---|
| 165 |
} |
|---|
| 166 |
} |
|---|
| 167 |
|
|---|
| 168 |
close(ZFF); |
|---|
| 169 |
close(DNA); |
|---|
| 170 |
|
|---|
| 171 |
|
|---|
| 172 |
|
|---|
| 173 |
sub is_hc { |
|---|
| 174 |
my $qi = shift @_; |
|---|
| 175 |
my $AED = shift @_; |
|---|
| 176 |
|
|---|
| 177 |
my @q = split(/\|/, $qi); |
|---|
| 178 |
my @qual = (@q[1..5],$q[8]); |
|---|
| 179 |
my $hc = 1; |
|---|
| 180 |
foreach my $i (0..4) { |
|---|
| 181 |
if ($qual[$i] < $thresh[$i]) { |
|---|
| 182 |
$hc = 0; |
|---|
| 183 |
} |
|---|
| 184 |
} |
|---|
| 185 |
|
|---|
| 186 |
$hc = 0 if($AED > $thrAED); |
|---|
| 187 |
|
|---|
| 188 |
return $hc; |
|---|
| 189 |
} |
|---|