Changeset 151
- Timestamp:
- 02/20/09 12:38:14 (9 months ago)
- Files:
-
- Apollo/gff3.tiers (modified) (1 diff)
- MPI/mpi_maker (modified) (4 diffs)
- bin/gff3_merge (modified) (7 diffs)
- bin/maker (modified) (3 diffs)
- lib/Bio/Search/Hit/PhatHit/Base.pm (modified) (1 diff)
- lib/CGL/TranslationMachine.pm (modified) (1 diff)
- lib/Dumper/GFF/GFFV3.pm (modified) (4 diffs)
- lib/Fasta.pm (modified) (1 diff)
- lib/GFFDB.pm (modified) (10 diffs)
- lib/GI.pm (modified) (10 diffs)
- lib/Process/MpiChunk.pm (modified) (6 diffs)
- lib/Widget/augustus.pm (modified) (3 diffs)
- lib/clean.pm (modified) (2 diffs)
- lib/cluster.pm (modified) (1 diff)
- lib/evaluator/funs.pm (modified) (1 diff)
- lib/evaluator/quality_index.pm (modified) (1 diff)
- lib/maker/auto_annotator.pm (modified) (31 diffs)
- lib/maker/join.pm (modified) (1 diff)
- lib/shadow_AED.pm (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Apollo/gff3.tiers
r89 r151 515 515 516 516 [Type] 517 label : GENMARK 518 tiername : Gene Prediction 519 datatype : genemark:dummy 520 datatype : GeneMark 521 datatype : pred_gff:GeneMark 522 glyph : DrawableResultFeatureSet 523 color : 103,155,104 524 column : SCORE 525 column : GENOMIC_RANGE 526 column : query_frame 527 column : GENOMIC_LENGTH 528 sortbycolumn : GENOMIC_RANGE 529 530 531 [Type] 517 532 label : SNAP 518 533 tiername : Gene Prediction MPI/mpi_maker
r146 r151 242 242 my $root = 0; #define root node (only changed for debugging) 243 243 244 #---Process options on the command line 245 try{ 246 GetOptions("RM_off|R" => \$OPT{R}, 247 "force|f" => \$OPT{force}, 248 "genome|g=s" => \$OPT{genome}, 249 "cpus|c=i" => \$OPT{cpus}, 250 "predictor=s" =>\$OPT{predictor}, 251 "retry=i" =>\$OPT{retry}, 252 "quiet" =>\$main::quiet, 253 "CTL" => sub {GI::generate_control_files() if($rank == $root); MPI_Finalize(); exit(0);}, 254 "help|?" => sub {print $usage if($rank == $root); MPI_Finalize(); exit(0)} 255 ); 256 } 257 catch Error::Simple with{ 258 my $E = shift; 259 260 print STDERR $E->{-text}; 261 die "\n\nMaker failed parsing command line options!!\n\n"; 262 }; 263 244 264 #-------------------------------------- 245 265 #---------PRIMARY MPI PROCCESS--------- … … 248 268 #--check if root node 249 269 if ($rank == $root) { 250 251 try{252 #--Process arguments and the command line253 GetOptions("RM_off|R" => \$OPT{R},254 "force|f" => \$OPT{force},255 "genome|g=s" => \$OPT{genome},256 "cpus|c=i" => \$OPT{cpus},257 "predictor=s" =>\$OPT{predictor},258 "retry=i" =>\$OPT{retry},259 "quiet" =>\$main::quiet,260 "CTL" => sub {GI::generate_control_files(); exit(0);},261 "help|?" => sub {print $usage; exit(0)}262 );263 }264 catch Error::Simple with{265 my $E = shift;266 267 print STDERR $E->{-text};268 die "\n\nMaker failed parsing command line options!!\n\n";269 };270 271 270 #varibles that are persistent outside of try 272 271 my %CTL_OPT; … … 383 382 if($t_need_flag > 0){ 384 383 my $tier; 385 while (my $fasta = $iterator->next Entry() || shift @failed){384 while (my $fasta = $iterator->nextFasta() || shift @failed){ 386 385 $tier = Process::MpiTiers->new({fasta =>$fasta, 387 386 CTL_OPT => \%CTL_OPT, … … 449 448 my $tier; 450 449 451 while (my $fasta = $iterator->next Entry() || shift @failed){450 while (my $fasta = $iterator->nextFasta() || shift @failed){ 452 451 $tier = Process::MpiTiers->new({fasta => $fasta, 453 452 CTL_OPT => \%CTL_OPT, bin/gff3_merge
r127 r151 1 #! /usr/ local/perl -w1 #! /usr/bin/perl -w 2 2 3 3 use strict; … … 11 11 my $outfile; 12 12 13 GetOptions ("datastor|d " => \$datastore,13 GetOptions ("datastor|d=s" => \$datastore, 14 14 "i=s" => \@files, 15 15 "o=s" => \$outfile, … … 17 17 ); 18 18 19 if( ! $datastore && ! @files){19 if((! $datastore && ! @files) || ! $outfile){ 20 20 print $usage; 21 21 exit(); … … 26 26 open(IN, "< $datastore"); 27 27 28 @files = <IN>; 29 @files = grep (/FINISHED/, @files); 28 #uniq the entries 29 my %entries; 30 @entries{@{[<IN>]}} = (); 31 32 foreach my $e (keys %entries){ 33 next unless ($e =~ /FINISHED/); 34 chomp $e; 35 my ($id, $dir, $status) = split("\t", $e); 36 push(@files, $dir); 37 } 30 38 31 39 foreach my $file (@files){ … … 35 43 } 36 44 45 @files = sort @files; 46 47 open(my $GFF, "> $outfile"); 48 print $GFF "\#\#gff-version 3\n"; 49 37 50 my ($ANN, $ann_file) = tempfile(); 38 print $ANN "\#\#gff-version 3\n";39 51 my ($FAS, $fas_file) = tempfile(); 40 52 print $FAS "\#\#FASTA\n"; 41 53 54 my %uniq; 42 55 foreach my $file (@files){ 43 56 die "ERROR: The file \'$file\' does not exist\n" if (! -e $file); … … 48 61 while (defined(my $line = <IN>)){ 49 62 next if ($line =~ /^\#\#gff-version 3/); 63 if($line =~ /^\#\#genome-build/){ 64 next if exists $uniq{$line}; 65 $uniq{$line}++; 66 print $GFF $line; 67 next; 68 } 69 if($line =~ /^\#\#sequence-region/){ 70 die "ERROR: This contig has already been added\: $line\n" if exists $uniq{$line}; 71 $uniq{$line}++; 72 print $GFF $line; 73 next; 74 } 50 75 if ($line =~ /^\#\#FASTA/){ 51 76 $FH = $FAS; … … 56 81 } 57 82 } 58 83 close($GFF); 59 84 close($ANN); 60 85 close($FAS); 61 86 62 system (" mv $ann_file$outfile");87 system ("cat $ann_file >> $outfile"); 63 88 system ("cat $fas_file >> $outfile"); 89 unlink("$ann_file"); 64 90 unlink("$fas_file"); bin/maker
r146 r151 7 7 use lib "$FindBin::Bin/../lib"; 8 8 use vars qw($LOG $RANK); 9 use Error qw(:warndie); 9 10 10 BEGIN{ 11 11 $RANK = 'non_mpi'; #default value of rank … … 24 24 }; 25 25 26 #supress warnings from storable module 27 $SIG{'__WARN__'} = sub { 28 warn $_[0] if ( $_[0] !~ /Not a CODE reference/ && 29 $_[0] !~ /Can\'t store item CODE/ 30 ); 31 }; 32 26 33 #output to log file of seq that caused rank to die 27 34 $SIG{'__DIE__'} = sub { 28 if (defined ($LOG) && defined$_[0]) {35 if (defined ($LOG) && $_[0]) { 29 36 my $die_count = $LOG->get_die_count(); 30 37 $die_count++; … … 514 521 $pred_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, 515 522 $q_seq_ref, 516 ' repeat'523 'pred' 517 524 ); 518 525 } lib/Bio/Search/Hit/PhatHit/Base.pm
r127 r151 462 462 $hstr{ $h }++; 463 463 } 464 my $qstr = join( '/', sort keys %qstr); 465 my $hstr = join( '/', sort keys %hstr); 464 #changed 2/12/2009 to act like current version of Bioperl 465 my ($qstr) = sort {$qstr{$b} <=> $qstr{$a}} keys %qstr; 466 my ($hstr) = sort {$hstr{$b} <=> $hstr{$a}} keys %hstr; 466 467 if ($seqType =~ /list|array/i) { 467 468 return ($qstr, $hstr); lib/CGL/TranslationMachine.pm
r89 r151 179 179 ################################################ subroutine header begin ## 180 180 181 =head2 longest_translation 182 183 Usage : 184 185 =for example 186 $seq = "atgaaaaaauaa"; 187 188 =for example begin 189 190 use CGL::TranslationMachine; 191 my $t = new CGL::TranslationMachine; 192 my ($longest_orf,$offset) = $t->longest_translation($seq); 193 194 =for example end 195 196 =for example_testing 197 is($longest_orf, "MKK", "Did it find the right orf?"); 198 is($offset, 0, "Did it get the offset right?"); 199 200 Purpose : Find the longest open reading frame in any of three 201 frames in the input sequence. 202 Returns : A list of the amino acid sequence of the longest 203 open reading frame and the offset at which it begins in 204 the input sequence. 205 Arguments : A translatable sequence. 206 Throws : 207 Comments : 208 : 209 See Also : Bio::Tools::CodonTable 210 211 =cut 212 213 ################################################## subroutine header end ## 214 215 sub longest_translation_plus_stop { 216 my $self = shift; 217 my $seq = shift; 218 my $seqlength = length($seq); 219 220 my ($longest,$longest_len,$offset,@frames) = ('',0,0); 221 for( my $i = 0; $i < FRAMES; $i++ ) { 222 my $counter = $i; 223 foreach my $orf ( split(/\*/,$self->translate(substr($seq,$i))) ) { 224 my $orflen = length($orf); 225 if( $orflen > $longest_len ) { 226 $longest = $orf; 227 $offset = $counter; 228 $longest_len = $orflen; 229 my $c_start = ($orflen * 3) + $offset; 230 my $codon = ($c_start < $seqlength) ? substr($seq, $c_start, 3) : ''; 231 if(length($codon) == 3 && $self->translate($codon) eq '*'){ 232 $longest .= '*'; 233 } 234 } 235 # multi by 3 for aa->codon 236 # 1 for the stop codon (included 237 # in seq but not in length) 238 $counter += ($orflen + 1)*3; 239 } 240 } 241 242 return ($longest,$offset); 243 } 244 245 ################################################ subroutine header begin ## 246 181 247 =head2 translate_from_offset 182 248 lib/Dumper/GFF/GFFV3.pm
r146 r151 10 10 use PostData; 11 11 use PhatHit_utils; 12 use File::Copy; 12 13 13 14 @ISA = qw( … … 113 114 114 115 system("cat $seq_f >> $ann_f"); 115 unlink( "$seq_f");116 system("mv $ann_f $gff_f");116 unlink($seq_f); 117 move($ann_f, $gff_f); 117 118 118 119 die "ERROR: GFF3 file not created\n" if(! -e $gff_f); … … 440 441 my $type; 441 442 if ($class =~ /^blastx$/i){ 442 $type = $k eq 'hit' ? 'protein_match' : 'match_part'; 443 }elsif ($class =~ /^tblastx$/i){ 444 $type = $k eq 'hit' ? 'translated_nucleotide_match' : 'match_part'; 445 } 446 elsif ($class =~ /protein2genome/i){ 443 $type = $k eq 'hit' ? 'protein_match' : 'match_part'; 444 } 445 elsif ($class =~ /^protein2genome$/i){ 447 446 $type = $k eq 'hit' ? 'protein_match' : 'match_part'; 448 447 } 449 elsif ($class =~ /est2genome/i){ 450 $type = $k eq 'hit' ? 'expressed_sequence_match' : 'match_part'; 448 elsif($class =~ /^protein_gff\:/i){ 449 $type = $k eq 'hit' ? 'protein_match' : 'match_part'; 450 } 451 elsif ($class =~ /^tblastx$/i){ 452 $type = $k eq 'hit' ? 'translated_nucleotide_match' : 'match_part'; 453 } 454 elsif ($class =~ /^altest_gff/i){ 455 $type = $k eq 'hit' ? 'translated_nucleotide_match' : 'match_part'; 456 } 457 elsif ($class =~ /^est2genome$/i){ 458 $type = $k eq 'hit' ? 'expressed_sequence_match' : 'match_part'; 451 459 } 452 460 elsif ($class =~ /^blastn$/i){ 453 $type = $k eq 'hit' ? 'expressed_sequence_match' : 'match_part' ; 454 } 455 elsif ($class =~ /snap/i){ 456 $type = $k eq 'hit' ? 'match' : 'match_part' ; 457 $class= 'snap'; 458 } 459 elsif ($class =~ /augustus/i){ 460 $type = $k eq 'hit' ? 'match' : 'match_part' ; 461 $class= 'augustus'; 462 } 463 elsif ($class =~ /fgenesh/i){ 464 $type = $k eq 'hit' ? 'match' : 'match_part' ; 465 $class= 'fgenesh'; 466 } 467 elsif ($class =~ /twinscan/i){ 468 $type = $k eq 'hit' ? 'match' : 'match_part' ; 469 $class= 'twinscan'; 470 } 471 elsif ($class =~ /repeat/i){ 472 $type = $k eq 'hit' ? 'match' : 'match_part'; 473 $class= 'repeatmasker'; 461 $type = $k eq 'hit' ? 'expressed_sequence_match' : 'match_part' ; 462 } 463 elsif ($class =~ /^est_gff\:/i){ 464 $type = $k eq 'hit' ? 'expressed_sequence_match' : 'match_part'; 465 } 466 elsif ($class =~ /^snap$/i){ 467 $type = $k eq 'hit' ? 'match' : 'match_part' ; 468 } 469 elsif ($class =~ /^augustus$/i){ 470 $type = $k eq 'hit' ? 'match' : 'match_part' ; 471 } 472 elsif ($class =~ /^fgenesh$/i){ 473 $type = $k eq 'hit' ? 'match' : 'match_part' ; 474 } 475 elsif ($class =~ /^twinscan$/i){ 476 $type = $k eq 'hit' ? 'match' : 'match_part' ; 477 } 478 elsif ($class =~ /^jigsaw$/i){ 479 $type = $k eq 'hit' ? 'match' : 'match_part' ; 480 } 481 elsif ($class =~ /^pred_gff\:/i){ 482 $type = $k eq 'hit' ? 'match' : 'match_part' ; 483 } 484 elsif ($class =~ /^repeat_gff\:/i){ 485 $type = $k eq 'hit' ? 'match' : 'match_part'; 486 } 487 elsif ($class =~ /^repeat/i){ 488 $type = $k eq 'hit' ? 'match' : 'match_part'; 474 489 } 475 490 else { 476 die "unknown class in GFFV3::get_class_and_type $class ".ref($h)."\n";491 die "unknown class in GFFV3::get_class_and_type $class ".ref($h)."\n"; 477 492 } 478 493 … … 686 701 push(@data, $seq_id, 'maker', 'mRNA', $t_b, $t_e, '.', $t_s, '.'); 687 702 my $nine = 'ID='.$t_id.';Parent='.$g_id.';Name='.$t_name; 688 $nine .= ' ;aed='.$AED.'eval_score='.$score;703 $nine .= ''; 689 704 $nine .= ';'.$t_hit->{-attrib} if($t_hit->{-attrib}); 690 705 push(@data, $nine); lib/Fasta.pm
r143 r151 149 149 150 150 my $def = Fasta::getDef($fasta_ref); 151 my $seq_id = Def2SeqID($def);151 my $seq_id = def2SeqID($def); 152 152 153 153 return $seq_id; lib/GFFDB.pm
r144 r151 371 371 372 372 $data[1] = "$tag:$data[1]" if($tag && $data[1] !~ /^$tag\:/); 373 $data[6] = '+' if($data[6] =~ /^\.$|^0$/); #fixes some repeat entries 373 374 374 375 my %l = (seqid => $data[0], … … 443 444 } 444 445 elsif($h_type eq 'pred'){ 445 $structs = _get_structs($features, $seq_ref); 446 $structs = _get_genes($features, $seq_ref); 447 my $structs2 = _get_structs($features, $seq_ref); 448 push(@$structs, @$structs2); 446 449 } 447 450 elsif($h_type eq 'other'){ … … 600 603 601 604 my $tran_id = $t->{id}; 602 my $tran_name = $t->{name};; 603 $tran_name =~ s/\s*QI\:([\d\|\.-]+)$//; 605 my $tran_name = $t->{name}; 606 ($tran_name) = $tran_name =~ /(^[^\s]+)/; 607 $tran_name =~ s/(mRNA\-\d+)\-AED\:.*/$1/; 604 608 605 609 my $description = "g_name=$gene_name;g_id=$gene_id;t_name=$tran_name;t_id=$tran_id" unless(! $gene_name); … … 678 682 my $hit_strand = 1; 679 683 my $hit_name = $t->{name}; 680 $hit_name =~ s/\s*QI\:([\d\|\.-]+)$//; 681 684 ($hit_name) = $hit_name =~ /(^[^\s]+)/; 685 $hit_name =~ s/(mRNA\-\d+)\-AED\:.*/$1/; 686 682 687 foreach my $e (@{$t->{cdss}}){ 683 688 my @args; … … 786 791 my $hit_strand = 1; 787 792 my $hit_name = $t->{name}; 788 $hit_name =~ s/\s*QI\:([\d\|\.-]+)$//;789 793 ($hit_name) = $hit_name =~ /(^[^\s]+)/; 794 $hit_name =~ s/(mRNA\-\d+)\-AED\:.*/$1/; 790 795 791 796 foreach my $e (@{$t->{exons}}){ … … 907 912 my $cdss = _grab(['CDS'], $features); 908 913 my $mRNAs = _grab(['mRNA'], $features); 914 my $UTRs = _grab(['five_prime_UTR', 'three_prime_UTR'], $features); 909 915 910 916 foreach my $p_id (keys %{$mRNAs}){ … … 926 932 my $id=_get_annotation($f,'ID'); 927 933 my $name=_get_annotation($f,'Name'); 934 935 #take care of wormbases incorrect parentage 936 if(exists $exons->{$id}){ 937 foreach my $mRNA (@{$mRNAs->{$id}}){ 938 my $t_id = $mRNA->{id}; 939 $mRNA->{exons} = _fix_wormbase($exons->{$id}, $cdss->{$t_id}, $UTRs->{$t_id}); 940 } 941 } 928 942 929 943 push(@genes, {'f' => $f, … … 949 963 } 950 964 #------------------------------------------------------------------------------- 965 #try and discover the exon parantage for wormbase entries 966 sub _fix_wormbase { 967 my $exons = shift || []; 968 my $cdss = shift || []; 969 my $UTRs = shift || []; 970 971 my @keepers; 972 foreach my $exon (@$exons){ 973 my $e = $exon->{f}; 974 my $eB = $e->start; 975 my $eE = $e->end; 976 977 my $ok = 0; 978 my $okB = 0; 979 my $okE = 0; 980 981 #check if exon goes with this CDS 982 foreach my $piece (@$cdss){ 983 my $p = $piece->{f}; 984 my $pB = $p->start; 985 my $pE = $p->end; 986 987 if($pB == $eB && $pE == $eE){ 988 $ok = 1; 989 } 990 elsif($pB == $eB && $pE != $eE){ 991 $okB = 1; 992 } 993 elsif($pB != $eB && $pE == $eE){ 994 $okE = 1; 995 } 996 997 last if($ok); 998 } 999 1000 if($ok){ 1001 push(@keepers, $exon); 1002 next; 1003 } 1004 1005 #check if exon goes with or is completed by this UTR 1006 foreach my $piece (@$UTRs){ 1007 my $p = $piece->{f}; 1008 my $pB = $p->start; 1009 my $pE = $p->end; 1010 1011 if($pB == $eB && $pE == $eE){ 1012 $ok = 1; 1013 } 1014 elsif($pB == $eB && $pE != $eE){ 1015 $okB = 1; 1016 $ok = 1 if($okE); 1017 } 1018 elsif($pB != $eB && $pE == $eE){ 1019 $okE = 1; 1020 $ok = 1 if($okB); 1021 } 1022 1023 last if($ok); 1024 } 1025 1026 if($ok){ 1027 push(@keepers, $exon); 1028 next; 1029 } 1030 } 1031 1032 return \@keepers; 1033 } 1034 #------------------------------------------------------------------------------- 951 1035 #try too build a sructure similar to that producd by _get_genes 952 1036 #but for non gene hits in the gff3 … … 959 1043 foreach my $f (@{$features}){ 960 1044 my $tag_t = $f->primary_tag(); 1045 1046 next if($tag_t =~ /^gene$|^mRNA$|^exon$|^CDS$/); 1047 961 1048 my $id = _get_annotation($f,'ID'); 962 1049 my $name = _get_annotation($f, 'Name'); … … 1266 1353 1267 1354 my @sorted; 1268 if ($t->{f}->strand() == 1){ 1355 1356 if ($t->{f}->strand() == 1){ 1269 1357 @sorted = 1270 1358 sort {$a->{f}->start <=> $b->{f}->start} @{$t->{exons}}; lib/GI.pm
r146 r151 13 13 use URI::Escape; 14 14 use File::Path; 15 use File::Copy; 15 16 use Data::Dumper; 16 17 use Getopt::Long; … … 1046 1047 sub build_fasta_index { 1047 1048 my $db = shift; 1048 print "\n\n".$db."\n\n";1049 1049 my $index = new Bio::DB::Fasta($db); 1050 1050 return $index; … … 1069 1069 1070 1070 1071 if ($command =~ /xdformat $/) {1071 if ($command =~ /xdformat/) { 1072 1072 if (($type eq 'blastn' && ! -e $file.'.xnd') || 1073 1073 ($type eq 'blastx' && ! -e $file.'.xpd') || … … 1083 1083 } 1084 1084 } 1085 elsif ($command =~ /formatdb $/) {1085 elsif ($command =~ /formatdb/) { 1086 1086 if (($type eq 'blastn' && ! -e $file.'.nsq') || 1087 1087 ($type eq 'blastx' && ! -e $file.'.psq') || … … 1098 1098 } 1099 1099 else { 1100 die "ERROR: databases can only be formated by xdformat or formatdb \n";1100 die "ERROR: databases can only be formated by xdformat or formatdb not \'$command\'\n"; 1101 1101 } 1102 1102 } … … 1142 1142 #copy db to local tmp dir and run xdformat or formatdb 1143 1143 if (! @{[<$tmp_db.xn?*>]} && (! -e $blast_finished || $opt_f) ) { 1144 system("cp $db $tmp_db");1144 copy($db, $tmp_db); 1145 1145 dbformat($formater, $tmp_db, 'blastn'); 1146 1146 } … … 1391 1391 #copy db to local tmp dir and run xdformat or format db 1392 1392 if (! @{[<$tmp_db.xp?*>]} && (! -e $blast_finished || $opt_f) ) { 1393 system("cp $db $tmp_db");1394 dbformat($formater, $tmp_db, 'blastx');1393 copy($db, $tmp_db); 1394 dbformat($formater, $tmp_db, 'blastx'); 1395 1395 } 1396 1396 elsif (-e $blast_finished && ! $opt_f) { … … 1652 1652 #copy db to local tmp dir and run xdformat or formatdb 1653 1653 if (! @{[<$tmp_db.xn?*>]} && (! -e $blast_finished || $opt_f) ) { 1654 system("cp $db $tmp_db");1654 copy($db, $tmp_db); 1655 1655 dbformat($formater, $tmp_db, 'tblastx'); 1656 1656 } … … 2495 2495 print OUT "fgenesh:$O{fgenesh} #location of fgenesh executable\n"; 2496 2496 # print OUT "twinscan:$O{twinscan} #location of twinscan executable\n"; 2497 print OUT "fathom:$O{fathom} #location of fathom executable\n";2497 # print OUT "fathom:$O{fathom} #location of fathom executable\n"; 2498 2498 print OUT "\n"; 2499 2499 print OUT "#-----Other Algorithms\n"; … … 2514 2514 print OUT "eva_hspmax:$O{eva_hspmax}\n"; 2515 2515 print OUT "eva_gspmax:$O{eva_gspmax}\n"; 2516 print OUT "enable_fathom:$O{enable_fathom}\n";2516 # print OUT "enable_fathom:$O{enable_fathom}\n"; 2517 2517 close (OUT); 2518 2518 } lib/Process/MpiChunk.pm
r146 r151 5 5 use strict; 6 6 7 use Error qw(:try );7 use Error qw(:try :warndie); 8 8 use Error::Simple; 9 9 use Storable; … … 623 623 q_def 624 624 masked_total_seq 625 fasta 625 626 LOG 626 627 CTL_OPT} … … 633 634 my $q_def = $VARS->{q_def}; 634 635 my $masked_total_seq = $VARS->{masked_total_seq}; 636 my $fasta = $VARS->{fasta}; 635 637 my $the_void = $VARS->{the_void}; 636 638 my $safe_seq_id = $VARS->{safe_seq_id}; … … 640 642 my $masked_fasta = Fasta::toFasta($q_def.' masked', \$masked_total_seq); 641 643 my $masked_file = $the_void."/query.masked.fasta"; 642 FastaFile::writeFile(\$masked_fasta ,$masked_file);644 FastaFile::writeFile(\$masked_fasta, $masked_file); 643 645 646 my $unmasked_file = $the_void."/query.fasta"; 647 FastaFile::writeFile(\$fasta, $unmasked_file); 648 644 649 #==ab initio predictions here 645 my $preds = GI::abinits($masked_file, 646 $the_void, 647 $safe_seq_id, 648 \%CTL_OPT, 649 $LOG 650 ); 651 650 my $masked_preds = GI::abinits($masked_file, 651 $the_void, 652 $safe_seq_id, 653 \%CTL_OPT, 654 $LOG 655 ); 656 657 my $preds = GI::abinits($unmasked_file, 658 $the_void, 659 $safe_seq_id, 660 \%CTL_OPT, 661 $LOG 662 ); 663 664 push(@$preds, @$masked_preds); 665 652 666 #==QRNA noncoding RNA prediction here 653 667 my $qra_preds = []; … … 1156 1170 $pred_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, 1157 1171 $q_seq_ref, 1158 ' repeat'1172 'pred' 1159 1173 ); 1160 1174 } … … 1669 1683 1670 1684 #-- write to DS log the finished files 1671 $DS_CTL->add_entry($seq_id, $out_dir, 'FINISHED');1685 #$DS_CTL->add_entry($seq_id, $out_dir, 'FINISHED'); 1672 1686 1673 1687 #--- clear the log variable lib/Widget/augustus.pm
r127 r151 359 359 $this_gene{score} = $score; 360 360 } 361 if ($fields[1] eq 'AUGUSTUS' && $fields[2] eq 'CDS' ){361 elsif ($fields[1] eq 'AUGUSTUS' && $fields[2] eq 'CDS' ){ 362 362 my $strand = $fields[6] eq '+' ? 1 : -1; 363 363 … … 394 394 }; 395 395 396 }397 elsif ($fields[1] eq 'AUGUSTUS' && $fields[2] eq 'gene'){398 396 } 399 397 elsif ($fields[1] eq 'AUGUSTUS' && $fields[2] eq 'transcript'){ … … 456 454 my @stuff = split(/\n/, $line); 457 455 456 my $t_count = grep {/^[^\#]/ && /^[^\s]+\s+AUGUSTUS\s+transcript\s+/} @stuff; 457 458 #augustus can call multiple transcripts for inconsistent hints 459 if($t_count > 1){ 460 my @multi_stuff; 461 my $count; 462 foreach my $l (@stuff){ 463 if($l =~ /^\#/){ #maybe important someday 464 foreach my $s (@multi_stuff){ 465 push(@$s, $l); 466 } 467 } 468 elsif($l =~ /^[^\s]+\s+AUGUSTUS\s+gene\s+/){ 469 foreach my $s (@multi_stuff){ 470 push(@$s, $l); 471 } 472 } 473 elsif($l =~ /^[^\s]+\s+AUGUSTUS\s+transcript\s+/){ 474 $count++; 475 push(@{$multi_stuff[$count-1]}, $l); 476 } 477 elsif($l =~ /^[^\s]+\s+AUGUSTUS\s+[^\s]+\s+/){ 478 push(@{$multi_stuff[$count-1]}, $l); 479 } 480 } 481 482 foreach my $s (@multi_stuff){ 483 my $gene = parse_gene($s); 484 push(@genes, $gene); 485 } 486 next; 487 } 488 458 489 my $gene = parse_gene(\@stuff); 459 490 push(@genes, $gene); lib/clean.pm
r127 r151 19 19 #------------------------------------------------------------------------ 20 20 #--------------------------- FUNCTIONS ---------------------------------- 21 #------------------------------------------------------------------------ 22 sub purge_short_single_exons { 23 my $phat_hits = shift; 24 my $min = shift; 25 26 my @keepers; 27 foreach my $hit (@{$phat_hits}){ 28 if ($hit->num_hsps == 1){ 29 push(@keepers, $hit) if($hit->{_hsps}->[0]->length('query') >= $min); 30 } 31 else{ 32 push(@keepers, $hit); 33 } 34 } 35 36 return \@keepers 37 } 21 38 #------------------------------------------------------------------------ 22 39 sub throw_out_bad_splicers { … … 58 75 sub alt_sort { 59 76 $b->num_hsps <=> $a->num_hsps || $b->length <=> $a->length; 77 } 78 #------------------------------------------------------------------------ 79 sub is_identical_form { 80 my $hit1 = shift; 81 my $hit2 = shift; 82 83 my @hsps1 = sort {$a->start('query') <=> $b->start('query')} $hit1->hsps; 84 my @hsps2 = sort {$a->start('query') <=> $b->start('query')} $hit2->hsps; 85 86 return if (@hsps1 != @hsps2); 87 88 for(my $i = 0; $i < @hsps1; $i++){ 89 return if($hsps1[$i]->start('query') != $hsps2[$i]->start('query')); 90 return if($hsps1[$i]->end('query') != $hsps2[$i]->end('query')); 91 } 92 93 return 1; 60 94 } 61 95 #------------------------------------------------------------------------ lib/cluster.pm
r127 r151 443 443 my $i = 0; 444 444 foreach my $c (keys %{$map}){ 445 next if(! defined $map->{$c}); 445 446 foreach my $m (@{$map->{$c}}){ 446 447 my $hit = $lookup{$m}; lib/evaluator/funs.pm
r130 r151 493 493 494 494 my $seq = $box->{transcription_seq}; 495 my $length = length($$seq); 496 495 497 my $first_codon = uc(substr($$seq, $box->{translation_offset}, 3)); 496 498 my $last_codon = uc(substr($$seq, $box->{translation_end}-4, 3)); lib/evaluator/quality_index.pm
r130 r151 258 258 259 259 my $l_trans = get_length_protein($t); 260 ; 260 261 261 my ($length_5, $length_3) = get_length_of_utrs($t); 262 262 lib/maker/auto_annotator.pm
r147 r151 24 24 use evaluator::AED; 25 25 use shadow_AED; 26 use Storable qw(dclone); 27 28 $Storable::forgive_me = 1; 26 29 27 30 @ISA = qw( … … 44 47 #only ESTs from splice concious algorithms i.e. no blastn 45 48 my $clean_est = get_selected_types($est_hits,'est2genome', 'est_gff'); 49 my $clean_altest = $alt_est_hits; 46 50 if ($single_exon == 1) { 47 $clean_est = $clean_est;51 #do nothing 48 52 }else { 49 53 # don't use unpliced single exon ESTs-- usually genomic contamination 50 54 $clean_est = clean::purge_single_exon_hits($clean_est); 55 $clean_altest = clean::purge_single_exon_hits($clean_altest); 51 56 } 52 57 53 58 # throw out the exonerate est hits with weird splice sites 54 59 $clean_est = clean::throw_out_bad_splicers($clean_est, $seq); 55 56 60 57 61 # combine puts type in order they are given … … 63 67 $prot_hits, 64 68 $clean_est, 65 $ alt_est_hits69 $clean_altest 66 70 ); 67 71 … … 70 74 my $p_clusters = cluster::shadow_cluster(0, $seq, $p, 10); 71 75 my $m_clusters = cluster::shadow_cluster(0, $seq, $m, 10); 76 77 #purge after clustering so as to still have the effect of evidence joining ESTs 78 $p_clusters = purge_short_ESTs_in_clusters($p_clusters, 250); 79 $m_clusters = purge_short_ESTs_in_clusters($m_clusters, 250); 80 72 81 my $careful_clusters = []; 73 82 push(@{$careful_clusters}, @{$p_clusters}); 74 83 push(@{$careful_clusters}, @{$m_clusters}); 75 84 76 85 # identify the ab-inits that fall within and between clusters 77 86 my ($c_index, $hit_one, $hit_none, $hit_mult) = segment_preds($predictions, … … 83 92 } 84 93 94 #--new clusters built by merging clusters that overlap preds 95 #--used for ab-initio coring 96 #--must be ran before merge as merge alters cluster content 97 my $pred_clusters = join_clusters_on_pred($predictions, $careful_clusters, $c_index); 98 99 #--split preds across clusters that they overlap 85 100 merge_into_cluster($hit_one, $careful_clusters, $c_index); 86 101 merge_into_cluster($hit_mult, $careful_clusters, $c_index); 87 102 103 #--data prep 88 104 my $c_id = 0; 89 105 my @bx_data; … … 100 116 $c_id++; 101 117 } 102 #--- end c_bag103 118 104 119 #--- processing for scoring ab-initio predictions 105 120 my @pr_data; 106 if(@{$predictions}){ 107 my $p_bag = combine($prot_hits, 108 $clean_est, 109 $alt_est_hits, 110 $predictions 111 ); 112 113 ($p, $m, $x, $z) = PhatHit_utils::seperate_by_strand('query', $p_bag); 114 $p_clusters = cluster::shadow_cluster(0, $seq, $p, 10); 115 $m_clusters = cluster::shadow_cluster(0, $seq, $m, 10); 116 my $careful_p_clusters = []; 117 push(@{$careful_p_clusters}, @{$p_clusters}); 118 push(@{$careful_p_clusters}, @{$m_clusters}); 119 120 foreach my $c (@{$careful_p_clusters}){ 121 my $pr = prep_pred_data($c, $c_id, $seq); 122 push(@pr_data, @{$pr}) if defined $pr; 123 124 $c_id++; 125 } 126 } 127 #--- end p_bag 121 foreach my $c (@{$pred_clusters}){ 122 my $pr = prep_pred_data($c, $c_id, $seq); 123 push(@pr_data, @{$pr}) if defined $pr; 124 125 $c_id++; 126 } 128 127 129 128 return (\@bx_data, \@gf_data, \@pr_data); … … 229 228 } 230 229 } 230 } 231 #------------------------------------------------------------------------ 232 sub join_clusters_on_pred { 233 my $hits = shift; 234 my $clusters = shift; 235 my $c_index = shift; 236 237 my @p_clusters; 238 239 foreach my $s (@{$hits}, ){ 240 my @keys = keys %{$c_index->{$s->{temp_id}}}; 241 my @new_c = ($s); 242 foreach my $i (@keys){ 243 push(@new_c, @{$clusters->[$i]}); 244 } 245 push(@p_clusters, \@new_c); 246 } 247 248 return \@p_clusters; 231 249 } 232 250 #------------------------------------------------------------------------ … … 252 270 # } 253 271 #------------------------------------------------------------------------ 272 sub purge_short_ESTs_in_clusters{ 273 my $clusters = shift; 274 my $min = shift; 275 276 my @c_keepers; 277 foreach my $c (@$clusters){ 278 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff'); 279 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 280 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); 281 my $alt_ests_in_cluster = get_selected_types($c,'tblastx', 'altest_gff'); 282 my $models_in_cluster = get_selected_types($c,'model_gff', 'maker'); 283 my $preds_in_cluster = get_selected_types($c,'snap', 'augustus', 'fgenesh', 'twinscan', 'pred_gff'); 284 285 $ests_in_cluster = clean::purge_short_single_exons($ests_in_cluster, $min); 286 $alt_ests_in_cluster = clean::purge_short_single_exons($alt_ests_in_cluster, $min); 287 288 my @new_c; 289 push(@new_c, @$ests_in_cluster); 290 push(@new_c, @$alt_ests_in_cluster); 291 push(@new_c, @$ps_in_cluster); 292 push(@new_c, @$bx_in_cluster); 293 push(@new_c, @$models_in_cluster); 294 295 if(@new_c){ 296 push(@new_c, @$preds_in_cluster); 297 push(@c_keepers, \@new_c); 298 } 299 } 300 301 return \@c_keepers; 302 } 303 #------------------------------------------------------------------------ 254 304 #returns an array of hashes with the following atributes 255 305 #ests => set of all ests … … 265 315 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 266 316 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); 267 my $alt_ests_in_cluster = get_selected_types($c,'tblastx' );317 my $alt_ests_in_cluster = get_selected_types($c,'tblastx', 'altest_gff'); 268 318 my $models_in_cluster = get_selected_types($c,'model_gff', 'maker'); 269 319 my $preds_in_cluster = get_selected_types($c,'snap', 'augustus', 'fgenesh', 'twinscan', 'pred_gff'); 270 320 271 my @single = grep {! $_->{_hit s_multi}} @{$preds_in_cluster};321 my @single = grep {! $_->{_hit_multi}} @{$preds_in_cluster}; 272 322 273 323 # groups of most informative protein hits … … 327 377 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 328 378 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); 329 my $alt_ests_in_cluster = get_selected_types($c,'tblastx' );379 my $alt_ests_in_cluster = get_selected_types($c,'tblastx', 'altest_gff'); 330 380 my $preds_in_cluster = get_selected_types($c,'snap', 'augustus', 'fgenesh', 'twinscan', 'pred_gff'); 331 381 332 my @single = grep {! $_->{_hit s_multi}} @{$preds_in_cluster};382 my @single = grep {! $_->{_hit_multi}} @{$preds_in_cluster}; 333 383 334 384 # groups of most informative protein hits … … 368 418 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 369 419 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); 370 my $alt_ests_in_cluster = get_selected_types($c,'tblastx' );420 my $alt_ests_in_cluster = get_selected_types($c,'tblastx', 'altest_gff'); 371 421 my $preds_in_cluster = get_selected_types($c,'snap', 'augustus', 'fgenesh', 'twinscan', 'pred_gff'); 372 422 … … 514 564 } 515 565 516 #--- gene prediction here566 #---hint based gene prediction here 517 567 foreach my $prdr (@{$CTL_OPTIONS->{_predictor}}){ 518 next if($prdr eq 'gff' );568 next if($prdr eq 'gff' || $prdr eq 'abinit'); 519 569 my $transcripts = run_it($bx_data, 520 570 $the_void, … … 538 588 539 589 $annotations{$prdr} = $annot; 540 push(@bag, @{$annot});541 590 } 542 591 … … 562 611 ); 563 612 564 #identify overlapping and non-overlapping abinits 565 my ($hit_some, $hit_none) = abinit_annotation_overlap($all_ab, \@bag); 566 567 $annotations{'abinit'} = $hit_some; #only overlapping abinits 568 $annotations{'non_overlap'} = $hit_none; #only non overlapping 613 $annotations{'abinit'} = $all_ab; 569 614 570 615 return \%annotations; 616 } 617 #------------------------------------------------------------------------ 618 sub get_n_count{ 619 my $g = shift; 620 621 my ($start, $end) = ($g->{g_start}, $g->{g_end}); 622 my $offset = $start; 623 my $length = $end - $start + 1; 624 my @b_seq = map {0} (1..$length); 625 626 foreach my $s (@{$g->{t_structs}}){ 627 my $hit = $s->{hit}; 628 my @hsps = $hit->hsps() if defined $hit->hsps(); 629 foreach my $hsp (@hsps){ 630 my $s = $hsp->start('query') - $offset; 631 my $e = $hsp->end('query') - $offset; 632 633 #array space coors 634 die "ERROR: Start value not permited!!\n" if($s >= $length || $s < 0); 635 die "ERROR: End value not permited!!\n" if($e < 0 || $e >= $length); 636 637 @b_seq[$s..$e] = map {1} ($s..$e); 638 } 639 } 640 641 my @index = (0, 0); 642 643 foreach my $i (@b_seq){ 644 $index[$i]++; 645 } 646 647 return $index[1]; 571 648 } 572 649 #------------------------------------------------------------------------ … … 579 656 my $CTL_OPTIONS = shift; 580 657 658 return $annotations->{gff} 659 if(@{$CTL_OPTIONS->{_predictor}} == 1 && grep {/^gff$/} @{$CTL_OPTIONS->{_predictor}}); 660 581 661 my %order; #this is important for sorting 582 my @p_list; 583 my @m_list; 662 my $p_list = []; 663 my $m_list = []; 664 my $p_merge = []; 665 my $m_merge = []; 584 666 my $i = @{$CTL_OPTIONS->{_predictor}}; 585 667 foreach my $p (@{$CTL_OPTIONS->{_predictor}}){ … … 588 670 foreach my $g (@{$annotations->{$p}}){ 589 671 if($g->{g_strand} == 1){ 590 push(@p_list, $g); 672 next unless($g->{AED} < 1 || $p eq 'gff'); 673 push(@$p_list, $g); 674 push(@$p_merge, $g) if($g->{t_structs}->[0]->{hit}->{_hit_multi}); 591 675 } 592 676 elsif($g->{g_strand} == -1){ 593 push(@m_list, $g); 677 next unless($g->{AED} < 1 || $p eq 'gff'); 678 push(@$m_list, $g); 679 push(@$m_merge, $g) if($g->{t_structs}->[0]->{hit}->{_hit_multi}); 594 680 } 595 681 else{ … … 600 686 %SCORE = %order; #must be set for sort to work correctly 601 687 602 @p_list = sort {crit1($b) <=> crit1($a) || crit2($b) <=> crit2($a) || crit3($b) cmp crit3($a)} @p_list; 603 @m_list = sort {crit1($b) <=> crit1($a) || crit2($b) <=> crit2($a)|| crit3($b) cmp crit3($a)} @m_list; 604 605 my @p_keepers; 606 foreach my $g (@p_list){ 607 my $g_B = $g->{g_start}; 608 my $g_E = $g->{g_end}; 609 610 my $bad; 611 foreach my $k (@p_keepers){ 612 my $k_B = $k->{g_start}; 613 my $k_E = $k->{g_end}; 614 615 my $class = compare::compare($g_B, $g_E, $k_B, $k_E); 616 617 if($class ne '0'){ 618 $bad = 1; 619 last; 620 } 621 } 622 623 push(@p_keepers, $g) if(! $bad); 624 } 625 626 my @m_keepers; 627 foreach my $g (@m_list){ 688 #remove low scoring overlaping genes 689 @$p_list = sort {crit1($a) <=> crit1($b) || crit2($b) <=> crit2($a)|| crit3($b) cmp crit3($a)} @$p_list; 690 @$m_list = sort {crit1($a) <=> crit1($b) || crit2($b) <=> crit2($a)|| crit3($b) cmp crit3($a)} @$m_list; 691 $p_list = _best($p_list); 692 $m_list = _best($m_list); 693 694 #use abinit that merges clusters if average AED improves 695 my $p_m_list = []; 696 foreach my $g (@$p_merge){ 628 697 my $g_B = $g->{g_start}; 629 698 my $g_E = $g->{g_end}; 630 699 631 my $bad; 632 foreach my $k (@m_keepers){ 633 my $k_B = $k->{g_start}; 634 my $k_E = $k->{g_end}; 635 636 my $class = compare::compare($g_B, $g_E, $k_B, $k_E); 637 638 if($class ne '0'){ 639 $bad = 1; 640 last; 641 } 700 my $tot_length; 701 my $tot_AED; 702 703 foreach my $k (@$p_list){ 704 my $k_B = $k->{g_start}; 705 my $k_E = $k->{g_end}; 706 707 my $class = compare::compare($g_B, $g_E, $k_B, $k_E); 708 709 if($class ne '0'){ 710 my $nc = get_n_count($k); 711 $tot_length += $nc; 712 $tot_AED += $nc * $k->{AED} 713 } 642 714 } 643 715 644 push(@m_keepers, $g) if(! $bad); 645 } 646 647 push(@p_keepers, @m_keepers); 716 next if(!$tot_length); 717 718 my $ave_AED = $tot_AED/$tot_length; 719 720 push(@$p_m_list, $g) if($g->{AED} < $ave_AED); 721 } 722 723 my $m_m_list = []; 724 foreach my $g (@$m_merge){ 725 my $g_B = $g->{g_start}; 726 my $g_E = $g->{g_end}; 727 728 my $tot_length; 729 my $tot_AED; 730 731 foreach my $k (@$m_list){ 732 my $k_B = $k->{g_start}; 733 my $k_E = $k->{g_end}; 734 735 my $class = compare::compare($g_B, $g_E, $k_B, $k_E); 736 737 if($class ne '0'){ 738 my $nc = get_n_count($g); 739 $tot_length += $nc; 740 $tot_AED += $nc * $g->{AED} 741 } 742 } 743 744 next if(!$tot_length); 745 746 my $ave_AED = $tot_AED/$tot_length; 747 748 push(@$m_m_list, $g) if($g->{AED} < $ave_AED); 749 } 750 751 @$p_m_list = sort {crit1($a) <=> crit1($b) || crit2($b) <=> crit2($a)|| crit3($b) cmp crit3($a)} @$p_m_list; 752 @$m_m_list = sort {crit1($a) <=> crit1($b) || crit2($b) <=> crit2($a)|| crit3($b) cmp crit3($a)} @$m_m_list; 753 my $p_keepers = _best($p_m_list); 754 my $m_keepers = _best($m_m_list); 755 756 #now combine mergers with other keepers 757 push(@$p_keepers, @$p_list); 758 push(@$m_keepers, @$m_list); 759 $p_keepers = _best($p_keepers); 760 $m_keepers = _best($m_keepers); 761 push(@$p_keepers, @$m_keepers); 648 762 649 763 #write evaluator reports 650 foreach my $ann (@p_keepers){ 651 foreach my $t (@{$ann->{t_structs}}){ 652 my $dir = "$out_base/evaluator"; 653 mkdir($dir) if(! -e $dir); 654 my $file = "$dir/".Fasta::seqID2SafeID($t->{hit}->name).".eva"; 655 open(my $FH, "> $file"); 656 print $FH $t->{report}; 657 close($FH); 658 } 659 } 660 661 return \@p_keepers; 764 #foreach my $ann (@$p_keepers){ 765 # foreach my $t (@{$ann->{t_structs}}){ 766 # my $dir = "$out_base/evaluator"; 767 # mkdir($dir) if(! -e $dir); 768 # my $file = "$dir/".Fasta::seqID2SafeID($t->{hit}->name).".eva"; 769 # open(my $FH, "> $file"); 770 # print $FH $t->{report}; 771 # close($FH); 772 # } 773 #} 774 775 return $p_keepers; 776 } 777 #------------------------------------------------------------------------ 778 sub _best{ 779 my $list = shift; 780 781 my @keepers; 782 foreach my $g (@$list){ 783 my $g_B = $g->{g_start}; 784 my $g_E = $g->{g_end}; 785 786 my $bad; 787 foreach my $k (@keepers){ 788 my $k_B = $k->{g_start}; 789 my $k_E = $k->{g_end}; 790 791 my $class = compare::compare($g_B, $g_E, $k_B, $k_E); 792 793 if($class ne '0'){ 794 $bad = 1; 795 last; 796 } 797 } 798 799 push(@keepers, $g) if(! $bad); 800 } 801 802 return \@keepers; 662 803 } 663 804 #------------------------------------------------------------------------ … … 709 850 next if(! defined $model); 710 851 my $transcript = $model; 852 853 my $utr_trans; 854 if($predictor eq 'abinit' && defined($ests)){ 855 my $copy = dclone($transcript); 856 $utr_trans = pneu($ests, $copy, $seq); 857 push(@transcripts, [$utr_trans, $set, $transcript]); 858 if (! clean::is_identical_form($utr_trans, $transcript)){ 859 push(@transcripts, [$utr_trans, $set, undef]); 860 } 861 } 711 862 push(@transcripts, [$transcript, $set, undef]); 863 712 864 $i++; 713 865 next; … … 717 869 next if (! defined $mia); 718 870 my $transcript = pneu($ests, $mia, $seq); 719 push(@transcripts, [$transcript, $set, undef]) ;871 push(@transcripts, [$transcript, $set, undef]) if $transcript; 720 872 $i++; 721 873 next; 722 } 874 } 723 875 724 876 my ($pred_shots, $strand) = get_pred_shot($seq, … … 850 1002 851 1003 my $t_name = "$g_name-mRNA-$i"; #affects GFFV3.pm 852 $f->name($t_name);853 854 #my $qi = maker::quality_index::get_transcript_qi($f,$evi,$offset,$len_3_utr,$l_trans);855 1004 856 1005 #----evaluator here … … 858 1007 my $pol_e_hits = get_selected_types($evi->{ests}, 'est2genome', 'est_gff'); 859 1008 my $blastx_hits = get_selected_types($evi->{gomiph},'blastx', 'protein_gff'); 1009 my $tblastx_hits = get_selected_types($evi->{alt_ests},'tblastx', 'altest_gff'); 860 1010 my $abinits = $evi->{all_preds}; 861 1011 862 1012 my @bag = (@$pol_p_hits, 863 1013 @$pol_e_hits, 864 @$blastx_hits 1014 @$blastx_hits, 1015 @$tblastx_hits 865 1016 ); 866 1017 867 1018 #holds evalutor struct 868 my $eva = evaluator::evaluate::power_evaluate($f,869 $seq,870 $pol_p_hits,871 $pol_e_hits,872 $blastx_hits,873 $abinits,874 $so_code,875 $geneAED,876 $alt_spli_sup,877 $t_name,878 $CTL_OPTIONS,879 $the_void,880 );1019 #my $eva = evaluator::evaluate::power_evaluate($f, 1020 # $seq, 1021 # $pol_p_hits, 1022 # $pol_e_hits, 1023 # $blastx_hits, 1024 # $abinits, 1025 # $so_code, 1026 # $geneAED, 1027 # $alt_spli_sup, 1028 # $t_name, 1029 # $CTL_OPTIONS, 1030 # $the_void, 1031 # ); 881 1032 882 my $AED = $eva->{txnAED}; 883 my $score = $eva->{score}; 884 my $qi = $eva->{qi}; 1033 my $AED = shadow_AED::get_AED(\@bag, $f);#$eva->{txnAED}; 1034 1035 my $score = 0;#$eva->{score}; 1036 my $qi = maker::quality_index::get_transcript_qi($f,$evi,$offset,$len_3_utr,$l_trans);#$eva->{qi}; 1037 my $report = '';#$eva->{report}; 885 1038 #---- 886 1039 $t_name .= " AED:"; 887 1040 $t_name .= sprintf '%.2f', $AED; # two decimal places 888 1041 $t_name .= " $qi"; 1042 $f->name($t_name); #give name to hit 889 1043 890 1044 my $t_struct = {'hit' => $f, … … 897 1051 'AED' => $AED, 898 1052 'score' => $score, 899 'report' => $ eva->{report}1053 'report' => $report 900 1054 }; 901 1055 … … 1069 1223 } 1070 1224 1071 my $so_code = evaluator::so_classifier::so_code($c);1072 my $alt_spli_sup = evaluator::funs::alt_spli($c, \@pol_e_hits, $seq);1073 my $geneAED = evaluator::AED::gene_AED($c, \@pol_e_hits, \@pol_p_hits, \@blastx_hits, \@ab_inits, $seq);1225 my $so_code = '';#evaluator::so_classifier::so_code($c); 1226 my $alt_spli_sup ='';# evaluator::funs::alt_spli($c, \@pol_e_hits, $seq); 1227 my $geneAED = '';#evaluator::AED::gene_AED($c, \@pol_e_hits, \@pol_p_hits, \@blastx_hits, \@ab_inits, $seq); 1074 1228 #---- 1075 1229 1076 1230 my $score = 0; 1231 my $AED = 1; 1077 1232 my $i = 1; 1078 1233 foreach my $f (@{$c}) { … … 1082 1237 load_transcript_struct($f, $g_name, $i, $seq, $evidence, $so_code, 1083 1238 $geneAED, $alt_spli_sup, $the_void, $CTL_OPTIONS); 1239 1240 if(!$f || ! $t_struct){ 1241 sleep 1; 1242 } 1243 1084 1244 push(@t_structs, $t_struct); 1085 1245 $score = $t_struct->{score} if($t_struct->{score} > $score); 1246 $AED = $t_struct->{AED} if($t_struct->{AED} < $AED); 1086 1247 $i++; 1087 1248 } … … 1095 1256 'g_strand' => $g_strand, 1096 1257 'score' => $score, 1097 'AED' => $ geneAED,1258 'AED' => $AED,#$geneAED, 1098 1259 'predictor' => $predictor, 1099 1260 'so_code' => $so_code … … 1151 1312 } 1152 1313 } 1314 if(! @exons){ 1315 return; 1316 } 1153 1317 my @sorted_b = sort {$a->start('query') <=> $b->start('query')} @exons; 1154 1318 my @sorted_e = sort {$b->end('query') <=> $a->end('query')} @exons; 1319 1320 my $ref = ref($sorted_b[0]); 1155 1321 1156 1322 my $g_start = $sorted_b[0]->start('query'); … … 1227 1393 my $best = shift(@sorted); 1228 1394 1229 return ($best->[1], $best->[0]); 1230 1395 return ($best->[1], $best->[0]) if($best); 1231 1396 } 1232 1397 #------------------------------------------------------------------------ … … 1259 1424 my $p_seq_2 = $tM->translate_from_offset($seq, $offset); 1260 1425 1261 my ($t_seq) = $p_seq_2 =~ /(^[^\*]+ )\*?/;1426 my ($t_seq) = $p_seq_2 =~ /(^[^\*]+\*?)/; 1262 1427 1263 1428 die "logic error in auto_annotate::get_off_and_str!\n" … … 1272 1437 my $tM = new CGL::TranslationMachine(); 1273 1438 1274 my ($p_seq , $offset) = $tM->longest_translation($seq); 1439 my ($p_seq , $offset) = $tM->longest_translation_plus_stop($seq); 1440 1441 my $end = length($p_seq)*3 + $offset + 1; 1275 1442 1276 1443 $p_seq =~ s/\*$//; # added 11/21/06 1277 1444 1278 my $end = length($p_seq)*3 + $offset + 1 + 3;1279 1280 my ($trim, $leader);1281 1445 if ($p_seq =~ /^M/){ 1282 1446 # easy it begins with an M.... … … 1287 1451 my ($off_new, $p_seq_new) = get_longest_m_seq($seq); 1288 1452 1289 my $n_end = length($p_seq_new)*3 + $off_new + 1 + 31453 my $n_end = length($p_seq_new)*3 + $off_new + 1 1290 1454 if defined($p_seq_new); 1291 1455 1456 $p_seq_new =~ s/\*$// if($p_seq_new); # added 11/21/06 1292 1457 1293 1458 if (!defined($p_seq_new)){ … … 1295 1460 } 1296 1461 elsif ($offset < 3 && length($p_seq) - length($p_seq_new) > 30){ 1297 return ($p_seq , $offset, $end); 1462 return ($p_seq , $offset, $end); 1298 1463 } 1299 1464 else { lib/maker/join.pm
r127 r151 348 348 my $length = 0; 349 349 foreach my $hsp (@anno_hsps){ 350 $new_total_score += $hsp->score(); 351 $length += $hsp->length(); 350 my $score = $hsp->score(); 351 $score = 0 if( ! $score || $score eq '.' || $score eq 'NA'); 352 $new_total_score += $score; 353 $length += $hsp->length(); 352 354 } 353 355 lib/shadow_AED.pm
r146 r151 9 9 my $tran = shift; 10 10 11 return 0.5if(! @{$hits} || ! $tran);11 return 1 if(! @{$hits} || ! $tran); 12 12 13 13 my ($start, $end) = ($tran->start('query'), $tran->end('query')); … … 68 68 my $spec = $index{3}/($index{2} + $index{3}); #specificity 69 69 my $sens = $index{3}/($index{1} + $index{3}); #sensitivity 70 my $AED = ($spec + $sens)/2;70 my $AED = 1 - ($spec + $sens)/2; 71 71 72 72 return $AED;
