Changeset 249
- Timestamp:
- 08/31/09 17:07:51 (3 months ago)
- Files:
-
- bin/maker (modified) (2 diffs)
- lib/FastaDB.pm (added)
- lib/GI.pm (modified) (19 diffs)
- lib/Widget/RepeatMasker.pm (modified) (1 diff)
- lib/Widget/blastn.pm (modified) (1 diff)
- lib/Widget/blastx.pm (modified) (1 diff)
- lib/Widget/tblastx.pm (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
bin/maker
r244 r249 58 58 use Getopt::Long qw(:config no_ignore_case); 59 59 use File::Temp qw(tempfile tempdir); 60 use Bio::DB::Fasta;60 #use Bio::DB::Fasta; 61 61 use GI; 62 62 use Dumper::GFF::GFFV3; … … 253 253 254 254 next if($tier->terminated); 255 $tier->run while(! $tier->terminated && ! $tier->failed);255 $tier->run_all; 256 256 $DS_CTL->add_entry($tier->DS); 257 257 push(@failed, $tier->fasta) if ($tier->failed); lib/GI.pm
r248 r249 34 34 use PhatHit_utils; 35 35 use Shadower; 36 use Bio::DB::Fasta;36 #use Bio::DB::Fasta; 37 37 use polisher::exonerate::protein; 38 38 use polisher::exonerate::est; … … 42 42 use maker::sens_spec; 43 43 use File::NFSLock; 44 use FastaDB; 45 use Digest::MD5; 44 46 use threads; 45 47 … … 213 215 #search db index 214 216 my $fastaObj = $db_index->get_Seq_by_id($hit->name); 217 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 218 219 #still no sequence? try rebuilding the index and try again 215 220 if (not $fastaObj) { 216 #rebuild index and try again 217 my $db_file = $db_index->{dirname}."/".$db_index->fileno2path(0); 218 print STDERR "WARNING: Cannot find ".$hit->name.", trying to re-index $db_file\n"; 219 $db_index = Bio::DB::Fasta->new($db_file, '-reindex' => 1); 220 $fastaObj = $db_index->get_Seq_by_id($hit->name); 221 if (not $fastaObj) { 222 print STDERR "stop here:".$hit->name."\n"; 223 die "ERROR: Fasta index error\n"; 224 } 221 print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; 222 $db_index->reindex(); 223 $fastaObj = $db_index->get_Seq_by_id($hit->name); 224 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 225 if (not $fastaObj) { 226 print STDERR "stop here:".$hit->name."\n"; 227 die "ERROR: Fasta index error\n"; 228 } 225 229 } 226 230 … … 228 232 my $t_seq = $fastaObj->seq(); 229 233 my $t_def = $db_index->header($hit->name); 230 234 $t_def = $db_index->header_by_alias($hit->description) if(! defined $t_def); 235 231 236 #write fasta file 232 237 my $fasta = Fasta::toFasta('>'.$t_def, \$t_seq); … … 520 525 my $mpi_size = shift @_ || 1; 521 526 527 #always set to at least 10 for faster fasta indexing 528 $mpi_size = 10 if($mpi_size < 10); 529 522 530 my $file = $CTL_OPT->{$key}; 523 531 my $alt = $CTL_OPT->{alt_peptide} if($key =~ /protein/); … … 530 538 my $bins = $mpi_size; 531 539 $bins = $db_size if ($db_size < $bins); 532 540 533 541 my ($f_name) = $file =~ /([^\/]+)$/; 534 542 $f_name =~ s/\.fasta$//; … … 538 546 my $f_dir = "$b_dir/$d_name"; 539 547 my $t_dir = $TMP."/$d_name"; 540 my $t_full = "$t_dir\.fasta";541 my $f_full = "$b_dir/$f_name\.fasta";548 #my $t_full = "$t_dir\.fasta"; 549 #my $f_full = "$b_dir/$f_name\.fasta"; 542 550 543 551 #delete old indexes if full file does not exist 544 if(! -e $f_full && -e "$f_full.index"){545 unlink("$f_full.index");546 }552 #if(! -e $f_full && -e "$f_full.index"){ 553 # unlink("$f_full.index"); 554 #} 547 555 548 556 #check if files exist 549 if($mpi_size == 1 && -e $f_full){ #on one processor check if finished 550 return $f_full, [$f_full]; 551 } 552 elsif(-e "$f_dir"){ #on multi processors check if finished 553 my @t_db = <$f_dir/*$d_name\.*>; 557 #if($mpi_size == 1 && -e $f_full){ #on one processor check if finished 558 # return $f_full, [$f_full]; 559 #} 560 #elsif(-e "$f_dir"){ #on multi processors check if finished 561 if(-e "$f_dir"){ #on multi processors check if finished 562 my @t_db = <$f_dir/*$d_name*\.fasta>; 554 563 555 564 my @existing_db; … … 558 567 } 559 568 560 if(@existing_db == $mpi_size){ #use existing if right count 561 if(! -e $f_full){ #fix full file 562 concatenate_files(\@existing_db, $f_full); 563 } 564 565 return $f_full, \@existing_db; 569 if(@existing_db == $bins){ #use existing if right count 570 #if(! -e $f_full){ #fix full file 571 # concatenate_files(\@existing_db, $f_full); 572 #} 573 574 #return $f_full, \@existing_db; 575 return $f_dir, \@existing_db; 566 576 } 567 577 else{ #remove if there is an error … … 571 581 572 582 #make needed output directories 573 mkdir($t_dir) unless ($mpi_size == 1); 583 #mkdir($t_dir) unless ($mpi_size == 1); 584 mkdir($t_dir); 574 585 mkdir($b_dir) unless (-e $b_dir); 575 586 576 587 #open filehandles for pieces on multi processors 577 588 my @fhs; 578 if($mpi_size != 1){589 #if($mpi_size != 1){ 579 590 for (my $i = 0; $i < $bins; $i++) { 580 591 my $name = "$t_dir/$d_name\.$i\.fasta"; … … 584 595 push (@fhs, $fh); 585 596 } 586 }597 #} 587 598 588 599 #write fastas here 589 600 my %alias; 590 my $FA;591 open($FA, "> $t_full"); #full file601 #my $FA; 602 #open($FA, "> $t_full"); #full file 592 603 while (my $fasta = $fasta_iterator->nextEntry()) { 593 604 my $def = Fasta::getDef(\$fasta); … … 606 617 #fix weird blast trimming error for long seq IDs by replacing them 607 618 if(length($seq_id) > 78){ 608 my $new_id = substr($seq_id, 0, 78);619 my $new_id = uri_escape(Digest::MD5::md5_base64($seq_id), "^A-Za-z0-9\-\_"); 609 620 die "ERROR: The id $seq_id is too long for BLAST, and I can'y uniquely fix it\n" 610 621 if($alias{$new_id}); 611 622 $alias{$new_id}++; 612 $def =~ s/^ >/>$seq_id maker_alias=/;623 $def =~ s/^(>\S+)/$1 MD5_alias=$new_id/; 613 624 } 614 625 … … 617 628 618 629 #build full file 619 print $FA $$fasta_ref;630 #print $FA $$fasta_ref; 620 631 621 632 #build part files only on multi processor 622 if($mpi_size != 1){633 #if($mpi_size != 1){ 623 634 my $fh = shift @fhs; 624 635 print $fh $$fasta_ref; 625 636 push (@fhs, $fh); 626 }627 } 628 close($FA); #close full file637 #} 638 } 639 #close($FA); #close full file 629 640 630 641 #close part file handles … … 634 645 635 646 #move finished files into place 636 system("mv $t_full $f_full"); 637 system("mv $t_dir $f_dir") unless($mpi_size == 1); 647 #system("mv $t_full $f_full"); 648 #system("mv $t_dir $f_dir") unless($mpi_size == 1); 649 system("mv $t_dir $f_dir"); 638 650 639 651 #check if everything is ok 640 if($mpi_size == 1 && -e $f_full){ #single processor 641 return $f_full, [$f_full]; 642 } 643 elsif (-e $f_dir && -e $f_full) { #multi processor 644 my @t_db = <$f_dir/*$d_name\.*>; 652 #if($mpi_size == 1 && -e $f_full){ #single processor 653 # return $f_full, [$f_full]; 654 #} 655 #elsif (-e $f_dir && -e $f_full) { #multi processor 656 if (-e $f_dir) { #multi processor 657 my @t_db = <$f_dir/*$d_name*\.fasta>; 645 658 646 659 my @db_files; … … 649 662 } 650 663 651 die "ERROR: SplitDB not created correctly\n\n" if(@db_files != $mpi_size); #not ok 652 653 return $f_full, \@db_files; 664 die "ERROR: SplitDB not created correctly\n\n" if(@db_files != $bins); #not ok 665 666 #return $f_full, \@db_files; 667 return $f_dir, \@db_files; 654 668 } 655 669 else { … … 1029 1043 1030 1044 my $fastaObj = $db_index->get_Seq_by_id($hit->name); 1045 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1046 1047 #still no sequence? try rebuilding the index and try again 1031 1048 if (not $fastaObj) { 1032 #rebuild index and try again 1033 my $db_file = $db_index->{dirname}."/".$db_index->fileno2path(0); 1034 print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index $db_file\n"; 1035 $db_index = Bio::DB::Fasta->new($db_file, '-reindex' => 1); 1049 print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; 1050 $db_index->reindex(); 1036 1051 $fastaObj = $db_index->get_Seq_by_id($hit->name); 1052 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1037 1053 if (not $fastaObj) { 1038 1054 print STDERR "stop here:".$hit->name."\n"; … … 1040 1056 } 1041 1057 } 1058 1042 1059 my $seq = $fastaObj->seq(); 1043 1060 my $def = $db_index->header($hit->name); 1061 $def = $db_index->header_by_alias($hit->description) if(! defined $def); 1044 1062 1045 1063 my $fasta = Fasta::toFasta('>'.$def, \$seq); … … 1152 1170 foreach my $c (@{$clusters}) { 1153 1171 foreach my $hit (@{$c}) { 1154 my $id = $hit->name(); 1155 my $fastaObj = $index->get_Seq_by_id($id); 1172 my $fastaObj = $index->get_Seq_by_id($hit->name); 1173 $fastaObj = $index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1174 1175 #still no sequence? try rebuilding the index and try again 1176 if (not $fastaObj) { 1177 print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; 1178 $index->reindex(); 1179 $fastaObj = $index->get_Seq_by_id($hit->name); 1180 $fastaObj = $index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1181 if (not $fastaObj) { 1182 print STDERR "stop here:".$hit->name."\n"; 1183 die "ERROR: Fasta index error\n"; 1184 } 1185 } 1186 1156 1187 my $seq = $fastaObj->seq(); 1157 my $def = $index->header($id); 1188 my $def = $index->header($hit->name); 1189 $def = $index->header_by_alias($hit->description) if(! defined $def); 1158 1190 my $fasta = Fasta::toFasta('>'.$def, \$seq); 1159 1191 $fastas .= $$fasta; … … 1165 1197 sub build_fasta_index { 1166 1198 my $db = shift; 1167 my $index = new Bio::DB::Fasta($db);1199 my $index = new FastaDB($db); 1168 1200 return $index; 1169 1201 } … … 1181 1213 foreach my $db (@dbs){ 1182 1214 next if(! $db); 1183 unlink("$db.index") if($CTL_OPT->{force} && -e "$db.index"); 1184 new Bio::DB::Fasta($db); 1215 if($CTL_OPT->{force}){ 1216 foreach my $f (@{[<$db/*.index>]}){ 1217 unlink("$f") if(-f $f); 1218 } 1219 } 1220 new FastaDB($db); 1185 1221 } 1186 1222 } lib/Widget/RepeatMasker.pm
r207 r249 9 9 use FileHandle; 10 10 use Widget; 11 use Bio::DB::Fasta;11 #use Bio::DB::Fasta; 12 12 use Bio::Search::Hit::PhatHit::repeatmasker; 13 13 use Bio::Search::HSP::PhatHSP::repeatmasker; lib/Widget/blastn.pm
r248 r249 99 99 $hit->queryLength($result->query_length); 100 100 $hit->queryName($result->query_name); 101 if($hit->description() =~ /maker_alias=(\S+)/){102 $hit->name($1);103 }104 101 105 102 my @hsps; lib/Widget/blastx.pm
r248 r249 99 99 $hit->queryLength($result->query_length); 100 100 $hit->queryName($result->query_name); 101 if($hit->description() =~ /maker_alias=(\S+)/){ 102 $hit->name($1); 103 } 101 104 102 my @hsps; 105 103 while(my $hsp = $hit->next_hsp) { lib/Widget/tblastx.pm
r248 r249 99 99 $hit->queryLength($result->query_length); 100 100 $hit->queryName($result->query_name); 101 if($hit->description() =~ /maker_alias=(\S+)/){102 $hit->name($1);103 }104 101 105 102 my @hsps;
