Changeset 260
- Timestamp:
- 09/09/09 13:38:19 (3 months ago)
- Files:
-
- lib/FastaDB.pm (modified) (2 diffs)
- lib/GI.pm (modified) (11 diffs)
- lib/PhatHit_utils.pm (modified) (6 diffs)
- lib/Widget/blastn.pm (modified) (1 diff)
- lib/Widget/blastx.pm (modified) (1 diff)
- lib/Widget/tblastx.pm (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
lib/FastaDB.pm
r249 r260 43 43 foreach my $file (@keep){ 44 44 push(@{$self->{index}}, new Bio::DB::Fasta($file, @args)); 45 46 #build reverse index to get the correct index based on file name 47 my ($title) = $file =~ /([^\/]+)$/; 48 $self->{file2index}{$title} = $self->{index}->[-1]; 45 49 } 46 50 … … 74 78 foreach my $file (@keep){ 75 79 push(@{$self->{index}}, new Bio::DB::Fasta($file, @args)); 80 81 #build reverse index to get the correct index based on file name 82 my ($title) = $file =~ /([^\/]+)$/; 83 $self->{file2index}{$title} = $self->{index}->[-1]; 76 84 } 77 85 78 86 return $self; 87 } 88 #------------------------------------------------------------------------------- 89 #uses hit info to search all indexes faster 90 sub get_Seq_for_hit { 91 my $self = shift; 92 my $hit = shift; 93 94 my $r_ind = $self->{file2index}; #reverse index 95 my $id = $hit->name; 96 97 if($hit->description =~ /MD5_alias=(\S+)/){ 98 $id = $1; 99 } 100 101 my $dbf = $hit->database_name; 102 103 return $self->get_Seq_by_id($id) if(! defined $dbf); 104 105 ($dbf) = $dbf =~ /([^\/]+)$/; 106 107 my $fastaObj; 108 if(exists $r_ind->{$dbf}){ 109 my $db = $r_ind->{$dbf}; 110 $fastaObj = $db->get_Seq_by_id($id); 111 } 112 113 if(! $fastaObj){ 114 my @files = grep {!/^$dbf$/} keys %$r_ind; #check remaining files 115 foreach my $dbf (@files){ 116 my $db = $r_ind->{$dbf}; 117 $fastaObj = $db->get_Seq_by_id($id); 118 last if($fastaObj); 119 } 120 } 121 122 return $fastaObj; 123 } 124 #------------------------------------------------------------------------------- 125 #uses hit info to search all indexes faster 126 sub header_for_hit { 127 my $self = shift; 128 my $hit = shift; 129 130 my $r_ind = $self->{file2index}; #reverse index 131 my $id = $hit->name; 132 133 if($hit->description =~ /MD5_alias=(\S+)/){ 134 $id = $1; 135 } 136 137 my $dbf = $hit->database_name; 138 139 return $self->header($id) if(! defined $dbf); 140 141 ($dbf) = $dbf =~ /([^\/]+)$/; 142 143 my $fastaObj; 144 my $header; 145 if(exists $r_ind->{$dbf}){ 146 my $db = $r_ind->{$dbf}; 147 $fastaObj = $db->get_Seq_by_id($id); 148 $header = $db->header($id) if($fastaObj); 149 } 150 151 if(! $fastaObj){ 152 my @files = grep {!/^$dbf$/} keys %$r_ind; #check remaining files 153 foreach my $dbf (@files){ 154 my $db = $r_ind->{$dbf}; 155 $fastaObj = $db->get_Seq_by_id($id); 156 157 if($fastaObj){ 158 $header = $db->header($id); 159 last 160 } 161 } 162 } 163 164 return $header; 79 165 } 80 166 #------------------------------------------------------------------------------- lib/GI.pm
r249 r260 214 214 ); 215 215 #search db index 216 my $fastaObj = $db_index->get_Seq_by_id($hit->name); 217 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 216 my $fastaObj = $db_index->get_Seq_for_hit($hit); 218 217 219 218 #still no sequence? try rebuilding the index and try again … … 221 220 print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; 222 221 $db_index->reindex(); 223 $fastaObj = $db_index->get_Seq_by_id($hit->name); 224 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 222 $fastaObj = $db_index->get_Seq_for_hit($hit); 225 223 if (not $fastaObj) { 226 224 print STDERR "stop here:".$hit->name."\n"; … … 231 229 #get fasta def and seq 232 230 my $t_seq = $fastaObj->seq(); 233 my $t_def = $db_index->header($hit->name); 234 $t_def = $db_index->header_by_alias($hit->description) if(! defined $t_def); 231 my $t_def = $db_index->header_for_hit($hit); 235 232 236 233 #write fasta file … … 601 598 #my $FA; 602 599 #open($FA, "> $t_full"); #full file 600 my $wflag = 1; #flag set so warnings gets printed only once 603 601 while (my $fasta = $fasta_iterator->nextEntry()) { 604 602 my $def = Fasta::getDef(\$fasta); … … 617 615 #fix weird blast trimming error for long seq IDs by replacing them 618 616 if(length($seq_id) > 78){ 617 warn "WARNING: The fasta file contains sequences with names longer\n". 618 "than 78 characters. Long names get trimmed by BLAST, making\n". 619 "it harder to identify the source of an alignmnet. You might\n". 620 "want to reformat the fasta file with shorter IDs.\n". 621 "File_name:$file\n\n" if($wflag-- > 0); 622 619 623 my $new_id = uri_escape(Digest::MD5::md5_base64($seq_id), "^A-Za-z0-9\-\_"); 620 624 die "ERROR: The id $seq_id is too long for BLAST, and I can'y uniquely fix it\n" … … 1042 1046 my $id = $hit->name(); 1043 1047 1044 my $fastaObj = $db_index->get_Seq_by_id($hit->name); 1045 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1048 my $fastaObj = $db_index->get_Seq_for_hit($hit); 1046 1049 1047 1050 #still no sequence? try rebuilding the index and try again … … 1049 1052 print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; 1050 1053 $db_index->reindex(); 1051 $fastaObj = $db_index->get_Seq_ by_id($hit->name);1052 $fastaObj = $db_index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1054 $fastaObj = $db_index->get_Seq_for_hit($hit); 1055 1053 1056 if (not $fastaObj) { 1054 1057 print STDERR "stop here:".$hit->name."\n"; … … 1058 1061 1059 1062 my $seq = $fastaObj->seq(); 1060 my $def = $db_index->header($hit->name); 1061 $def = $db_index->header_by_alias($hit->description) if(! defined $def); 1063 my $def = $db_index->header_for_hit($hit); 1062 1064 1063 1065 my $fasta = Fasta::toFasta('>'.$def, \$seq); … … 1170 1172 foreach my $c (@{$clusters}) { 1171 1173 foreach my $hit (@{$c}) { 1172 my $fastaObj = $index->get_Seq_by_id($hit->name); 1173 $fastaObj = $index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1174 my $fastaObj = $index->get_Seq_for_hit($hit); 1174 1175 1175 1176 #still no sequence? try rebuilding the index and try again … … 1177 1178 print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; 1178 1179 $index->reindex(); 1179 $fastaObj = $index->get_Seq_ by_id($hit->name);1180 $fastaObj = $index->get_Seq_by_alias($hit->description) if(not $fastaObj); 1180 $fastaObj = $index->get_Seq_for_hit($hit); 1181 1181 1182 if (not $fastaObj) { 1182 1183 print STDERR "stop here:".$hit->name."\n"; … … 1186 1187 1187 1188 my $seq = $fastaObj->seq(); 1188 my $def = $index->header($hit->name); 1189 $def = $index->header_by_alias($hit->description) if(! defined $def); 1189 my $def = $index->header_for_hit($hit); 1190 1190 my $fasta = Fasta::toFasta('>'.$def, \$seq); 1191 1191 $fastas .= $$fasta; lib/PhatHit_utils.pm
r244 r260 181 181 182 182 $new_hit->queryLength($hit->queryLength); 183 $new_hit->database_name($hit->database_name); 183 184 $new_hit->{is_split} = 1 if $num > 1; 184 185 … … 239 240 240 241 $new_hit->queryLength($hit->queryLength); 242 $new_hit->database_name($hit->database_name); 241 243 $new_hit->{is_split} = 1; 242 244 … … 270 272 271 273 $new_hit->queryLength($hit->queryLength); 274 $new_hit->database_name($hit->database_name); 272 275 $new_hit->{is_shattered} = 1; 273 276 … … 299 302 300 303 $new_hit->queryLength($hit->queryLength); 304 $new_hit->database_name($hit->database_name); 301 305 $new_hit->{is_shattered} = 1; 302 306 … … 770 774 771 775 $new_hit->queryLength($hit->queryLength); 772 776 $new_hit->database_name($hit->database_name); 773 777 774 778 my @new_hsps; … … 879 883 ); 880 884 881 $new_hit->queryLength($hit->queryLength); 882 $new_hit->{is_normalized} = 1; 883 884 my %crap; 885 foreach my $hsp (@keepers){ 886 $crap{$hsp->nB('hit')}{$hsp->nE('hit')}++; 887 $new_hit->add_hsp($hsp); 888 } 885 $new_hit->queryLength($hit->queryLength); 886 $new_hit->database_name($hit->database_name); 887 $new_hit->{is_normalized} = 1; 888 889 my %crap; 890 foreach my $hsp (@keepers){ 891 $crap{$hsp->nB('hit')}{$hsp->nE('hit')}++; 892 $new_hit->add_hsp($hsp); 893 } 889 894 890 895 my $s_size = @{$sorted}; lib/Widget/blastn.pm
r249 r260 99 99 $hit->queryLength($result->query_length); 100 100 $hit->queryName($result->query_name); 101 $hit->database_name($result->database_name); 101 102 102 103 my @hsps; lib/Widget/blastx.pm
r249 r260 99 99 $hit->queryLength($result->query_length); 100 100 $hit->queryName($result->query_name); 101 $hit->database_name($result->database_name); 101 102 102 103 my @hsps; lib/Widget/tblastx.pm
r249 r260 99 99 $hit->queryLength($result->query_length); 100 100 $hit->queryName($result->query_name); 101 $hit->database_name($result->database_name); 101 102 102 103 my @hsps;
