root/lib/FastaDB.pm

Revision 260, 6.2 kB (checked in by cholt, 3 months ago)

made indexing search less IO intensive

Line 
1 #-------------------------------------------------------------------------------
2 #------                           FastaDB                              ---------
3 #-------------------------------------------------------------------------------
4 package FastaDB;
5 use strict;
6 use vars qw(@ISA @EXPORT $VERSION);
7 use Exporter;
8 use PostData;
9 use FileHandle;
10 use URI::Escape;
11 use Bio::DB::Fasta;
12
13 @ISA = qw(
14           );
15 #-------------------------------------------------------------------------------
16 #------------------------------- Methods ---------------------------------------
17 #-------------------------------------------------------------------------------
18 sub new {
19     my $class = shift;
20     my $dir   = shift;
21
22     my $self = {};
23
24     bless ($self, $class);
25
26     die "ERROR: Directory $dir does not exist or is not a directory.\n" if(! -d $dir);
27
28     $self->{dirname} = $dir;
29     my @args = @_;
30     push (@args, ('-makeid' => \&makeid));
31
32     my @files = <$dir/*>;
33
34     #identify fastas
35     my @keep;
36     foreach my $file (@files){
37         next if (! -f $file);
38         next unless ($file =~ /\.(fa|fasta|fast|FA|FASTA|FAST|dna)$/);
39         push(@keep, $file);
40     }
41
42     #build indexes
43     foreach my $file (@keep){
44         push(@{$self->{index}}, new Bio::DB::Fasta($file, @args));
45
46         #build reverse index to get the correct index based on file name
47         my ($title) = $file =~ /([^\/]+)$/;
48         $self->{file2index}{$title} = $self->{index}->[-1];
49     }
50
51     return $self;
52 }
53 #-------------------------------------------------------------------------------
54 sub reindex {
55     my $self = shift;
56     my $dir = $self->{dirname};
57
58     die "ERROR: Directory $dir does not exist or is not a directory.\n" if(! -d $dir);
59
60     my @args = ('-reindex' => 1,
61                 '-makeid' => \&makeid
62                 );
63
64     my @files = <$dir/*>;
65    
66     #identify fastas
67     my @keep;
68     foreach my $file (@files){
69         next if (! -f $file);
70         next unless ($file =~ /\.(fa|fasta|fast|FA|FASTA|FAST|dna)$/);
71         push(@keep, $file);
72     }
73
74     #clear old index array
75     $self->{index} = [];
76
77     #build indexes
78     foreach my $file (@keep){
79         push(@{$self->{index}}, new Bio::DB::Fasta($file, @args));
80
81         #build reverse index to get the correct index based on file name
82         my ($title) = $file =~ /([^\/]+)$/;
83         $self->{file2index}{$title} = $self->{index}->[-1];
84     }
85
86     return $self;
87 }
88 #-------------------------------------------------------------------------------
89 #uses hit info to search all indexes faster
90 sub get_Seq_for_hit {
91     my $self = shift;
92     my $hit = shift;
93
94     my $r_ind = $self->{file2index}; #reverse index
95     my $id = $hit->name;
96
97     if($hit->description =~ /MD5_alias=(\S+)/){
98         $id = $1;
99     }
100
101     my $dbf = $hit->database_name;
102
103     return $self->get_Seq_by_id($id) if(! defined $dbf);   
104
105     ($dbf) = $dbf =~ /([^\/]+)$/;
106
107     my $fastaObj;
108     if(exists $r_ind->{$dbf}){
109         my $db = $r_ind->{$dbf};
110         $fastaObj = $db->get_Seq_by_id($id);
111     }
112    
113     if(! $fastaObj){
114         my @files = grep {!/^$dbf$/} keys %$r_ind; #check remaining files
115         foreach my $dbf (@files){
116             my $db = $r_ind->{$dbf};
117             $fastaObj = $db->get_Seq_by_id($id);
118             last if($fastaObj);
119         }
120     }
121
122     return $fastaObj;
123 }
124 #-------------------------------------------------------------------------------
125 #uses hit info to search all indexes faster
126 sub header_for_hit {
127     my $self = shift;
128     my $hit = shift;
129
130     my $r_ind = $self->{file2index}; #reverse index
131     my $id = $hit->name;
132
133     if($hit->description =~ /MD5_alias=(\S+)/){
134         $id = $1;
135     }
136
137     my $dbf = $hit->database_name;
138
139     return $self->header($id) if(! defined $dbf);
140
141     ($dbf) = $dbf =~ /([^\/]+)$/;
142
143     my $fastaObj;
144     my $header;
145     if(exists $r_ind->{$dbf}){
146         my $db = $r_ind->{$dbf};
147         $fastaObj = $db->get_Seq_by_id($id);
148         $header = $db->header($id) if($fastaObj);
149     }
150    
151     if(! $fastaObj){
152         my @files = grep {!/^$dbf$/} keys %$r_ind; #check remaining files
153         foreach my $dbf (@files){
154             my $db = $r_ind->{$dbf};
155             $fastaObj = $db->get_Seq_by_id($id);
156
157             if($fastaObj){
158                 $header = $db->header($id);
159                 last
160             }
161         }
162     }
163
164     return $header;
165 }
166 #-------------------------------------------------------------------------------
167 sub get_Seq_by_id {
168     my $self = shift;
169     my $id = shift;
170
171     my $fastaObj;
172     foreach my $db (@{$self->{index}}){
173         $fastaObj = $db->get_Seq_by_id($id);
174         last if($fastaObj);
175     }
176
177     return $fastaObj;
178 }
179 #-------------------------------------------------------------------------------
180 sub get_Seq_by_alias {
181     my $self = shift;
182     my $alias = shift;
183
184     $alias =~ /MD5_alias=(\S+)/;
185     $alias = $1;
186
187     return undef if(! defined $alias);
188
189     my $fastaObj;
190     foreach my $db (@{$self->{index}}){
191         $fastaObj = $db->get_Seq_by_id($alias);
192         last if($fastaObj);
193     }
194
195     return $fastaObj;
196 }
197 #-------------------------------------------------------------------------------
198 sub header {
199     my $self = shift;
200     my $id = shift;
201
202     my $h;
203     foreach my $db (@{$self->{index}}){
204         #do it this way first to avoid warnings
205         my $fastaObj = $db->get_Seq_by_id($id);
206         next if (!$fastaObj);
207         $h = $db->header($id);
208         last;
209     }
210
211     return $h;
212 }
213 #-------------------------------------------------------------------------------
214 sub header_by_alias {
215     my $self = shift;
216     my $alias = shift;
217
218     $alias =~ /MD5_alias=(\S+)/;
219     $alias = $1;
220
221     return undef if(! defined $alias);
222
223     my $h;
224     foreach my $db (@{$self->{index}}){
225         #do it this way first to avoid warnings
226         my $fastaObj = $db->get_Seq_by_id($alias);
227         next if (!$fastaObj);
228         $h = $db->header($alias);
229         last;
230     }
231
232     return $h;
233 }
234 #-------------------------------------------------------------------------------
235 #------------------------------- SUBS ------------------------------------------
236 #-------------------------------------------------------------------------------
237 sub makeid {
238     my $def = shift;
239
240     my @ids;
241    
242     #get the standard id
243     if($def =~ />(\S+)/){
244         push(@ids, $1);
245     }
246
247     #get the MD5 ID if made by GI::split_db
248     #otherwise just trim the standard name to get an alias
249     #this is because the BLAST parser trims names
250     if($def =~ /MD5_alias=(\S+)/){
251         push(@ids, $1);
252     }
253     elsif(defined $ids[0] && length($ids[0]) > 78){
254         push(@ids, substr($ids[0], 0, 78));
255     }
256    
257     return @ids;
258 }
259 #-------------------------------------------------------------------------------
260 1;
Note: See TracBrowser for help on using the browser.