root/lib/GI.pm

Revision 281, 95.1 kB (checked in by cholt, 2 weeks ago)

no longer thread dependant, fixed overhang bug, and evidence clustering bug

Line 
1 #------------------------------------------------------------------------
2 #----                                GI                              ----
3 #------------------------------------------------------------------------
4 package GI;
5
6 use strict;
7 use vars qw(@ISA @EXPORT $VERSION $TMP $LOCK);
8 use FindBin;
9 use Exporter;
10 use FileHandle;
11 use File::Temp qw(tempfile tempdir);
12 use Dumper::GFF::GFFV3;
13 use Dumper::XML::Game;
14 use URI::Escape;
15 use File::Path;
16 use File::Copy;
17 use Data::Dumper;
18 use Getopt::Long;
19 use FileHandle;
20 use PostData;
21 use Cwd qw(cwd abs_path);
22 use Fasta;
23 use Iterator::Fasta;
24 use FastaChunker;
25 use Widget::RepeatMasker;
26 use Widget::blastx;
27 use Widget::tblastx;
28 use Widget::blastn;
29 use Widget::snap;
30 use Widget::genemark;
31 use Widget::fgenesh;
32 use Widget::xdformat;
33 use Widget::formatdb;
34 use PhatHit_utils;
35 use Shadower;
36 use polisher::exonerate::protein;
37 use polisher::exonerate::est;
38 use maker::auto_annotator;
39 use cluster;
40 use repeat_mask_seq;
41 use maker::sens_spec;
42 use File::NFSLock;
43 use FastaDB;
44 use Digest::MD5;
45
46 @ISA = qw(
47         );
48
49 $TMP = tempdir("maker_XXXXXX", CLEANUP => 1, TMPDIR => 1);
50 #------------------------------------------------------------------------
51 #--------------------------- CLASS FUNCTIONS ----------------------------
52 #------------------------------------------------------------------------
53 sub set_global_temp {
54     my $dir = shift;
55
56     return if(! -d $dir);
57
58     #remove old tempdir if user supplied a new one
59     if($TMP ne $dir){
60         File::Path::rmtree($TMP);
61     }
62
63     $TMP = $dir;
64 }
65 #------------------------------------------------------------------------
66 sub get_preds_on_chunk {
67    my $preds = shift;
68    my $chunk = shift;
69
70    my $c_start = $chunk->offset + 1;
71    my $c_end = $chunk->offset + $chunk->length;
72
73    my @keepers;
74    foreach my $pred (@{$preds}) {
75       my $s_start = $pred->start('query');
76       my $s_end = $pred->end('query');
77
78       ($s_start, $s_end) = ($s_end, $s_start) if ($s_end < $s_start);
79
80       if ($c_start <= $s_start && $s_start <= $c_end) {
81          push (@keepers, $pred);
82       }
83    }
84
85    return \@keepers;
86 }
87 #-----------------------------------------------------------------------------
88 sub merge_resolve_hits{
89    my $fasta = shift @_;
90    my $fasta_index = shift @_;
91    my $blast_keepers = shift @_;
92    my $blast_holdovers = shift @_;
93    my $the_void = shift @_;
94    my %CTL_OPT = %{shift @_};
95    my $type = shift @_; #blastn, blastx, or tblastx
96    my $LOG = shift @_;
97
98    PhatHit_utils::merge_hits($blast_keepers,
99                              $blast_holdovers,
100                              $CTL_OPT{split_hit},
101                             );
102    @{$blast_holdovers} = ();
103
104    $blast_keepers = reblast_merged_hits($fasta,
105                                         $blast_keepers,
106                                         $fasta_index,
107                                         $the_void,
108                                         $type,
109                                         \%CTL_OPT,
110                                         $LOG
111                                         );
112
113    return $blast_keepers;
114 }
115 #-----------------------------------------------------------------------------
116 sub reblast_merged_hits {
117    my $g_fasta     = shift @_;
118    my $hits        = shift @_;
119    my $db_index    = shift @_;
120    my $the_void    = shift @_;
121    my $type        = shift @_;
122    my %CTL_OPT = %{shift @_};
123    my $LOG         = shift @_;
124
125    #==get data from parent fasta
126
127    #parent fasta get def and seq
128    my $par_def = Fasta::getDef($$g_fasta);
129    my $par_seq = Fasta::getSeqRef($$g_fasta);
130
131    #get seq id off def line
132    my ($p_id)  = $par_def =~ /^([^\s\t\n]+)/;
133    $p_id =~ s/^\>//g;           #just in case
134    $p_id =~ s/\|/_/g;
135
136    #build a safe name for file names from the sequence identifier
137    my $p_safe_id = uri_escape($p_id,
138                               '\*\?\|\\\/\'\"\{\}\<\>\;\,\^\(\)\$\~\:'
139                              );
140
141    my @blast_keepers;
142
143    #==check whether to re-blast each hit
144    foreach my $hit (@{$hits}) {
145       #if not a merged hit take as is
146       if (not $hit->{'_sequences_was_merged'}) {
147          push (@blast_keepers, $hit);
148          next;
149       }
150
151       #== excise region of query fasta and build new chunk object for blast
152       
153       #find region of hit to get piece
154       my ($nB, $nE) = PhatHit_utils::get_span_of_hit($hit,'query');   
155       my @coors = [$nB, $nE];
156       my $piece = Shadower::getPieces($par_seq, \@coors, $CTL_OPT{split_hit});
157      
158       #get piece fasta def, seq, and offset
159       my $piece_def = $par_def." ".$piece->[0]->{b}." ".$piece->[0]->{e};
160       my $piece_seq = $piece->[0]->{piece};
161       my $offset = $piece->[0]->{b} - 1;
162      
163       #make the piece into a fasta chunk
164       my $chunk = new FastaChunk();
165       $chunk->seq($piece_seq);
166       $chunk->def($piece_def);
167       $chunk->parent_def($par_def);
168       $chunk->size(length($$par_seq));       #the max size of a chunk
169       $chunk->length(length($chunk->seq())); #the actual size of a chunk
170       $chunk->offset($offset);
171       $chunk->number(0);
172       $chunk->is_last(1);
173       $chunk->parent_seq_length(length($$par_seq));
174
175       #==build new fasta and db for blast search from hit name and db index
176       
177       #get name
178       my $t_id  = $hit->name();
179       $t_id =~ s/\|/_/g;
180
181       #build a safe name for file names from the sequence identifier
182       my $t_safe_id = uri_escape($t_id,
183                                  '\*\?\|\\\/\'\"\{\}\<\>\;\,\^\(\)\$\~\:'
184                                 );
185       #search db index
186       my $fastaObj = $db_index->get_Seq_for_hit($hit);
187      
188       #still no sequence? try rebuilding the index and try again
189       if (not $fastaObj) {
190           print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n";
191           $db_index->reindex();
192           $fastaObj = $db_index->get_Seq_for_hit($hit);
193           if (not $fastaObj) {
194               print STDERR "stop here:".$hit->name."\n";
195               die "ERROR: Fasta index error\n";
196           }
197       }
198      
199       #get fasta def and seq
200       my $t_seq      = $fastaObj->seq();
201       my $t_def      = $db_index->header_for_hit($hit);
202      
203       #write fasta file
204       my $fasta = Fasta::toFasta('>'.$t_def, \$t_seq);
205       my $t_file = $the_void."/".$t_safe_id.'.for_'.$type.'.fasta';
206       FastaFile::writeFile($fasta, $t_file);
207      
208       #build db for blast using xdformat or formatdb
209       dbformat($CTL_OPT{_formater}, $t_file, $type);
210      
211       #==run the blast search
212       if ($type eq 'blastx') {
213          
214          print STDERR "re-running blast against ".$hit->name."...\n" unless $main::quiet;
215          my $keepers = blastx($chunk,
216                               $t_file,
217                               $the_void,
218                               $p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e},
219                               \%CTL_OPT,
220                               $LOG
221                              );
222          
223          push(@blast_keepers, @{$keepers});
224          print STDERR "...finished\n" unless $main::quiet;
225       }
226       elsif ($type eq 'blastn') {
227          
228          print STDERR "re-running blast against ".$hit->name."...\n" unless $main::quiet;
229          my $keepers = blastn($chunk,
230                               $t_file,
231                               $the_void,
232                               $p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e},
233                               \%CTL_OPT,
234                               $LOG
235                              );
236          
237          push(@blast_keepers, @{$keepers});
238          print STDERR "...finished\n" unless $main::quiet;
239       }
240       elsif ($type eq 'tblastx') {
241
242          print STDERR "re-running blast against ".$hit->name."...\n" unless $main::quiet;
243          my $keepers = tblastx($chunk,
244                                $t_file,
245                                $the_void,
246                                $p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e},
247                                \%CTL_OPT,
248                                $LOG
249                               );
250
251          push(@blast_keepers, @{$keepers});
252          print STDERR "...finished\n" unless $main::quiet;
253       }
254       else {
255          die "ERROR: Invaliv type \'$type\' in maker::reblast_merged_hit\n";
256       }
257
258       unlink <$t_file.x??>;
259    }
260    
261    #==return hits
262    return (\@blast_keepers);
263 }
264 #-----------------------------------------------------------------------------
265 sub process_the_chunk_divide{
266     my $chunk      = shift @_;
267     my $split_hit  = shift @_;
268     my $s_flag     = shift @_; #indicates whether to treat strands independantly
269     my $groups_cfh = shift @_; #group to cluster and find holdovers
270     
271     my $phat_hits;
272    
273     foreach my $group (@{$groups_cfh}) {
274         push(@{$phat_hits}, @{$group});
275     }
276    
277     my $p_hits = [];
278     my $m_hits = [];
279    
280     #seperate by strand or not (this makes chunk cutoffs strand independant)
281     if($s_flag){
282         ($p_hits, $m_hits) = PhatHit_utils::seperate_by_strand('query', $phat_hits, 1); #exonerate flag set
283     }
284     else{
285         $p_hits = $phat_hits;
286     }
287    
288     my $p_coors  = PhatHit_utils::to_begin_and_end_coors($p_hits, 'query');
289     my $m_coors  = PhatHit_utils::to_begin_and_end_coors($m_hits, 'query');
290    
291     foreach my $p_coor (@{$p_coors}) {
292         $p_coor->[0] -= $chunk->offset();
293         $p_coor->[1] -= $chunk->offset();
294         #fix coordinates for hits outside of chunk end   
295         $p_coor->[0] = $chunk->length if($p_coor->[0] > $chunk->length);
296         $p_coor->[1] = $chunk->length if($p_coor->[1] > $chunk->length);
297         #fix coordinates for hits outside of chunk begin
298         $p_coor->[0] = 0 if($p_coor->[0] < 0);
299         $p_coor->[1] = 0 if($p_coor->[1] < 0);
300     }
301     foreach my $m_coor (@{$m_coors}) {
302         $m_coor->[0] -= $chunk->offset();
303         $m_coor->[1] -= $chunk->offset();
304         #fix coordinates for hits outside of chunk end
305         $m_coor->[0] = $chunk->length if($m_coor->[0] > $chunk->length);
306         $m_coor->[1] = $chunk->length if($m_coor->[1] > $chunk->length);
307         #fix coordinates for hits outside of chunk begin
308         $m_coor->[0] = 0 if($m_coor->[0] < 0);
309         $m_coor->[1] = 0 if($m_coor->[1] < 0);
310     }
311    
312     my $p_pieces = Shadower::getPieces(\$chunk->seq, $p_coors, 10);
313     $p_pieces = [sort {$b->{e} <=> $a->{e}} @{$p_pieces}];
314     my $m_pieces = Shadower::getPieces(\$chunk->seq, $m_coors, 10);
315     $m_pieces = [sort {$b->{e} <=> $a->{e}} @{$m_pieces}];
316    
317     my $cutoff = $chunk->length + $chunk->offset - $split_hit;
318     my $p_cutoff = $chunk->length + $chunk->offset + 1;
319     my $m_cutoff = $chunk->length + $chunk->offset + 1;
320
321     my @keepers;
322     my @holdovers;
323
324     #no internal cutoff if this is the last contig
325     $cutoff = $chunk->length + $chunk->offset + 1 if($chunk->is_last);
326    
327     #adjust cutoff to overlapping hits
328     foreach my $p_piece (@{$p_pieces}) {
329         if ($p_piece->{e} + $chunk->offset >= $cutoff) {
330             $p_cutoff = $p_piece->{b} + $chunk->offset;
331         }
332     }
333     foreach my $m_piece (@{$m_pieces}) {
334         if ($m_piece->{e} + $chunk->offset >= $cutoff) {
335             $m_cutoff = $m_piece->{b} + $chunk->offset;
336         }
337     }
338    
339     #too small, all are heldover for next round
340     if ($p_cutoff <= 1 + $chunk->offset &&
341         $m_cutoff <= 1 + $chunk->offset) {
342         foreach my $g (@{$groups_cfh}){
343             push (@holdovers, $g);
344             push (@keepers, []);
345         }
346         return @keepers, @holdovers;
347     }
348    
349     #seperate holdovers and keepers
350     foreach my $group (@{$groups_cfh}) {
351         my $group_keepers = [];
352         my $group_holdovers = [];
353        
354         foreach my $hit (@{$group}) {
355             my $b = $hit->nB('query');
356             my $e = $hit->nE('query');
357             my $strand = $hit->strand;
358            
359             #exonerate counterpart check (blastn with flipped exonerate)
360             $strand *= -1 if ($hit->{_exonerate_flipped});
361            
362             #if stands are not being treated independantly, treat all as plus strand
363             $strand = 1 if (!$s_flag);
364            
365             ($b, $e) = ($e, $b) if $b > $e;
366            
367             if (($strand eq '1' && $e < $p_cutoff && $p_cutoff > $chunk->offset +1) ||
368                 ($strand eq '-1' && $e < $m_cutoff && $m_cutoff > $chunk->offset +1)
369                 ) {
370                 $hit->{_holdover} = 0;
371                 push(@{$group_keepers}, $hit);
372             }
373             else {
374                 $hit->{_holdover} = 1;
375                 push(@{$group_holdovers}, $hit);
376             }
377         }
378        
379         push(@keepers, $group_keepers);
380         push(@holdovers, $group_holdovers);
381     }
382
383     #keepers and hit holdovers are returned in same order given by user
384     return @keepers, @holdovers;
385 }
386 #-----------------------------------------------------------------------------
387
388 # sub write_quality_data {
389 #    my $quality_indices = shift;
390 #    my $seq_id          = shift;
391
392 #    my $out_file = $seq_id.'.maker.transcripts.qi';
393 #    my $fh = new FileHandle();
394 #    $fh->open(">$out_file");
395
396 #    print $fh "genomic_seq\ttranscript\tquality_index\n";
397
398 #    while (my $d = shift(@{$quality_indices})) {
399 #       my $t_name = $d->[0];
400 #       my $t_qi   = $d->[1];
401         
402 #       print $fh "$seq_id\t$t_name\t$t_qi\n";
403 #    }
404 #    $fh->close();
405 # }
406 #-----------------------------------------------------------------------------
407 sub maker_p_and_t_fastas {
408    my $maker    = shift @_;
409    my $non_over = shift @_;
410    my $abinit   = shift @_;
411    my $p_fastas = shift @_;
412    my $t_fastas = shift @_;
413    
414    foreach my $an (@$maker) {
415       foreach my $a (@{$an->{t_structs}}) {
416          my ($p_fasta, $t_fasta) = get_p_and_t_fastas($a);
417          $p_fastas->{maker} .= $p_fasta;
418          $t_fastas->{maker} .= $t_fasta;
419       }
420    }
421
422    foreach my $an (@$non_over) {
423        foreach my $a (@{$an->{t_structs}}) {
424            my ($p_fasta, $t_fasta) = get_p_and_t_fastas($a);
425            $p_fastas->{non_overlapping_ab_initio} .= $p_fasta;
426            $t_fastas->{non_overlapping_ab_initio} .= $t_fasta;
427        }
428    }
429    
430    foreach my $an (@$abinit) {
431        foreach my $a (@{$an->{t_structs}}) {
432            my ($p_fasta, $t_fasta) = get_p_and_t_fastas($a);
433            my $source = $a->{hit}->algorithm;
434            $source =~ s/pred_gff\://;
435            $p_fastas->{$source} .= $p_fasta;
436            $t_fastas->{$source} .= $t_fasta;
437        }
438    }
439 }
440
441 #-----------------------------------------------------------------------------
442 sub get_p_and_t_fastas {
443    my $t_struct = shift;
444        
445    my $t_seq  = $t_struct->{t_seq};
446    my $p_seq  = $t_struct->{p_seq};
447    my $t_off  = $t_struct->{t_offset};
448    my $t_name = $t_struct->{t_name};
449    my $AED    = $t_struct->{AED};
450    my $QI     = $t_struct->{t_qi};
451        
452    my $p_def = '>'.$t_name.' protein AED:'.$AED.' QI:'.$QI;
453    my $t_def = '>'.$t_name.' transcript offset:'.$t_off.' AED:'.$AED.' QI:'.$QI;
454        
455    my $p_fasta = Fasta::toFasta($p_def, \$p_seq);
456    my $t_fasta = Fasta::toFasta($t_def, \$t_seq);
457        
458    return($p_fasta, $t_fasta);
459 }
460 #----------------------------------------------------------------------------
461 sub write_p_and_t_fastas{
462     my $p_fastas = shift @_;
463     my $t_fastas = shift @_;
464     my $safe_seq_id = shift @_;
465     my $out_dir = shift @_;
466
467     while( my $key = each %$p_fastas){
468         my $name = "$out_dir/$safe_seq_id.maker";
469         $name .= ".$key" unless($key eq 'maker');
470         $name .= "\.proteins.fasta";
471
472         FastaFile::writeFile(\$p_fastas->{$key},
473                              $name,
474                             );
475     }
476
477     while( my $key = each %$t_fastas){
478         my $name = "$out_dir/$safe_seq_id.maker";
479         $name .= ".$key" unless($key eq 'maker');
480         $name .= "\.transcripts.fasta";
481
482         FastaFile::writeFile(\$t_fastas->{$key},
483                              $name,
484                             );
485     }
486 }
487 #----------------------------------------------------------------------------
488 sub create_blastdb {
489    my $CTL_OPT = shift @_;
490    my $mpi_size = shift@_ || 1;
491
492    #rebuild all fastas when specified
493    File::Path::rmtree($CTL_OPT->{out_base}."/mpi_blastdb") if($CTL_OPT->{force});
494
495    ($CTL_OPT->{_protein}, $CTL_OPT->{p_db}) = split_db($CTL_OPT, 'protein', $mpi_size);
496    ($CTL_OPT->{_est}, $CTL_OPT->{e_db}) = split_db($CTL_OPT, 'est', $mpi_size);
497    ($CTL_OPT->{_est_reads},  $CTL_OPT->{d_db}) = split_db($CTL_OPT, 'est_reads', $mpi_size);
498    ($CTL_OPT->{_altest},  $CTL_OPT->{a_db}) = split_db($CTL_OPT, 'altest', $mpi_size);
499    ($CTL_OPT->{_repeat_protein}, $CTL_OPT->{r_db}) = split_db($CTL_OPT, 'repeat_protein', $mpi_size);
500 }
501 #----------------------------------------------------------------------------
502 sub concatenate_files {
503     my $infiles = shift;
504     my $outfile = shift;
505
506     my ($tFH, $t_file) = tempfile(DIR => $TMP);
507     foreach my $file (@$infiles){
508         open(my $IN, "< $file");
509         while(defined(my $line = <$IN>)){
510             print $tFH $line;
511         }
512         close($IN);
513     }
514     close($tFH);
515     File::Copy::move($t_file, $outfile);
516 }
517 #----------------------------------------------------------------------------
518 sub split_db {
519    my $CTL_OPT  = shift @_;
520    my $key      = shift @_;
521    my $mpi_size = shift @_ || 1;
522
523    #always set to at least 10 for faster fasta indexing
524    $mpi_size = 10 if($mpi_size < 10);
525
526    my $file = $CTL_OPT->{$key};
527    my $alt = $CTL_OPT->{alt_peptide} if($key =~ /protein/);
528    
529    return ('', []) if (not $file);
530    
531    #set up names and variables
532    my $fasta_iterator = new Iterator::Fasta($file);
533    my $db_size = $fasta_iterator->number_of_entries();
534    my $bins = $mpi_size;
535    $bins = $db_size if ($db_size < $bins);
536
537    my ($f_name) = $file =~ /([^\/]+)$/;
538    $f_name =~ s/\.fasta$//;
539    
540    my $d_name = "$f_name\.mpi\.$mpi_size";
541    my $b_dir = $CTL_OPT->{out_base}."/mpi_blastdb";
542    my $f_dir = "$b_dir/$d_name";
543    my $t_dir = $TMP."/$d_name";
544    #my $t_full = "$t_dir\.fasta";
545    #my $f_full = "$b_dir/$f_name\.fasta";
546
547    #delete old indexes if full file does not exist
548    #if(! -e $f_full && -e "$f_full.index"){
549    #    unlink("$f_full.index");
550    #}
551
552    #check if files exist
553    #if($mpi_size == 1 && -e $f_full){ #on one processor check if finished
554    #    return $f_full, [$f_full];
555    #}
556    #elsif(-e "$f_dir"){ #on multi processors check if finished
557    if(-e "$f_dir"){ #on multi processors check if finished
558       my @t_db = <$f_dir/*$d_name*\.fasta>;
559
560       my @existing_db;
561       foreach my $f (@t_db) {
562           push (@existing_db, $f) if (! -d $f);
563       }
564
565       if(@existing_db == $bins){ #use existing if right count
566           #if(! -e $f_full){ #fix full file
567           #    concatenate_files(\@existing_db, $f_full);
568           #}
569
570           #return $f_full, \@existing_db;
571           return $f_dir, \@existing_db;
572       }
573       else{ #remove if there is an error
574           File::Path::rmtree($f_dir);
575       }
576    }
577
578    #make needed output directories
579    #mkdir($t_dir) unless ($mpi_size == 1);
580    mkdir($t_dir);
581    mkdir($b_dir) unless (-e $b_dir);
582
583    #open filehandles for  pieces on multi processors
584    my @fhs;
585    #if($mpi_size != 1){
586        for (my $i = 0; $i < $bins; $i++) {
587            my $name = "$t_dir/$d_name\.$i\.fasta";
588            my $fh;
589            open ($fh, "> $name");
590            
591            push (@fhs, $fh);
592        }
593    #}
594   
595    #write fastas here
596    my %alias;
597    #my $FA;
598    #open($FA, "> $t_full"); #full file
599
600    my $wflag = 1; #flag set so warnings gets printed only once
601    while (my $fasta = $fasta_iterator->nextEntry()) {
602       my $def = Fasta::getDef(\$fasta);
603       my $seq_id = Fasta::def2SeqID($def);
604       my $seq_ref = Fasta::getSeqRef(\$fasta);
605
606       #fix non standard peptides
607       if (defined $alt) {
608          $$seq_ref =~ s/[\*\-]//g;
609          $$seq_ref =~ s/[^abcdefghiklmnpqrstvwyzxABCDEFGHIKLMNPQRSTVWYZX\-\n]/$alt/g;
610       }
611       #fix nucleotide sequences
612       elsif($key !~ /protein/){
613           #most programs use N for masking but for some reason the NCBI decided to
614           #use X to mask their sequence, which causes many many programs to fail
615           $$seq_ref =~ s/\-//g;
616           $$seq_ref =~ s/X/N/g;
617           die "ERROR: The nucleotide sequence file \'$file\'\n".
618               "appears to contain protein sequence or unrecognized characters.\n".
619               "Please check/fix the file before continuing.\n".
620               "Invalid Character: $1\n\n"
621               if($$seq_ref =~ /([^acgturykmswbdhvnxACGTURYKMSWBDHVNX\-\n])/);
622       }
623
624       #Skip empty fasta entries
625       next if($$seq_ref eq '');
626
627       #fix weird blast trimming error for long seq IDs by replacing them
628       if(length($seq_id) > 78){
629           warn "WARNING: The fasta file contains sequences with names longer\n".
630                "than 78 characters.  Long names get trimmed by BLAST, making\n".
631                "it harder to identify the source of an alignmnet. You might\n".
632                "want to reformat the fasta file with shorter IDs.\n".
633                "File_name:$file\n\n" if($wflag-- > 0);
634
635           my $new_id = uri_escape(Digest::MD5::md5_base64($seq_id), "^A-Za-z0-9\-\_");
636           die "ERROR: The id $seq_id is too long for BLAST, and I can'y uniquely fix it\n"
637               if($alias{$new_id});
638           $alias{$new_id}++;
639           $def =~ s/^(>\S+)/$1 MD5_alias=$new_id/;
640       }
641
642       #reformat fasta, just incase
643       my $fasta_ref = Fasta::toFastaRef($def, $seq_ref);
644
645       #build full file
646       #print $FA $$fasta_ref;
647
648       #build part files only on multi processor
649       #if($mpi_size != 1){
650           my $fh = shift @fhs;
651           print $fh $$fasta_ref;
652           push (@fhs, $fh);
653       #}
654    }
655    #close($FA); #close full file
656
657    #close part file handles
658    foreach my $fh (@fhs) {
659       close ($fh);
660    }
661
662    #move finished files into place
663    #system("mv $t_full $f_full");
664    #system("mv $t_dir $f_dir") unless($mpi_size == 1);
665    system("mv $t_dir $f_dir");
666
667    #check if everything is ok
668    #if($mpi_size == 1 && -e $f_full){ #single processor
669    #    return $f_full, [$f_full];
670    #}
671    #elsif (-e $f_dir && -e $f_full) { #multi processor
672    if (-e $f_dir) { #multi processor
673       my @t_db = <$f_dir/*$d_name*\.fasta>;
674
675       my @db_files;
676       foreach my $f (@t_db) {
677          push (@db_files, $f) if (! -d $f);
678       }
679
680       die "ERROR: SplitDB not created correctly\n\n" if(@db_files != $bins); #not ok
681
682       #return $f_full, \@db_files;
683       return $f_dir, \@db_files;
684    }
685    else {
686       die "ERROR: Could not split db\n"; #not ok
687    }
688 }
689 #----------------------------------------------------------------------------
690 # sub load_anno_hsps {
691 #    my $annotations = shift;
692 #    my @coors;
693 #    my $i = @{$annotations};
694 #    foreach my $an (@$annotations) {
695 #       foreach my $a (@{$an->[0]}) {
696 #        my $hit = $a->{hit};
697 #        foreach my $hsp ($hit->hsps()) {
698 #           push(@coors, [$hsp->nB('query'),
699 #                         $hsp->nE('query'),
700 #                        ]);
701 #        }
702 #       }
703 #    }
704 #    return` (\@coors, $i);;
705 # }
706 #-----------------------------------------------------------------------------
707 # sub load_clust_hsps {
708 #    my $clusters = shift;
709 #    my @coors;
710 #    my $i = @{$clusters};
711 #    foreach my $c (@$clusters) {
712 #       foreach my $hit (@{$c}) {
713 #        foreach my $hsp ($hit->hsps()) {
714 #           push(@coors, [$hsp->nB('query'),
715 #                         $hsp->nE('query'),
716 #                        ]);
717 #        }
718 #       }
719 #    }
720 #    return (\@coors, $i);
721 # }
722 #-----------------------------------------------------------------------------
723 # sub load_snap_hsps {
724 #    my $snaps = shift;
725 #    my @coors;
726 #    my $i = @{$snaps};
727 #    foreach my $hit (@{$snaps}) {
728 #       foreach my $hsp ($hit->hsps()) {
729 #        push(@coors, [$hsp->nB('query'),
730 #                      $hsp->nE('query'),
731 #                     ]);
732 #       }
733 #    }
734 #    return (\@coors, $i);
735 # }
736 #-----------------------------------------------------------------------------
737 sub flatten {
738    my $clusters = shift;
739    my $type     = shift;
740    my @hits;
741    foreach my $c (@{$clusters}) {
742       foreach my $hit (@{$c}) {
743          $hit->type($type) if defined($type);
744          push(@hits, $hit);
745       }
746    }
747    return \@hits;
748 }
749 #------------------------------------------------------------------------
750 sub combine {
751    my @bag;
752    while (my $hits = shift(@_)) {
753       foreach my $hit (@{$hits}) {
754          push(@bag, $hit);
755       }
756    }
757    return \@bag;
758 }
759 #-----------------------------------------------------------------------------
760 sub abinits {
761    my @preds;
762
763    push(@preds, @{snap(@_)});
764    push(@preds, @{augustus(@_)});
765    push(@preds, @{fgenesh(@_)});
766    push(@preds, @{genemark(@_)});
767    push(@preds, @{twinscan(@_)});
768
769    return \@preds;
770 }
771 #-----------------------------------------------------------------------------
772 sub snap {
773    my $in_file     = shift;
774    my $the_void    = shift;
775    my $seq_id      = shift;
776    my $CTL_OPT = shift;
777    my $LOG         = shift;
778    
779    my $exe    = $CTL_OPT->{snap};
780    my $snaphmm = $CTL_OPT->{snaphmm};
781    
782    return [] if(! grep {/snap/} @{$CTL_OPT->{_run}});
783    
784    my %params;
785    my $out_file = "$the_void/$seq_id\.all\.snap";
786    
787    $LOG->add_entry("STARTED", $out_file, "");   
788
789    my $command  = $exe;
790    $command .= " $snaphmm";
791    $command .= " $in_file";
792    $command .= " > $out_file";
793        
794    my $w = new Widget::snap();
795        
796    if (-e $out_file) {
797       print STDERR "re reading snap report.\n" unless $main::quiet;
798       print STDERR "$out_file\n" unless $main::quiet;
799    }
800    else {
801       print STDERR "running  snap.\n" unless $main::quiet;
802       $w->run($command);
803    }
804        
805    $params{min_exon_score}  = -100000; #-10000;
806    $params{min_gene_score}  = -100000; #0;
807                 
808    my $keepers = Widget::snap::parse($out_file,
809                                      \%params,
810                                      $in_file,
811                                     );
812
813    $LOG->add_entry("FINISHED", $out_file, "");
814
815    return $keepers;
816 }
817 #-----------------------------------------------------------------------------
818 sub genemark {
819    my $in_file     = shift;
820    my $the_void    = shift;
821    my $seq_id      = shift;
822    my $CTL_OPT     = shift;
823    my $LOG         = shift;
824
825    #genemark sometimes fails if called directly so I built a wrapper
826    my $wrap = "$FindBin::Bin/../lib/Widget/genemark/gmhmm_wrap";
827    my $exe  = $CTL_OPT->{organism_type} eq 'eukaryotic' ? $CTL_OPT->{gmhmme3} : $CTL_OPT->{gmhmmp}; #genemark
828    my $pro = $CTL_OPT->{probuild}; #helper exe
829    my $hmm = $CTL_OPT->{gmhmm};
830    
831    return [] if(! grep {/genemark/} @{$CTL_OPT->{_run}});
832    
833    my %params;
834    my $out_file = "$the_void/$seq_id\.all\.genemark";
835    
836    $LOG->add_entry("STARTED", $out_file, "");   
837
838
839    my $command  = $wrap;
840    $command .= " -m $hmm";
841    $command .= " -g $exe";
842    $command .= " -p $pro";
843    $command .= " -o $out_file";
844    #$command .= " -t $TMP";
845    $command .= " $in_file";
846
847    my $w = new Widget::genemark();
848        
849    if (-e $out_file) {
850       print STDERR "re reading genemark report.\n" unless $main::quiet;
851       print STDERR "$out_file\n" unless $main::quiet;
852    }
853    else {
854       print STDERR "running  genemark.\n" unless $main::quiet;
855       $w->run($command);
856    }
857        
858    $params{min_exon_score}  = -100000; #-10000;
859    $params{min_gene_score}  = -100000; #0;
860                 
861    my $keepers = Widget::genemark::parse($out_file,
862                                      \%params,
863                                      $in_file,
864                                     );
865
866    $LOG->add_entry("FINISHED", $out_file, "");
867
868    return $keepers;
869 }
870 #-----------------------------------------------------------------------------
871 sub augustus {
872    my $in_file     = shift;
873    my $the_void    = shift;
874    my $seq_id      = shift;
875    my $CTL_OPT = shift;
876    my $LOG         = shift;
877
878    my $exe = $CTL_OPT->{augustus};
879    my $org = $CTL_OPT->{augustus_species};
880
881    return [] if(! grep {/augustus/} @{$CTL_OPT->{_run}});
882
883    my %params;
884    my $out_file    = "$the_void/$seq_id\.all\.augustus";
885
886    $LOG->add_entry("STARTED", $out_file, "");
887
888    my $command  = $exe;
889    $command .= " --species=$org";
890    $command .= " --UTR=off"; #added 3/19/2009
891    $command .= " $in_file";
892    $command .= " > $out_file";
893
894    my $w = new Widget::augustus();
895
896    if (-e $out_file) {
897       print STDERR "re reading augustus report.\n" unless $main::quiet;
898       print STDERR "$out_file\n" unless $main::quiet;
899    }
900    else {
901       print STDERR "running  augustus.\n" unless $main::quiet;
902       $w->run($command);
903    }
904
905    $params{min_exon_score}  = -100000; #-10000;
906    $params{min_gene_score}  = -100000; #0;
907
908    my $chunk_keepers = Widget::augustus::parse($out_file,
909                                                \%params,
910                                                $in_file
911                                               );
912
913    $LOG->add_entry("FINISHED", $out_file, "");
914
915    return $chunk_keepers;
916 }
917 #-----------------------------------------------------------------------------
918 sub fgenesh {
919    my $in_file     = shift;
920    my $the_void    = shift;
921    my $seq_id      = shift;
922    my $CTL_OPT = shift;
923    my $LOG         = shift;
924
925    my $wrap = "$FindBin::Bin/../lib/Widget/fgenesh/fgenesh_wrap"; #fgenesh wrapper
926    my $exe = $CTL_OPT->{fgenesh};
927    my $org = $CTL_OPT->{fgenesh_par_file};
928
929    return [] if(! grep {/fgenesh/} @{$CTL_OPT->{_run}});
930
931    my %params;
932    my $out_file    = "$the_void/$seq_id\.all\.fgenesh";
933
934    $LOG->add_entry("STARTED", $out_file, "");
935
936    my $command  = "$wrap $exe";
937    #$command .= " -tmp $TMP";
938    $command .= " $org";
939    $command .= " $in_file";
940    $command .= " > $out_file";
941
942    my $w = new Widget::fgenesh();
943
944    if (-e $out_file) {
945       print STDERR "re reading fgenesh report.\n" unless $main::quiet;
946       print STDERR "$out_file\n" unless $main::quiet;
947    }
948    else {
949       print STDERR "running  fgenesh.\n" unless $main::quiet;
950       $w->run($command);
951    }
952
953    $params{min_exon_score}  = -100000; #-10000;
954    $params{min_gene_score}  = -100000; #0;
955
956    my $chunk_keepers = Widget::fgenesh::parse($out_file,
957                                               \%params,
958                                               $in_file
959                                              );
960
961    $LOG->add_entry("FINISHED", $out_file, "");
962
963    return $chunk_keepers;
964 }
965 #-----------------------------------------------------------------------------
966 sub twinscan {
967    my $in_file     = shift;
968    my $the_void    = shift;
969    my $seq_id      = shift;
970    my $CTL_OPT = shift;
971    my $LOG         = shift;
972
973    my $exe = $CTL_OPT->{twinscan};
974    my $org = $CTL_OPT->{twinscan_species};
975
976    return [] if(! grep {/twinscan/} @{$CTL_OPT->{_run}});
977
978    my %params;
979    my $out_file    = "$the_void/$seq_id\.all\.twinscan";
980
981    $LOG->add_entry("STARTED", $out_file, "");
982
983    my $command  = $exe;
984    $command .= ' --species='."$org";
985    $command .= " $in_file";
986    $command .= " > $out_file";
987
988    my $w = new Widget::twinscan();
989
990    if (-e $out_file) {
991       print STDERR "re reading twinscan report.\n" unless $main::quiet;
992       print STDERR "$out_file\n" unless $main::quiet;
993    }
994    else {
995       print STDERR "running  twinscan.\n" unless $main::quiet;
996       $w->run($command);
997    }
998
999    $params{min_exon_score}  = -100000; #-10000;
1000    $params{min_gene_score}  = -100000; #0;
1001
1002    my $chunk_keepers = Widget::twinscan::parse($out_file,
1003                                                \%params,
1004                                                $in_file
1005                                               );
1006
1007    $LOG->add_entry("FINISHED", $out_file, "");
1008
1009    return $chunk_keepers;
1010 }
1011
1012 #-----------------------------------------------------------------------------
1013 sub polish_exonerate {
1014     my $g_fasta     = shift;
1015     my $phat_hits   = shift;
1016     my $db_index    = shift;
1017     my $the_void    = shift;
1018     my $type        = shift;
1019     my $exonerate   = shift;
1020     my $pcov        = shift;
1021     my $pid         = shift;
1022     my $score_limit = shift;
1023     my $matrix      = shift;
1024     my $LOG         = shift;
1025    
1026     my $def = Fasta::getDef($g_fasta);
1027     my $seq = Fasta::getSeqRef($g_fasta);
1028    
1029     my $exe = $exonerate;
1030    
1031     my @exonerate_data;
1032    
1033     foreach my $hit (@{$phat_hits}) {
1034         next if $hit->pAh < $pcov;
1035         next if $hit->hsp('best')->frac_identical < $pid;
1036        
1037         my ($nB, $nE) = PhatHit_utils::get_span_of_hit($hit,'query');
1038        
1039         my @coors = [$nB, $nE];
1040         my $p = Shadower::getPieces($seq, \@coors, 50);
1041         my $p_def = $def." ".$p->[0]->{b}." ".$p->[0]->{e};
1042         my $p_fasta = Fasta::toFasta($p_def, \$p->[0]->{piece});
1043         my $name =  Fasta::def2SeqID($p_def);
1044         my $safe_name = Fasta::seqID2SafeID($name);
1045        
1046         my $d_file = $the_void."/".$safe_name.'.'.$p->[0]->{b}.'.'.$p->[0]->{e}.".fasta";
1047        
1048         FastaFile::writeFile($p_fasta, $d_file);
1049        
1050         my $offset = $p->[0]->{b} - 1;
1051         my $id  = $hit->name();
1052        
1053         my $fastaObj = $db_index->get_Seq_for_hit($hit);
1054        
1055         #still no sequence? try rebuilding the index and try again
1056         if (not $fastaObj) {
1057             print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n";
1058             $db_index->reindex();
1059             $fastaObj = $db_index->get_Seq_for_hit($hit);
1060            
1061             if (not $fastaObj) {
1062                 print STDERR "stop here:".$hit->name."\n";
1063                 die "ERROR: Fasta index error\n";
1064             }
1065         }
1066        
1067         my $seq      = $fastaObj->seq();
1068         my $def      = $db_index->header_for_hit($hit);
1069        
1070         my $fasta    = Fasta::toFasta('>'.$def, \$seq);
1071        
1072         #build a safe name for file names from the sequence identifier
1073         my $safe_id = Fasta::seqID2SafeID($id);
1074        
1075         my $t_file    = $the_void."/".$safe_id.'.fasta';
1076         FastaFile::writeFile($fasta, $t_file);
1077        
1078         my $exonerate_hits = to_polisher($d_file,
1079                                          $t_file,
1080                                          $the_void,
1081                                          $offset,
1082                                          $type,
1083                                          $exe,
1084                                          $score_limit,
1085                                          $matrix,
1086                                          $LOG
1087                                          );
1088        
1089         foreach my $exonerate_hit (@{$exonerate_hits}) {
1090             if (defined($exonerate_hit) && exonerate_okay($exonerate_hit)) {
1091                 #tag the source blastn hit to let you know the counterpart
1092                 #exonerate hit was flipped to the other strand
1093                 $hit->{_exonerate_flipped} = 1 if($exonerate_hit->{_was_flipped});
1094                 $hit->type("exonerate:$type"); #set hit type (exonerate only)
1095                 
1096                 push(@exonerate_data, $exonerate_hit);         
1097             }
1098         }
1099     }
1100    
1101     return \@exonerate_data;
1102 }
1103 #-----------------------------------------------------------------------------
1104 sub exonerate_okay {
1105     my $hit  = shift;
1106    
1107     my $i = 0;
1108     foreach my $hsp ($hit->hsps()) {
1109         return 0 unless defined($hsp->nB('query'));
1110         return 0 unless defined($hsp->nE('query'));
1111         return 0 unless defined($hsp->nB('hit'));
1112         return 0 unless defined($hsp->nE('hit'));
1113         return 0 unless defined($hsp->strand('query'));
1114         return 0 unless defined($hsp->strand('query'));
1115         return 0 unless defined($hsp->strand('hit'));
1116         return 0 unless defined($hsp->strand('hit'));
1117        
1118         my $q_str = $hsp->query_string();
1119         my $h_str = $hsp->hit_string();
1120        
1121         if ($h_str =~ /Target Intron/) {
1122             print STDERR "BADDD EXONERATE!\n";
1123             sleep 4;
1124             return 0;
1125         }
1126         elsif ($q_str =~ /Target Intron/) {
1127             print STDERR "BADDD EXONERATE!\n";
1128             sleep 4;
1129             return 0;
1130         }
1131         $i++;
1132     }
1133    
1134     return 1;
1135 }
1136
1137 #-----------------------------------------------------------------------------
1138 sub to_polisher {
1139    my $d_file   = shift;
1140    my $t_file   = shift;
1141    my $the_void = shift;
1142    my $offset   = shift;
1143    my $type     = shift;
1144    my $exe      = shift;
1145    my $score_limit = shift;
1146    my $matrix = shift;
1147    my $LOG   = shift;
1148
1149    if ($type eq 'p') {
1150       return polisher::exonerate::protein::polish($d_file,
1151                                                   $t_file,
1152                                                   $the_void,
1153                                                   $offset,
1154                                                   $exe,
1155                                                   $score_limit,
1156                                                   $matrix,
1157                                                   $LOG
1158                                                  );
1159    }
1160    elsif ($type eq 'e') {
1161       return polisher::exonerate::est::polish($d_file,
1162                                               $t_file,
1163                                               $the_void,
1164                                               $offset,
1165                                               $exe,
1166                                               $score_limit,
1167                                               $matrix,
1168                                               $LOG
1169                                              );
1170    }
1171    else {
1172       die "unknown type:$type in sub to_polisher.\n";
1173    }
1174 }
1175 #-----------------------------------------------------------------------------
1176 sub make_multi_fasta {
1177    my $index    = shift;
1178    my $clusters = shift;;
1179    my $fastas = '';
1180    foreach my $c (@{$clusters}) {
1181       foreach my $hit (@{$c}) {
1182          my $fastaObj = $index->get_Seq_for_hit($hit);
1183          
1184          #still no sequence? try rebuilding the index and try again
1185          if (not $fastaObj) {
1186              print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n";
1187              $index->reindex();
1188              $fastaObj = $index->get_Seq_for_hit($hit);
1189
1190              if (not $fastaObj) {
1191                  print STDERR "stop here:".$hit->name."\n";
1192                  die "ERROR: Fasta index error\n";
1193              }
1194          }
1195
1196          my $seq      = $fastaObj->seq();
1197          my $def      = $index->header_for_hit($hit);
1198          my $fasta    = Fasta::toFasta('>'.$def, \$seq);
1199          $fastas     .= $$fasta;
1200       }
1201    }
1202    return \$fastas;
1203 }
1204 #-----------------------------------------------------------------------------
1205 sub build_fasta_index {
1206    my $db = shift;
1207    my $index = new FastaDB($db);
1208    return $index;
1209 }
1210 #-----------------------------------------------------------------------------
1211 sub build_all_indexes {
1212    my $CTL_OPT = shift;
1213
1214    my @dbs = ($CTL_OPT->{_est},
1215               $CTL_OPT->{_protein},
1216               $CTL_OPT->{_repeat_protein},
1217               $CTL_OPT->{_est_reads},
1218               $CTL_OPT->{_altest}
1219              );
1220
1221    foreach my $db (@dbs){
1222        next if(! $db);
1223        if($CTL_OPT->{force}){
1224            foreach my $f (@{[<$db/*.index>]}){
1225                unlink("$f") if(-f $f);
1226            }
1227        }
1228        new FastaDB($db);
1229    }
1230 }
1231 #-----------------------------------------------------------------------------
1232 sub dbformat {
1233    my $command = shift;
1234    my $file = shift;
1235    my $type = shift;
1236
1237    die "ERROR: Can not find xdformat or formatdb executable\n" if(! -e $command);
1238    die "ERROR: Can not find the db file $file\n" if(! -e $file);
1239    die "ERROR: You must define a type (blastn|blastx|tblastx)\n" if(! $type);
1240
1241
1242    if ($command =~ /xdformat/) {
1243       if (($type eq 'blastn' && ! -e $file.'.xnd') ||
1244           ($type eq 'blastx' && ! -e $file.'.xpd') ||
1245           ($type eq 'tblastx' && ! -e $file.'.xnd')
1246          ) {
1247          $command .= " -p" if($type eq 'blastx');
1248          $command .= " -n" if($type eq 'blastn' || $type eq 'tblastx');
1249          $command .= " $file";
1250
1251          my $w = new Widget::xdformat();
1252          print STDERR "formating database...\n" unless $main::quiet;
1253          $w->run($command);
1254       }
1255    }
1256    elsif ($command =~ /formatdb/) {
1257       if (($type eq 'blastn' && ! -e $file.'.nsq') ||
1258           ($type eq 'blastx' && ! -e $file.'.psq') ||
1259           ($type eq 'tblastx' && ! -e $file.'.nsq')
1260          ) {
1261          $command .= " -p T" if($type eq 'blastx');
1262          $command .= " -p F" if($type eq 'blastn' || $type eq 'tblastx');
1263          $command .= " -i $file";
1264
1265          my $w = new Widget::formatdb();
1266          print STDERR "formating database...\n" unless $main::quiet;
1267          $w->run($command);
1268       }
1269    }
1270    else {
1271       die "ERROR: databases can only be formated by xdformat or formatdb not \'$command\'\n";
1272    }
1273 }
1274 #-----------------------------------------------------------------------------
1275 sub blastn_as_chunks {
1276    my $chunk      = shift;
1277    my $db         = shift;
1278    my $old_db     = shift;
1279    my $the_void   = shift;
1280    my $seq_id     = shift;
1281    my $CTL_OPT    = shift;
1282    my $rank       = shift;
1283    my $LOG        = shift;
1284    my $LOG_FLAG   = shift;
1285
1286    my $blastn      = $CTL_OPT->{_blastn};
1287    my $bit_blastn  = $CTL_OPT->{bit_blastn};
1288    my $eval_blastn = $CTL_OPT->{eval_blastn};
1289    my $pcov_blastn = $CTL_OPT->{pcov_blastn};
1290    my $pid_blastn  = $CTL_OPT->{pid_blastn};
1291    my $split_hit   = $CTL_OPT->{split_hit};
1292    my $cpus        = $CTL_OPT->{cpus};
1293    my $formater    = $CTL_OPT->{_formater};
1294    my $softmask    = $CTL_OPT->{softmask};
1295    my $org_type    = $CTL_OPT->{organism_type};
1296
1297    #build names for files to use and copy
1298    my ($db_n) = $db =~ /([^\/]+)$/;
1299    $db_n  =~ s/\.fasta$//;
1300        
1301    my $chunk_number = $chunk->number();
1302  
1303    my ($db_old_n) = $old_db =~ /([^\/]+)$/;
1304    $db_old_n  =~ s/\.fasta$//;
1305    my $blast_finished = "$the_void/$seq_id\.$chunk_number\.$db_old_n\.blastn";
1306
1307    my $t_dir = $TMP."/rank".$rank;
1308    File::Path::mkpath($t_dir);
1309
1310    my $t_file_name = "$t_dir/$seq_id\.$chunk_number";
1311    my $blast_dir = "$blast_finished\.temp_dir";
1312    my $o_file    = "$blast_dir/$db_n\.blastn";
1313
1314    $db =~ /([^\/]+)$/;
1315    my $tmp_db = "$TMP/$1";
1316
1317    $LOG->add_entry("STARTED", $blast_finished, "") if($LOG_FLAG);
1318
1319    #copy db to local tmp dir and run xdformat or formatdb
1320    if ((! @{[<$tmp_db.x?d*>]} || ! @{[<$tmp_db.?sq*>]}) && (! -e $blast_finished)) {
1321        if(my $lock = new File::NFSLock("$tmp_db.lock", 'EX', undef, 300)){
1322            copy($db, $tmp_db) if(! -e $tmp_db);
1323            dbformat($formater, $tmp_db, 'blastn');
1324            $lock->unlock;
1325        }
1326    }
1327    elsif (-e $blast_finished) {
1328       print STDERR "re reading blast report.\n" unless ($main::quiet || !$LOG_FLAG);
1329       print STDERR "$blast_finished\n" unless ($main::quiet || !$LOG_FLAG);
1330       return $blast_dir;
1331    }
1332        
1333    #call blast executable
1334    $chunk->write_file($t_file_name); 
1335
1336    runBlastn($t_file_name,
1337              $tmp_db,
1338              $o_file,
1339              $blastn,
1340              $eval_blastn,
1341              $split_hit,
1342              $cpus,
1343              $org_type,
1344              $softmask
1345             );
1346
1347    $chunk->erase_fasta_file();
1348
1349    return $blast_dir;
1350 }
1351 #-----------------------------------------------------------------------------
1352 sub collect_blastn{
1353    my $chunk     = shift;
1354    my $blast_dir = shift;
1355    my $CTL_OPT   = shift;
1356    my $LOG       = shift;
1357
1358    my $eval_blastn = $CTL_OPT->{eval_blastn};
1359    my $bit_blastn  = $CTL_OPT->{bit_blastn},
1360    my $pcov_blastn = $CTL_OPT->{pcov_blastn};
1361    my $pid_blastn  = $CTL_OPT->{pid_blastn};
1362    my $split_hit   = $CTL_OPT->{split_hit};
1363
1364    my $blast_finished = $blast_dir;
1365    $blast_finished =~ s/\.temp_dir$//;
1366
1367    #merge blast reports
1368    if (! -e $blast_finished) {
1369       system ("cat $blast_dir/*blast* > $blast_finished");
1370       File::Path::rmtree ("$blast_dir");
1371    }
1372
1373    my %params;
1374    $params{significance}  = $eval_blastn;
1375    $params{hsp_bit_min}   = $bit_blastn;
1376    $params{percov}        = $pcov_blastn;
1377    $params{percid}        = $pid_blastn;
1378    $params{split_hit}     = $split_hit;
1379
1380    $LOG->add_entry("FINISHED", $blast_finished, "");
1381
1382    my $chunk_keepers = Widget::blastn::parse($blast_finished,
1383                                              \%params,
1384                                             );
1385    
1386    PhatHit_utils::add_offset($chunk_keepers,
1387                              $chunk->offset(),
1388                             );
1389
1390    if ($chunk->p_cutoff || $chunk->m_cutoff) {
1391       my @keepers;
1392      
1393       foreach my $hit (@{$chunk_keepers}) {
1394          if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) {
1395             push (@keepers, $hit)
1396          }
1397          elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) {
1398             push (@keepers, $hit)
1399          }
1400       }
1401      
1402       return \@keepers;
1403    }
1404    else {
1405       return $chunk_keepers
1406    }
1407 }
1408 #-----------------------------------------------------------------------------
1409 sub blastn {
1410    my $chunk      = shift;
1411    my $db         = shift;
1412    my $the_void   = shift;
1413    my $seq_id     = shift;
1414    my $CTL_OPT    = shift;
1415    my $LOG        = shift;
1416
1417    my $blastn      = $CTL_OPT->{_blastn};
1418    my $bit_blastn  = $CTL_OPT->{bit_blastn};
1419    my $eval_blastn = $CTL_OPT->{eval_blastn};
1420    my $pcov_blastn = $CTL_OPT->{pcov_blastn};
1421    my $pid_blastn  = $CTL_OPT->{pid_blastn};
1422    my $split_hit   = $CTL_OPT->{split_hit};
1423    my $cpus        = $CTL_OPT->{cpus};
1424    my $formater    = $CTL_OPT->{_formater};
1425    my $softmask    = $CTL_OPT->{softmask};
1426    my $org_type    = $CTL_OPT->{organism_type};
1427
1428    my ($db_n) = $db =~ /([^\/]+)$/;
1429    $db_n  =~ s/\.fasta$//;
1430        
1431    my $chunk_number = $chunk->number();
1432    my $q_length = $chunk->parent_seq_length();
1433    my $file_name = "$the_void/$seq_id\.$chunk_number";
1434    my $o_file    = "$the_void/$seq_id\.$chunk_number\.$db_n\.blastn";
1435
1436    $LOG->add_entry("STARTED", $o_file, "");
1437
1438    $chunk->write_file($file_name);
1439    runBlastn($file_name,
1440              $db,
1441              $o_file,
1442              $blastn,
1443              $eval_blastn,
1444              $split_hit,
1445              $cpus,
1446              $org_type,
1447              $softmask
1448              );
1449
1450    my %params;
1451    $params{significance}  = $eval_blastn;
1452    $params{hsp_bit_min}   = $bit_blastn;
1453    $params{percov}        = $pcov_blastn;
1454    $params{percid}        = $pid_blastn;
1455    $params{split_hit}     = $split_hit;
1456
1457    my $chunk_keepers = Widget::blastn::parse($o_file,
1458                                              \%params,
1459                                             );
1460
1461    $LOG->add_entry("FINISHED", $o_file, "");
1462
1463    PhatHit_utils::add_offset($chunk_keepers,
1464                              $chunk->offset(),
1465                             );
1466
1467    $chunk->erase_fasta_file();
1468
1469    return $chunk_keepers
1470 }
1471 #-----------------------------------------------------------------------------
1472 sub runBlastn {
1473    my $q_file     = shift;
1474    my $db         = shift;
1475    my $out_file   = shift;
1476    my $blast      = shift;
1477    my $eval_blast = shift;
1478    my $split_hit  = shift;
1479    my $cpus       = shift;
1480    my $org_type   = shift;
1481    my $softmask   = shift;
1482
1483    my $command  = $blast;
1484    if ($command =~ /blastn$/) {
1485       $command .= " $db $q_file B=100000 V=100000 E=$eval_blast";
1486       $command .= ($softmask) ? " wordmask=seg" : " filter=seg";;
1487       $command .= " R=3";
1488       $command .= " W=15";
1489       $command .= " M=1";
1490       $command .= " N=-3";
1491       $command .= " Q=3";
1492       $command .= " Z=1000";
1493       $command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000";
1494       $command .= " cpus=$cpus";       
1495       $command .= ($org_type eq 'eukaryotic') ? " topcomboN=1" : "";
1496       $command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5";
1497       $command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5";
1498       $command .= ($org_type eq 'eukaryotic') ? " hspsepqmax=$split_hit" : "";
1499       $command .= " lcmask";
1500       $command .= " gi";
1501       $command .= " warnings"; #suppress certain warnings
1502       $command .= " novalidctxok"; #fixes failure related to short and masked sequence
1503       $command .= " shortqueryok"; #fixes failure related to very short sequence
1504       $command .= ($org_type eq 'eukaryotic') ? "" : " kap";
1505       #$command .= " mformat=2"; # remove for full report
1506       $command .= " -o $out_file";
1507    }
1508    elsif ($command =~ /blastall$/) {
1509       $command .= " -p blastn";
1510       $command .= " -d $db -i $q_file -b 100000 -v 100000 -e $eval_blast";
1511       $command .= " -E 3";
1512       $command .= " -W 15";
1513       $command .= " -r 1";
1514       $command .= " -q -3";
1515       $command .= " -G 3";
1516       $command .= " -z 1000";
1517       $command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000";
1518       $command .= " -a $cpus"
1519       $command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5";
1520       $command .= " -U";
1521       $command .= " -F T";
1522       $command .= " -I";
1523       #$command .= " -m 8"; # remove for full report
1524       $command .= " -o $out_file";
1525    }
1526    else{
1527       die "ERROR: Must be a blastn executable"
1528    }
1529
1530    my $w = new Widget::blastn();
1531    if (-e $out_file) {
1532       print STDERR "re reading blast report.\n" unless $main::quiet;
1533       print STDERR "$out_file\n" unless $main::quiet;
1534    }
1535    else {
1536       print STDERR "running  blast search.\n" unless $main::quiet;
1537       my $dir = $out_file;
1538       $dir =~ s/[^\/]+$//;
1539       File::Path::mkpath($dir);
1540       $w->run($command);
1541    }
1542 }
1543 #-----------------------------------------------------------------------------
1544 sub blastx_as_chunks {
1545    my $chunk      = shift;
1546    my $db         = shift;
1547    my $old_db     = shift;
1548    my $the_void   = shift;
1549    my $seq_id     = shift;
1550    my $CTL_OPT    = shift;
1551    my $rank       = shift;
1552    my $LOG        = shift;
1553    my $LOG_FLAG   = shift;
1554
1555    my $rflag = 1 if($old_db && $CTL_OPT->{repeat_protein} eq $old_db); #am I running repeat data?
1556
1557    my $blastx      = $CTL_OPT->{_blastx};
1558    my $bit_blastx  = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{bit_blastx};
1559    my $eval_blastx = ($rflag) ? $CTL_OPT->{eval_rm_blastx} : $CTL_OPT->{eval_blastx};
1560    my $pcov_blastx = ($rflag) ? $CTL_OPT->{pcov_rm_blastx} : $CTL_OPT->{pcov_blastx};
1561    my $pid_blastx  = ($rflag) ? $CTL_OPT->{pid_rm_blastx} : $CTL_OPT->{pid_blastx};
1562    my $split_hit   = $CTL_OPT->{split_hit}; #repeat proteins get shatttered later anyway
1563    my $cpus        = $CTL_OPT->{cpus};
1564    my $formater    = $CTL_OPT->{_formater};
1565    my $softmask    = ($rflag) ? 1 : $CTL_OPT->{softmask}; #always on for repeats
1566    my $org_type    = $CTL_OPT->{organism_type};
1567
1568    #build names for files to use and copy       
1569    my ($db_n) = $db =~ /([^\/]+)$/;
1570    $db_n  =~ s/\.fasta$//;
1571
1572    my $chunk_number = $chunk->number();
1573    
1574    my ($db_old_n) = $old_db =~ /([^\/]+)$/;
1575    $db_old_n  =~ s/\.fasta$//;
1576    my $blast_finished = "$the_void/$seq_id\.$chunk_number\.$db_old_n\.blastx";
1577    
1578    my $t_dir = $TMP."/rank".$rank;
1579    File::Path::mkpath($t_dir);
1580
1581    my $t_file_name = "$t_dir/$seq_id\.$chunk_number";
1582    my $blast_dir = "$blast_finished\.temp_dir";
1583    my $o_file    = "$blast_dir/$db_n\.blastx";
1584    
1585    $db =~ /([^\/]+)$/;
1586    my $tmp_db = "$TMP/$1";
1587
1588    $LOG->add_entry("STARTED", $blast_finished, "") if($LOG_FLAG);
1589
1590    #copy db to local tmp dir and run xdformat or format db
1591    if ((! @{[<$tmp_db.x?d*>]} || ! @{[<$tmp_db.?sq*>]}) && (! -e $blast_finished) ) {
1592        if(my $lock = new File::NFSLock("$tmp_db.lock", 'EX', undef, 300)){
1593            copy($db, $tmp_db) if(! -e $tmp_db);
1594            dbformat($formater, $tmp_db, 'blastx');
1595            $lock->unlock;
1596        }
1597    }
1598    elsif (-e $blast_finished) {
1599       print STDERR "re reading blast report.\n" unless ($main::quiet || !$LOG_FLAG);
1600       print STDERR "$blast_finished\n" unless ($main::quiet || !$LOG_FLAG);
1601       return $blast_dir;
1602    }
1603
1604    #call blast executable           
1605    $chunk->write_file($t_file_name);
1606
1607    runBlastx($t_file_name,
1608              $tmp_db,
1609              $o_file,
1610              $blastx,
1611              $eval_blastx,
1612              $split_hit,
1613              $cpus,
1614              $org_type,
1615              $softmask
1616             );
1617
1618    $chunk->erase_fasta_file();
1619
1620    return $blast_dir;
1621 }
1622 #-----------------------------------------------------------------------------
1623 sub collect_blastx{
1624    my $chunk     = shift;
1625    my $blast_dir = shift;
1626    my $CTL_OPT   = shift;
1627    my $LOG       = shift;
1628
1629    my $eval_blastx = $CTL_OPT->{eval_blastx};
1630    my $bit_blastx  = $CTL_OPT->{bit_blastx},
1631    my $pcov_blastx = $CTL_OPT->{pcov_blastx};
1632    my $pid_blastx  = $CTL_OPT->{pid_blastx};
1633    my $split_hit   = $CTL_OPT->{split_hit};
1634
1635    my $blast_finished = $blast_dir;
1636    $blast_finished =~ s/\.temp_dir$//;
1637
1638    #merge blast reports
1639    if (! -e $blast_finished) {
1640       system ("cat $blast_dir/*blast* > $blast_finished");
1641       rmtree ("$blast_dir");
1642    }
1643
1644    my %params;
1645    $params{significance}  = $eval_blastx;
1646    $params{hsp_bit_min}   = $bit_blastx;
1647    $params{percov}        = $pcov_blastx;
1648    $params{percid}        = $pid_blastx;
1649    $params{split_hit}     = $split_hit;
1650
1651    $LOG->add_entry("FINISHED", $blast_finished, "");
1652    
1653    my $chunk_keepers = Widget::blastx::parse($blast_finished,
1654                                              \%params,
1655                                             );
1656
1657    PhatHit_utils::add_offset($chunk_keepers,
1658                              $chunk->offset()
1659                             );
1660
1661    if ($chunk->p_cutoff || $chunk->m_cutoff) {
1662       my @keepers;
1663      
1664       foreach my $hit (@{$chunk_keepers}) {
1665          if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) {
1666             push (@keepers, $hit)
1667          }
1668          elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) {
1669             push (@keepers, $hit)
1670          }
1671       }
1672      
1673       return \@keepers;
1674    }
1675    else {
1676       return $chunk_keepers;
1677    }
1678 }
1679 #-----------------------------------------------------------------------------
1680 sub blastx {
1681    my $chunk      = shift;
1682    my $db         = shift;
1683    my $the_void   = shift;
1684    my $seq_id     = shift;
1685    my $CTL_OPT    = shift;
1686    my $LOG        = shift;
1687
1688    my $rflag = 1 if($db && $CTL_OPT->{repeat_protein} eq $db); #am I running repeat data?
1689
1690    my $blastx      = $CTL_OPT->{_blastx};
1691    my $bit_blastx  = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{bit_blastx};
1692    my $eval_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{eval_blastx};
1693    my $pcov_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pcov_blastx};
1694    my $pid_blastx  = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pid_blastx};
1695    my $split_hit   = $CTL_OPT->{split_hit}; #repeat proteins get shatttered later anyway
1696    my $cpus        = $CTL_OPT->{cpus};
1697    my $formater    = $CTL_OPT->{_formater};
1698    my $softmask    = ($rflag) ? 1 : $CTL_OPT->{softmask};
1699    my $org_type    = $CTL_OPT->{organism_type};
1700
1701    my ($db_n) = $db =~ /([^\/]+)$/;
1702    $db_n  =~ s/\.fasta$//;
1703
1704    my $q_length = $chunk->parent_seq_length();
1705    my $chunk_number = $chunk->number();
1706                
1707    my $file_name = "$the_void/$seq_id\.$chunk_number";
1708    my $o_file    = "$the_void/$seq_id\.$chunk_number\.$db_n\.blastx";
1709
1710    $LOG->add_entry("STARTED", $o_file, "");
1711
1712    $chunk->write_file($file_name);
1713    runBlastx($file_name,
1714              $db,
1715              $o_file,
1716              $blastx,
1717              $eval_blastx,
1718              $split_hit,
1719              $cpus,
1720              $org_type,
1721              $softmask
1722             );
1723
1724    my %params;
1725    $params{significance}  = $eval_blastx;
1726    $params{hsp_bit_min}   = $bit_blastx;
1727    $params{percov}        = $pcov_blastx;
1728    $params{percid}        = $pid_blastx;
1729    $params{split_hit}     = $split_hit;
1730    
1731    my $chunk_keepers = Widget::blastx::parse($o_file,
1732                                              \%params,
1733                                             );
1734
1735    $LOG->add_entry("FINISHED", $o_file, "");
1736
1737    PhatHit_utils::add_offset($chunk_keepers,
1738                              $chunk->offset(),
1739                             );
1740    
1741    $chunk->erase_fasta_file();
1742    
1743    if ($chunk->p_cutoff || $chunk->m_cutoff) {
1744       my @keepers;
1745      
1746       foreach my $hit (@{$chunk_keepers}) {
1747          if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) {
1748             push (@keepers, $hit)
1749          }
1750          elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) {
1751             push (@keepers, $hit)
1752          }
1753       }
1754      
1755       return \@keepers;
1756    }
1757    else {
1758       return $chunk_keepers
1759    }
1760 }
1761
1762 #-----------------------------------------------------------------------------
1763 sub runBlastx {
1764    my $q_file   = shift;
1765    my $db       = shift;
1766    my $out_file = shift;
1767    my $blast = shift;
1768    my $eval_blast = shift;
1769    my $split_hit = shift;
1770    my $cpus = shift;
1771    my $org_type = shift;
1772    my $softmask = shift;
1773
1774
1775    my $command  = $blast;
1776    if ($command =~ /blastx$/) {
1777       $command .= " $db $q_file B=10000 V=10000 E=$eval_blast";
1778       $command .= ($softmask) ? " wordmask=seg" : " filter=seg";
1779       #$command .= " T=20";
1780       #$command .= " W=5";
1781       #$command .= " wink=5";
1782       $command .= " Z=300";
1783       $command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000";
1784       $command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5";
1785       $command .= " cpus=$cpus";
1786       $command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5";
1787       #$command .= " hspsepqmax=10000";
1788       $command .= " lcmask";
1789       $command .= " kap";
1790       $command .= " gi";
1791       $command .= " warnings"; #suppress certain warnings
1792       $command .= " novalidctxok"; #fixes failure related to short and masked sequence
1793       $command .= " shortqueryok"; #fixes failure related to very short sequence
1794       #$command .= " mformat=2"; # remove for full report
1795       $command .= " -o $out_file";
1796    }
1797    elsif ($command =~ /blastall$/) {
1798       $command .= " -p blastx";
1799       $command .= " -d $db -i $q_file -b 100000 -v 100000 -e $eval_blast";
1800       $command .= " -z 300";
1801       $command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000";
1802       $command .= " -a $cpus"
1803       $command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5";
1804       $command .= " -U";
1805       $command .= " -F T";
1806       $command .= " -I";
1807       #$command .= " -m 8"; # remove for full report
1808       $command .= " -o $out_file";
1809    }
1810    else{
1811       die "ERROR: Must be a blastx executable"
1812    }
1813
1814    my $w = new Widget::blastx();
1815
1816    if (-e $out_file) {
1817       print STDERR "re reading blast report.\n" unless $main::quiet;
1818       print STDERR "$out_file\n" unless $main::quiet;
1819    }
1820    else {
1821       print STDERR "running  blast search.\n" unless $main::quiet;
1822       my $dir = $out_file;
1823       $dir =~ s/[^\/]+$//;
1824       File::Path::mkpath($dir);
1825       $w->run($command);
1826    }
1827 }
1828 #-----------------------------------------------------------------------------
1829 sub tblastx_as_chunks {
1830    my $chunk      = shift;
1831    my $db         = shift;
1832    my $old_db     = shift;
1833    my $the_void   = shift;
1834    my $seq_id     = shift;
1835    my $CTL_OPT    = shift;
1836    my $rank       = shift;
1837    my $LOG        = shift;
1838    my $LOG_FLAG   = shift;
1839
1840    my $tblastx      = $CTL_OPT->{_tblastx};
1841    my $bit_tblastx  = $CTL_OPT->{bit_tblastx};
1842    my $eval_tblastx = $CTL_OPT->{eval_tblastx};
1843    my $pcov_tblastx = $CTL_OPT->{pcov_tblastx};
1844    my $pid_tblastx  = $CTL_OPT->{pid_tblastx};
1845    my $split_hit    = $CTL_OPT->{split_hit};
1846    my $cpus         = $CTL_OPT->{cpus};
1847    my $formater     = $CTL_OPT->{_formater};
1848    my $softmask     = $CTL_OPT->{softmask};
1849    my $org_type    = $CTL_OPT->{organism_type};
1850
1851    #build names for files to use and copy
1852    my ($db_n) = $db =~ /([^\/]+)$/;
1853    $db_n  =~ s/\.fasta$//;
1854        
1855    my $chunk_number = $chunk->number();
1856  
1857    my ($db_old_n) = $old_db =~ /([^\/]+)$/;
1858    $db_old_n  =~ s/\.fasta$//;
1859    my $blast_finished = "$the_void/$seq_id\.$chunk_number\.$db_old_n\.tblastx";
1860
1861    my $t_dir = $TMP."/rank".$rank;
1862    File::Path::mkpath($t_dir);
1863
1864    my $t_file_name = "$t_dir/$seq_id\.$chunk_number";
1865    my $blast_dir = "$blast_finished\.temp_dir";
1866    my $o_file    = "$blast_dir/$db_n\.tblastx";
1867
1868    $db =~ /([^\/]+)$/;
1869    my $tmp_db = "$TMP/$1";
1870
1871    $LOG->add_entry("STARTED", $blast_finished, "") if($LOG_FLAG);
1872
1873    #copy db to local tmp dir and run xdformat or formatdb
1874    if ((! @{[<$tmp_db.x?d*>]} || ! @{[<$tmp_db.?sq*>]}) && (! -e $blast_finished) ) {
1875        if(my $lock = new File::NFSLock("$tmp_db.lock", 'EX', undef, 300)){
1876            copy($db, $tmp_db) if(! -e $tmp_db);
1877            dbformat($formater, $tmp_db, 'tblastx');
1878            $lock->unlock;
1879        }
1880    }
1881    elsif (-e $blast_finished) {
1882       print STDERR "re reading blast report.\n" unless ($main::quiet || !$LOG_FLAG);
1883       print STDERR "$blast_finished\n" unless ($main::quiet || !$LOG_FLAG);
1884       return $blast_dir;
1885    }
1886        
1887    #call blast executable
1888    $chunk->write_file($t_file_name); 
1889
1890    runtBlastx($t_file_name,
1891               $tmp_db,
1892               $o_file,
1893               $tblastx,
1894               $eval_tblastx,
1895               $split_hit,
1896               $cpus,
1897               $org_type,
1898               $softmask
1899              );
1900
1901    $chunk->erase_fasta_file();
1902
1903    return $blast_dir;
1904 }
1905 #-----------------------------------------------------------------------------
1906 sub collect_tblastx{
1907    my $chunk     = shift;
1908    my $blast_dir = shift;
1909    my $CTL_OPT   = shift;
1910    my $LOG       = shift;
1911
1912    my $eval_tblastx = $CTL_OPT->{eval_tblastx};
1913    my $bit_tblastx  = $CTL_OPT->{bit_tblastx},
1914    my $pcov_tblastx = $CTL_OPT->{pcov_tblastx};
1915    my $pid_tblastx  = $CTL_OPT->{pid_tblastx};
1916    my $split_hit   = $CTL_OPT->{split_hit};
1917
1918    my $blast_finished = $blast_dir;
1919    $blast_finished =~ s/\.temp_dir$//;
1920
1921    #merge blast reports
1922    if (! -e $blast_finished) {
1923       system ("cat $blast_dir/*blast* > $blast_finished");
1924       File::Path::rmtree ("$blast_dir");
1925    }
1926
1927    my %params;
1928    $params{significance}  = $eval_tblastx;
1929    $params{hsp_bit_min}   = $bit_tblastx;
1930    $params{percov}        = $pcov_tblastx;
1931    $params{percid}        = $pid_tblastx;
1932    $params{split_hit}     = $split_hit;
1933
1934    $LOG->add_entry("FINISHED", $blast_finished, "");
1935
1936    my $chunk_keepers = Widget::tblastx::parse($blast_finished,
1937                                               \%params,
1938                                              );
1939    
1940    PhatHit_utils::add_offset($chunk_keepers,
1941                              $chunk->offset(),
1942                             );
1943
1944    if ($chunk->p_cutoff || $chunk->m_cutoff) {
1945       my @keepers;
1946      
1947       foreach my $hit (@{$chunk_keepers}) {
1948          if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) {
1949             push (@keepers, $hit)
1950          }
1951          elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) {
1952             push (@keepers, $hit)
1953          }
1954       }
1955      
1956       return \@keepers;
1957    }
1958    else {
1959       return $chunk_keepers
1960    }
1961 }
1962 #-----------------------------------------------------------------------------
1963 sub tblastx {
1964    my $chunk      = shift;
1965    my $db         = shift;
1966    my $the_void   = shift;
1967    my $seq_id     = shift;
1968    my $CTL_OPT    = shift;
1969    my $LOG        = shift;
1970
1971    my $tblastx      = $CTL_OPT->{_tblastx};
1972    my $bit_tblastx  = $CTL_OPT->{bit_tblastx};
1973    my $eval_tblastx = $CTL_OPT->{eval_tblastx};
1974    my $pcov_tblastx = $CTL_OPT->{pcov_tblastx};
1975    my $pid_tblastx  = $CTL_OPT->{pid_tblastx};
1976    my $split_hit    = $CTL_OPT->{split_hit};
1977    my $cpus         = $CTL_OPT->{cpus};
1978    my $formater     = $CTL_OPT->{_formater};
1979    my $softmask     = $CTL_OPT->{softmask};
1980    my $org_type    = $CTL_OPT->{organism_type};
1981
1982    my ($db_n) = $db =~ /([^\/]+)$/;
1983    $db_n  =~ s/\.fasta$//;
1984        
1985    my $chunk_number = $chunk->number();
1986    my $q_length = $chunk->parent_seq_length();
1987    my $file_name = "$the_void/$seq_id\.$chunk_number";
1988    my $o_file    = "$the_void/$seq_id\.$chunk_number\.$db_n\.tblastx";
1989
1990    $LOG->add_entry("STARTED", $o_file, "");
1991
1992    $chunk->write_file($file_name);
1993    runtBlastx($file_name,
1994               $db,
1995               $o_file,
1996               $tblastx,
1997               $eval_tblastx,
1998               $split_hit,
1999               $cpus,
2000               $org_type,
2001               $softmask
2002              );
2003
2004    my %params;
2005    $params{significance}  = $eval_tblastx;
2006    $params{hsp_bit_min}   = $bit_tblastx;
2007    $params{percov}        = $pcov_tblastx;
2008    $params{percid}        = $pid_tblastx;
2009    $params{split_hit}     = $split_hit;
2010
2011    my $chunk_keepers = Widget::tblastx::parse($o_file,
2012                                               \%params,
2013                                              );
2014
2015    $LOG->add_entry("FINISHED", $o_file, "");
2016
2017    PhatHit_utils::add_offset($chunk_keepers,
2018                              $chunk->offset(),
2019                             );
2020
2021    $chunk->erase_fasta_file();
2022
2023    return $chunk_keepers;
2024 }
2025 #-----------------------------------------------------------------------------
2026 sub runtBlastx {
2027    my $q_file   = shift;
2028    my $db       = shift;
2029    my $out_file = shift;
2030    my $blast = shift;
2031    my $eval_blast = shift;
2032    my $split_hit = shift;
2033    my $cpus = shift;
2034    my $org_type = shift;
2035    my $softmask = shift;
2036
2037
2038    my $command  = $blast;
2039    if ($command =~ /tblastx$/) {
2040       $command .= " $db $q_file B=100000 V=100000 E=$eval_blast";
2041       $command .= ($softmask) ? " wordmask=seg" : " filter=seg";
2042       #$command .= " W=15";
2043       $command .= " Z=1000";
2044       $command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000";
2045       $command .= " cpus=$cpus";       
2046       $command .= ($org_type eq 'eukaryotic') ? " topcomboN=1" : "";
2047       $command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5";
2048       $command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5";
2049       $command .= ($org_type eq 'eukaryotic') ? " hspsepqmax=$split_hit" : "";
2050       $command .= " lcmask";
2051       $command .= " gi";
2052       $command .= " warnings"; #suppress certain warnings
2053       $command .= " novalidctxok"; #fixes failure related to short and masked sequence
2054       $command .= " shortqueryok"; #fixes failure related to very short sequence
2055       $command .= ($org_type eq 'eukaryotic') ? "" : " kap";
2056       #$command .= " mformat=2"; # remove for full report
2057       $command .= " -o $out_file";
2058    }
2059    elsif ($command =~ /blastall$/) {
2060       $command .= " -p tblastx";
2061       $command .= " -d $db -i $q_file -b 100000 -v 100000 -e $eval_blast";
2062       #$command .= " -W 15";
2063       $command .= " -z 1000";
2064       $command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000";
2065       $command .= " -a $cpus"
2066       $command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5";
2067       $command .= " -U";
2068       $command .= " -F T";
2069       $command .= " -I";
2070       #$command .= " -m 8"; # remove for full report
2071       $command .= " -o $out_file";
2072    }
2073    else{
2074       die "ERROR: Must be a tblastx executable"
2075    }
2076
2077    my $w = new Widget::tblastx();
2078    if (-e $out_file) {
2079       print STDERR "re reading blast report.\n" unless $main::quiet;
2080       print STDERR "$out_file\n" unless $main::quiet;
2081    }
2082    else {
2083       print STDERR "running  blast search.\n" unless $main::quiet;
2084       my $dir = $out_file;
2085       $dir =~ s/[^\/]+$//;
2086       File::Path::mkpath($dir);
2087       $w->run($command);
2088    }
2089
2090 }
2091 #-----------------------------------------------------------------------------
2092 sub repeatmask {
2093    my $chunk        = shift;
2094    my $the_void     = shift;
2095    my $seq_id       = shift;
2096    my $model_org    = shift;
2097    my $RepeatMasker = shift;
2098    my $rmlib        = shift;
2099    my $cpus         = shift;
2100    my $LOG = shift;
2101
2102    my $chunk_number = $chunk->number();
2103    my $file_name = "$the_void/$seq_id\.$chunk_number";
2104    $file_name .= ".specific" if($rmlib);
2105    my $o_file    = "$file_name\.out";
2106    my $q_length = $chunk->parent_seq_length();
2107    my $query_def = $chunk->parent_def();
2108    my $query_seq = $chunk->seq();
2109    
2110    $LOG->add_entry("STARTED", $o_file, "");
2111    
2112    $chunk->write_file($file_name);
2113    
2114    runRepeatMasker($file_name,
2115                    $model_org,
2116                    $the_void,
2117                    $o_file,
2118                    $RepeatMasker,
2119                    $rmlib,
2120                    $cpus,
2121                   );            # -no_low
2122   
2123    my $rm_chunk_keepers = Widget::RepeatMasker::parse($o_file,
2124                                                       $seq_id,
2125                                                       $q_length
2126                                                      );
2127    
2128    $LOG->add_entry("FINISHED", $o_file, "");
2129    
2130    PhatHit_utils::add_offset($rm_chunk_keepers,
2131                              $chunk->offset(),
2132                             );
2133    #     PhatHit_utils::merge_hits($rm_keepers, 
2134    #                          $rm_chunk_keepers,
2135    #                          20,
2136    #                         );
2137   
2138    $chunk->erase_fasta_file();
2139        
2140    return ($rm_chunk_keepers);
2141 }
2142 #-----------------------------------------------------------------------------
2143 sub runRepeatMasker {
2144    my $q_file   = shift;
2145    my $species  = shift;
2146    my $dir      = shift;
2147    my $o_file   = shift;
2148    my $RepeatMasker = shift;
2149    my $rmlib = shift;
2150    my $cpus = shift;
2151    my $no_low   = shift;
2152        
2153    my $command  = $RepeatMasker;
2154
2155    if ($rmlib) {
2156       $command .= " $q_file -lib $rmlib -dir $dir -pa $cpus";   
2157    }
2158    else {
2159       $command .= " $q_file -species $species -dir $dir -pa $cpus";
2160    }
2161    $command .= " -nolow" if defined($no_low);
2162        
2163    my $w = new Widget::RepeatMasker();
2164    if (-e $o_file) {
2165       print STDERR "re reading repeat masker report.\n" unless $main::quiet;
2166       print STDERR "$o_file\n" unless $main::quiet;
2167    }
2168    else {
2169       print STDERR "running  repeat masker.\n" unless $main::quiet;
2170       $w->run($command);
2171    }
2172 }
2173
2174 #-----------------------------------------------------------------------------
2175 #returns a directory name to write analysis output to
2176 sub build_the_void {
2177    my $seq_id  = shift;
2178    my $out_dir = shift;
2179
2180    $out_dir =~ s/\/$//;
2181
2182    my $vid = "theVoid\.$seq_id";   
2183    my $the_void = "$out_dir/$vid";
2184    File::Path::mkpath ($the_void);
2185
2186    return $the_void;
2187 }
2188 #-----------------------------------------------------------------------------
2189 #this function sets the defualt values for control options
2190 #the function also determines what control options are valid, so
2191 #to add control options edit this function
2192 sub set_defaults {
2193    my $type = shift || 'all';
2194
2195    if ($type !~ /^all$|^opts$|^bopt$|^exe$/) {
2196       warn "WARNING: Invalid type \'$type\' in S_Func ::set_defaults";
2197       $type = 'all';
2198    }
2199
2200    my %CTL_OPT;
2201
2202    #maker_opts
2203    if ($type eq 'all' || $type eq 'opts') {
2204       $CTL_OPT{'genome'} = '';
2205       $CTL_OPT{'genome_gff'} = '';
2206       $CTL_OPT{'est_pass'} = 0;
2207       $CTL_OPT{'altest_pass'} = 0;
2208       $CTL_OPT{'protein_pass'} = 0;
2209       $CTL_OPT{'rm_pass'} = 0;
2210       $CTL_OPT{'model_pass'} = 1;
2211       $CTL_OPT{'pred_pass'} = 0;
2212       $CTL_OPT{'other_pass'} = 0;
2213       $CTL_OPT{'est'} = '';
2214       $CTL_OPT{'est_reads'} = '';
2215       $CTL_OPT{'altest'} = '';
2216       $CTL_OPT{'est_gff'} = '';
2217       $CTL_OPT{'altest_gff'} = '';
2218       $CTL_OPT{'protein'} = '';
2219       $CTL_OPT{'protein_gff'} = '';
2220       $CTL_OPT{'model_org'} = 'all';
2221       $CTL_OPT{'repeat_protein'} = Cwd::abs_path("$FindBin::Bin/../data/te_proteins.fasta");
2222       $CTL_OPT{'rmlib'} = '';
2223       $CTL_OPT{'rm_gff'} = '';
2224       $CTL_OPT{'organism_type'} = 'eukaryotic';
2225       $CTL_OPT{'predictor'} = 'est2genome';
2226       $CTL_OPT{'predictor'} = 'model_gff' if($main::eva);
2227       $CTL_OPT{'snaphmm'} = '';
2228       $CTL_OPT{'gmhmm'} = '';
2229       $CTL_OPT{'augustus_species'} = '';
2230       $CTL_OPT{'fgenesh_par_file'} = '';
2231       $CTL_OPT{'model_gff'} = '';
2232       $CTL_OPT{'pred_gff'} = '';
2233       $CTL_OPT{'other_gff'} = '';
2234       $CTL_OPT{'alt_peptide'} = 'C';
2235       $CTL_OPT{'cpus'} = 1;
2236       $CTL_OPT{'evaluate'} = 0;
2237       $CTL_OPT{'evaluate'} = 1 if($main::eva);
2238       $CTL_OPT{'max_dna_len'} = 100000;
2239       $CTL_OPT{'min_contig'} = 1;
2240       $CTL_OPT{'split_hit'} = 10000;
2241       $CTL_OPT{'softmask'} = 1;
2242       $CTL_OPT{'pred_flank'} = 200;
2243       $CTL_OPT{'single_exon'} = 0;
2244       $CTL_OPT{'single_length'} = 250;
2245       $CTL_OPT{'min_protein'} = 0;
2246       $CTL_OPT{'AED_threshold'} = 1;
2247       $CTL_OPT{'keep_preds'} = 0;
2248       $CTL_OPT{'map_forward'} = 0;
2249       $CTL_OPT{'retry'} = 1;
2250       $CTL_OPT{'clean_try'} = 0;
2251       $CTL_OPT{'TMP'} = '';
2252       $CTL_OPT{'run'} = '';
2253       $CTL_OPT{'unmask'} = 0;
2254       $CTL_OPT{'clean_up'} = 0;
2255       #evaluator below here
2256       $CTL_OPT{'side_thre'} = 5;
2257       $CTL_OPT{'eva_window_size'} = 70;
2258       $CTL_OPT{'eva_split_hit'} = 1;
2259       $CTL_OPT{'eva_hspmax'} = 100;
2260       $CTL_OPT{'eva_gspmax'} = 100;
2261       $CTL_OPT{'enable_fathom'} = 0;
2262       $CTL_OPT{'enable_fathom'} = 1 if($main::eva);
2263    }
2264
2265    #maker_bopts
2266    if ($type eq 'all' || $type eq 'bopts') {
2267       $CTL_OPT{'blast_type'} = 'wublast';
2268       $CTL_OPT{'pcov_blastn'} = 0.80;
2269       $CTL_OPT{'pid_blastn'} = 0.85;
2270       $CTL_OPT{'eval_blastn'} = 1e-10;
2271       $CTL_OPT{'bit_blastn'} = 40;
2272       $CTL_OPT{'pcov_blastx'} = 0.50;
2273       $CTL_OPT{'pid_blastx'} = 0.40;
2274       $CTL_OPT{'eval_blastx'} = 1e-6;
2275       $CTL_OPT{'bit_blastx'} = 30;
2276       $CTL_OPT{'pcov_rm_blastx'} = 0.50;
2277       $CTL_OPT{'pid_rm_blastx'} = 0.40;
2278       $CTL_OPT{'eval_rm_blastx'} = 1e-6;
2279       $CTL_OPT{'bit_rm_blastx'} = 30;
2280       $CTL_OPT{'pcov_tblastx'} = 0.80;
2281       $CTL_OPT{'pid_tblastx'} = 0.85;
2282       $CTL_OPT{'eval_tblastx'} = 1e-10;
2283       $CTL_OPT{'bit_tblastx'} = 40;
2284       $CTL_OPT{'en_score_limit'} = 20;
2285       $CTL_OPT{'ep_score_limit'} = 20;
2286       #evaluator below here
2287       $CTL_OPT{'eva_pcov_blastn'} = 0.80;
2288       $CTL_OPT{'eva_pid_blastn'} = 0.85;
2289       $CTL_OPT{'eva_eval_blastn'} = 1e-10;
2290       $CTL_OPT{'eva_bit_blastn'} = 40;
2291    }
2292
2293    #maker_exe
2294    if ($type eq 'all' || $type eq 'exe') {
2295       my @exes = ('xdformat',
2296                   'formatdb',
2297                   'blastall',
2298                   'blastn',
2299                   'blastx',
2300                   'tblastx',
2301                   'RepeatMasker',
2302                   'exonerate',
2303                   'snap',
2304                   'gmhmme3',
2305                   'gmhmmp',
2306                   'augustus',
2307                   'fgenesh',
2308                   'twinscan',
2309                   'jigsaw',
2310                   'qrna',
2311                   'fathom',
2312                   'probuild'
2313                  );
2314
2315       foreach my $exe (@exes) {
2316          my $loc = `which $exe 2> /dev/null`;
2317          chomp $loc;
2318          if ($loc =~ /^no $exe/) {
2319             $CTL_OPT{$exe} = '';
2320          }
2321          else {
2322             $CTL_OPT{$exe} = $loc;
2323          }
2324       }
2325    }
2326
2327    return %CTL_OPT;
2328 }
2329 #-----------------------------------------------------------------------------
2330 #this function parses the control files and sets up options for each maker run
2331 #error checking for starup occurs here
2332 sub load_control_files {
2333    my @ctlfiles = @{shift @_};
2334    my %OPT = %{shift @_};
2335    my $mpi_size = shift @_ || 1;
2336
2337    #--set default values and control structure
2338    my %CTL_OPT = set_defaults();
2339
2340    my $error;         #hold all fatal errors from control file parsing
2341
2342    #--load values from control files
2343    foreach my $ctlfile (@ctlfiles) {
2344       open (CTL, "< $ctlfile") or die"ERROR: Could not open control file \"$ctlfile\".\n";
2345        
2346       while (my $line = <CTL>) {
2347          chomp($line);
2348            
2349          if ($line !~ /^[\#\s\t\n]/ && $line =~ /^([^\:]+)\:([^\n\#]*)/) {
2350             my $key = $1;
2351             my $value = $2;
2352
2353             #remove preceding and trailing whitespace
2354             $value =~ s/^[\s\t]+|[\s\t]+$//g;
2355            
2356             if (exists $CTL_OPT{$key}) { #should already exist or is a bad value
2357                #resolve environmental variables
2358                if ($value =~ /\$/) {
2359                   $value = `echo \"$value\"`;
2360                   chomp($value);
2361                }
2362
2363                #require numerical values for certain options
2364                if ($CTL_OPT{$key} =~ /^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:e[-+]?\d+)?$/ &&
2365                    $value !~  /^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:e[-+]?\d+)?$/
2366                   ) {
2367                   $error .= "ERROR: valid number required for option \'$key\'\n\n"
2368                }
2369
2370                #set value
2371                $CTL_OPT{$key} = defined($value) ? $value : '';
2372             }
2373             else {
2374                warn "WARNING: Invalid option \'$key\' in control file $ctlfile\n\n";
2375             }
2376          }
2377       }
2378    }
2379
2380    #--load command line options
2381    $CTL_OPT{genome} = $OPT{genome} if (defined $OPT{genome});
2382    $CTL_OPT{genome_gff} = $OPT{genome_gff} if (defined $OPT{genome_gff});
2383    $CTL_OPT{model_gff} = $OPT{model_gff} if (defined $OPT{model_gff});
2384    $CTL_OPT{force} = $OPT{force} if (defined $OPT{force});
2385    $CTL_OPT{predictor} = $OPT{predictor} if (defined $OPT{predictor});
2386    $CTL_OPT{retry} = $OPT{retry} if (defined $OPT{retry});
2387    $CTL_OPT{cpus} = $OPT{cpus} if (defined $OPT{cpus});
2388    $CTL_OPT{clean_try} = $OPT{clean_try} if (defined $OPT{clean_try});
2389    $CTL_OPT{again} = $OPT{again} if (defined $OPT{again});
2390
2391    #skip repeat masking command line option
2392    if ($OPT{R} || $CTL_OPT{organism_type} eq 'prokaryotic') {
2393       $CTL_OPT{model_org} = '';
2394       $CTL_OPT{repeat_protein} = '';
2395       $CTL_OPT{rmlib} = '';
2396       $CTL_OPT{rm_gff} = '';
2397       $CTL_OPT{rm_pass} = 0;
2398
2399       print STDERR "INFO: ";
2400       print STDERR "No repeats expected in prokaryotic organisms.\n"
2401           if($CTL_OPT{organism_type} eq 'prokaryotic');
2402       print STDERR "Now removing all repeat masking options.\n\n";
2403    }
2404
2405    #if no repeat masking options are set don't run masking dependent methods
2406    #i.e. unmasked predictions
2407    if ($CTL_OPT{model_org} eq '' &&
2408        $CTL_OPT{repeat_protein} eq '' &&
2409        $CTL_OPT{rmlib} eq '' &&
2410        $CTL_OPT{rm_gff} eq '' &&
2411        ($CTL_OPT{rm_pass} == 0 ||
2412         $CTL_OPT{genome_gff} eq '')
2413       ) {
2414        warn "WARNING: There are no masking options set, yet unmask is set to 1.\n".
2415            "This is not valid. The value for unmask will be set to 0.\n\n"
2416            if($CTL_OPT{unmask} == 1);
2417
2418        $CTL_OPT{unmask} = 0;
2419        $CTL_OPT{_no_mask} = 1; #no masking options found
2420    }
2421
2422    #required for evaluator to work
2423    $CTL_OPT{predictor} = 'model_gff' if($main::eva);
2424    $CTL_OPT{model_pass} = 1 if($main::eva);
2425
2426    #parse predictor and error check
2427    $CTL_OPT{predictor} =~ s/\s+//g;
2428    my @predictors = split(',', $CTL_OPT{predictor});
2429
2430    my %uniq;
2431    $CTL_OPT{_predictor} = [];
2432    foreach my $p (@predictors) {
2433        if ($p !~ /^snap$|^augustus$|^est2genome$|^protein2genome$|^fgenesh$/ &&
2434            $p !~ /^genemark$|^jigsaw$|^model_gff$|^pred_gff$/
2435            ) {
2436            $error .= "ERROR: Invalid predictor defined: $p\n".
2437                "Valid entries are: est2genome, model_gff, pred_gff,\n".
2438                "snap, genemark, augustus, and fgenesh\n\n";
2439        }
2440        elsif($CTL_OPT{organism_type} eq 'prokaryotic'){
2441            if($p =~ /^snap$|^augustus$|^fgenesh$|^jigsaw$/){
2442                warn "WARNING: the predictor $p does not support prokaryotic organisms\n".
2443                    "and will be ignored.\n\n";
2444            }
2445            else{
2446                push(@{$CTL_OPT{_run}}, $p) unless(exists $uniq{$p} || $p !~ /genemark/);
2447                push(@{$CTL_OPT{_predictor}}, $p) unless(exists $uniq{$p});
2448                $uniq{$p}++;
2449            }
2450        }
2451        elsif($CTL_OPT{organism_type} eq 'eukaryotic'){
2452            push(@{$CTL_OPT{_predictor}}, $p) unless(exists $uniq{$p});
2453            if($p =~ /^snap$|^augustus$|^fgenesh$|^genemark$|^jigsaw$/){
2454                push(@{$CTL_OPT{_run}}, $p) unless(exists $uniq{$p});
2455            }
2456            $uniq{$p}++;
2457        }
2458        else{
2459            die "FATAL: Logic error in GI::load_contol_files\n";
2460        }
2461    }
2462
2463    #reset predictor value based on previous filtering step (for log)
2464    $CTL_OPT{predictor} = join(",", @{$CTL_OPT{_predictor}});
2465
2466    #parse run and error check
2467    $CTL_OPT{run} =~ s/\s+//g;
2468    my @run = split(',', $CTL_OPT{run});
2469
2470    foreach my $p (@run) {
2471       if ($p !~ /^snap$|^augustus$|^fgenesh$|^genemark$|^jigsaw$/) {
2472          $error .= "ERROR: Invalid value defined for run: $p\n".
2473          "Valid entries are: snap, augustus, genemark, or fgenesh\n\n";
2474       }
2475       else {
2476          push(@{$CTL_OPT{_run}}, $p) unless($uniq{$p});
2477          $uniq{$p}++;
2478       }
2479    }
2480
2481    #check blast type validity and related values (NCBI vs. WUBLAST)
2482    if ($CTL_OPT{blast_type} !~ /^wublast$|^ncbi$/) {
2483       warn "WARNING: blast_type must be set to \'wublast\' or \'ncbi\'.\n",
2484       "The value $CTL_OPT{blast_type} is invalid.\n",
2485       "This will now be reset to the default 'wublast'.\n\n";
2486      
2487       $CTL_OPT{blast_type} = 'wublast';
2488    }
2489    
2490    if ($CTL_OPT{blast_type} =~ /^wublast$/ &&
2491        ! -e $CTL_OPT{blastn} &&
2492        ! -e $CTL_OPT{blastx} &&
2493        ! -e $CTL_OPT{tblastx} &&
2494        -e $CTL_OPT{blastall}
2495       ) {
2496       warn "WARNING: blast_type is set to \'wublast\' but wublast executables\n",
2497       "can not be located.  NCBI blast will be used instead.\n\n";
2498
2499       $CTL_OPT{blast_type} = 'ncbi';
2500    }
2501
2502    if ($CTL_OPT{blast_type} =~ /^ncbi$/ &&
2503        ! -e $CTL_OPT{blastall} &&
2504        -e $CTL_OPT{blastn} &&
2505        -e $CTL_OPT{blastx} &&
2506        -e $CTL_OPT{tblastx}
2507       ) {
2508       warn "WARNING: blast_type is set to \'ncbi\' but ncbi executables\n",
2509       "can not be located.  WUBLAST blast will be used instead.\n\n";
2510
2511       $CTL_OPT{blast_type} = 'wublast';
2512    }
2513    
2514    #use standard value to refer to both NCBI and WUBLAST
2515    if ($CTL_OPT{blast_type} =~ /^wublast$/) {
2516       $CTL_OPT{_formater} = $CTL_OPT{xdformat};
2517       $CTL_OPT{_blastn} = $CTL_OPT{blastn};
2518       $CTL_OPT{_blastx} = $CTL_OPT{blastx};
2519       $CTL_OPT{_tblastx} = $CTL_OPT{tblastx};
2520    }
2521    elsif ($CTL_OPT{blast_type} =~ /^ncbi$/) {
2522       $CTL_OPT{_formater} = $CTL_OPT{formatdb};
2523       $CTL_OPT{_blastn} = $CTL_OPT{blastall};
2524       $CTL_OPT{_blastx} = $CTL_OPT{blastall};
2525       $CTL_OPT{_tblastx} = $CTL_OPT{blastall};
2526    }
2527
2528    #check organism type values
2529    if ($CTL_OPT{organism_type} !~ /^eukaryotic$|^prokaryotic$/) {
2530       $error .=  "ERROR: organism_type must be set to \'eukaryotic\' or \'prokaryotic\'.\n".
2531       "The value $CTL_OPT{organism_type} is invalid.\n\n";
2532
2533       #set the default just to assist in populating remaining errors
2534       $CTL_OPT{organism_type} = 'eukaryotic';
2535    }
2536    
2537    #--validate existence of required values from control files
2538
2539    #required files in here
2540    my @infiles;
2541
2542    #decide if required
2543    push (@infiles, '_blastn', '_formater') if($CTL_OPT{est});
2544    push (@infiles, '_blastn', '_formater') if($CTL_OPT{est_reads});
2545    push (@infiles, '_blastx', '_formater') if($CTL_OPT{protein});
2546    push (@infiles, '_blastx', '_formater') if($CTL_OPT{repeat_protein});
2547    push (@infiles, '_tblastx', '_formater') if($CTL_OPT{altest});
2548    push (@infiles, 'genome') if($CTL_OPT{genome});
2549    push (@infiles, 'genome') if(!$CTL_OPT{genome_gff} && !$main::eva);
2550    push (@infiles, 'est') if($CTL_OPT{est});
2551    push (@infiles, 'protein') if($CTL_OPT{protein});
2552    push (@infiles, 'altest') if($CTL_OPT{altest});
2553    push (@infiles, 'est_reads') if($CTL_OPT{est_reads});
2554    push (@infiles, 'probuild') if (grep (/genemark/, @{$CTL_OPT{_run}}));
2555    push (@infiles, 'fathom') if ($CTL_OPT{enable_fathom} && $CTL_OPT{evaluate});
2556    push (@infiles, 'est_gff') if($CTL_OPT{est_gff});
2557    push (@infiles, 'protein_gff') if($CTL_OPT{protein_gff});
2558    push (@infiles, 'genome_gff') if($CTL_OPT{genome_gff});
2559    push (@infiles, 'genome_gff') if($main::eva && ! $CTL_OPT{model_gff});
2560    push (@infiles, 'pred_gff') if($CTL_OPT{pred_gff});
2561    push (@infiles, 'pred_gff') if (grep (/pred_gff/, $CTL_OPT{predictor}));
2562    push (@infiles, 'model_gff') if ($CTL_OPT{model_gff});
2563    push (@infiles, 'model_gff') if ($main::eva && ! $CTL_OPT{genome_gff});
2564    push (@infiles, 'model_gff') if (grep (/model_gff/, $CTL_OPT{predictor}) &&
2565                                     (!$CTL_OPT{genome_gff} ||
2566                                      !$CTL_OPT{model_pass})
2567                                    );
2568    if($CTL_OPT{organism_type} eq 'eukaryotic'){
2569        push (@infiles, 'exonerate') if($CTL_OPT{est});
2570        push (@infiles, 'exonerate') if($CTL_OPT{protein});
2571        push (@infiles, 'repeat_protein') if ($CTL_OPT{repeat_protein});
2572        push (@infiles, 'RepeatMasker') if($CTL_OPT{rmlib});
2573        push (@infiles, 'RepeatMasker') if($CTL_OPT{model_org});
2574        push (@infiles, 'rm_gff') if($CTL_OPT{rm_gff});
2575        push (@infiles, 'rmlib') if ($CTL_OPT{rmlib});
2576        push (@infiles, 'snap') if (grep (/snap/, @{$CTL_OPT{_run}}));
2577        push (@infiles, 'gmhmme3') if (grep (/genemark/, @{$CTL_OPT{_run}}));
2578        push (@infiles, 'augustus') if (grep (/augustus/, @{$CTL_OPT{_run}}));
2579        push (@infiles, 'fgenesh') if (grep (/fgenesh/, @{$CTL_OPT{_run}}));
2580        push (@infiles, 'twinscan') if (grep (/twinscan/, @{$CTL_OPT{_run}}));
2581        push (@infiles, 'jigsaw') if (grep (/jigsaw/, @{$CTL_OPT{_run}}));
2582    }
2583    elsif($CTL_OPT{organism_type} eq 'prokaryotic'){
2584        push (@infiles, 'gmhmmp') if (grep (/genemark/, @{$CTL_OPT{_run}}));
2585    }
2586
2587    #uniq the array
2588    %uniq = ();
2589    foreach my $in (@infiles) {
2590       $uniq{$in}++;
2591    }
2592    @infiles = keys %uniq;
2593
2594    #verify existence of required values
2595    foreach my $in (@infiles) {
2596       if (not $CTL_OPT{$in}) {
2597          $error .= "ERROR: You have failed to provide a value for \'$in\' in the control files.\n\n";
2598          next;
2599       }
2600       elsif (not -e $CTL_OPT{$in}) {
2601          $error .= "ERROR: The \'$in\' file $CTL_OPT{$in} does not exist.\n".
2602          "Please check your control files: maker_opts.ctl, maker_bopts, or maker_exe.ctl.\n\n";
2603          next;
2604       }
2605      
2606       #set the absolute path to the file to reduce ambiguity
2607       #this breaks blast which requires symbolic links
2608       $CTL_OPT{$in} = Cwd::abs_path($CTL_OPT{$in}) unless ($in =~ /^_blastn$|^_blastx$|^_tblastx$/);
2609    }
2610                                
2611    #--error check that values are meaningful
2612    if (grep (/^augustus$/, @infiles) && not $CTL_OPT{augustus_species}) {
2613       warn "WARNING: There is no species specified for Augustus in maker_opts.ctl augustus_species.\n".
2614       "As a result the default (fly) will be used.\n\n";
2615       $CTL_OPT{augustus_species} = "fly";
2616    }
2617    if (grep (/^augustus$/, @infiles) &&
2618        (! $ENV{AUGUSTUS_CONFIG_PATH} || ! -e "$ENV{AUGUSTUS_CONFIG_PATH}/extrinsic/extrinsic.MPE.cfg")
2619       ) {
2620       $error .= "ERROR: The environmental variable AUGUSTUS_CONFIG_PATH has not been set\n".
2621       "or is not set correctly Please set this in your profile per Augustus\n".
2622       "installation instructions then try again.\n\n";
2623    }
2624    if (grep (/^snaps$|^fathom$/, @infiles) && not $CTL_OPT{snaphmm}) {
2625       warn "WARNING: There is no model specified for for Snap/Fathom in maker_opts.ctl snaphmm.\n".
2626       "As a result, the default (fly) will be used.\n\n";
2627       $CTL_OPT{snaphmm} = "fly";
2628    }
2629    if (grep (/^snap$|^fathom$/, @infiles) && ! -e $CTL_OPT{snaphmm} &&
2630        (! exists $ENV{ZOE} || ! -e $ENV{ZOE}."/HMM/".$CTL_OPT{snaphmm})
2631       ) {
2632       $error .= "ERROR: The snaphmm specified for Snap/Fathom in maker_opts.ctl does not exist.\n\n";
2633    }
2634    if (grep (/^gmhmme3$/, @infiles) && not $CTL_OPT{gmhmm}) {
2635       $ error .=  "ERROR: There is no model specified for for GeneMark in maker_opts.ctl gmhmm.\n\n";
2636    }
2637    elsif (grep (/^gmhmme3$/, @infiles) && ! -e $CTL_OPT{gmhmm}) {
2638       $error .= "ERROR: The gmhmm specified for GeneMark in maker_opts.ctl does not exist.\n\n";
2639    }
2640    if (grep (/^fgenesh$/, @infiles)) {
2641       if (! $CTL_OPT{fgenesh_par_file}) {
2642          $error .= "ERROR: There is no parameter file secified for for FgenesH in\n".
2643          "maker_opts.ctl fgenesh_par_file.\n\n";
2644       }
2645       elsif (! -e $CTL_OPT{fgenesh_par_file}) {
2646          $error .= "ERROR: The parameter file specified for fgenesh in maker_opts.ctl does not exist.\n\n";
2647       }
2648    }
2649    if ($CTL_OPT{max_dna_len} < 50000) {
2650       warn "WARNING: \'max_dna_len\' is set too low.  The minimum value permited is 50,000.\n".
2651       "max_dna_len will be reset to 50,000\n\n";
2652       $CTL_OPT{max_dna_len} = 50000;
2653    }
2654    if ($CTL_OPT{split_hit} > 0 && $CTL_OPT{organism_type} eq 'prokaryotic') {
2655       warn "WARNING: \'split_hit\' is meaningless for prokaryotic genomes and will be set to 0.\n\n";
2656       $CTL_OPT{split_hit} = 0;
2657    }
2658    if ($CTL_OPT{single_exon} == 0 && $CTL_OPT{organism_type} eq 'prokaryotic') {
2659       warn "WARNING: \'single_exon\' is required for prokaryotic genomes and will be set to 1.\n\n";
2660       $CTL_OPT{single_exon} = 1;
2661    }
2662    if ($CTL_OPT{min_contig} <= 0) {
2663       warn "WARNING: \'min_contig\' must be set to 1 or higher.\n".
2664       "min_contig will be reset to 1\n\n";
2665       $CTL_OPT{min_contig} = 1;
2666    }
2667    if ($CTL_OPT{min_contig} < 0) {
2668       warn "WARNING: \'min_protein\' must be set to 0 or higher.\n".
2669       "min_protein will be reset to 0\n\n";
2670       $CTL_OPT{min_protein} = 0;
2671    }
2672    if ($CTL_OPT{AED_threshold} < 0 || $CTL_OPT{AED_threshold} > 1) {
2673       warn "WARNING: \'AED_threshold\' must be set to a value betweeb 0 and 1.\n".
2674       "AED_threshold will be reset to 1\n\n";
2675       $CTL_OPT{AED_threshold} = 1;
2676    }
2677    if ($CTL_OPT{retry} < 0) {
2678       warn "WARNING: \'retry\' must be set to 0 or greater.\n".
2679            "It will now be set to 0\n\n";
2680       $CTL_OPT{retry} = 0;
2681    }
2682    if($CTL_OPT{TMP} && ! -d $CTL_OPT{TMP}){
2683        $error .= "The TMP value \'$CTL_OPT{TMP}\' is not a directory or does not exist.\n\n";
2684    }
2685    if($main::eva && $CTL_OPT{genome_gff} && $CTL_OPT{model_gff}){ #only for evaluator
2686        $error .= "You can only specify a GFF3 file for genome_gff or model_gff no both!!\n\n";
2687    }
2688    
2689    die $error if ($error);   
2690
2691    #--check genome fasta file
2692    my $fasta_gff = ($CTL_OPT{genome_gff}) ? $CTL_OPT{genome_gff} : $CTL_OPT{model_gff};
2693    my $iterator = new Iterator::Any( -fasta => $CTL_OPT{genome},
2694                                      -gff => $fasta_gff
2695                                    );
2696
2697    if ($iterator->number_of_entries() == 0) {
2698       my $genome = (! $CTL_OPT{genome}) ? $fasta_gff : $CTL_OPT{genome};
2699       die "ERROR:  The file $genome contains no fasta entries\n\n";
2700    }
2701
2702    #--decide whether to force datastore
2703    if ($iterator->number_of_entries() > 1000 && ! $CTL_OPT{datastore}) {
2704       warn "WARNING:  There are more than 1000 fasta entries in the input file.\n".
2705       "A two depth datastore will be used to avoid overloading the data structure of\n".
2706       "the output directory.\n\n";
2707      
2708       $CTL_OPT{datastore} = 1;
2709    }
2710
2711    #--decide if gff database should be created
2712    my @gffs = grep(/\_gff$/, @{[keys %CTL_OPT]});
2713    foreach my $key (@gffs) {
2714       if ($CTL_OPT{$key}) {
2715          $CTL_OPT{go_gffdb} = 1;
2716          last;
2717       }
2718    }
2719
2720    #--check validity of the alternate peptide
2721    $CTL_OPT{alt_peptide} = uc($CTL_OPT{alt_peptide});
2722    if ($CTL_OPT{alt_peptide} !~ /^[ABCDEFGHIKLMNPQRSTVWXYZ]$/) {
2723       warn "WARNING: Invalid alternate peptide \'$CTL_OPT{alt_peptide}\'.\n",
2724       "This will be set to the default 'C'.\n\n";
2725       $CTL_OPT{alt_peptide} = 'C';
2726    }
2727
2728    #--set values for datastructure
2729    my $genome = $CTL_OPT{genome};
2730    $genome = $CTL_OPT{genome_gff} if (not $genome);
2731    ($CTL_OPT{out_name}) = $genome =~ /([^\/]+)$/;
2732    $CTL_OPT{out_name} =~ s/\.[^\.]+$//;
2733    $CTL_OPT{CWD} = Cwd::cwd();
2734    if(! $CTL_OPT{out_base}){
2735       $CTL_OPT{out_base} = $CTL_OPT{CWD}."/$CTL_OPT{out_name}.maker.output";
2736    }
2737    mkdir($CTL_OPT{out_base}) if(! -d $CTL_OPT{out_base});
2738    die "ERROR: Could not build output directory $CTL_OPT{out_base}\n"
2739         if(! -d $CTL_OPT{out_base});
2740
2741    #--check if another instance of maker is running, if not lock the directory
2742    my $check;
2743    if(-e $CTL_OPT{out_base}."/.maker_lock.NFSLock"){
2744        warn "WARNING: Lock found, maker may already be running.\n".
2745            "Checking for other instance.\n\n";
2746        $check = 1;
2747    }
2748
2749    #lock must be global or it is detroyed outside of block
2750    if($LOCK = new File::NFSLock($CTL_OPT{out_base}."/.maker_lock", 'EX', 40, 40)){
2751        warn "Everything seems ok!!\n\n" if($check);
2752
2753        $LOCK->maintain(30);
2754    }
2755    else{
2756        die "ERROR: Another instance of maker is already running for this dataset.\n\n";
2757    }
2758    
2759    #--set up optional global TMP (If TMP is not accessible from other nodes
2760    #)
2761    if($CTL_OPT{TMP}){
2762        $CTL_OPT{_TMP} = tempdir("maker_XXXXXX", CLEANUP => 1, DIR => $CTL_OPT{TMP});
2763    }
2764    else{
2765        $CTL_OPT{_TMP} = $TMP;
2766    }
2767
2768    set_global_temp($CTL_OPT{_TMP});
2769
2770    #---set up blast databases and indexes for analyisis
2771    create_blastdb(\%CTL_OPT, $mpi_size);
2772    build_all_indexes(\%CTL_OPT);
2773
2774    #--log the control files
2775    generate_control_files($CTL_OPT{out_base}, %CTL_OPT);
2776
2777    return %CTL_OPT;
2778 }
2779 #-----------------------------------------------------------------------------
2780 #this function generates generic control files
2781 sub generate_control_files {
2782    my $dir = shift || Cwd::cwd();
2783    my %O = (@_) ? @_ : set_defaults();
2784    my $ev = 1 if($main::eva);
2785    my $log = 1 if(@_);
2786
2787    #--build opts.ctl file
2788    open (OUT, "> $dir/eval_opts.ctl") if($ev);
2789    open (OUT, "> $dir/maker_opts.log") if(!$ev && $log);
2790    open (OUT, "> $dir/maker_opts.ctl") if(!$ev && !$log);
2791    print OUT "#-----Genome (Required for De-Novo Annotation)\n" if(!$ev);
2792    print OUT "#-----Genome (Required if not internal to GFF3 file)\n" if($ev);
2793    print OUT "genome:$O{genome} #genome sequence file in fasta format\n";
2794    print OUT "\n";
2795    print OUT "#-----Re-annotation Options (Only Maker derived GFF3)\n" if(!$ev);
2796    print OUT "#-----Maker Derived GFF3 Annotations to Evaluate (genome fasta is internal to GFF3)\n" if($ev);
2797    print OUT "genome_gff:$O{genome_gff} #re-annotate genome based on this gff3 file\n" if(!$ev);
2798    print OUT "genome_gff:$O{genome_gff} #Maker derived gff3 file\n" if($ev);
2799    print OUT "est_pass:$O{est_pass} #use ests in genome_gff: 1 = yes, 0 = no\n";
2800    print OUT "altest_pass:$O{altest_pass} #use alternate organism ests in genome_gff: 1 = yes, 0 = no\n";
2801    print OUT "protein_pass:$O{protein_pass} #use proteins in genome_gff: 1 = yes, 0 = no\n";
2802    print OUT "rm_pass:$O{rm_pass} #use repeats in genome_gff: 1 = yes, 0 = no\n";
2803    print OUT "model_pass:$O{model_pass} #use gene models in genome_gff: 1 = yes, 0 = no\n" if(!$ev);
2804    print OUT "pred_pass:$O{pred_pass} #use ab-initio predictions in genome_gff: 1 = yes, 0 = no\n";
2805    print OUT "other_pass:$O{other_pass} #passthrough everything else in genome_gff: 1 = yes, 0 = no\n" if(!$ev);
2806    print OUT "\n";
2807    print OUT "#-----External GFF3 Annotations to Evaluate\n" if($ev);
2808    print OUT "model_gff:$O{model_gff} #gene models from an external gff3 file\n" if($ev);
2809    print OUT "\n"if($ev);
2810    print OUT "#-----EST Evidence (you should provide a value for at least one)\n";
2811    print OUT "est:$O{est} #non-redundant set of assembled ESTs in fasta format (classic EST analysis)\n";
2812    print OUT "est_reads:$O{est_reads} #unassembled nextgen mRNASeq in fasta format (not fully implemented)\n";
2813    print OUT "altest:$O{altest} #EST/cDNA sequence file in fasta format from an alternate organism\n";
2814    print OUT "est_gff:$O{est_gff} #EST evidence from an external gff3 file\n";
2815    print OUT "altest_gff:$O{altest_gff} #Alternate organism EST evidence from a seperate gff3 file\n";
2816    print OUT "\n";
2817    print OUT "#-----Protein Homology Evidence (you should provide a value for at least one)\n";
2818    print OUT "protein:$O{protein}  #protein sequence file in fasta format\n";
2819    print OUT "protein_gff:$O{protein_gff}  #protein homology evidence from an external gff3 file\n";
2820    print OUT "\n";
2821    print OUT "#-----Repeat Masking (leave values blank to skip)\n";
2822    print OUT "model_org:$O{model_org} #model organism for RepBase masking in RepeatMasker\n";
2823    print OUT "repeat_protein:$O{repeat_protein} #a database of transposable element proteins in fasta format\n";
2824    print OUT "rmlib:$O{rmlib} #an organism specific repeat library in fasta format\n";
2825    print OUT "rm_gff:$O{rm_gff} #repeat elements from an external gff3 file\n";
2826    print OUT "\n";
2827    print OUT "#-----Gene Prediction Options\n" if(!$ev);
2828    print OUT "#-----Evaluator Ab-Initio Comparison Options\n" if($ev);
2829    print OUT "organism_type:$O{organism_type} #eukaryotic or prokaryotic. Default is eukaryotic\n";
2830    print OUT "run:$O{run} #ab-initio methods to run (seperate multiple values by ',')\n" if($ev);
2831    print OUT "predictor:$O{predictor} #prediction methods for annotations (seperate multiple values by ',')\n" if(!$ev);
2832    print OUT "unmask:$O{unmask} #Also run ab-initio methods on unmasked sequence, 1 = yes, 0 = no\n";
2833    print OUT "snaphmm:$O{snaphmm} #SNAP HMM model\n";
2834    print OUT "gmhmm:$O{gmhmm} #GeneMark HMM model\n";
2835    print OUT "augustus_species:$O{augustus_species} #Augustus gene prediction model\n";
2836    print OUT "fgenesh_par_file:$O{fgenesh_par_file} #Fgenesh parameter file\n";
2837    print OUT "model_gff:$O{model_gff} #gene models from an external gff3 file (annotation pass-through)\n" if(!$ev);
2838    print OUT "pred_gff:$O{pred_gff} #ab-initio predictions from an external gff3 file\n";
2839    print OUT "\n";
2840    print OUT "#-----Other Annotation Type Options (features maker doesn't recognize)\n" if(!$ev);
2841    print OUT "other_gff:$O{other_gff} #features to pass-through to final output from an extenal gff3 file\n" if(!$ev);
2842    print OUT "\n" if(!$ev);
2843    print OUT "#-----External Application Specific Options\n";
2844    print OUT "alt_peptide:$O{alt_peptide} #amino acid used to replace non standard amino acids in blast databases\n";
2845    print OUT "cpus:$O{cpus} #max number of cpus to use in BLAST and RepeatMasker\n";
2846    print OUT "\n";
2847    print OUT "#-----Maker Specific Options\n";
2848    print OUT "evaluate:$O{evaluate} #run Evaluator on all annotations, 1 = yes, 0 = no\n" if(!$ev);
2849    print OUT "max_dna_len:$O{max_dna_len} #length for dividing up contigs into chunks (larger values increase memory usage)\n";
2850    print OUT "min_contig:$O{min_contig} #all contigs from the input genome file below this size will be skipped\n";
2851    print OUT "min_protein:$O{min_protein} #all gene annotations must produce a protein of at least this many amino acids in length\n" if(!$ev);
2852    print OUT "AED_threshold:$O{AED_threshold} #Maximum Annotation Edit Distance allowed for annotations (bound by 0 and 1)\n" if(!$ev);
2853    print OUT "softmask:$O{softmask} #use soft-masked rather than hard-masked seg filtering for wublast\n";
2854    print OUT "split_hit:$O{split_hit} #length for the splitting of hits (expected max intron size for evidence alignments)\n";
2855    print OUT "pred_flank:$O{pred_flank} #length of sequence surrounding EST and protein evidence used to extend gene predictions\n";
2856    print OUT "single_exon:$O{single_exon} #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no\n";
2857    print OUT "single_length:$O{single_length} #min length required for single exon ESTs if \'single_exon\ is enabled'\n";
2858    print OUT "keep_preds:$O{keep_preds} #Add non-overlapping ab-inito gene prediction to final annotation set, 1 = yes, 0 = no\n" if(!$ev);
2859    print OUT "map_forward:$O{map_forward} #try to map names and attributes forward from gff3 annotations, 1 = yes, 0 = no\n" if(!$ev);
2860    print OUT "retry:$O{retry} #number of times to retry a contig if there is a failure for some reason\n";
2861    print OUT "clean_try:$O{clean_try} #removeall data from previous run before retrying, 1 = yes, 0 = no\n";
2862    print OUT "clean_up:$O{clean_up} #removes theVoid directory with individual analysis files, 1 = yes, 0 = no\n";
2863    print OUT "TMP:$O{TMP} #specify a directory other than the system default temporary directory for temporary files\n";
2864    print OUT "\n";
2865    print OUT "#-----EVALUATOR Control Options\n";
2866    print OUT "side_thre:$O{side_thre}\n";
2867    print OUT "eva_window_size:$O{eva_window_size}\n";
2868    print OUT "eva_split_hit:$O{eva_split_hit}\n";
2869    print OUT "eva_hspmax:$O{eva_hspmax}\n";
2870    print OUT "eva_gspmax:$O{eva_gspmax}\n";
2871    print OUT "enable_fathom:$O{enable_fathom}\n";
2872    close (OUT);
2873    
2874    #--build bopts.ctl file
2875    open (OUT, "> $dir/eval_bopts.ctl") if($ev);
2876    open (OUT, "> $dir/maker_bopts.log") if(!$ev && $log);
2877    open (OUT, "> $dir/maker_bopts.ctl") if(!$ev && !$log);
2878    print OUT "#-----BLAST and Exonerate Statistics Thresholds\n";
2879    print OUT "blast_type:$O{blast_type} #set to 'wublast' or 'ncbi'\n";
2880    print OUT "\n";
2881    print OUT "pcov_blastn:$O{pcov_blastn} #Blastn Percent Coverage Threhold EST-Genome Alignments\n";
2882    print OUT "pid_blastn:$O{pid_blastn} #Blastn Percent Identity Threshold EST-Genome Aligments\n";
2883    print OUT "eval_blastn:$O{eval_blastn} #Blastn eval cutoff\n";
2884    print OUT "bit_blastn:$O{bit_blastn} #Blastn bit cutoff\n";
2885    print OUT "\n";
2886    print OUT "pcov_blastx:$O{pcov_blastx} #Blastx Percent Coverage Threhold Protein-Genome Alignments\n";
2887    print OUT "pid_blastx:$O{pid_blastx} #Blastx Percent Identity Threshold Protein-Genome Aligments\n";
2888    print OUT "eval_blastx:$O{eval_blastx} #Blastx eval cutoff\n";
2889    print OUT "bit_blastx:$O{bit_blastx} #Blastx bit cutoff\n";
2890    print OUT "\n";
2891    print OUT "pcov_rm_blastx:$O{pcov_rm_blastx} #Blastx Percent Coverage Threhold For Transposable Element Masking\n";
2892    print OUT "pid_rm_blastx:$O{pid_rm_blastx} #Blastx Percent Identity Threshold For Transposbale Element Masking\n";
2893    print OUT "eval_rm_blastx:$O{eval_rm_blastx} #Blastx eval cutoff for transposable element masking\n";
2894    print OUT "bit_rm_blastx:$O{bit_rm_blastx} #Blastx bit cutoff for transposable element masking\n";
2895    print OUT "\n";
2896    print OUT "pcov_tblastx:$O{pcov_tblastx} #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments\n";
2897    print OUT "pid_tblastx:$O{pid_tblastx} #tBlastx Percent Identity Threshold alt-EST-Genome Aligments\n";
2898    print OUT "eval_tblastx:$O{eval_tblastx} #tBlastx eval cutoff\n";
2899    print OUT "bit_tblastx:$O{bit_tblastx} #tBlastx bit cutoff\n";
2900    print OUT "\n";
2901    print OUT "eva_pcov_blastn:$O{eva_pcov_blastn} #Evaluator Blastn Percent Coverage Threshold EST-Genome Alignments\n";
2902    print OUT "eva_pid_blastn:$O{eva_pid_blastn} #Evaluator Blastn Percent Identity Threshold EST-Genome Alignments\n";
2903    print OUT "eva_eval_blastn:$O{eva_eval_blastn} #Evaluator Blastn eval cutoff\n";
2904    print OUT "eva_bit_blastn:$O{eva_bit_blastn} #Evaluator Blastn bit cutoff\n";
2905    print OUT "\n";
2906    print OUT "ep_score_limit:$O{ep_score_limit} #exonerate protein percent of maximal score threshold\n";
2907    print OUT "en_score_limit:$O{en_score_limit} #exonerate nucleotide percent of maximal score threshold\n";
2908    close(OUT);
2909    
2910    #--build maker_exe.ctl file
2911    open (OUT, "> $dir/eval_exe.ctl") if($ev);
2912    open (OUT, "> $dir/maker_exe.log") if(!$ev && $log);
2913    open (OUT, "> $dir/maker_exe.ctl") if(!$ev && !$log);
2914    print OUT "#-----Location of Executables Used by Maker/Evaluator\n";
2915    print OUT "formatdb:$O{formatdb} #location of NCBI formatdb executable\n";
2916    print OUT "blastall:$O{blastall} #location of NCBI blastall executable\n";
2917    print OUT "xdformat:$O{xdformat} #location of WUBLAST xdformat executable\n";
2918    print OUT "blastn:$O{blastn} #location of WUBLAST blastn executable\n";
2919    print OUT "blastx:$O{blastx} #location of WUBLAST blastx executable\n";
2920    print OUT "tblastx:$O{tblastx} #location of WUBLAST tblastx executable\n";
2921    print OUT "RepeatMasker:$O{RepeatMasker} #location of RepeatMasker executable\n";
2922    print OUT "exonerate:$O{exonerate} #location of exonerate executable\n";
2923    print OUT "\n";
2924    print OUT "#-----Ab-initio Gene Prediction Algorithms\n";
2925    print OUT "snap:$O{snap} #location of snap executable\n";
2926    print OUT "gmhmme3:$O{gmhmme3} #location of eukaryotic genemark executable\n";
2927    print OUT "gmhmmp:$O{gmhmmp} #location of prokaryotic genemark executable\n";
2928    print OUT "augustus:$O{augustus} #location of augustus executable\n";
2929    print OUT "fgenesh:$O{fgenesh} #location of fgenesh executable\n";
2930    print OUT "twinscan:$O{twinscan} #location of twinscan executable (not yet implemented)\n";
2931    print OUT "\n";
2932    print OUT "#-----Other Algorithms\n";
2933    print OUT "jigsaw:$O{jigsaw} #location of jigsaw executable (not yet implemented)\n";
2934    print OUT "qrna:$O{qrna} #location of qrna executable (not yet implemented)\n";
2935    print OUT "fathom:$O{fathom} #location of fathom executable (not yet implemented)\n";
2936    print OUT "probuild:$O{probuild} #location of probuild executable (required for genemark)\n";
2937    close(OUT);   
2938 }
2939
2940 1;
Note: See TracBrowser for help on using the browser.