| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
package GI; |
|---|
| 5 |
|
|---|
| 6 |
use strict; |
|---|
| 7 |
use vars qw(@ISA @EXPORT $VERSION $TMP $LOCK); |
|---|
| 8 |
use FindBin; |
|---|
| 9 |
use Exporter; |
|---|
| 10 |
use FileHandle; |
|---|
| 11 |
use File::Temp qw(tempfile tempdir); |
|---|
| 12 |
use Dumper::GFF::GFFV3; |
|---|
| 13 |
use Dumper::XML::Game; |
|---|
| 14 |
use URI::Escape; |
|---|
| 15 |
use File::Path; |
|---|
| 16 |
use File::Copy; |
|---|
| 17 |
use Data::Dumper; |
|---|
| 18 |
use Getopt::Long; |
|---|
| 19 |
use FileHandle; |
|---|
| 20 |
use PostData; |
|---|
| 21 |
use Cwd qw(cwd abs_path); |
|---|
| 22 |
use Fasta; |
|---|
| 23 |
use Iterator::Fasta; |
|---|
| 24 |
use FastaChunker; |
|---|
| 25 |
use Widget::RepeatMasker; |
|---|
| 26 |
use Widget::blastx; |
|---|
| 27 |
use Widget::tblastx; |
|---|
| 28 |
use Widget::blastn; |
|---|
| 29 |
use Widget::snap; |
|---|
| 30 |
use Widget::genemark; |
|---|
| 31 |
use Widget::fgenesh; |
|---|
| 32 |
use Widget::xdformat; |
|---|
| 33 |
use Widget::formatdb; |
|---|
| 34 |
use PhatHit_utils; |
|---|
| 35 |
use Shadower; |
|---|
| 36 |
use polisher::exonerate::protein; |
|---|
| 37 |
use polisher::exonerate::est; |
|---|
| 38 |
use maker::auto_annotator; |
|---|
| 39 |
use cluster; |
|---|
| 40 |
use repeat_mask_seq; |
|---|
| 41 |
use maker::sens_spec; |
|---|
| 42 |
use File::NFSLock; |
|---|
| 43 |
use FastaDB; |
|---|
| 44 |
use Digest::MD5; |
|---|
| 45 |
|
|---|
| 46 |
@ISA = qw( |
|---|
| 47 |
); |
|---|
| 48 |
|
|---|
| 49 |
$TMP = tempdir("maker_XXXXXX", CLEANUP => 1, TMPDIR => 1); |
|---|
| 50 |
|
|---|
| 51 |
|
|---|
| 52 |
|
|---|
| 53 |
sub set_global_temp { |
|---|
| 54 |
my $dir = shift; |
|---|
| 55 |
|
|---|
| 56 |
return if(! -d $dir); |
|---|
| 57 |
|
|---|
| 58 |
|
|---|
| 59 |
if($TMP ne $dir){ |
|---|
| 60 |
File::Path::rmtree($TMP); |
|---|
| 61 |
} |
|---|
| 62 |
|
|---|
| 63 |
$TMP = $dir; |
|---|
| 64 |
} |
|---|
| 65 |
|
|---|
| 66 |
sub get_preds_on_chunk { |
|---|
| 67 |
my $preds = shift; |
|---|
| 68 |
my $chunk = shift; |
|---|
| 69 |
|
|---|
| 70 |
my $c_start = $chunk->offset + 1; |
|---|
| 71 |
my $c_end = $chunk->offset + $chunk->length; |
|---|
| 72 |
|
|---|
| 73 |
my @keepers; |
|---|
| 74 |
foreach my $pred (@{$preds}) { |
|---|
| 75 |
my $s_start = $pred->start('query'); |
|---|
| 76 |
my $s_end = $pred->end('query'); |
|---|
| 77 |
|
|---|
| 78 |
($s_start, $s_end) = ($s_end, $s_start) if ($s_end < $s_start); |
|---|
| 79 |
|
|---|
| 80 |
if ($c_start <= $s_start && $s_start <= $c_end) { |
|---|
| 81 |
push (@keepers, $pred); |
|---|
| 82 |
} |
|---|
| 83 |
} |
|---|
| 84 |
|
|---|
| 85 |
return \@keepers; |
|---|
| 86 |
} |
|---|
| 87 |
|
|---|
| 88 |
sub merge_resolve_hits{ |
|---|
| 89 |
my $fasta = shift @_; |
|---|
| 90 |
my $fasta_index = shift @_; |
|---|
| 91 |
my $blast_keepers = shift @_; |
|---|
| 92 |
my $blast_holdovers = shift @_; |
|---|
| 93 |
my $the_void = shift @_; |
|---|
| 94 |
my %CTL_OPT = %{shift @_}; |
|---|
| 95 |
my $type = shift @_; |
|---|
| 96 |
my $LOG = shift @_; |
|---|
| 97 |
|
|---|
| 98 |
PhatHit_utils::merge_hits($blast_keepers, |
|---|
| 99 |
$blast_holdovers, |
|---|
| 100 |
$CTL_OPT{split_hit}, |
|---|
| 101 |
); |
|---|
| 102 |
@{$blast_holdovers} = (); |
|---|
| 103 |
|
|---|
| 104 |
$blast_keepers = reblast_merged_hits($fasta, |
|---|
| 105 |
$blast_keepers, |
|---|
| 106 |
$fasta_index, |
|---|
| 107 |
$the_void, |
|---|
| 108 |
$type, |
|---|
| 109 |
\%CTL_OPT, |
|---|
| 110 |
$LOG |
|---|
| 111 |
); |
|---|
| 112 |
|
|---|
| 113 |
return $blast_keepers; |
|---|
| 114 |
} |
|---|
| 115 |
|
|---|
| 116 |
sub reblast_merged_hits { |
|---|
| 117 |
my $g_fasta = shift @_; |
|---|
| 118 |
my $hits = shift @_; |
|---|
| 119 |
my $db_index = shift @_; |
|---|
| 120 |
my $the_void = shift @_; |
|---|
| 121 |
my $type = shift @_; |
|---|
| 122 |
my %CTL_OPT = %{shift @_}; |
|---|
| 123 |
my $LOG = shift @_; |
|---|
| 124 |
|
|---|
| 125 |
|
|---|
| 126 |
|
|---|
| 127 |
|
|---|
| 128 |
my $par_def = Fasta::getDef($$g_fasta); |
|---|
| 129 |
my $par_seq = Fasta::getSeqRef($$g_fasta); |
|---|
| 130 |
|
|---|
| 131 |
|
|---|
| 132 |
my ($p_id) = $par_def =~ /^([^\s\t\n]+)/; |
|---|
| 133 |
$p_id =~ s/^\>//g; #just in case |
|---|
| 134 |
$p_id =~ s/\|/_/g; |
|---|
| 135 |
|
|---|
| 136 |
|
|---|
| 137 |
my $p_safe_id = uri_escape($p_id, |
|---|
| 138 |
'\*\?\|\\\/\'\"\{\}\<\>\;\,\^\(\)\$\~\:' |
|---|
| 139 |
); |
|---|
| 140 |
|
|---|
| 141 |
my @blast_keepers; |
|---|
| 142 |
|
|---|
| 143 |
|
|---|
| 144 |
foreach my $hit (@{$hits}) { |
|---|
| 145 |
|
|---|
| 146 |
if (not $hit->{'_sequences_was_merged'}) { |
|---|
| 147 |
push (@blast_keepers, $hit); |
|---|
| 148 |
next; |
|---|
| 149 |
} |
|---|
| 150 |
|
|---|
| 151 |
|
|---|
| 152 |
|
|---|
| 153 |
|
|---|
| 154 |
my ($nB, $nE) = PhatHit_utils::get_span_of_hit($hit,'query'); |
|---|
| 155 |
my @coors = [$nB, $nE]; |
|---|
| 156 |
my $piece = Shadower::getPieces($par_seq, \@coors, $CTL_OPT{split_hit}); |
|---|
| 157 |
|
|---|
| 158 |
|
|---|
| 159 |
my $piece_def = $par_def." ".$piece->[0]->{b}." ".$piece->[0]->{e}; |
|---|
| 160 |
my $piece_seq = $piece->[0]->{piece}; |
|---|
| 161 |
my $offset = $piece->[0]->{b} - 1; |
|---|
| 162 |
|
|---|
| 163 |
|
|---|
| 164 |
my $chunk = new FastaChunk(); |
|---|
| 165 |
$chunk->seq($piece_seq); |
|---|
| 166 |
$chunk->def($piece_def); |
|---|
| 167 |
$chunk->parent_def($par_def); |
|---|
| 168 |
$chunk->size(length($$par_seq)); |
|---|
| 169 |
$chunk->length(length($chunk->seq())); |
|---|
| 170 |
$chunk->offset($offset); |
|---|
| 171 |
$chunk->number(0); |
|---|
| 172 |
$chunk->is_last(1); |
|---|
| 173 |
$chunk->parent_seq_length(length($$par_seq)); |
|---|
| 174 |
|
|---|
| 175 |
|
|---|
| 176 |
|
|---|
| 177 |
|
|---|
| 178 |
my $t_id = $hit->name(); |
|---|
| 179 |
$t_id =~ s/\|/_/g; |
|---|
| 180 |
|
|---|
| 181 |
|
|---|
| 182 |
my $t_safe_id = uri_escape($t_id, |
|---|
| 183 |
'\*\?\|\\\/\'\"\{\}\<\>\;\,\^\(\)\$\~\:' |
|---|
| 184 |
); |
|---|
| 185 |
|
|---|
| 186 |
my $fastaObj = $db_index->get_Seq_for_hit($hit); |
|---|
| 187 |
|
|---|
| 188 |
|
|---|
| 189 |
if (not $fastaObj) { |
|---|
| 190 |
print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; |
|---|
| 191 |
$db_index->reindex(); |
|---|
| 192 |
$fastaObj = $db_index->get_Seq_for_hit($hit); |
|---|
| 193 |
if (not $fastaObj) { |
|---|
| 194 |
print STDERR "stop here:".$hit->name."\n"; |
|---|
| 195 |
die "ERROR: Fasta index error\n"; |
|---|
| 196 |
} |
|---|
| 197 |
} |
|---|
| 198 |
|
|---|
| 199 |
|
|---|
| 200 |
my $t_seq = $fastaObj->seq(); |
|---|
| 201 |
my $t_def = $db_index->header_for_hit($hit); |
|---|
| 202 |
|
|---|
| 203 |
|
|---|
| 204 |
my $fasta = Fasta::toFasta('>'.$t_def, \$t_seq); |
|---|
| 205 |
my $t_file = $the_void."/".$t_safe_id.'.for_'.$type.'.fasta'; |
|---|
| 206 |
FastaFile::writeFile($fasta, $t_file); |
|---|
| 207 |
|
|---|
| 208 |
|
|---|
| 209 |
dbformat($CTL_OPT{_formater}, $t_file, $type); |
|---|
| 210 |
|
|---|
| 211 |
|
|---|
| 212 |
if ($type eq 'blastx') { |
|---|
| 213 |
|
|---|
| 214 |
print STDERR "re-running blast against ".$hit->name."...\n" unless $main::quiet; |
|---|
| 215 |
my $keepers = blastx($chunk, |
|---|
| 216 |
$t_file, |
|---|
| 217 |
$the_void, |
|---|
| 218 |
$p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e}, |
|---|
| 219 |
\%CTL_OPT, |
|---|
| 220 |
$LOG |
|---|
| 221 |
); |
|---|
| 222 |
|
|---|
| 223 |
push(@blast_keepers, @{$keepers}); |
|---|
| 224 |
print STDERR "...finished\n" unless $main::quiet; |
|---|
| 225 |
} |
|---|
| 226 |
elsif ($type eq 'blastn') { |
|---|
| 227 |
|
|---|
| 228 |
print STDERR "re-running blast against ".$hit->name."...\n" unless $main::quiet; |
|---|
| 229 |
my $keepers = blastn($chunk, |
|---|
| 230 |
$t_file, |
|---|
| 231 |
$the_void, |
|---|
| 232 |
$p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e}, |
|---|
| 233 |
\%CTL_OPT, |
|---|
| 234 |
$LOG |
|---|
| 235 |
); |
|---|
| 236 |
|
|---|
| 237 |
push(@blast_keepers, @{$keepers}); |
|---|
| 238 |
print STDERR "...finished\n" unless $main::quiet; |
|---|
| 239 |
} |
|---|
| 240 |
elsif ($type eq 'tblastx') { |
|---|
| 241 |
|
|---|
| 242 |
print STDERR "re-running blast against ".$hit->name."...\n" unless $main::quiet; |
|---|
| 243 |
my $keepers = tblastx($chunk, |
|---|
| 244 |
$t_file, |
|---|
| 245 |
$the_void, |
|---|
| 246 |
$p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e}, |
|---|
| 247 |
\%CTL_OPT, |
|---|
| 248 |
$LOG |
|---|
| 249 |
); |
|---|
| 250 |
|
|---|
| 251 |
push(@blast_keepers, @{$keepers}); |
|---|
| 252 |
print STDERR "...finished\n" unless $main::quiet; |
|---|
| 253 |
} |
|---|
| 254 |
else { |
|---|
| 255 |
die "ERROR: Invaliv type \'$type\' in maker::reblast_merged_hit\n"; |
|---|
| 256 |
} |
|---|
| 257 |
|
|---|
| 258 |
unlink <$t_file.x??>; |
|---|
| 259 |
} |
|---|
| 260 |
|
|---|
| 261 |
|
|---|
| 262 |
return (\@blast_keepers); |
|---|
| 263 |
} |
|---|
| 264 |
|
|---|
| 265 |
sub process_the_chunk_divide{ |
|---|
| 266 |
my $chunk = shift @_; |
|---|
| 267 |
my $split_hit = shift @_; |
|---|
| 268 |
my $s_flag = shift @_; |
|---|
| 269 |
my $groups_cfh = shift @_; |
|---|
| 270 |
|
|---|
| 271 |
my $phat_hits; |
|---|
| 272 |
|
|---|
| 273 |
foreach my $group (@{$groups_cfh}) { |
|---|
| 274 |
push(@{$phat_hits}, @{$group}); |
|---|
| 275 |
} |
|---|
| 276 |
|
|---|
| 277 |
my $p_hits = []; |
|---|
| 278 |
my $m_hits = []; |
|---|
| 279 |
|
|---|
| 280 |
|
|---|
| 281 |
if($s_flag){ |
|---|
| 282 |
($p_hits, $m_hits) = PhatHit_utils::seperate_by_strand('query', $phat_hits, 1); |
|---|
| 283 |
} |
|---|
| 284 |
else{ |
|---|
| 285 |
$p_hits = $phat_hits; |
|---|
| 286 |
} |
|---|
| 287 |
|
|---|
| 288 |
my $p_coors = PhatHit_utils::to_begin_and_end_coors($p_hits, 'query'); |
|---|
| 289 |
my $m_coors = PhatHit_utils::to_begin_and_end_coors($m_hits, 'query'); |
|---|
| 290 |
|
|---|
| 291 |
foreach my $p_coor (@{$p_coors}) { |
|---|
| 292 |
$p_coor->[0] -= $chunk->offset(); |
|---|
| 293 |
$p_coor->[1] -= $chunk->offset(); |
|---|
| 294 |
|
|---|
| 295 |
$p_coor->[0] = $chunk->length if($p_coor->[0] > $chunk->length); |
|---|
| 296 |
$p_coor->[1] = $chunk->length if($p_coor->[1] > $chunk->length); |
|---|
| 297 |
|
|---|
| 298 |
$p_coor->[0] = 0 if($p_coor->[0] < 0); |
|---|
| 299 |
$p_coor->[1] = 0 if($p_coor->[1] < 0); |
|---|
| 300 |
} |
|---|
| 301 |
foreach my $m_coor (@{$m_coors}) { |
|---|
| 302 |
$m_coor->[0] -= $chunk->offset(); |
|---|
| 303 |
$m_coor->[1] -= $chunk->offset(); |
|---|
| 304 |
|
|---|
| 305 |
$m_coor->[0] = $chunk->length if($m_coor->[0] > $chunk->length); |
|---|
| 306 |
$m_coor->[1] = $chunk->length if($m_coor->[1] > $chunk->length); |
|---|
| 307 |
|
|---|
| 308 |
$m_coor->[0] = 0 if($m_coor->[0] < 0); |
|---|
| 309 |
$m_coor->[1] = 0 if($m_coor->[1] < 0); |
|---|
| 310 |
} |
|---|
| 311 |
|
|---|
| 312 |
my $p_pieces = Shadower::getPieces(\$chunk->seq, $p_coors, 10); |
|---|
| 313 |
$p_pieces = [sort {$b->{e} <=> $a->{e}} @{$p_pieces}]; |
|---|
| 314 |
my $m_pieces = Shadower::getPieces(\$chunk->seq, $m_coors, 10); |
|---|
| 315 |
$m_pieces = [sort {$b->{e} <=> $a->{e}} @{$m_pieces}]; |
|---|
| 316 |
|
|---|
| 317 |
my $cutoff = $chunk->length + $chunk->offset - $split_hit; |
|---|
| 318 |
my $p_cutoff = $chunk->length + $chunk->offset + 1; |
|---|
| 319 |
my $m_cutoff = $chunk->length + $chunk->offset + 1; |
|---|
| 320 |
|
|---|
| 321 |
my @keepers; |
|---|
| 322 |
my @holdovers; |
|---|
| 323 |
|
|---|
| 324 |
|
|---|
| 325 |
$cutoff = $chunk->length + $chunk->offset + 1 if($chunk->is_last); |
|---|
| 326 |
|
|---|
| 327 |
|
|---|
| 328 |
foreach my $p_piece (@{$p_pieces}) { |
|---|
| 329 |
if ($p_piece->{e} + $chunk->offset >= $cutoff) { |
|---|
| 330 |
$p_cutoff = $p_piece->{b} + $chunk->offset; |
|---|
| 331 |
} |
|---|
| 332 |
} |
|---|
| 333 |
foreach my $m_piece (@{$m_pieces}) { |
|---|
| 334 |
if ($m_piece->{e} + $chunk->offset >= $cutoff) { |
|---|
| 335 |
$m_cutoff = $m_piece->{b} + $chunk->offset; |
|---|
| 336 |
} |
|---|
| 337 |
} |
|---|
| 338 |
|
|---|
| 339 |
|
|---|
| 340 |
if ($p_cutoff <= 1 + $chunk->offset && |
|---|
| 341 |
$m_cutoff <= 1 + $chunk->offset) { |
|---|
| 342 |
foreach my $g (@{$groups_cfh}){ |
|---|
| 343 |
push (@holdovers, $g); |
|---|
| 344 |
push (@keepers, []); |
|---|
| 345 |
} |
|---|
| 346 |
return @keepers, @holdovers; |
|---|
| 347 |
} |
|---|
| 348 |
|
|---|
| 349 |
|
|---|
| 350 |
foreach my $group (@{$groups_cfh}) { |
|---|
| 351 |
my $group_keepers = []; |
|---|
| 352 |
my $group_holdovers = []; |
|---|
| 353 |
|
|---|
| 354 |
foreach my $hit (@{$group}) { |
|---|
| 355 |
my $b = $hit->nB('query'); |
|---|
| 356 |
my $e = $hit->nE('query'); |
|---|
| 357 |
my $strand = $hit->strand; |
|---|
| 358 |
|
|---|
| 359 |
|
|---|
| 360 |
$strand *= -1 if ($hit->{_exonerate_flipped}); |
|---|
| 361 |
|
|---|
| 362 |
|
|---|
| 363 |
$strand = 1 if (!$s_flag); |
|---|
| 364 |
|
|---|
| 365 |
($b, $e) = ($e, $b) if $b > $e; |
|---|
| 366 |
|
|---|
| 367 |
if (($strand eq '1' && $e < $p_cutoff && $p_cutoff > $chunk->offset +1) || |
|---|
| 368 |
($strand eq '-1' && $e < $m_cutoff && $m_cutoff > $chunk->offset +1) |
|---|
| 369 |
) { |
|---|
| 370 |
$hit->{_holdover} = 0; |
|---|
| 371 |
push(@{$group_keepers}, $hit); |
|---|
| 372 |
} |
|---|
| 373 |
else { |
|---|
| 374 |
$hit->{_holdover} = 1; |
|---|
| 375 |
push(@{$group_holdovers}, $hit); |
|---|
| 376 |
} |
|---|
| 377 |
} |
|---|
| 378 |
|
|---|
| 379 |
push(@keepers, $group_keepers); |
|---|
| 380 |
push(@holdovers, $group_holdovers); |
|---|
| 381 |
} |
|---|
| 382 |
|
|---|
| 383 |
|
|---|
| 384 |
return @keepers, @holdovers; |
|---|
| 385 |
} |
|---|
| 386 |
|
|---|
| 387 |
|
|---|
| 388 |
|
|---|
| 389 |
|
|---|
| 390 |
|
|---|
| 391 |
|
|---|
| 392 |
|
|---|
| 393 |
|
|---|
| 394 |
|
|---|
| 395 |
|
|---|
| 396 |
|
|---|
| 397 |
|
|---|
| 398 |
|
|---|
| 399 |
|
|---|
| 400 |
|
|---|
| 401 |
|
|---|
| 402 |
|
|---|
| 403 |
|
|---|
| 404 |
|
|---|
| 405 |
|
|---|
| 406 |
|
|---|
| 407 |
sub maker_p_and_t_fastas { |
|---|
| 408 |
my $maker = shift @_; |
|---|
| 409 |
my $non_over = shift @_; |
|---|
| 410 |
my $abinit = shift @_; |
|---|
| 411 |
my $p_fastas = shift @_; |
|---|
| 412 |
my $t_fastas = shift @_; |
|---|
| 413 |
|
|---|
| 414 |
foreach my $an (@$maker) { |
|---|
| 415 |
foreach my $a (@{$an->{t_structs}}) { |
|---|
| 416 |
my ($p_fasta, $t_fasta) = get_p_and_t_fastas($a); |
|---|
| 417 |
$p_fastas->{maker} .= $p_fasta; |
|---|
| 418 |
$t_fastas->{maker} .= $t_fasta; |
|---|
| 419 |
} |
|---|
| 420 |
} |
|---|
| 421 |
|
|---|
| 422 |
foreach my $an (@$non_over) { |
|---|
| 423 |
foreach my $a (@{$an->{t_structs}}) { |
|---|
| 424 |
my ($p_fasta, $t_fasta) = get_p_and_t_fastas($a); |
|---|
| 425 |
$p_fastas->{non_overlapping_ab_initio} .= $p_fasta; |
|---|
| 426 |
$t_fastas->{non_overlapping_ab_initio} .= $t_fasta; |
|---|
| 427 |
} |
|---|
| 428 |
} |
|---|
| 429 |
|
|---|
| 430 |
foreach my $an (@$abinit) { |
|---|
| 431 |
foreach my $a (@{$an->{t_structs}}) { |
|---|
| 432 |
my ($p_fasta, $t_fasta) = get_p_and_t_fastas($a); |
|---|
| 433 |
my $source = $a->{hit}->algorithm; |
|---|
| 434 |
$source =~ s/pred_gff\://; |
|---|
| 435 |
$p_fastas->{$source} .= $p_fasta; |
|---|
| 436 |
$t_fastas->{$source} .= $t_fasta; |
|---|
| 437 |
} |
|---|
| 438 |
} |
|---|
| 439 |
} |
|---|
| 440 |
|
|---|
| 441 |
|
|---|
| 442 |
sub get_p_and_t_fastas { |
|---|
| 443 |
my $t_struct = shift; |
|---|
| 444 |
|
|---|
| 445 |
my $t_seq = $t_struct->{t_seq}; |
|---|
| 446 |
my $p_seq = $t_struct->{p_seq}; |
|---|
| 447 |
my $t_off = $t_struct->{t_offset}; |
|---|
| 448 |
my $t_name = $t_struct->{t_name}; |
|---|
| 449 |
my $AED = $t_struct->{AED}; |
|---|
| 450 |
my $QI = $t_struct->{t_qi}; |
|---|
| 451 |
|
|---|
| 452 |
my $p_def = '>'.$t_name.' protein AED:'.$AED.' QI:'.$QI; |
|---|
| 453 |
my $t_def = '>'.$t_name.' transcript offset:'.$t_off.' AED:'.$AED.' QI:'.$QI; |
|---|
| 454 |
|
|---|
| 455 |
my $p_fasta = Fasta::toFasta($p_def, \$p_seq); |
|---|
| 456 |
my $t_fasta = Fasta::toFasta($t_def, \$t_seq); |
|---|
| 457 |
|
|---|
| 458 |
return($p_fasta, $t_fasta); |
|---|
| 459 |
} |
|---|
| 460 |
|
|---|
| 461 |
sub write_p_and_t_fastas{ |
|---|
| 462 |
my $p_fastas = shift @_; |
|---|
| 463 |
my $t_fastas = shift @_; |
|---|
| 464 |
my $safe_seq_id = shift @_; |
|---|
| 465 |
my $out_dir = shift @_; |
|---|
| 466 |
|
|---|
| 467 |
while( my $key = each %$p_fastas){ |
|---|
| 468 |
my $name = "$out_dir/$safe_seq_id.maker"; |
|---|
| 469 |
$name .= ".$key" unless($key eq 'maker'); |
|---|
| 470 |
$name .= "\.proteins.fasta"; |
|---|
| 471 |
|
|---|
| 472 |
FastaFile::writeFile(\$p_fastas->{$key}, |
|---|
| 473 |
$name, |
|---|
| 474 |
); |
|---|
| 475 |
} |
|---|
| 476 |
|
|---|
| 477 |
while( my $key = each %$t_fastas){ |
|---|
| 478 |
my $name = "$out_dir/$safe_seq_id.maker"; |
|---|
| 479 |
$name .= ".$key" unless($key eq 'maker'); |
|---|
| 480 |
$name .= "\.transcripts.fasta"; |
|---|
| 481 |
|
|---|
| 482 |
FastaFile::writeFile(\$t_fastas->{$key}, |
|---|
| 483 |
$name, |
|---|
| 484 |
); |
|---|
| 485 |
} |
|---|
| 486 |
} |
|---|
| 487 |
|
|---|
| 488 |
sub create_blastdb { |
|---|
| 489 |
my $CTL_OPT = shift @_; |
|---|
| 490 |
my $mpi_size = shift@_ || 1; |
|---|
| 491 |
|
|---|
| 492 |
|
|---|
| 493 |
File::Path::rmtree($CTL_OPT->{out_base}."/mpi_blastdb") if($CTL_OPT->{force}); |
|---|
| 494 |
|
|---|
| 495 |
($CTL_OPT->{_protein}, $CTL_OPT->{p_db}) = split_db($CTL_OPT, 'protein', $mpi_size); |
|---|
| 496 |
($CTL_OPT->{_est}, $CTL_OPT->{e_db}) = split_db($CTL_OPT, 'est', $mpi_size); |
|---|
| 497 |
($CTL_OPT->{_est_reads}, $CTL_OPT->{d_db}) = split_db($CTL_OPT, 'est_reads', $mpi_size); |
|---|
| 498 |
($CTL_OPT->{_altest}, $CTL_OPT->{a_db}) = split_db($CTL_OPT, 'altest', $mpi_size); |
|---|
| 499 |
($CTL_OPT->{_repeat_protein}, $CTL_OPT->{r_db}) = split_db($CTL_OPT, 'repeat_protein', $mpi_size); |
|---|
| 500 |
} |
|---|
| 501 |
|
|---|
| 502 |
sub concatenate_files { |
|---|
| 503 |
my $infiles = shift; |
|---|
| 504 |
my $outfile = shift; |
|---|
| 505 |
|
|---|
| 506 |
my ($tFH, $t_file) = tempfile(DIR => $TMP); |
|---|
| 507 |
foreach my $file (@$infiles){ |
|---|
| 508 |
open(my $IN, "< $file"); |
|---|
| 509 |
while(defined(my $line = <$IN>)){ |
|---|
| 510 |
print $tFH $line; |
|---|
| 511 |
} |
|---|
| 512 |
close($IN); |
|---|
| 513 |
} |
|---|
| 514 |
close($tFH); |
|---|
| 515 |
File::Copy::move($t_file, $outfile); |
|---|
| 516 |
} |
|---|
| 517 |
|
|---|
| 518 |
sub split_db { |
|---|
| 519 |
my $CTL_OPT = shift @_; |
|---|
| 520 |
my $key = shift @_; |
|---|
| 521 |
my $mpi_size = shift @_ || 1; |
|---|
| 522 |
|
|---|
| 523 |
|
|---|
| 524 |
$mpi_size = 10 if($mpi_size < 10); |
|---|
| 525 |
|
|---|
| 526 |
my $file = $CTL_OPT->{$key}; |
|---|
| 527 |
my $alt = $CTL_OPT->{alt_peptide} if($key =~ /protein/); |
|---|
| 528 |
|
|---|
| 529 |
return ('', []) if (not $file); |
|---|
| 530 |
|
|---|
| 531 |
|
|---|
| 532 |
my $fasta_iterator = new Iterator::Fasta($file); |
|---|
| 533 |
my $db_size = $fasta_iterator->number_of_entries(); |
|---|
| 534 |
my $bins = $mpi_size; |
|---|
| 535 |
$bins = $db_size if ($db_size < $bins); |
|---|
| 536 |
|
|---|
| 537 |
my ($f_name) = $file =~ /([^\/]+)$/; |
|---|
| 538 |
$f_name =~ s/\.fasta$//; |
|---|
| 539 |
|
|---|
| 540 |
my $d_name = "$f_name\.mpi\.$mpi_size"; |
|---|
| 541 |
my $b_dir = $CTL_OPT->{out_base}."/mpi_blastdb"; |
|---|
| 542 |
my $f_dir = "$b_dir/$d_name"; |
|---|
| 543 |
my $t_dir = $TMP."/$d_name"; |
|---|
| 544 |
|
|---|
| 545 |
|
|---|
| 546 |
|
|---|
| 547 |
|
|---|
| 548 |
|
|---|
| 549 |
|
|---|
| 550 |
|
|---|
| 551 |
|
|---|
| 552 |
|
|---|
| 553 |
|
|---|
| 554 |
|
|---|
| 555 |
|
|---|
| 556 |
|
|---|
| 557 |
if(-e "$f_dir"){ |
|---|
| 558 |
my @t_db = <$f_dir/*$d_name*\.fasta>; |
|---|
| 559 |
|
|---|
| 560 |
my @existing_db; |
|---|
| 561 |
foreach my $f (@t_db) { |
|---|
| 562 |
push (@existing_db, $f) if (! -d $f); |
|---|
| 563 |
} |
|---|
| 564 |
|
|---|
| 565 |
if(@existing_db == $bins){ |
|---|
| 566 |
|
|---|
| 567 |
|
|---|
| 568 |
|
|---|
| 569 |
|
|---|
| 570 |
|
|---|
| 571 |
return $f_dir, \@existing_db; |
|---|
| 572 |
} |
|---|
| 573 |
else{ |
|---|
| 574 |
File::Path::rmtree($f_dir); |
|---|
| 575 |
} |
|---|
| 576 |
} |
|---|
| 577 |
|
|---|
| 578 |
|
|---|
| 579 |
|
|---|
| 580 |
mkdir($t_dir); |
|---|
| 581 |
mkdir($b_dir) unless (-e $b_dir); |
|---|
| 582 |
|
|---|
| 583 |
|
|---|
| 584 |
my @fhs; |
|---|
| 585 |
|
|---|
| 586 |
for (my $i = 0; $i < $bins; $i++) { |
|---|
| 587 |
my $name = "$t_dir/$d_name\.$i\.fasta"; |
|---|
| 588 |
my $fh; |
|---|
| 589 |
open ($fh, "> $name"); |
|---|
| 590 |
|
|---|
| 591 |
push (@fhs, $fh); |
|---|
| 592 |
} |
|---|
| 593 |
|
|---|
| 594 |
|
|---|
| 595 |
|
|---|
| 596 |
my %alias; |
|---|
| 597 |
|
|---|
| 598 |
|
|---|
| 599 |
|
|---|
| 600 |
my $wflag = 1; |
|---|
| 601 |
while (my $fasta = $fasta_iterator->nextEntry()) { |
|---|
| 602 |
my $def = Fasta::getDef(\$fasta); |
|---|
| 603 |
my $seq_id = Fasta::def2SeqID($def); |
|---|
| 604 |
my $seq_ref = Fasta::getSeqRef(\$fasta); |
|---|
| 605 |
|
|---|
| 606 |
|
|---|
| 607 |
if (defined $alt) { |
|---|
| 608 |
$$seq_ref =~ s/[\*\-]//g; |
|---|
| 609 |
$$seq_ref =~ s/[^abcdefghiklmnpqrstvwyzxABCDEFGHIKLMNPQRSTVWYZX\-\n]/$alt/g; |
|---|
| 610 |
} |
|---|
| 611 |
|
|---|
| 612 |
elsif($key !~ /protein/){ |
|---|
| 613 |
|
|---|
| 614 |
|
|---|
| 615 |
$$seq_ref =~ s/\-//g; |
|---|
| 616 |
$$seq_ref =~ s/X/N/g; |
|---|
| 617 |
die "ERROR: The nucleotide sequence file \'$file\'\n". |
|---|
| 618 |
"appears to contain protein sequence or unrecognized characters.\n". |
|---|
| 619 |
"Please check/fix the file before continuing.\n". |
|---|
| 620 |
"Invalid Character: $1\n\n" |
|---|
| 621 |
if($$seq_ref =~ /([^acgturykmswbdhvnxACGTURYKMSWBDHVNX\-\n])/); |
|---|
| 622 |
} |
|---|
| 623 |
|
|---|
| 624 |
|
|---|
| 625 |
next if($$seq_ref eq ''); |
|---|
| 626 |
|
|---|
| 627 |
|
|---|
| 628 |
if(length($seq_id) > 78){ |
|---|
| 629 |
warn "WARNING: The fasta file contains sequences with names longer\n". |
|---|
| 630 |
"than 78 characters. Long names get trimmed by BLAST, making\n". |
|---|
| 631 |
"it harder to identify the source of an alignmnet. You might\n". |
|---|
| 632 |
"want to reformat the fasta file with shorter IDs.\n". |
|---|
| 633 |
"File_name:$file\n\n" if($wflag-- > 0); |
|---|
| 634 |
|
|---|
| 635 |
my $new_id = uri_escape(Digest::MD5::md5_base64($seq_id), "^A-Za-z0-9\-\_"); |
|---|
| 636 |
die "ERROR: The id $seq_id is too long for BLAST, and I can'y uniquely fix it\n" |
|---|
| 637 |
if($alias{$new_id}); |
|---|
| 638 |
$alias{$new_id}++; |
|---|
| 639 |
$def =~ s/^(>\S+)/$1 MD5_alias=$new_id/; |
|---|
| 640 |
} |
|---|
| 641 |
|
|---|
| 642 |
|
|---|
| 643 |
my $fasta_ref = Fasta::toFastaRef($def, $seq_ref); |
|---|
| 644 |
|
|---|
| 645 |
|
|---|
| 646 |
|
|---|
| 647 |
|
|---|
| 648 |
|
|---|
| 649 |
|
|---|
| 650 |
my $fh = shift @fhs; |
|---|
| 651 |
print $fh $$fasta_ref; |
|---|
| 652 |
push (@fhs, $fh); |
|---|
| 653 |
|
|---|
| 654 |
} |
|---|
| 655 |
|
|---|
| 656 |
|
|---|
| 657 |
|
|---|
| 658 |
foreach my $fh (@fhs) { |
|---|
| 659 |
close ($fh); |
|---|
| 660 |
} |
|---|
| 661 |
|
|---|
| 662 |
|
|---|
| 663 |
|
|---|
| 664 |
|
|---|
| 665 |
system("mv $t_dir $f_dir"); |
|---|
| 666 |
|
|---|
| 667 |
|
|---|
| 668 |
|
|---|
| 669 |
|
|---|
| 670 |
|
|---|
| 671 |
|
|---|
| 672 |
if (-e $f_dir) { |
|---|
| 673 |
my @t_db = <$f_dir/*$d_name*\.fasta>; |
|---|
| 674 |
|
|---|
| 675 |
my @db_files; |
|---|
| 676 |
foreach my $f (@t_db) { |
|---|
| 677 |
push (@db_files, $f) if (! -d $f); |
|---|
| 678 |
} |
|---|
| 679 |
|
|---|
| 680 |
die "ERROR: SplitDB not created correctly\n\n" if(@db_files != $bins); |
|---|
| 681 |
|
|---|
| 682 |
|
|---|
| 683 |
return $f_dir, \@db_files; |
|---|
| 684 |
} |
|---|
| 685 |
else { |
|---|
| 686 |
die "ERROR: Could not split db\n"; |
|---|
| 687 |
} |
|---|
| 688 |
} |
|---|
| 689 |
|
|---|
| 690 |
|
|---|
| 691 |
|
|---|
| 692 |
|
|---|
| 693 |
|
|---|
| 694 |
|
|---|
| 695 |
|
|---|
| 696 |
|
|---|
| 697 |
|
|---|
| 698 |
|
|---|
| 699 |
|
|---|
| 700 |
|
|---|
| 701 |
|
|---|
| 702 |
|
|---|
| 703 |
|
|---|
| 704 |
|
|---|
| 705 |
|
|---|
| 706 |
|
|---|
| 707 |
|
|---|
| 708 |
|
|---|
| 709 |
|
|---|
| 710 |
|
|---|
| 711 |
|
|---|
| 712 |
|
|---|
| 713 |
|
|---|
| 714 |
|
|---|
| 715 |
|
|---|
| 716 |
|
|---|
| 717 |
|
|---|
| 718 |
|
|---|
| 719 |
|
|---|
| 720 |
|
|---|
| 721 |
|
|---|
| 722 |
|
|---|
| 723 |
|
|---|
| 724 |
|
|---|
| 725 |
|
|---|
| 726 |
|
|---|
| 727 |
|
|---|
| 728 |
|
|---|
| 729 |
|
|---|
| 730 |
|
|---|
| 731 |
|
|---|
| 732 |
|
|---|
| 733 |
|
|---|
| 734 |
|
|---|
| 735 |
|
|---|
| 736 |
|
|---|
| 737 |
sub flatten { |
|---|
| 738 |
my $clusters = shift; |
|---|
| 739 |
my $type = shift; |
|---|
| 740 |
my @hits; |
|---|
| 741 |
foreach my $c (@{$clusters}) { |
|---|
| 742 |
foreach my $hit (@{$c}) { |
|---|
| 743 |
$hit->type($type) if defined($type); |
|---|
| 744 |
push(@hits, $hit); |
|---|
| 745 |
} |
|---|
| 746 |
} |
|---|
| 747 |
return \@hits; |
|---|
| 748 |
} |
|---|
| 749 |
|
|---|
| 750 |
sub combine { |
|---|
| 751 |
my @bag; |
|---|
| 752 |
while (my $hits = shift(@_)) { |
|---|
| 753 |
foreach my $hit (@{$hits}) { |
|---|
| 754 |
push(@bag, $hit); |
|---|
| 755 |
} |
|---|
| 756 |
} |
|---|
| 757 |
return \@bag; |
|---|
| 758 |
} |
|---|
| 759 |
|
|---|
| 760 |
sub abinits { |
|---|
| 761 |
my @preds; |
|---|
| 762 |
|
|---|
| 763 |
push(@preds, @{snap(@_)}); |
|---|
| 764 |
push(@preds, @{augustus(@_)}); |
|---|
| 765 |
push(@preds, @{fgenesh(@_)}); |
|---|
| 766 |
push(@preds, @{genemark(@_)}); |
|---|
| 767 |
push(@preds, @{twinscan(@_)}); |
|---|
| 768 |
|
|---|
| 769 |
return \@preds; |
|---|
| 770 |
} |
|---|
| 771 |
|
|---|
| 772 |
sub snap { |
|---|
| 773 |
my $in_file = shift; |
|---|
| 774 |
my $the_void = shift; |
|---|
| 775 |
my $seq_id = shift; |
|---|
| 776 |
my $CTL_OPT = shift; |
|---|
| 777 |
my $LOG = shift; |
|---|
| 778 |
|
|---|
| 779 |
my $exe = $CTL_OPT->{snap}; |
|---|
| 780 |
my $snaphmm = $CTL_OPT->{snaphmm}; |
|---|
| 781 |
|
|---|
| 782 |
return [] if(! grep {/snap/} @{$CTL_OPT->{_run}}); |
|---|
| 783 |
|
|---|
| 784 |
my %params; |
|---|
| 785 |
my $out_file = "$the_void/$seq_id\.all\.snap"; |
|---|
| 786 |
|
|---|
| 787 |
$LOG->add_entry("STARTED", $out_file, ""); |
|---|
| 788 |
|
|---|
| 789 |
my $command = $exe; |
|---|
| 790 |
$command .= " $snaphmm"; |
|---|
| 791 |
$command .= " $in_file"; |
|---|
| 792 |
$command .= " > $out_file"; |
|---|
| 793 |
|
|---|
| 794 |
my $w = new Widget::snap(); |
|---|
| 795 |
|
|---|
| 796 |
if (-e $out_file) { |
|---|
| 797 |
print STDERR "re reading snap report.\n" unless $main::quiet; |
|---|
| 798 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 799 |
} |
|---|
| 800 |
else { |
|---|
| 801 |
print STDERR "running snap.\n" unless $main::quiet; |
|---|
| 802 |
$w->run($command); |
|---|
| 803 |
} |
|---|
| 804 |
|
|---|
| 805 |
$params{min_exon_score} = -100000; |
|---|
| 806 |
$params{min_gene_score} = -100000; |
|---|
| 807 |
|
|---|
| 808 |
my $keepers = Widget::snap::parse($out_file, |
|---|
| 809 |
\%params, |
|---|
| 810 |
$in_file, |
|---|
| 811 |
); |
|---|
| 812 |
|
|---|
| 813 |
$LOG->add_entry("FINISHED", $out_file, ""); |
|---|
| 814 |
|
|---|
| 815 |
return $keepers; |
|---|
| 816 |
} |
|---|
| 817 |
|
|---|
| 818 |
sub genemark { |
|---|
| 819 |
my $in_file = shift; |
|---|
| 820 |
my $the_void = shift; |
|---|
| 821 |
my $seq_id = shift; |
|---|
| 822 |
my $CTL_OPT = shift; |
|---|
| 823 |
my $LOG = shift; |
|---|
| 824 |
|
|---|
| 825 |
|
|---|
| 826 |
my $wrap = "$FindBin::Bin/../lib/Widget/genemark/gmhmm_wrap"; |
|---|
| 827 |
my $exe = $CTL_OPT->{organism_type} eq 'eukaryotic' ? $CTL_OPT->{gmhmme3} : $CTL_OPT->{gmhmmp}; |
|---|
| 828 |
my $pro = $CTL_OPT->{probuild}; |
|---|
| 829 |
my $hmm = $CTL_OPT->{gmhmm}; |
|---|
| 830 |
|
|---|
| 831 |
return [] if(! grep {/genemark/} @{$CTL_OPT->{_run}}); |
|---|
| 832 |
|
|---|
| 833 |
my %params; |
|---|
| 834 |
my $out_file = "$the_void/$seq_id\.all\.genemark"; |
|---|
| 835 |
|
|---|
| 836 |
$LOG->add_entry("STARTED", $out_file, ""); |
|---|
| 837 |
|
|---|
| 838 |
|
|---|
| 839 |
my $command = $wrap; |
|---|
| 840 |
$command .= " -m $hmm"; |
|---|
| 841 |
$command .= " -g $exe"; |
|---|
| 842 |
$command .= " -p $pro"; |
|---|
| 843 |
$command .= " -o $out_file"; |
|---|
| 844 |
|
|---|
| 845 |
$command .= " $in_file"; |
|---|
| 846 |
|
|---|
| 847 |
my $w = new Widget::genemark(); |
|---|
| 848 |
|
|---|
| 849 |
if (-e $out_file) { |
|---|
| 850 |
print STDERR "re reading genemark report.\n" unless $main::quiet; |
|---|
| 851 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 852 |
} |
|---|
| 853 |
else { |
|---|
| 854 |
print STDERR "running genemark.\n" unless $main::quiet; |
|---|
| 855 |
$w->run($command); |
|---|
| 856 |
} |
|---|
| 857 |
|
|---|
| 858 |
$params{min_exon_score} = -100000; |
|---|
| 859 |
$params{min_gene_score} = -100000; |
|---|
| 860 |
|
|---|
| 861 |
my $keepers = Widget::genemark::parse($out_file, |
|---|
| 862 |
\%params, |
|---|
| 863 |
$in_file, |
|---|
| 864 |
); |
|---|
| 865 |
|
|---|
| 866 |
$LOG->add_entry("FINISHED", $out_file, ""); |
|---|
| 867 |
|
|---|
| 868 |
return $keepers; |
|---|
| 869 |
} |
|---|
| 870 |
|
|---|
| 871 |
sub augustus { |
|---|
| 872 |
my $in_file = shift; |
|---|
| 873 |
my $the_void = shift; |
|---|
| 874 |
my $seq_id = shift; |
|---|
| 875 |
my $CTL_OPT = shift; |
|---|
| 876 |
my $LOG = shift; |
|---|
| 877 |
|
|---|
| 878 |
my $exe = $CTL_OPT->{augustus}; |
|---|
| 879 |
my $org = $CTL_OPT->{augustus_species}; |
|---|
| 880 |
|
|---|
| 881 |
return [] if(! grep {/augustus/} @{$CTL_OPT->{_run}}); |
|---|
| 882 |
|
|---|
| 883 |
my %params; |
|---|
| 884 |
my $out_file = "$the_void/$seq_id\.all\.augustus"; |
|---|
| 885 |
|
|---|
| 886 |
$LOG->add_entry("STARTED", $out_file, ""); |
|---|
| 887 |
|
|---|
| 888 |
my $command = $exe; |
|---|
| 889 |
$command .= " --species=$org"; |
|---|
| 890 |
$command .= " --UTR=off"; |
|---|
| 891 |
$command .= " $in_file"; |
|---|
| 892 |
$command .= " > $out_file"; |
|---|
| 893 |
|
|---|
| 894 |
my $w = new Widget::augustus(); |
|---|
| 895 |
|
|---|
| 896 |
if (-e $out_file) { |
|---|
| 897 |
print STDERR "re reading augustus report.\n" unless $main::quiet; |
|---|
| 898 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 899 |
} |
|---|
| 900 |
else { |
|---|
| 901 |
print STDERR "running augustus.\n" unless $main::quiet; |
|---|
| 902 |
$w->run($command); |
|---|
| 903 |
} |
|---|
| 904 |
|
|---|
| 905 |
$params{min_exon_score} = -100000; |
|---|
| 906 |
$params{min_gene_score} = -100000; |
|---|
| 907 |
|
|---|
| 908 |
my $chunk_keepers = Widget::augustus::parse($out_file, |
|---|
| 909 |
\%params, |
|---|
| 910 |
$in_file |
|---|
| 911 |
); |
|---|
| 912 |
|
|---|
| 913 |
$LOG->add_entry("FINISHED", $out_file, ""); |
|---|
| 914 |
|
|---|
| 915 |
return $chunk_keepers; |
|---|
| 916 |
} |
|---|
| 917 |
|
|---|
| 918 |
sub fgenesh { |
|---|
| 919 |
my $in_file = shift; |
|---|
| 920 |
my $the_void = shift; |
|---|
| 921 |
my $seq_id = shift; |
|---|
| 922 |
my $CTL_OPT = shift; |
|---|
| 923 |
my $LOG = shift; |
|---|
| 924 |
|
|---|
| 925 |
my $wrap = "$FindBin::Bin/../lib/Widget/fgenesh/fgenesh_wrap"; |
|---|
| 926 |
my $exe = $CTL_OPT->{fgenesh}; |
|---|
| 927 |
my $org = $CTL_OPT->{fgenesh_par_file}; |
|---|
| 928 |
|
|---|
| 929 |
return [] if(! grep {/fgenesh/} @{$CTL_OPT->{_run}}); |
|---|
| 930 |
|
|---|
| 931 |
my %params; |
|---|
| 932 |
my $out_file = "$the_void/$seq_id\.all\.fgenesh"; |
|---|
| 933 |
|
|---|
| 934 |
$LOG->add_entry("STARTED", $out_file, ""); |
|---|
| 935 |
|
|---|
| 936 |
my $command = "$wrap $exe"; |
|---|
| 937 |
|
|---|
| 938 |
$command .= " $org"; |
|---|
| 939 |
$command .= " $in_file"; |
|---|
| 940 |
$command .= " > $out_file"; |
|---|
| 941 |
|
|---|
| 942 |
my $w = new Widget::fgenesh(); |
|---|
| 943 |
|
|---|
| 944 |
if (-e $out_file) { |
|---|
| 945 |
print STDERR "re reading fgenesh report.\n" unless $main::quiet; |
|---|
| 946 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 947 |
} |
|---|
| 948 |
else { |
|---|
| 949 |
print STDERR "running fgenesh.\n" unless $main::quiet; |
|---|
| 950 |
$w->run($command); |
|---|
| 951 |
} |
|---|
| 952 |
|
|---|
| 953 |
$params{min_exon_score} = -100000; |
|---|
| 954 |
$params{min_gene_score} = -100000; |
|---|
| 955 |
|
|---|
| 956 |
my $chunk_keepers = Widget::fgenesh::parse($out_file, |
|---|
| 957 |
\%params, |
|---|
| 958 |
$in_file |
|---|
| 959 |
); |
|---|
| 960 |
|
|---|
| 961 |
$LOG->add_entry("FINISHED", $out_file, ""); |
|---|
| 962 |
|
|---|
| 963 |
return $chunk_keepers; |
|---|
| 964 |
} |
|---|
| 965 |
|
|---|
| 966 |
sub twinscan { |
|---|
| 967 |
my $in_file = shift; |
|---|
| 968 |
my $the_void = shift; |
|---|
| 969 |
my $seq_id = shift; |
|---|
| 970 |
my $CTL_OPT = shift; |
|---|
| 971 |
my $LOG = shift; |
|---|
| 972 |
|
|---|
| 973 |
my $exe = $CTL_OPT->{twinscan}; |
|---|
| 974 |
my $org = $CTL_OPT->{twinscan_species}; |
|---|
| 975 |
|
|---|
| 976 |
return [] if(! grep {/twinscan/} @{$CTL_OPT->{_run}}); |
|---|
| 977 |
|
|---|
| 978 |
my %params; |
|---|
| 979 |
my $out_file = "$the_void/$seq_id\.all\.twinscan"; |
|---|
| 980 |
|
|---|
| 981 |
$LOG->add_entry("STARTED", $out_file, ""); |
|---|
| 982 |
|
|---|
| 983 |
my $command = $exe; |
|---|
| 984 |
$command .= ' --species='."$org"; |
|---|
| 985 |
$command .= " $in_file"; |
|---|
| 986 |
$command .= " > $out_file"; |
|---|
| 987 |
|
|---|
| 988 |
my $w = new Widget::twinscan(); |
|---|
| 989 |
|
|---|
| 990 |
if (-e $out_file) { |
|---|
| 991 |
print STDERR "re reading twinscan report.\n" unless $main::quiet; |
|---|
| 992 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 993 |
} |
|---|
| 994 |
else { |
|---|
| 995 |
print STDERR "running twinscan.\n" unless $main::quiet; |
|---|
| 996 |
$w->run($command); |
|---|
| 997 |
} |
|---|
| 998 |
|
|---|
| 999 |
$params{min_exon_score} = -100000; |
|---|
| 1000 |
$params{min_gene_score} = -100000; |
|---|
| 1001 |
|
|---|
| 1002 |
my $chunk_keepers = Widget::twinscan::parse($out_file, |
|---|
| 1003 |
\%params, |
|---|
| 1004 |
$in_file |
|---|
| 1005 |
); |
|---|
| 1006 |
|
|---|
| 1007 |
$LOG->add_entry("FINISHED", $out_file, ""); |
|---|
| 1008 |
|
|---|
| 1009 |
return $chunk_keepers; |
|---|
| 1010 |
} |
|---|
| 1011 |
|
|---|
| 1012 |
|
|---|
| 1013 |
sub polish_exonerate { |
|---|
| 1014 |
my $g_fasta = shift; |
|---|
| 1015 |
my $phat_hits = shift; |
|---|
| 1016 |
my $db_index = shift; |
|---|
| 1017 |
my $the_void = shift; |
|---|
| 1018 |
my $type = shift; |
|---|
| 1019 |
my $exonerate = shift; |
|---|
| 1020 |
my $pcov = shift; |
|---|
| 1021 |
my $pid = shift; |
|---|
| 1022 |
my $score_limit = shift; |
|---|
| 1023 |
my $matrix = shift; |
|---|
| 1024 |
my $LOG = shift; |
|---|
| 1025 |
|
|---|
| 1026 |
my $def = Fasta::getDef($g_fasta); |
|---|
| 1027 |
my $seq = Fasta::getSeqRef($g_fasta); |
|---|
| 1028 |
|
|---|
| 1029 |
my $exe = $exonerate; |
|---|
| 1030 |
|
|---|
| 1031 |
my @exonerate_data; |
|---|
| 1032 |
|
|---|
| 1033 |
foreach my $hit (@{$phat_hits}) { |
|---|
| 1034 |
next if $hit->pAh < $pcov; |
|---|
| 1035 |
next if $hit->hsp('best')->frac_identical < $pid; |
|---|
| 1036 |
|
|---|
| 1037 |
my ($nB, $nE) = PhatHit_utils::get_span_of_hit($hit,'query'); |
|---|
| 1038 |
|
|---|
| 1039 |
my @coors = [$nB, $nE]; |
|---|
| 1040 |
my $p = Shadower::getPieces($seq, \@coors, 50); |
|---|
| 1041 |
my $p_def = $def." ".$p->[0]->{b}." ".$p->[0]->{e}; |
|---|
| 1042 |
my $p_fasta = Fasta::toFasta($p_def, \$p->[0]->{piece}); |
|---|
| 1043 |
my $name = Fasta::def2SeqID($p_def); |
|---|
| 1044 |
my $safe_name = Fasta::seqID2SafeID($name); |
|---|
| 1045 |
|
|---|
| 1046 |
my $d_file = $the_void."/".$safe_name.'.'.$p->[0]->{b}.'.'.$p->[0]->{e}.".fasta"; |
|---|
| 1047 |
|
|---|
| 1048 |
FastaFile::writeFile($p_fasta, $d_file); |
|---|
| 1049 |
|
|---|
| 1050 |
my $offset = $p->[0]->{b} - 1; |
|---|
| 1051 |
my $id = $hit->name(); |
|---|
| 1052 |
|
|---|
| 1053 |
my $fastaObj = $db_index->get_Seq_for_hit($hit); |
|---|
| 1054 |
|
|---|
| 1055 |
|
|---|
| 1056 |
if (not $fastaObj) { |
|---|
| 1057 |
print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; |
|---|
| 1058 |
$db_index->reindex(); |
|---|
| 1059 |
$fastaObj = $db_index->get_Seq_for_hit($hit); |
|---|
| 1060 |
|
|---|
| 1061 |
if (not $fastaObj) { |
|---|
| 1062 |
print STDERR "stop here:".$hit->name."\n"; |
|---|
| 1063 |
die "ERROR: Fasta index error\n"; |
|---|
| 1064 |
} |
|---|
| 1065 |
} |
|---|
| 1066 |
|
|---|
| 1067 |
my $seq = $fastaObj->seq(); |
|---|
| 1068 |
my $def = $db_index->header_for_hit($hit); |
|---|
| 1069 |
|
|---|
| 1070 |
my $fasta = Fasta::toFasta('>'.$def, \$seq); |
|---|
| 1071 |
|
|---|
| 1072 |
|
|---|
| 1073 |
my $safe_id = Fasta::seqID2SafeID($id); |
|---|
| 1074 |
|
|---|
| 1075 |
my $t_file = $the_void."/".$safe_id.'.fasta'; |
|---|
| 1076 |
FastaFile::writeFile($fasta, $t_file); |
|---|
| 1077 |
|
|---|
| 1078 |
my $exonerate_hits = to_polisher($d_file, |
|---|
| 1079 |
$t_file, |
|---|
| 1080 |
$the_void, |
|---|
| 1081 |
$offset, |
|---|
| 1082 |
$type, |
|---|
| 1083 |
$exe, |
|---|
| 1084 |
$score_limit, |
|---|
| 1085 |
$matrix, |
|---|
| 1086 |
$LOG |
|---|
| 1087 |
); |
|---|
| 1088 |
|
|---|
| 1089 |
foreach my $exonerate_hit (@{$exonerate_hits}) { |
|---|
| 1090 |
if (defined($exonerate_hit) && exonerate_okay($exonerate_hit)) { |
|---|
| 1091 |
|
|---|
| 1092 |
|
|---|
| 1093 |
$hit->{_exonerate_flipped} = 1 if($exonerate_hit->{_was_flipped}); |
|---|
| 1094 |
$hit->type("exonerate:$type"); |
|---|
| 1095 |
|
|---|
| 1096 |
push(@exonerate_data, $exonerate_hit); |
|---|
| 1097 |
} |
|---|
| 1098 |
} |
|---|
| 1099 |
} |
|---|
| 1100 |
|
|---|
| 1101 |
return \@exonerate_data; |
|---|
| 1102 |
} |
|---|
| 1103 |
|
|---|
| 1104 |
sub exonerate_okay { |
|---|
| 1105 |
my $hit = shift; |
|---|
| 1106 |
|
|---|
| 1107 |
my $i = 0; |
|---|
| 1108 |
foreach my $hsp ($hit->hsps()) { |
|---|
| 1109 |
return 0 unless defined($hsp->nB('query')); |
|---|
| 1110 |
return 0 unless defined($hsp->nE('query')); |
|---|
| 1111 |
return 0 unless defined($hsp->nB('hit')); |
|---|
| 1112 |
return 0 unless defined($hsp->nE('hit')); |
|---|
| 1113 |
return 0 unless defined($hsp->strand('query')); |
|---|
| 1114 |
return 0 unless defined($hsp->strand('query')); |
|---|
| 1115 |
return 0 unless defined($hsp->strand('hit')); |
|---|
| 1116 |
return 0 unless defined($hsp->strand('hit')); |
|---|
| 1117 |
|
|---|
| 1118 |
my $q_str = $hsp->query_string(); |
|---|
| 1119 |
my $h_str = $hsp->hit_string(); |
|---|
| 1120 |
|
|---|
| 1121 |
if ($h_str =~ /Target Intron/) { |
|---|
| 1122 |
print STDERR "BADDD EXONERATE!\n"; |
|---|
| 1123 |
sleep 4; |
|---|
| 1124 |
return 0; |
|---|
| 1125 |
} |
|---|
| 1126 |
elsif ($q_str =~ /Target Intron/) { |
|---|
| 1127 |
print STDERR "BADDD EXONERATE!\n"; |
|---|
| 1128 |
sleep 4; |
|---|
| 1129 |
return 0; |
|---|
| 1130 |
} |
|---|
| 1131 |
$i++; |
|---|
| 1132 |
} |
|---|
| 1133 |
|
|---|
| 1134 |
return 1; |
|---|
| 1135 |
} |
|---|
| 1136 |
|
|---|
| 1137 |
|
|---|
| 1138 |
sub to_polisher { |
|---|
| 1139 |
my $d_file = shift; |
|---|
| 1140 |
my $t_file = shift; |
|---|
| 1141 |
my $the_void = shift; |
|---|
| 1142 |
my $offset = shift; |
|---|
| 1143 |
my $type = shift; |
|---|
| 1144 |
my $exe = shift; |
|---|
| 1145 |
my $score_limit = shift; |
|---|
| 1146 |
my $matrix = shift; |
|---|
| 1147 |
my $LOG = shift; |
|---|
| 1148 |
|
|---|
| 1149 |
if ($type eq 'p') { |
|---|
| 1150 |
return polisher::exonerate::protein::polish($d_file, |
|---|
| 1151 |
$t_file, |
|---|
| 1152 |
$the_void, |
|---|
| 1153 |
$offset, |
|---|
| 1154 |
$exe, |
|---|
| 1155 |
$score_limit, |
|---|
| 1156 |
$matrix, |
|---|
| 1157 |
$LOG |
|---|
| 1158 |
); |
|---|
| 1159 |
} |
|---|
| 1160 |
elsif ($type eq 'e') { |
|---|
| 1161 |
return polisher::exonerate::est::polish($d_file, |
|---|
| 1162 |
$t_file, |
|---|
| 1163 |
$the_void, |
|---|
| 1164 |
$offset, |
|---|
| 1165 |
$exe, |
|---|
| 1166 |
$score_limit, |
|---|
| 1167 |
$matrix, |
|---|
| 1168 |
$LOG |
|---|
| 1169 |
); |
|---|
| 1170 |
} |
|---|
| 1171 |
else { |
|---|
| 1172 |
die "unknown type:$type in sub to_polisher.\n"; |
|---|
| 1173 |
} |
|---|
| 1174 |
} |
|---|
| 1175 |
|
|---|
| 1176 |
sub make_multi_fasta { |
|---|
| 1177 |
my $index = shift; |
|---|
| 1178 |
my $clusters = shift;; |
|---|
| 1179 |
my $fastas = ''; |
|---|
| 1180 |
foreach my $c (@{$clusters}) { |
|---|
| 1181 |
foreach my $hit (@{$c}) { |
|---|
| 1182 |
my $fastaObj = $index->get_Seq_for_hit($hit); |
|---|
| 1183 |
|
|---|
| 1184 |
|
|---|
| 1185 |
if (not $fastaObj) { |
|---|
| 1186 |
print STDERR "WARNING: Cannot find> ".$hit->name.", trying to re-index the fasta.\n"; |
|---|
| 1187 |
$index->reindex(); |
|---|
| 1188 |
$fastaObj = $index->get_Seq_for_hit($hit); |
|---|
| 1189 |
|
|---|
| 1190 |
if (not $fastaObj) { |
|---|
| 1191 |
print STDERR "stop here:".$hit->name."\n"; |
|---|
| 1192 |
die "ERROR: Fasta index error\n"; |
|---|
| 1193 |
} |
|---|
| 1194 |
} |
|---|
| 1195 |
|
|---|
| 1196 |
my $seq = $fastaObj->seq(); |
|---|
| 1197 |
my $def = $index->header_for_hit($hit); |
|---|
| 1198 |
my $fasta = Fasta::toFasta('>'.$def, \$seq); |
|---|
| 1199 |
$fastas .= $$fasta; |
|---|
| 1200 |
} |
|---|
| 1201 |
} |
|---|
| 1202 |
return \$fastas; |
|---|
| 1203 |
} |
|---|
| 1204 |
|
|---|
| 1205 |
sub build_fasta_index { |
|---|
| 1206 |
my $db = shift; |
|---|
| 1207 |
my $index = new FastaDB($db); |
|---|
| 1208 |
return $index; |
|---|
| 1209 |
} |
|---|
| 1210 |
|
|---|
| 1211 |
sub build_all_indexes { |
|---|
| 1212 |
my $CTL_OPT = shift; |
|---|
| 1213 |
|
|---|
| 1214 |
my @dbs = ($CTL_OPT->{_est}, |
|---|
| 1215 |
$CTL_OPT->{_protein}, |
|---|
| 1216 |
$CTL_OPT->{_repeat_protein}, |
|---|
| 1217 |
$CTL_OPT->{_est_reads}, |
|---|
| 1218 |
$CTL_OPT->{_altest} |
|---|
| 1219 |
); |
|---|
| 1220 |
|
|---|
| 1221 |
foreach my $db (@dbs){ |
|---|
| 1222 |
next if(! $db); |
|---|
| 1223 |
if($CTL_OPT->{force}){ |
|---|
| 1224 |
foreach my $f (@{[<$db/*.index>]}){ |
|---|
| 1225 |
unlink("$f") if(-f $f); |
|---|
| 1226 |
} |
|---|
| 1227 |
} |
|---|
| 1228 |
new FastaDB($db); |
|---|
| 1229 |
} |
|---|
| 1230 |
} |
|---|
| 1231 |
|
|---|
| 1232 |
sub dbformat { |
|---|
| 1233 |
my $command = shift; |
|---|
| 1234 |
my $file = shift; |
|---|
| 1235 |
my $type = shift; |
|---|
| 1236 |
|
|---|
| 1237 |
die "ERROR: Can not find xdformat or formatdb executable\n" if(! -e $command); |
|---|
| 1238 |
die "ERROR: Can not find the db file $file\n" if(! -e $file); |
|---|
| 1239 |
die "ERROR: You must define a type (blastn|blastx|tblastx)\n" if(! $type); |
|---|
| 1240 |
|
|---|
| 1241 |
|
|---|
| 1242 |
if ($command =~ /xdformat/) { |
|---|
| 1243 |
if (($type eq 'blastn' && ! -e $file.'.xnd') || |
|---|
| 1244 |
($type eq 'blastx' && ! -e $file.'.xpd') || |
|---|
| 1245 |
($type eq 'tblastx' && ! -e $file.'.xnd') |
|---|
| 1246 |
) { |
|---|
| 1247 |
$command .= " -p" if($type eq 'blastx'); |
|---|
| 1248 |
$command .= " -n" if($type eq 'blastn' || $type eq 'tblastx'); |
|---|
| 1249 |
$command .= " $file"; |
|---|
| 1250 |
|
|---|
| 1251 |
my $w = new Widget::xdformat(); |
|---|
| 1252 |
print STDERR "formating database...\n" unless $main::quiet; |
|---|
| 1253 |
$w->run($command); |
|---|
| 1254 |
} |
|---|
| 1255 |
} |
|---|
| 1256 |
elsif ($command =~ /formatdb/) { |
|---|
| 1257 |
if (($type eq 'blastn' && ! -e $file.'.nsq') || |
|---|
| 1258 |
($type eq 'blastx' && ! -e $file.'.psq') || |
|---|
| 1259 |
($type eq 'tblastx' && ! -e $file.'.nsq') |
|---|
| 1260 |
) { |
|---|
| 1261 |
$command .= " -p T" if($type eq 'blastx'); |
|---|
| 1262 |
$command .= " -p F" if($type eq 'blastn' || $type eq 'tblastx'); |
|---|
| 1263 |
$command .= " -i $file"; |
|---|
| 1264 |
|
|---|
| 1265 |
my $w = new Widget::formatdb(); |
|---|
| 1266 |
print STDERR "formating database...\n" unless $main::quiet; |
|---|
| 1267 |
$w->run($command); |
|---|
| 1268 |
} |
|---|
| 1269 |
} |
|---|
| 1270 |
else { |
|---|
| 1271 |
die "ERROR: databases can only be formated by xdformat or formatdb not \'$command\'\n"; |
|---|
| 1272 |
} |
|---|
| 1273 |
} |
|---|
| 1274 |
|
|---|
| 1275 |
sub blastn_as_chunks { |
|---|
| 1276 |
my $chunk = shift; |
|---|
| 1277 |
my $db = shift; |
|---|
| 1278 |
my $old_db = shift; |
|---|
| 1279 |
my $the_void = shift; |
|---|
| 1280 |
my $seq_id = shift; |
|---|
| 1281 |
my $CTL_OPT = shift; |
|---|
| 1282 |
my $rank = shift; |
|---|
| 1283 |
my $LOG = shift; |
|---|
| 1284 |
my $LOG_FLAG = shift; |
|---|
| 1285 |
|
|---|
| 1286 |
my $blastn = $CTL_OPT->{_blastn}; |
|---|
| 1287 |
my $bit_blastn = $CTL_OPT->{bit_blastn}; |
|---|
| 1288 |
my $eval_blastn = $CTL_OPT->{eval_blastn}; |
|---|
| 1289 |
my $pcov_blastn = $CTL_OPT->{pcov_blastn}; |
|---|
| 1290 |
my $pid_blastn = $CTL_OPT->{pid_blastn}; |
|---|
| 1291 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1292 |
my $cpus = $CTL_OPT->{cpus}; |
|---|
| 1293 |
my $formater = $CTL_OPT->{_formater}; |
|---|
| 1294 |
my $softmask = $CTL_OPT->{softmask}; |
|---|
| 1295 |
my $org_type = $CTL_OPT->{organism_type}; |
|---|
| 1296 |
|
|---|
| 1297 |
|
|---|
| 1298 |
my ($db_n) = $db =~ /([^\/]+)$/; |
|---|
| 1299 |
$db_n =~ s/\.fasta$//; |
|---|
| 1300 |
|
|---|
| 1301 |
my $chunk_number = $chunk->number(); |
|---|
| 1302 |
|
|---|
| 1303 |
my ($db_old_n) = $old_db =~ /([^\/]+)$/; |
|---|
| 1304 |
$db_old_n =~ s/\.fasta$//; |
|---|
| 1305 |
my $blast_finished = "$the_void/$seq_id\.$chunk_number\.$db_old_n\.blastn"; |
|---|
| 1306 |
|
|---|
| 1307 |
my $t_dir = $TMP."/rank".$rank; |
|---|
| 1308 |
File::Path::mkpath($t_dir); |
|---|
| 1309 |
|
|---|
| 1310 |
my $t_file_name = "$t_dir/$seq_id\.$chunk_number"; |
|---|
| 1311 |
my $blast_dir = "$blast_finished\.temp_dir"; |
|---|
| 1312 |
my $o_file = "$blast_dir/$db_n\.blastn"; |
|---|
| 1313 |
|
|---|
| 1314 |
$db =~ /([^\/]+)$/; |
|---|
| 1315 |
my $tmp_db = "$TMP/$1"; |
|---|
| 1316 |
|
|---|
| 1317 |
$LOG->add_entry("STARTED", $blast_finished, "") if($LOG_FLAG); |
|---|
| 1318 |
|
|---|
| 1319 |
|
|---|
| 1320 |
if ((! @{[<$tmp_db.x?d*>]} || ! @{[<$tmp_db.?sq*>]}) && (! -e $blast_finished)) { |
|---|
| 1321 |
if(my $lock = new File::NFSLock("$tmp_db.lock", 'EX', undef, 300)){ |
|---|
| 1322 |
copy($db, $tmp_db) if(! -e $tmp_db); |
|---|
| 1323 |
dbformat($formater, $tmp_db, 'blastn'); |
|---|
| 1324 |
$lock->unlock; |
|---|
| 1325 |
} |
|---|
| 1326 |
} |
|---|
| 1327 |
elsif (-e $blast_finished) { |
|---|
| 1328 |
print STDERR "re reading blast report.\n" unless ($main::quiet || !$LOG_FLAG); |
|---|
| 1329 |
print STDERR "$blast_finished\n" unless ($main::quiet || !$LOG_FLAG); |
|---|
| 1330 |
return $blast_dir; |
|---|
| 1331 |
} |
|---|
| 1332 |
|
|---|
| 1333 |
|
|---|
| 1334 |
$chunk->write_file($t_file_name); |
|---|
| 1335 |
|
|---|
| 1336 |
runBlastn($t_file_name, |
|---|
| 1337 |
$tmp_db, |
|---|
| 1338 |
$o_file, |
|---|
| 1339 |
$blastn, |
|---|
| 1340 |
$eval_blastn, |
|---|
| 1341 |
$split_hit, |
|---|
| 1342 |
$cpus, |
|---|
| 1343 |
$org_type, |
|---|
| 1344 |
$softmask |
|---|
| 1345 |
); |
|---|
| 1346 |
|
|---|
| 1347 |
$chunk->erase_fasta_file(); |
|---|
| 1348 |
|
|---|
| 1349 |
return $blast_dir; |
|---|
| 1350 |
} |
|---|
| 1351 |
|
|---|
| 1352 |
sub collect_blastn{ |
|---|
| 1353 |
my $chunk = shift; |
|---|
| 1354 |
my $blast_dir = shift; |
|---|
| 1355 |
my $CTL_OPT = shift; |
|---|
| 1356 |
my $LOG = shift; |
|---|
| 1357 |
|
|---|
| 1358 |
my $eval_blastn = $CTL_OPT->{eval_blastn}; |
|---|
| 1359 |
my $bit_blastn = $CTL_OPT->{bit_blastn}, |
|---|
| 1360 |
my $pcov_blastn = $CTL_OPT->{pcov_blastn}; |
|---|
| 1361 |
my $pid_blastn = $CTL_OPT->{pid_blastn}; |
|---|
| 1362 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1363 |
|
|---|
| 1364 |
my $blast_finished = $blast_dir; |
|---|
| 1365 |
$blast_finished =~ s/\.temp_dir$//; |
|---|
| 1366 |
|
|---|
| 1367 |
|
|---|
| 1368 |
if (! -e $blast_finished) { |
|---|
| 1369 |
system ("cat $blast_dir/*blast* > $blast_finished"); |
|---|
| 1370 |
File::Path::rmtree ("$blast_dir"); |
|---|
| 1371 |
} |
|---|
| 1372 |
|
|---|
| 1373 |
my %params; |
|---|
| 1374 |
$params{significance} = $eval_blastn; |
|---|
| 1375 |
$params{hsp_bit_min} = $bit_blastn; |
|---|
| 1376 |
$params{percov} = $pcov_blastn; |
|---|
| 1377 |
$params{percid} = $pid_blastn; |
|---|
| 1378 |
$params{split_hit} = $split_hit; |
|---|
| 1379 |
|
|---|
| 1380 |
$LOG->add_entry("FINISHED", $blast_finished, ""); |
|---|
| 1381 |
|
|---|
| 1382 |
my $chunk_keepers = Widget::blastn::parse($blast_finished, |
|---|
| 1383 |
\%params, |
|---|
| 1384 |
); |
|---|
| 1385 |
|
|---|
| 1386 |
PhatHit_utils::add_offset($chunk_keepers, |
|---|
| 1387 |
$chunk->offset(), |
|---|
| 1388 |
); |
|---|
| 1389 |
|
|---|
| 1390 |
if ($chunk->p_cutoff || $chunk->m_cutoff) { |
|---|
| 1391 |
my @keepers; |
|---|
| 1392 |
|
|---|
| 1393 |
foreach my $hit (@{$chunk_keepers}) { |
|---|
| 1394 |
if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) { |
|---|
| 1395 |
push (@keepers, $hit) |
|---|
| 1396 |
} |
|---|
| 1397 |
elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) { |
|---|
| 1398 |
push (@keepers, $hit) |
|---|
| 1399 |
} |
|---|
| 1400 |
} |
|---|
| 1401 |
|
|---|
| 1402 |
return \@keepers; |
|---|
| 1403 |
} |
|---|
| 1404 |
else { |
|---|
| 1405 |
return $chunk_keepers |
|---|
| 1406 |
} |
|---|
| 1407 |
} |
|---|
| 1408 |
|
|---|
| 1409 |
sub blastn { |
|---|
| 1410 |
my $chunk = shift; |
|---|
| 1411 |
my $db = shift; |
|---|
| 1412 |
my $the_void = shift; |
|---|
| 1413 |
my $seq_id = shift; |
|---|
| 1414 |
my $CTL_OPT = shift; |
|---|
| 1415 |
my $LOG = shift; |
|---|
| 1416 |
|
|---|
| 1417 |
my $blastn = $CTL_OPT->{_blastn}; |
|---|
| 1418 |
my $bit_blastn = $CTL_OPT->{bit_blastn}; |
|---|
| 1419 |
my $eval_blastn = $CTL_OPT->{eval_blastn}; |
|---|
| 1420 |
my $pcov_blastn = $CTL_OPT->{pcov_blastn}; |
|---|
| 1421 |
my $pid_blastn = $CTL_OPT->{pid_blastn}; |
|---|
| 1422 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1423 |
my $cpus = $CTL_OPT->{cpus}; |
|---|
| 1424 |
my $formater = $CTL_OPT->{_formater}; |
|---|
| 1425 |
my $softmask = $CTL_OPT->{softmask}; |
|---|
| 1426 |
my $org_type = $CTL_OPT->{organism_type}; |
|---|
| 1427 |
|
|---|
| 1428 |
my ($db_n) = $db =~ /([^\/]+)$/; |
|---|
| 1429 |
$db_n =~ s/\.fasta$//; |
|---|
| 1430 |
|
|---|
| 1431 |
my $chunk_number = $chunk->number(); |
|---|
| 1432 |
my $q_length = $chunk->parent_seq_length(); |
|---|
| 1433 |
my $file_name = "$the_void/$seq_id\.$chunk_number"; |
|---|
| 1434 |
my $o_file = "$the_void/$seq_id\.$chunk_number\.$db_n\.blastn"; |
|---|
| 1435 |
|
|---|
| 1436 |
$LOG->add_entry("STARTED", $o_file, ""); |
|---|
| 1437 |
|
|---|
| 1438 |
$chunk->write_file($file_name); |
|---|
| 1439 |
runBlastn($file_name, |
|---|
| 1440 |
$db, |
|---|
| 1441 |
$o_file, |
|---|
| 1442 |
$blastn, |
|---|
| 1443 |
$eval_blastn, |
|---|
| 1444 |
$split_hit, |
|---|
| 1445 |
$cpus, |
|---|
| 1446 |
$org_type, |
|---|
| 1447 |
$softmask |
|---|
| 1448 |
); |
|---|
| 1449 |
|
|---|
| 1450 |
my %params; |
|---|
| 1451 |
$params{significance} = $eval_blastn; |
|---|
| 1452 |
$params{hsp_bit_min} = $bit_blastn; |
|---|
| 1453 |
$params{percov} = $pcov_blastn; |
|---|
| 1454 |
$params{percid} = $pid_blastn; |
|---|
| 1455 |
$params{split_hit} = $split_hit; |
|---|
| 1456 |
|
|---|
| 1457 |
my $chunk_keepers = Widget::blastn::parse($o_file, |
|---|
| 1458 |
\%params, |
|---|
| 1459 |
); |
|---|
| 1460 |
|
|---|
| 1461 |
$LOG->add_entry("FINISHED", $o_file, ""); |
|---|
| 1462 |
|
|---|
| 1463 |
PhatHit_utils::add_offset($chunk_keepers, |
|---|
| 1464 |
$chunk->offset(), |
|---|
| 1465 |
); |
|---|
| 1466 |
|
|---|
| 1467 |
$chunk->erase_fasta_file(); |
|---|
| 1468 |
|
|---|
| 1469 |
return $chunk_keepers |
|---|
| 1470 |
} |
|---|
| 1471 |
|
|---|
| 1472 |
sub runBlastn { |
|---|
| 1473 |
my $q_file = shift; |
|---|
| 1474 |
my $db = shift; |
|---|
| 1475 |
my $out_file = shift; |
|---|
| 1476 |
my $blast = shift; |
|---|
| 1477 |
my $eval_blast = shift; |
|---|
| 1478 |
my $split_hit = shift; |
|---|
| 1479 |
my $cpus = shift; |
|---|
| 1480 |
my $org_type = shift; |
|---|
| 1481 |
my $softmask = shift; |
|---|
| 1482 |
|
|---|
| 1483 |
my $command = $blast; |
|---|
| 1484 |
if ($command =~ /blastn$/) { |
|---|
| 1485 |
$command .= " $db $q_file B=100000 V=100000 E=$eval_blast"; |
|---|
| 1486 |
$command .= ($softmask) ? " wordmask=seg" : " filter=seg";; |
|---|
| 1487 |
$command .= " R=3"; |
|---|
| 1488 |
$command .= " W=15"; |
|---|
| 1489 |
$command .= " M=1"; |
|---|
| 1490 |
$command .= " N=-3"; |
|---|
| 1491 |
$command .= " Q=3"; |
|---|
| 1492 |
$command .= " Z=1000"; |
|---|
| 1493 |
$command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000"; |
|---|
| 1494 |
$command .= " cpus=$cpus"; |
|---|
| 1495 |
$command .= ($org_type eq 'eukaryotic') ? " topcomboN=1" : ""; |
|---|
| 1496 |
$command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5"; |
|---|
| 1497 |
$command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5"; |
|---|
| 1498 |
$command .= ($org_type eq 'eukaryotic') ? " hspsepqmax=$split_hit" : ""; |
|---|
| 1499 |
$command .= " lcmask"; |
|---|
| 1500 |
$command .= " gi"; |
|---|
| 1501 |
$command .= " warnings"; |
|---|
| 1502 |
$command .= " novalidctxok"; |
|---|
| 1503 |
$command .= " shortqueryok"; |
|---|
| 1504 |
$command .= ($org_type eq 'eukaryotic') ? "" : " kap"; |
|---|
| 1505 |
|
|---|
| 1506 |
$command .= " -o $out_file"; |
|---|
| 1507 |
} |
|---|
| 1508 |
elsif ($command =~ /blastall$/) { |
|---|
| 1509 |
$command .= " -p blastn"; |
|---|
| 1510 |
$command .= " -d $db -i $q_file -b 100000 -v 100000 -e $eval_blast"; |
|---|
| 1511 |
$command .= " -E 3"; |
|---|
| 1512 |
$command .= " -W 15"; |
|---|
| 1513 |
$command .= " -r 1"; |
|---|
| 1514 |
$command .= " -q -3"; |
|---|
| 1515 |
$command .= " -G 3"; |
|---|
| 1516 |
$command .= " -z 1000"; |
|---|
| 1517 |
$command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000"; |
|---|
| 1518 |
$command .= " -a $cpus"; |
|---|
| 1519 |
$command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5"; |
|---|
| 1520 |
$command .= " -U"; |
|---|
| 1521 |
$command .= " -F T"; |
|---|
| 1522 |
$command .= " -I"; |
|---|
| 1523 |
|
|---|
| 1524 |
$command .= " -o $out_file"; |
|---|
| 1525 |
} |
|---|
| 1526 |
else{ |
|---|
| 1527 |
die "ERROR: Must be a blastn executable"; |
|---|
| 1528 |
} |
|---|
| 1529 |
|
|---|
| 1530 |
my $w = new Widget::blastn(); |
|---|
| 1531 |
if (-e $out_file) { |
|---|
| 1532 |
print STDERR "re reading blast report.\n" unless $main::quiet; |
|---|
| 1533 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 1534 |
} |
|---|
| 1535 |
else { |
|---|
| 1536 |
print STDERR "running blast search.\n" unless $main::quiet; |
|---|
| 1537 |
my $dir = $out_file; |
|---|
| 1538 |
$dir =~ s/[^\/]+$//; |
|---|
| 1539 |
File::Path::mkpath($dir); |
|---|
| 1540 |
$w->run($command); |
|---|
| 1541 |
} |
|---|
| 1542 |
} |
|---|
| 1543 |
|
|---|
| 1544 |
sub blastx_as_chunks { |
|---|
| 1545 |
my $chunk = shift; |
|---|
| 1546 |
my $db = shift; |
|---|
| 1547 |
my $old_db = shift; |
|---|
| 1548 |
my $the_void = shift; |
|---|
| 1549 |
my $seq_id = shift; |
|---|
| 1550 |
my $CTL_OPT = shift; |
|---|
| 1551 |
my $rank = shift; |
|---|
| 1552 |
my $LOG = shift; |
|---|
| 1553 |
my $LOG_FLAG = shift; |
|---|
| 1554 |
|
|---|
| 1555 |
my $rflag = 1 if($old_db && $CTL_OPT->{repeat_protein} eq $old_db); |
|---|
| 1556 |
|
|---|
| 1557 |
my $blastx = $CTL_OPT->{_blastx}; |
|---|
| 1558 |
my $bit_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{bit_blastx}; |
|---|
| 1559 |
my $eval_blastx = ($rflag) ? $CTL_OPT->{eval_rm_blastx} : $CTL_OPT->{eval_blastx}; |
|---|
| 1560 |
my $pcov_blastx = ($rflag) ? $CTL_OPT->{pcov_rm_blastx} : $CTL_OPT->{pcov_blastx}; |
|---|
| 1561 |
my $pid_blastx = ($rflag) ? $CTL_OPT->{pid_rm_blastx} : $CTL_OPT->{pid_blastx}; |
|---|
| 1562 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1563 |
my $cpus = $CTL_OPT->{cpus}; |
|---|
| 1564 |
my $formater = $CTL_OPT->{_formater}; |
|---|
| 1565 |
my $softmask = ($rflag) ? 1 : $CTL_OPT->{softmask}; |
|---|
| 1566 |
my $org_type = $CTL_OPT->{organism_type}; |
|---|
| 1567 |
|
|---|
| 1568 |
|
|---|
| 1569 |
my ($db_n) = $db =~ /([^\/]+)$/; |
|---|
| 1570 |
$db_n =~ s/\.fasta$//; |
|---|
| 1571 |
|
|---|
| 1572 |
my $chunk_number = $chunk->number(); |
|---|
| 1573 |
|
|---|
| 1574 |
my ($db_old_n) = $old_db =~ /([^\/]+)$/; |
|---|
| 1575 |
$db_old_n =~ s/\.fasta$//; |
|---|
| 1576 |
my $blast_finished = "$the_void/$seq_id\.$chunk_number\.$db_old_n\.blastx"; |
|---|
| 1577 |
|
|---|
| 1578 |
my $t_dir = $TMP."/rank".$rank; |
|---|
| 1579 |
File::Path::mkpath($t_dir); |
|---|
| 1580 |
|
|---|
| 1581 |
my $t_file_name = "$t_dir/$seq_id\.$chunk_number"; |
|---|
| 1582 |
my $blast_dir = "$blast_finished\.temp_dir"; |
|---|
| 1583 |
my $o_file = "$blast_dir/$db_n\.blastx"; |
|---|
| 1584 |
|
|---|
| 1585 |
$db =~ /([^\/]+)$/; |
|---|
| 1586 |
my $tmp_db = "$TMP/$1"; |
|---|
| 1587 |
|
|---|
| 1588 |
$LOG->add_entry("STARTED", $blast_finished, "") if($LOG_FLAG); |
|---|
| 1589 |
|
|---|
| 1590 |
|
|---|
| 1591 |
if ((! @{[<$tmp_db.x?d*>]} || ! @{[<$tmp_db.?sq*>]}) && (! -e $blast_finished) ) { |
|---|
| 1592 |
if(my $lock = new File::NFSLock("$tmp_db.lock", 'EX', undef, 300)){ |
|---|
| 1593 |
copy($db, $tmp_db) if(! -e $tmp_db); |
|---|
| 1594 |
dbformat($formater, $tmp_db, 'blastx'); |
|---|
| 1595 |
$lock->unlock; |
|---|
| 1596 |
} |
|---|
| 1597 |
} |
|---|
| 1598 |
elsif (-e $blast_finished) { |
|---|
| 1599 |
print STDERR "re reading blast report.\n" unless ($main::quiet || !$LOG_FLAG); |
|---|
| 1600 |
print STDERR "$blast_finished\n" unless ($main::quiet || !$LOG_FLAG); |
|---|
| 1601 |
return $blast_dir; |
|---|
| 1602 |
} |
|---|
| 1603 |
|
|---|
| 1604 |
|
|---|
| 1605 |
$chunk->write_file($t_file_name); |
|---|
| 1606 |
|
|---|
| 1607 |
runBlastx($t_file_name, |
|---|
| 1608 |
$tmp_db, |
|---|
| 1609 |
$o_file, |
|---|
| 1610 |
$blastx, |
|---|
| 1611 |
$eval_blastx, |
|---|
| 1612 |
$split_hit, |
|---|
| 1613 |
$cpus, |
|---|
| 1614 |
$org_type, |
|---|
| 1615 |
$softmask |
|---|
| 1616 |
); |
|---|
| 1617 |
|
|---|
| 1618 |
$chunk->erase_fasta_file(); |
|---|
| 1619 |
|
|---|
| 1620 |
return $blast_dir; |
|---|
| 1621 |
} |
|---|
| 1622 |
|
|---|
| 1623 |
sub collect_blastx{ |
|---|
| 1624 |
my $chunk = shift; |
|---|
| 1625 |
my $blast_dir = shift; |
|---|
| 1626 |
my $CTL_OPT = shift; |
|---|
| 1627 |
my $LOG = shift; |
|---|
| 1628 |
|
|---|
| 1629 |
my $eval_blastx = $CTL_OPT->{eval_blastx}; |
|---|
| 1630 |
my $bit_blastx = $CTL_OPT->{bit_blastx}, |
|---|
| 1631 |
my $pcov_blastx = $CTL_OPT->{pcov_blastx}; |
|---|
| 1632 |
my $pid_blastx = $CTL_OPT->{pid_blastx}; |
|---|
| 1633 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1634 |
|
|---|
| 1635 |
my $blast_finished = $blast_dir; |
|---|
| 1636 |
$blast_finished =~ s/\.temp_dir$//; |
|---|
| 1637 |
|
|---|
| 1638 |
|
|---|
| 1639 |
if (! -e $blast_finished) { |
|---|
| 1640 |
system ("cat $blast_dir/*blast* > $blast_finished"); |
|---|
| 1641 |
rmtree ("$blast_dir"); |
|---|
| 1642 |
} |
|---|
| 1643 |
|
|---|
| 1644 |
my %params; |
|---|
| 1645 |
$params{significance} = $eval_blastx; |
|---|
| 1646 |
$params{hsp_bit_min} = $bit_blastx; |
|---|
| 1647 |
$params{percov} = $pcov_blastx; |
|---|
| 1648 |
$params{percid} = $pid_blastx; |
|---|
| 1649 |
$params{split_hit} = $split_hit; |
|---|
| 1650 |
|
|---|
| 1651 |
$LOG->add_entry("FINISHED", $blast_finished, ""); |
|---|
| 1652 |
|
|---|
| 1653 |
my $chunk_keepers = Widget::blastx::parse($blast_finished, |
|---|
| 1654 |
\%params, |
|---|
| 1655 |
); |
|---|
| 1656 |
|
|---|
| 1657 |
PhatHit_utils::add_offset($chunk_keepers, |
|---|
| 1658 |
$chunk->offset() |
|---|
| 1659 |
); |
|---|
| 1660 |
|
|---|
| 1661 |
if ($chunk->p_cutoff || $chunk->m_cutoff) { |
|---|
| 1662 |
my @keepers; |
|---|
| 1663 |
|
|---|
| 1664 |
foreach my $hit (@{$chunk_keepers}) { |
|---|
| 1665 |
if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) { |
|---|
| 1666 |
push (@keepers, $hit) |
|---|
| 1667 |
} |
|---|
| 1668 |
elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) { |
|---|
| 1669 |
push (@keepers, $hit) |
|---|
| 1670 |
} |
|---|
| 1671 |
} |
|---|
| 1672 |
|
|---|
| 1673 |
return \@keepers; |
|---|
| 1674 |
} |
|---|
| 1675 |
else { |
|---|
| 1676 |
return $chunk_keepers; |
|---|
| 1677 |
} |
|---|
| 1678 |
} |
|---|
| 1679 |
|
|---|
| 1680 |
sub blastx { |
|---|
| 1681 |
my $chunk = shift; |
|---|
| 1682 |
my $db = shift; |
|---|
| 1683 |
my $the_void = shift; |
|---|
| 1684 |
my $seq_id = shift; |
|---|
| 1685 |
my $CTL_OPT = shift; |
|---|
| 1686 |
my $LOG = shift; |
|---|
| 1687 |
|
|---|
| 1688 |
my $rflag = 1 if($db && $CTL_OPT->{repeat_protein} eq $db); |
|---|
| 1689 |
|
|---|
| 1690 |
my $blastx = $CTL_OPT->{_blastx}; |
|---|
| 1691 |
my $bit_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{bit_blastx}; |
|---|
| 1692 |
my $eval_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{eval_blastx}; |
|---|
| 1693 |
my $pcov_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pcov_blastx}; |
|---|
| 1694 |
my $pid_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pid_blastx}; |
|---|
| 1695 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1696 |
my $cpus = $CTL_OPT->{cpus}; |
|---|
| 1697 |
my $formater = $CTL_OPT->{_formater}; |
|---|
| 1698 |
my $softmask = ($rflag) ? 1 : $CTL_OPT->{softmask}; |
|---|
| 1699 |
my $org_type = $CTL_OPT->{organism_type}; |
|---|
| 1700 |
|
|---|
| 1701 |
my ($db_n) = $db =~ /([^\/]+)$/; |
|---|
| 1702 |
$db_n =~ s/\.fasta$//; |
|---|
| 1703 |
|
|---|
| 1704 |
my $q_length = $chunk->parent_seq_length(); |
|---|
| 1705 |
my $chunk_number = $chunk->number(); |
|---|
| 1706 |
|
|---|
| 1707 |
my $file_name = "$the_void/$seq_id\.$chunk_number"; |
|---|
| 1708 |
my $o_file = "$the_void/$seq_id\.$chunk_number\.$db_n\.blastx"; |
|---|
| 1709 |
|
|---|
| 1710 |
$LOG->add_entry("STARTED", $o_file, ""); |
|---|
| 1711 |
|
|---|
| 1712 |
$chunk->write_file($file_name); |
|---|
| 1713 |
runBlastx($file_name, |
|---|
| 1714 |
$db, |
|---|
| 1715 |
$o_file, |
|---|
| 1716 |
$blastx, |
|---|
| 1717 |
$eval_blastx, |
|---|
| 1718 |
$split_hit, |
|---|
| 1719 |
$cpus, |
|---|
| 1720 |
$org_type, |
|---|
| 1721 |
$softmask |
|---|
| 1722 |
); |
|---|
| 1723 |
|
|---|
| 1724 |
my %params; |
|---|
| 1725 |
$params{significance} = $eval_blastx; |
|---|
| 1726 |
$params{hsp_bit_min} = $bit_blastx; |
|---|
| 1727 |
$params{percov} = $pcov_blastx; |
|---|
| 1728 |
$params{percid} = $pid_blastx; |
|---|
| 1729 |
$params{split_hit} = $split_hit; |
|---|
| 1730 |
|
|---|
| 1731 |
my $chunk_keepers = Widget::blastx::parse($o_file, |
|---|
| 1732 |
\%params, |
|---|
| 1733 |
); |
|---|
| 1734 |
|
|---|
| 1735 |
$LOG->add_entry("FINISHED", $o_file, ""); |
|---|
| 1736 |
|
|---|
| 1737 |
PhatHit_utils::add_offset($chunk_keepers, |
|---|
| 1738 |
$chunk->offset(), |
|---|
| 1739 |
); |
|---|
| 1740 |
|
|---|
| 1741 |
$chunk->erase_fasta_file(); |
|---|
| 1742 |
|
|---|
| 1743 |
if ($chunk->p_cutoff || $chunk->m_cutoff) { |
|---|
| 1744 |
my @keepers; |
|---|
| 1745 |
|
|---|
| 1746 |
foreach my $hit (@{$chunk_keepers}) { |
|---|
| 1747 |
if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) { |
|---|
| 1748 |
push (@keepers, $hit) |
|---|
| 1749 |
} |
|---|
| 1750 |
elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) { |
|---|
| 1751 |
push (@keepers, $hit) |
|---|
| 1752 |
} |
|---|
| 1753 |
} |
|---|
| 1754 |
|
|---|
| 1755 |
return \@keepers; |
|---|
| 1756 |
} |
|---|
| 1757 |
else { |
|---|
| 1758 |
return $chunk_keepers |
|---|
| 1759 |
} |
|---|
| 1760 |
} |
|---|
| 1761 |
|
|---|
| 1762 |
|
|---|
| 1763 |
sub runBlastx { |
|---|
| 1764 |
my $q_file = shift; |
|---|
| 1765 |
my $db = shift; |
|---|
| 1766 |
my $out_file = shift; |
|---|
| 1767 |
my $blast = shift; |
|---|
| 1768 |
my $eval_blast = shift; |
|---|
| 1769 |
my $split_hit = shift; |
|---|
| 1770 |
my $cpus = shift; |
|---|
| 1771 |
my $org_type = shift; |
|---|
| 1772 |
my $softmask = shift; |
|---|
| 1773 |
|
|---|
| 1774 |
|
|---|
| 1775 |
my $command = $blast; |
|---|
| 1776 |
if ($command =~ /blastx$/) { |
|---|
| 1777 |
$command .= " $db $q_file B=10000 V=10000 E=$eval_blast"; |
|---|
| 1778 |
$command .= ($softmask) ? " wordmask=seg" : " filter=seg"; |
|---|
| 1779 |
|
|---|
| 1780 |
|
|---|
| 1781 |
|
|---|
| 1782 |
$command .= " Z=300"; |
|---|
| 1783 |
$command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000"; |
|---|
| 1784 |
$command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5"; |
|---|
| 1785 |
$command .= " cpus=$cpus"; |
|---|
| 1786 |
$command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5"; |
|---|
| 1787 |
|
|---|
| 1788 |
$command .= " lcmask"; |
|---|
| 1789 |
$command .= " kap"; |
|---|
| 1790 |
$command .= " gi"; |
|---|
| 1791 |
$command .= " warnings"; |
|---|
| 1792 |
$command .= " novalidctxok"; |
|---|
| 1793 |
$command .= " shortqueryok"; |
|---|
| 1794 |
|
|---|
| 1795 |
$command .= " -o $out_file"; |
|---|
| 1796 |
} |
|---|
| 1797 |
elsif ($command =~ /blastall$/) { |
|---|
| 1798 |
$command .= " -p blastx"; |
|---|
| 1799 |
$command .= " -d $db -i $q_file -b 100000 -v 100000 -e $eval_blast"; |
|---|
| 1800 |
$command .= " -z 300"; |
|---|
| 1801 |
$command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000"; |
|---|
| 1802 |
$command .= " -a $cpus"; |
|---|
| 1803 |
$command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5"; |
|---|
| 1804 |
$command .= " -U"; |
|---|
| 1805 |
$command .= " -F T"; |
|---|
| 1806 |
$command .= " -I"; |
|---|
| 1807 |
|
|---|
| 1808 |
$command .= " -o $out_file"; |
|---|
| 1809 |
} |
|---|
| 1810 |
else{ |
|---|
| 1811 |
die "ERROR: Must be a blastx executable"; |
|---|
| 1812 |
} |
|---|
| 1813 |
|
|---|
| 1814 |
my $w = new Widget::blastx(); |
|---|
| 1815 |
|
|---|
| 1816 |
if (-e $out_file) { |
|---|
| 1817 |
print STDERR "re reading blast report.\n" unless $main::quiet; |
|---|
| 1818 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 1819 |
} |
|---|
| 1820 |
else { |
|---|
| 1821 |
print STDERR "running blast search.\n" unless $main::quiet; |
|---|
| 1822 |
my $dir = $out_file; |
|---|
| 1823 |
$dir =~ s/[^\/]+$//; |
|---|
| 1824 |
File::Path::mkpath($dir); |
|---|
| 1825 |
$w->run($command); |
|---|
| 1826 |
} |
|---|
| 1827 |
} |
|---|
| 1828 |
|
|---|
| 1829 |
sub tblastx_as_chunks { |
|---|
| 1830 |
my $chunk = shift; |
|---|
| 1831 |
my $db = shift; |
|---|
| 1832 |
my $old_db = shift; |
|---|
| 1833 |
my $the_void = shift; |
|---|
| 1834 |
my $seq_id = shift; |
|---|
| 1835 |
my $CTL_OPT = shift; |
|---|
| 1836 |
my $rank = shift; |
|---|
| 1837 |
my $LOG = shift; |
|---|
| 1838 |
my $LOG_FLAG = shift; |
|---|
| 1839 |
|
|---|
| 1840 |
my $tblastx = $CTL_OPT->{_tblastx}; |
|---|
| 1841 |
my $bit_tblastx = $CTL_OPT->{bit_tblastx}; |
|---|
| 1842 |
my $eval_tblastx = $CTL_OPT->{eval_tblastx}; |
|---|
| 1843 |
my $pcov_tblastx = $CTL_OPT->{pcov_tblastx}; |
|---|
| 1844 |
my $pid_tblastx = $CTL_OPT->{pid_tblastx}; |
|---|
| 1845 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1846 |
my $cpus = $CTL_OPT->{cpus}; |
|---|
| 1847 |
my $formater = $CTL_OPT->{_formater}; |
|---|
| 1848 |
my $softmask = $CTL_OPT->{softmask}; |
|---|
| 1849 |
my $org_type = $CTL_OPT->{organism_type}; |
|---|
| 1850 |
|
|---|
| 1851 |
|
|---|
| 1852 |
my ($db_n) = $db =~ /([^\/]+)$/; |
|---|
| 1853 |
$db_n =~ s/\.fasta$//; |
|---|
| 1854 |
|
|---|
| 1855 |
my $chunk_number = $chunk->number(); |
|---|
| 1856 |
|
|---|
| 1857 |
my ($db_old_n) = $old_db =~ /([^\/]+)$/; |
|---|
| 1858 |
$db_old_n =~ s/\.fasta$//; |
|---|
| 1859 |
my $blast_finished = "$the_void/$seq_id\.$chunk_number\.$db_old_n\.tblastx"; |
|---|
| 1860 |
|
|---|
| 1861 |
my $t_dir = $TMP."/rank".$rank; |
|---|
| 1862 |
File::Path::mkpath($t_dir); |
|---|
| 1863 |
|
|---|
| 1864 |
my $t_file_name = "$t_dir/$seq_id\.$chunk_number"; |
|---|
| 1865 |
my $blast_dir = "$blast_finished\.temp_dir"; |
|---|
| 1866 |
my $o_file = "$blast_dir/$db_n\.tblastx"; |
|---|
| 1867 |
|
|---|
| 1868 |
$db =~ /([^\/]+)$/; |
|---|
| 1869 |
my $tmp_db = "$TMP/$1"; |
|---|
| 1870 |
|
|---|
| 1871 |
$LOG->add_entry("STARTED", $blast_finished, "") if($LOG_FLAG); |
|---|
| 1872 |
|
|---|
| 1873 |
|
|---|
| 1874 |
if ((! @{[<$tmp_db.x?d*>]} || ! @{[<$tmp_db.?sq*>]}) && (! -e $blast_finished) ) { |
|---|
| 1875 |
if(my $lock = new File::NFSLock("$tmp_db.lock", 'EX', undef, 300)){ |
|---|
| 1876 |
copy($db, $tmp_db) if(! -e $tmp_db); |
|---|
| 1877 |
dbformat($formater, $tmp_db, 'tblastx'); |
|---|
| 1878 |
$lock->unlock; |
|---|
| 1879 |
} |
|---|
| 1880 |
} |
|---|
| 1881 |
elsif (-e $blast_finished) { |
|---|
| 1882 |
print STDERR "re reading blast report.\n" unless ($main::quiet || !$LOG_FLAG); |
|---|
| 1883 |
print STDERR "$blast_finished\n" unless ($main::quiet || !$LOG_FLAG); |
|---|
| 1884 |
return $blast_dir; |
|---|
| 1885 |
} |
|---|
| 1886 |
|
|---|
| 1887 |
|
|---|
| 1888 |
$chunk->write_file($t_file_name); |
|---|
| 1889 |
|
|---|
| 1890 |
runtBlastx($t_file_name, |
|---|
| 1891 |
$tmp_db, |
|---|
| 1892 |
$o_file, |
|---|
| 1893 |
$tblastx, |
|---|
| 1894 |
$eval_tblastx, |
|---|
| 1895 |
$split_hit, |
|---|
| 1896 |
$cpus, |
|---|
| 1897 |
$org_type, |
|---|
| 1898 |
$softmask |
|---|
| 1899 |
); |
|---|
| 1900 |
|
|---|
| 1901 |
$chunk->erase_fasta_file(); |
|---|
| 1902 |
|
|---|
| 1903 |
return $blast_dir; |
|---|
| 1904 |
} |
|---|
| 1905 |
|
|---|
| 1906 |
sub collect_tblastx{ |
|---|
| 1907 |
my $chunk = shift; |
|---|
| 1908 |
my $blast_dir = shift; |
|---|
| 1909 |
my $CTL_OPT = shift; |
|---|
| 1910 |
my $LOG = shift; |
|---|
| 1911 |
|
|---|
| 1912 |
my $eval_tblastx = $CTL_OPT->{eval_tblastx}; |
|---|
| 1913 |
my $bit_tblastx = $CTL_OPT->{bit_tblastx}, |
|---|
| 1914 |
my $pcov_tblastx = $CTL_OPT->{pcov_tblastx}; |
|---|
| 1915 |
my $pid_tblastx = $CTL_OPT->{pid_tblastx}; |
|---|
| 1916 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1917 |
|
|---|
| 1918 |
my $blast_finished = $blast_dir; |
|---|
| 1919 |
$blast_finished =~ s/\.temp_dir$//; |
|---|
| 1920 |
|
|---|
| 1921 |
|
|---|
| 1922 |
if (! -e $blast_finished) { |
|---|
| 1923 |
system ("cat $blast_dir/*blast* > $blast_finished"); |
|---|
| 1924 |
File::Path::rmtree ("$blast_dir"); |
|---|
| 1925 |
} |
|---|
| 1926 |
|
|---|
| 1927 |
my %params; |
|---|
| 1928 |
$params{significance} = $eval_tblastx; |
|---|
| 1929 |
$params{hsp_bit_min} = $bit_tblastx; |
|---|
| 1930 |
$params{percov} = $pcov_tblastx; |
|---|
| 1931 |
$params{percid} = $pid_tblastx; |
|---|
| 1932 |
$params{split_hit} = $split_hit; |
|---|
| 1933 |
|
|---|
| 1934 |
$LOG->add_entry("FINISHED", $blast_finished, ""); |
|---|
| 1935 |
|
|---|
| 1936 |
my $chunk_keepers = Widget::tblastx::parse($blast_finished, |
|---|
| 1937 |
\%params, |
|---|
| 1938 |
); |
|---|
| 1939 |
|
|---|
| 1940 |
PhatHit_utils::add_offset($chunk_keepers, |
|---|
| 1941 |
$chunk->offset(), |
|---|
| 1942 |
); |
|---|
| 1943 |
|
|---|
| 1944 |
if ($chunk->p_cutoff || $chunk->m_cutoff) { |
|---|
| 1945 |
my @keepers; |
|---|
| 1946 |
|
|---|
| 1947 |
foreach my $hit (@{$chunk_keepers}) { |
|---|
| 1948 |
if ($hit->strand('query') eq '1' && $hit->start('query') >= $chunk->p_cutoff) { |
|---|
| 1949 |
push (@keepers, $hit) |
|---|
| 1950 |
} |
|---|
| 1951 |
elsif ($hit->strand('query') eq '-1' && $hit->start('query') >= $chunk->m_cutoff) { |
|---|
| 1952 |
push (@keepers, $hit) |
|---|
| 1953 |
} |
|---|
| 1954 |
} |
|---|
| 1955 |
|
|---|
| 1956 |
return \@keepers; |
|---|
| 1957 |
} |
|---|
| 1958 |
else { |
|---|
| 1959 |
return $chunk_keepers |
|---|
| 1960 |
} |
|---|
| 1961 |
} |
|---|
| 1962 |
|
|---|
| 1963 |
sub tblastx { |
|---|
| 1964 |
my $chunk = shift; |
|---|
| 1965 |
my $db = shift; |
|---|
| 1966 |
my $the_void = shift; |
|---|
| 1967 |
my $seq_id = shift; |
|---|
| 1968 |
my $CTL_OPT = shift; |
|---|
| 1969 |
my $LOG = shift; |
|---|
| 1970 |
|
|---|
| 1971 |
my $tblastx = $CTL_OPT->{_tblastx}; |
|---|
| 1972 |
my $bit_tblastx = $CTL_OPT->{bit_tblastx}; |
|---|
| 1973 |
my $eval_tblastx = $CTL_OPT->{eval_tblastx}; |
|---|
| 1974 |
my $pcov_tblastx = $CTL_OPT->{pcov_tblastx}; |
|---|
| 1975 |
my $pid_tblastx = $CTL_OPT->{pid_tblastx}; |
|---|
| 1976 |
my $split_hit = $CTL_OPT->{split_hit}; |
|---|
| 1977 |
my $cpus = $CTL_OPT->{cpus}; |
|---|
| 1978 |
my $formater = $CTL_OPT->{_formater}; |
|---|
| 1979 |
my $softmask = $CTL_OPT->{softmask}; |
|---|
| 1980 |
my $org_type = $CTL_OPT->{organism_type}; |
|---|
| 1981 |
|
|---|
| 1982 |
my ($db_n) = $db =~ /([^\/]+)$/; |
|---|
| 1983 |
$db_n =~ s/\.fasta$//; |
|---|
| 1984 |
|
|---|
| 1985 |
my $chunk_number = $chunk->number(); |
|---|
| 1986 |
my $q_length = $chunk->parent_seq_length(); |
|---|
| 1987 |
my $file_name = "$the_void/$seq_id\.$chunk_number"; |
|---|
| 1988 |
my $o_file = "$the_void/$seq_id\.$chunk_number\.$db_n\.tblastx"; |
|---|
| 1989 |
|
|---|
| 1990 |
$LOG->add_entry("STARTED", $o_file, ""); |
|---|
| 1991 |
|
|---|
| 1992 |
$chunk->write_file($file_name); |
|---|
| 1993 |
runtBlastx($file_name, |
|---|
| 1994 |
$db, |
|---|
| 1995 |
$o_file, |
|---|
| 1996 |
$tblastx, |
|---|
| 1997 |
$eval_tblastx, |
|---|
| 1998 |
$split_hit, |
|---|
| 1999 |
$cpus, |
|---|
| 2000 |
$org_type, |
|---|
| 2001 |
$softmask |
|---|
| 2002 |
); |
|---|
| 2003 |
|
|---|
| 2004 |
my %params; |
|---|
| 2005 |
$params{significance} = $eval_tblastx; |
|---|
| 2006 |
$params{hsp_bit_min} = $bit_tblastx; |
|---|
| 2007 |
$params{percov} = $pcov_tblastx; |
|---|
| 2008 |
$params{percid} = $pid_tblastx; |
|---|
| 2009 |
$params{split_hit} = $split_hit; |
|---|
| 2010 |
|
|---|
| 2011 |
my $chunk_keepers = Widget::tblastx::parse($o_file, |
|---|
| 2012 |
\%params, |
|---|
| 2013 |
); |
|---|
| 2014 |
|
|---|
| 2015 |
$LOG->add_entry("FINISHED", $o_file, ""); |
|---|
| 2016 |
|
|---|
| 2017 |
PhatHit_utils::add_offset($chunk_keepers, |
|---|
| 2018 |
$chunk->offset(), |
|---|
| 2019 |
); |
|---|
| 2020 |
|
|---|
| 2021 |
$chunk->erase_fasta_file(); |
|---|
| 2022 |
|
|---|
| 2023 |
return $chunk_keepers; |
|---|
| 2024 |
} |
|---|
| 2025 |
|
|---|
| 2026 |
sub runtBlastx { |
|---|
| 2027 |
my $q_file = shift; |
|---|
| 2028 |
my $db = shift; |
|---|
| 2029 |
my $out_file = shift; |
|---|
| 2030 |
my $blast = shift; |
|---|
| 2031 |
my $eval_blast = shift; |
|---|
| 2032 |
my $split_hit = shift; |
|---|
| 2033 |
my $cpus = shift; |
|---|
| 2034 |
my $org_type = shift; |
|---|
| 2035 |
my $softmask = shift; |
|---|
| 2036 |
|
|---|
| 2037 |
|
|---|
| 2038 |
my $command = $blast; |
|---|
| 2039 |
if ($command =~ /tblastx$/) { |
|---|
| 2040 |
$command .= " $db $q_file B=100000 V=100000 E=$eval_blast"; |
|---|
| 2041 |
$command .= ($softmask) ? " wordmask=seg" : " filter=seg"; |
|---|
| 2042 |
|
|---|
| 2043 |
$command .= " Z=1000"; |
|---|
| 2044 |
$command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000"; |
|---|
| 2045 |
$command .= " cpus=$cpus"; |
|---|
| 2046 |
$command .= ($org_type eq 'eukaryotic') ? " topcomboN=1" : ""; |
|---|
| 2047 |
$command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5"; |
|---|
| 2048 |
$command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5"; |
|---|
| 2049 |
$command .= ($org_type eq 'eukaryotic') ? " hspsepqmax=$split_hit" : ""; |
|---|
| 2050 |
$command .= " lcmask"; |
|---|
| 2051 |
$command .= " gi"; |
|---|
| 2052 |
$command .= " warnings"; |
|---|
| 2053 |
$command .= " novalidctxok"; |
|---|
| 2054 |
$command .= " shortqueryok"; |
|---|
| 2055 |
$command .= ($org_type eq 'eukaryotic') ? "" : " kap"; |
|---|
| 2056 |
|
|---|
| 2057 |
$command .= " -o $out_file"; |
|---|
| 2058 |
} |
|---|
| 2059 |
elsif ($command =~ /blastall$/) { |
|---|
| 2060 |
$command .= " -p tblastx"; |
|---|
| 2061 |
$command .= " -d $db -i $q_file -b 100000 -v 100000 -e $eval_blast"; |
|---|
| 2062 |
|
|---|
| 2063 |
$command .= " -z 1000"; |
|---|
| 2064 |
$command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000"; |
|---|
| 2065 |
$command .= " -a $cpus"; |
|---|
| 2066 |
$command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5"; |
|---|
| 2067 |
$command .= " -U"; |
|---|
| 2068 |
$command .= " -F T"; |
|---|
| 2069 |
$command .= " -I"; |
|---|
| 2070 |
|
|---|
| 2071 |
$command .= " -o $out_file"; |
|---|
| 2072 |
} |
|---|
| 2073 |
else{ |
|---|
| 2074 |
die "ERROR: Must be a tblastx executable"; |
|---|
| 2075 |
} |
|---|
| 2076 |
|
|---|
| 2077 |
my $w = new Widget::tblastx(); |
|---|
| 2078 |
if (-e $out_file) { |
|---|
| 2079 |
print STDERR "re reading blast report.\n" unless $main::quiet; |
|---|
| 2080 |
print STDERR "$out_file\n" unless $main::quiet; |
|---|
| 2081 |
} |
|---|
| 2082 |
else { |
|---|
| 2083 |
print STDERR "running blast search.\n" unless $main::quiet; |
|---|
| 2084 |
my $dir = $out_file; |
|---|
| 2085 |
$dir =~ s/[^\/]+$//; |
|---|
| 2086 |
File::Path::mkpath($dir); |
|---|
| 2087 |
$w->run($command); |
|---|
| 2088 |
} |
|---|
| 2089 |
|
|---|
| 2090 |
} |
|---|
| 2091 |
|
|---|
| 2092 |
sub repeatmask { |
|---|
| 2093 |
my $chunk = shift; |
|---|
| 2094 |
my $the_void = shift; |
|---|
| 2095 |
my $seq_id = shift; |
|---|
| 2096 |
my $model_org = shift; |
|---|
| 2097 |
my $RepeatMasker = shift; |
|---|
| 2098 |
my $rmlib = shift; |
|---|
| 2099 |
my $cpus = shift; |
|---|
| 2100 |
my $LOG = shift; |
|---|
| 2101 |
|
|---|
| 2102 |
my $chunk_number = $chunk->number(); |
|---|
| 2103 |
my $file_name = "$the_void/$seq_id\.$chunk_number"; |
|---|
| 2104 |
$file_name .= ".specific" if($rmlib); |
|---|
| 2105 |
my $o_file = "$file_name\.out"; |
|---|
| 2106 |
my $q_length = $chunk->parent_seq_length(); |
|---|
| 2107 |
my $query_def = $chunk->parent_def(); |
|---|
| 2108 |
my $query_seq = $chunk->seq(); |
|---|
| 2109 |
|
|---|
| 2110 |
$LOG->add_entry("STARTED", $o_file, ""); |
|---|
| 2111 |
|
|---|
| 2112 |
$chunk->write_file($file_name); |
|---|
| 2113 |
|
|---|
| 2114 |
runRepeatMasker($file_name, |
|---|
| 2115 |
$model_org, |
|---|
| 2116 |
$the_void, |
|---|
| 2117 |
$o_file, |
|---|
| 2118 |
$RepeatMasker, |
|---|
| 2119 |
$rmlib, |
|---|
| 2120 |
$cpus, |
|---|
| 2121 |
); |
|---|
| 2122 |
|
|---|
| 2123 |
my $rm_chunk_keepers = Widget::RepeatMasker::parse($o_file, |
|---|
| 2124 |
$seq_id, |
|---|
| 2125 |
$q_length |
|---|
| 2126 |
); |
|---|
| 2127 |
|
|---|
| 2128 |
$LOG->add_entry("FINISHED", $o_file, ""); |
|---|
| 2129 |
|
|---|
| 2130 |
PhatHit_utils::add_offset($rm_chunk_keepers, |
|---|
| 2131 |
$chunk->offset(), |
|---|
| 2132 |
); |
|---|
| 2133 |
|
|---|
| 2134 |
|
|---|
| 2135 |
|
|---|
| 2136 |
|
|---|
| 2137 |
|
|---|
| 2138 |
$chunk->erase_fasta_file(); |
|---|
| 2139 |
|
|---|
| 2140 |
return ($rm_chunk_keepers); |
|---|
| 2141 |
} |
|---|
| 2142 |
|
|---|
| 2143 |
sub runRepeatMasker { |
|---|
| 2144 |
my $q_file = shift; |
|---|
| 2145 |
my $species = shift; |
|---|
| 2146 |
my $dir = shift; |
|---|
| 2147 |
my $o_file = shift; |
|---|
| 2148 |
my $RepeatMasker = shift; |
|---|
| 2149 |
my $rmlib = shift; |
|---|
| 2150 |
my $cpus = shift; |
|---|
| 2151 |
my $no_low = shift; |
|---|
| 2152 |
|
|---|
| 2153 |
my $command = $RepeatMasker; |
|---|
| 2154 |
|
|---|
| 2155 |
if ($rmlib) { |
|---|
| 2156 |
$command .= " $q_file -lib $rmlib -dir $dir -pa $cpus"; |
|---|
| 2157 |
} |
|---|
| 2158 |
else { |
|---|
| 2159 |
$command .= " $q_file -species $species -dir $dir -pa $cpus"; |
|---|
| 2160 |
} |
|---|
| 2161 |
$command .= " -nolow" if defined($no_low); |
|---|
| 2162 |
|
|---|
| 2163 |
my $w = new Widget::RepeatMasker(); |
|---|
| 2164 |
if (-e $o_file) { |
|---|
| 2165 |
print STDERR "re reading repeat masker report.\n" unless $main::quiet; |
|---|
| 2166 |
print STDERR "$o_file\n" unless $main::quiet; |
|---|
| 2167 |
} |
|---|
| 2168 |
else { |
|---|
| 2169 |
print STDERR "running repeat masker.\n" unless $main::quiet; |
|---|
| 2170 |
$w->run($command); |
|---|
| 2171 |
} |
|---|
| 2172 |
} |
|---|
| 2173 |
|
|---|
| 2174 |
|
|---|
| 2175 |
|
|---|
| 2176 |
sub build_the_void { |
|---|
| 2177 |
my $seq_id = shift; |
|---|
| 2178 |
my $out_dir = shift; |
|---|
| 2179 |
|
|---|
| 2180 |
$out_dir =~ s/\/$//; |
|---|
| 2181 |
|
|---|
| 2182 |
my $vid = "theVoid\.$seq_id"; |
|---|
| 2183 |
my $the_void = "$out_dir/$vid"; |
|---|
| 2184 |
File::Path::mkpath ($the_void); |
|---|
| 2185 |
|
|---|
| 2186 |
return $the_void; |
|---|
| 2187 |
} |
|---|
| 2188 |
|
|---|
| 2189 |
|
|---|
| 2190 |
|
|---|
| 2191 |
|
|---|
| 2192 |
sub set_defaults { |
|---|
| 2193 |
my $type = shift || 'all'; |
|---|
| 2194 |
|
|---|
| 2195 |
if ($type !~ /^all$|^opts$|^bopt$|^exe$/) { |
|---|
| 2196 |
warn "WARNING: Invalid type \'$type\' in S_Func ::set_defaults"; |
|---|
| 2197 |
$type = 'all'; |
|---|
| 2198 |
} |
|---|
| 2199 |
|
|---|
| 2200 |
my %CTL_OPT; |
|---|
| 2201 |
|
|---|
| 2202 |
|
|---|
| 2203 |
if ($type eq 'all' || $type eq 'opts') { |
|---|
| 2204 |
$CTL_OPT{'genome'} = ''; |
|---|
| 2205 |
$CTL_OPT{'genome_gff'} = ''; |
|---|
| 2206 |
$CTL_OPT{'est_pass'} = 0; |
|---|
| 2207 |
$CTL_OPT{'altest_pass'} = 0; |
|---|
| 2208 |
$CTL_OPT{'protein_pass'} = 0; |
|---|
| 2209 |
$CTL_OPT{'rm_pass'} = 0; |
|---|
| 2210 |
$CTL_OPT{'model_pass'} = 1; |
|---|
| 2211 |
$CTL_OPT{'pred_pass'} = 0; |
|---|
| 2212 |
$CTL_OPT{'other_pass'} = 0; |
|---|
| 2213 |
$CTL_OPT{'est'} = ''; |
|---|
| 2214 |
$CTL_OPT{'est_reads'} = ''; |
|---|
| 2215 |
$CTL_OPT{'altest'} = ''; |
|---|
| 2216 |
$CTL_OPT{'est_gff'} = ''; |
|---|
| 2217 |
$CTL_OPT{'altest_gff'} = ''; |
|---|
| 2218 |
$CTL_OPT{'protein'} = ''; |
|---|
| 2219 |
$CTL_OPT{'protein_gff'} = ''; |
|---|
| 2220 |
$CTL_OPT{'model_org'} = 'all'; |
|---|
| 2221 |
$CTL_OPT{'repeat_protein'} = Cwd::abs_path("$FindBin::Bin/../data/te_proteins.fasta"); |
|---|
| 2222 |
$CTL_OPT{'rmlib'} = ''; |
|---|
| 2223 |
$CTL_OPT{'rm_gff'} = ''; |
|---|
| 2224 |
$CTL_OPT{'organism_type'} = 'eukaryotic'; |
|---|
| 2225 |
$CTL_OPT{'predictor'} = 'est2genome'; |
|---|
| 2226 |
$CTL_OPT{'predictor'} = 'model_gff' if($main::eva); |
|---|
| 2227 |
$CTL_OPT{'snaphmm'} = ''; |
|---|
| 2228 |
$CTL_OPT{'gmhmm'} = ''; |
|---|
| 2229 |
$CTL_OPT{'augustus_species'} = ''; |
|---|
| 2230 |
$CTL_OPT{'fgenesh_par_file'} = ''; |
|---|
| 2231 |
$CTL_OPT{'model_gff'} = ''; |
|---|
| 2232 |
$CTL_OPT{'pred_gff'} = ''; |
|---|
| 2233 |
$CTL_OPT{'other_gff'} = ''; |
|---|
| 2234 |
$CTL_OPT{'alt_peptide'} = 'C'; |
|---|
| 2235 |
$CTL_OPT{'cpus'} = 1; |
|---|
| 2236 |
$CTL_OPT{'evaluate'} = 0; |
|---|
| 2237 |
$CTL_OPT{'evaluate'} = 1 if($main::eva); |
|---|
| 2238 |
$CTL_OPT{'max_dna_len'} = 100000; |
|---|
| 2239 |
$CTL_OPT{'min_contig'} = 1; |
|---|
| 2240 |
$CTL_OPT{'split_hit'} = 10000; |
|---|
| 2241 |
$CTL_OPT{'softmask'} = 1; |
|---|
| 2242 |
$CTL_OPT{'pred_flank'} = 200; |
|---|
| 2243 |
$CTL_OPT{'single_exon'} = 0; |
|---|
| 2244 |
$CTL_OPT{'single_length'} = 250; |
|---|
| 2245 |
$CTL_OPT{'min_protein'} = 0; |
|---|
| 2246 |
$CTL_OPT{'AED_threshold'} = 1; |
|---|
| 2247 |
$CTL_OPT{'keep_preds'} = 0; |
|---|
| 2248 |
$CTL_OPT{'map_forward'} = 0; |
|---|
| 2249 |
$CTL_OPT{'retry'} = 1; |
|---|
| 2250 |
$CTL_OPT{'clean_try'} = 0; |
|---|
| 2251 |
$CTL_OPT{'TMP'} = ''; |
|---|
| 2252 |
$CTL_OPT{'run'} = ''; |
|---|
| 2253 |
$CTL_OPT{'unmask'} = 0; |
|---|
| 2254 |
$CTL_OPT{'clean_up'} = 0; |
|---|
| 2255 |
|
|---|
| 2256 |
$CTL_OPT{'side_thre'} = 5; |
|---|
| 2257 |
$CTL_OPT{'eva_window_size'} = 70; |
|---|
| 2258 |
$CTL_OPT{'eva_split_hit'} = 1; |
|---|
| 2259 |
$CTL_OPT{'eva_hspmax'} = 100; |
|---|
| 2260 |
$CTL_OPT{'eva_gspmax'} = 100; |
|---|
| 2261 |
$CTL_OPT{'enable_fathom'} = 0; |
|---|
| 2262 |
$CTL_OPT{'enable_fathom'} = 1 if($main::eva); |
|---|
| 2263 |
} |
|---|
| 2264 |
|
|---|
| 2265 |
|
|---|
| 2266 |
if ($type eq 'all' || $type eq 'bopts') { |
|---|
| 2267 |
$CTL_OPT{'blast_type'} = 'wublast'; |
|---|
| 2268 |
$CTL_OPT{'pcov_blastn'} = 0.80; |
|---|
| 2269 |
$CTL_OPT{'pid_blastn'} = 0.85; |
|---|
| 2270 |
$CTL_OPT{'eval_blastn'} = 1e-10; |
|---|
| 2271 |
$CTL_OPT{'bit_blastn'} = 40; |
|---|
| 2272 |
$CTL_OPT{'pcov_blastx'} = 0.50; |
|---|
| 2273 |
$CTL_OPT{'pid_blastx'} = 0.40; |
|---|
| 2274 |
$CTL_OPT{'eval_blastx'} = 1e-6; |
|---|
| 2275 |
$CTL_OPT{'bit_blastx'} = 30; |
|---|
| 2276 |
$CTL_OPT{'pcov_rm_blastx'} = 0.50; |
|---|
| 2277 |
$CTL_OPT{'pid_rm_blastx'} = 0.40; |
|---|
| 2278 |
$CTL_OPT{'eval_rm_blastx'} = 1e-6; |
|---|
| 2279 |
$CTL_OPT{'bit_rm_blastx'} = 30; |
|---|
| 2280 |
$CTL_OPT{'pcov_tblastx'} = 0.80; |
|---|
| 2281 |
$CTL_OPT{'pid_tblastx'} = 0.85; |
|---|
| 2282 |
$CTL_OPT{'eval_tblastx'} = 1e-10; |
|---|
| 2283 |
$CTL_OPT{'bit_tblastx'} = 40; |
|---|
| 2284 |
$CTL_OPT{'en_score_limit'} = 20; |
|---|
| 2285 |
$CTL_OPT{'ep_score_limit'} = 20; |
|---|
| 2286 |
|
|---|
| 2287 |
$CTL_OPT{'eva_pcov_blastn'} = 0.80; |
|---|
| 2288 |
$CTL_OPT{'eva_pid_blastn'} = 0.85; |
|---|
| 2289 |
$CTL_OPT{'eva_eval_blastn'} = 1e-10; |
|---|
| 2290 |
$CTL_OPT{'eva_bit_blastn'} = 40; |
|---|
| 2291 |
} |
|---|
| 2292 |
|
|---|
| 2293 |
|
|---|
| 2294 |
if ($type eq 'all' || $type eq 'exe') { |
|---|
| 2295 |
my @exes = ('xdformat', |
|---|
| 2296 |
'formatdb', |
|---|
| 2297 |
'blastall', |
|---|
| 2298 |
'blastn', |
|---|
| 2299 |
'blastx', |
|---|
| 2300 |
'tblastx', |
|---|
| 2301 |
'RepeatMasker', |
|---|
| 2302 |
'exonerate', |
|---|
| 2303 |
'snap', |
|---|
| 2304 |
'gmhmme3', |
|---|
| 2305 |
'gmhmmp', |
|---|
| 2306 |
'augustus', |
|---|
| 2307 |
'fgenesh', |
|---|
| 2308 |
'twinscan', |
|---|
| 2309 |
'jigsaw', |
|---|
| 2310 |
'qrna', |
|---|
| 2311 |
'fathom', |
|---|
| 2312 |
'probuild' |
|---|
| 2313 |
); |
|---|
| 2314 |
|
|---|
| 2315 |
foreach my $exe (@exes) { |
|---|
| 2316 |
my $loc = `which $exe 2> /dev/null`; |
|---|
| 2317 |
chomp $loc; |
|---|
| 2318 |
if ($loc =~ /^no $exe/) { |
|---|
| 2319 |
$CTL_OPT{$exe} = ''; |
|---|
| 2320 |
} |
|---|
| 2321 |
else { |
|---|
| 2322 |
$CTL_OPT{$exe} = $loc; |
|---|
| 2323 |
} |
|---|
| 2324 |
} |
|---|
| 2325 |
} |
|---|
| 2326 |
|
|---|
| 2327 |
return %CTL_OPT; |
|---|
| 2328 |
} |
|---|
| 2329 |
|
|---|
| 2330 |
|
|---|
| 2331 |
|
|---|
| 2332 |
sub load_control_files { |
|---|
| 2333 |
my @ctlfiles = @{shift @_}; |
|---|
| 2334 |
my %OPT = %{shift @_}; |
|---|
| 2335 |
my $mpi_size = shift @_ || 1; |
|---|
| 2336 |
|
|---|
| 2337 |
|
|---|
| 2338 |
my %CTL_OPT = set_defaults(); |
|---|
| 2339 |
|
|---|
| 2340 |
my $error; |
|---|
| 2341 |
|
|---|
| 2342 |
|
|---|
| 2343 |
foreach my $ctlfile (@ctlfiles) { |
|---|
| 2344 |
open (CTL, "< $ctlfile") or die"ERROR: Could not open control file \"$ctlfile\".\n"; |
|---|
| 2345 |
|
|---|
| 2346 |
while (my $line = <CTL>) { |
|---|
| 2347 |
chomp($line); |
|---|
| 2348 |
|
|---|
| 2349 |
if ($line !~ /^[\ |
|---|
| 2350 |
my $key = $1; |
|---|
| 2351 |
my $value = $2; |
|---|
| 2352 |
|
|---|
| 2353 |
|
|---|
| 2354 |
$value =~ s/^[\s\t]+|[\s\t]+$//g; |
|---|
| 2355 |
|
|---|
| 2356 |
if (exists $CTL_OPT{$key}) { |
|---|
| 2357 |
|
|---|
| 2358 |
if ($value =~ /\$/) { |
|---|
| 2359 |
$value = `echo \"$value\"`; |
|---|
| 2360 |
chomp($value); |
|---|
| 2361 |
} |
|---|
| 2362 |
|
|---|
| 2363 |
|
|---|
| 2364 |
if ($CTL_OPT{$key} =~ /^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:e[-+]?\d+)?$/ && |
|---|
| 2365 |
$value !~ /^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:e[-+]?\d+)?$/ |
|---|
| 2366 |
) { |
|---|
| 2367 |
$error .= "ERROR: valid number required for option \'$key\'\n\n" |
|---|
| 2368 |
} |
|---|
| 2369 |
|
|---|
| 2370 |
|
|---|
| 2371 |
$CTL_OPT{$key} = defined($value) ? $value : ''; |
|---|
| 2372 |
} |
|---|
| 2373 |
else { |
|---|
| 2374 |
warn "WARNING: Invalid option \'$key\' in control file $ctlfile\n\n"; |
|---|
| 2375 |
} |
|---|
| 2376 |
} |
|---|
| 2377 |
} |
|---|
| 2378 |
} |
|---|
| 2379 |
|
|---|
| 2380 |
|
|---|
| 2381 |
$CTL_OPT{genome} = $OPT{genome} if (defined $OPT{genome}); |
|---|
| 2382 |
$CTL_OPT{genome_gff} = $OPT{genome_gff} if (defined $OPT{genome_gff}); |
|---|
| 2383 |
$CTL_OPT{model_gff} = $OPT{model_gff} if (defined $OPT{model_gff}); |
|---|
| 2384 |
$CTL_OPT{force} = $OPT{force} if (defined $OPT{force}); |
|---|
| 2385 |
$CTL_OPT{predictor} = $OPT{predictor} if (defined $OPT{predictor}); |
|---|
| 2386 |
$CTL_OPT{retry} = $OPT{retry} if (defined $OPT{retry}); |
|---|
| 2387 |
$CTL_OPT{cpus} = $OPT{cpus} if (defined $OPT{cpus}); |
|---|
| 2388 |
$CTL_OPT{clean_try} = $OPT{clean_try} if (defined $OPT{clean_try}); |
|---|
| 2389 |
$CTL_OPT{again} = $OPT{again} if (defined $OPT{again}); |
|---|
| 2390 |
|
|---|
| 2391 |
|
|---|
| 2392 |
if ($OPT{R} || $CTL_OPT{organism_type} eq 'prokaryotic') { |
|---|
| 2393 |
$CTL_OPT{model_org} = ''; |
|---|
| 2394 |
$CTL_OPT{repeat_protein} = ''; |
|---|
| 2395 |
$CTL_OPT{rmlib} = ''; |
|---|
| 2396 |
$CTL_OPT{rm_gff} = ''; |
|---|
| 2397 |
$CTL_OPT{rm_pass} = 0; |
|---|
| 2398 |
|
|---|
| 2399 |
print STDERR "INFO: "; |
|---|
| 2400 |
print STDERR "No repeats expected in prokaryotic organisms.\n" |
|---|
| 2401 |
if($CTL_OPT{organism_type} eq 'prokaryotic'); |
|---|
| 2402 |
print STDERR "Now removing all repeat masking options.\n\n"; |
|---|
| 2403 |
} |
|---|
| 2404 |
|
|---|
| 2405 |
|
|---|
| 2406 |
|
|---|
| 2407 |
if ($CTL_OPT{model_org} eq '' && |
|---|
| 2408 |
$CTL_OPT{repeat_protein} eq '' && |
|---|
| 2409 |
$CTL_OPT{rmlib} eq '' && |
|---|
| 2410 |
$CTL_OPT{rm_gff} eq '' && |
|---|
| 2411 |
($CTL_OPT{rm_pass} == 0 || |
|---|
| 2412 |
$CTL_OPT{genome_gff} eq '') |
|---|
| 2413 |
) { |
|---|
| 2414 |
warn "WARNING: There are no masking options set, yet unmask is set to 1.\n". |
|---|
| 2415 |
"This is not valid. The value for unmask will be set to 0.\n\n" |
|---|
| 2416 |
if($CTL_OPT{unmask} == 1); |
|---|
| 2417 |
|
|---|
| 2418 |
$CTL_OPT{unmask} = 0; |
|---|
| 2419 |
$CTL_OPT{_no_mask} = 1; |
|---|
| 2420 |
} |
|---|
| 2421 |
|
|---|
| 2422 |
|
|---|
| 2423 |
$CTL_OPT{predictor} = 'model_gff' if($main::eva); |
|---|
| 2424 |
$CTL_OPT{model_pass} = 1 if($main::eva); |
|---|
| 2425 |
|
|---|
| 2426 |
|
|---|
| 2427 |
$CTL_OPT{predictor} =~ s/\s+//g; |
|---|
| 2428 |
my @predictors = split(',', $CTL_OPT{predictor}); |
|---|
| 2429 |
|
|---|
| 2430 |
my %uniq; |
|---|
| 2431 |
$CTL_OPT{_predictor} = []; |
|---|
| 2432 |
foreach my $p (@predictors) { |
|---|
| 2433 |
if ($p !~ /^snap$|^augustus$|^est2genome$|^protein2genome$|^fgenesh$/ && |
|---|
| 2434 |
$p !~ /^genemark$|^jigsaw$|^model_gff$|^pred_gff$/ |
|---|
| 2435 |
) { |
|---|
| 2436 |
$error .= "ERROR: Invalid predictor defined: $p\n". |
|---|
| 2437 |
"Valid entries are: est2genome, model_gff, pred_gff,\n". |
|---|
| 2438 |
"snap, genemark, augustus, and fgenesh\n\n"; |
|---|
| 2439 |
} |
|---|
| 2440 |
elsif($CTL_OPT{organism_type} eq 'prokaryotic'){ |
|---|
| 2441 |
if($p =~ /^snap$|^augustus$|^fgenesh$|^jigsaw$/){ |
|---|
| 2442 |
warn "WARNING: the predictor $p does not support prokaryotic organisms\n". |
|---|
| 2443 |
"and will be ignored.\n\n"; |
|---|
| 2444 |
} |
|---|
| 2445 |
else{ |
|---|
| 2446 |
push(@{$CTL_OPT{_run}}, $p) unless(exists $uniq{$p} || $p !~ /genemark/); |
|---|
| 2447 |
push(@{$CTL_OPT{_predictor}}, $p) unless(exists $uniq{$p}); |
|---|
| 2448 |
$uniq{$p}++; |
|---|
| 2449 |
} |
|---|
| 2450 |
} |
|---|
| 2451 |
elsif($CTL_OPT{organism_type} eq 'eukaryotic'){ |
|---|
| 2452 |
push(@{$CTL_OPT{_predictor}}, $p) unless(exists $uniq{$p}); |
|---|
| 2453 |
if($p =~ /^snap$|^augustus$|^fgenesh$|^genemark$|^jigsaw$/){ |
|---|
| 2454 |
push(@{$CTL_OPT{_run}}, $p) unless(exists $uniq{$p}); |
|---|
| 2455 |
} |
|---|
| 2456 |
$uniq{$p}++; |
|---|
| 2457 |
} |
|---|
| 2458 |
else{ |
|---|
| 2459 |
die "FATAL: Logic error in GI::load_contol_files\n"; |
|---|
| 2460 |
} |
|---|
| 2461 |
} |
|---|
| 2462 |
|
|---|
| 2463 |
|
|---|
| 2464 |
$CTL_OPT{predictor} = join(",", @{$CTL_OPT{_predictor}}); |
|---|
| 2465 |
|
|---|
| 2466 |
|
|---|
| 2467 |
$CTL_OPT{run} =~ s/\s+//g; |
|---|
| 2468 |
my @run = split(',', $CTL_OPT{run}); |
|---|
| 2469 |
|
|---|
| 2470 |
foreach my $p (@run) { |
|---|
| 2471 |
if ($p !~ /^snap$|^augustus$|^fgenesh$|^genemark$|^jigsaw$/) { |
|---|
| 2472 |
$error .= "ERROR: Invalid value defined for run: $p\n". |
|---|
| 2473 |
"Valid entries are: snap, augustus, genemark, or fgenesh\n\n"; |
|---|
| 2474 |
} |
|---|
| 2475 |
else { |
|---|
| 2476 |
push(@{$CTL_OPT{_run}}, $p) unless($uniq{$p}); |
|---|
| 2477 |
$uniq{$p}++; |
|---|
| 2478 |
} |
|---|
| 2479 |
} |
|---|
| 2480 |
|
|---|
| 2481 |
|
|---|
| 2482 |
if ($CTL_OPT{blast_type} !~ /^wublast$|^ncbi$/) { |
|---|
| 2483 |
warn "WARNING: blast_type must be set to \'wublast\' or \'ncbi\'.\n", |
|---|
| 2484 |
"The value $CTL_OPT{blast_type} is invalid.\n", |
|---|
| 2485 |
"This will now be reset to the default 'wublast'.\n\n"; |
|---|
| 2486 |
|
|---|
| 2487 |
$CTL_OPT{blast_type} = 'wublast'; |
|---|
| 2488 |
} |
|---|
| 2489 |
|
|---|
| 2490 |
if ($CTL_OPT{blast_type} =~ /^wublast$/ && |
|---|
| 2491 |
! -e $CTL_OPT{blastn} && |
|---|
| 2492 |
! -e $CTL_OPT{blastx} && |
|---|
| 2493 |
! -e $CTL_OPT{tblastx} && |
|---|
| 2494 |
-e $CTL_OPT{blastall} |
|---|
| 2495 |
) { |
|---|
| 2496 |
warn "WARNING: blast_type is set to \'wublast\' but wublast executables\n", |
|---|
| 2497 |
"can not be located. NCBI blast will be used instead.\n\n"; |
|---|
| 2498 |
|
|---|
| 2499 |
$CTL_OPT{blast_type} = 'ncbi'; |
|---|
| 2500 |
} |
|---|
| 2501 |
|
|---|
| 2502 |
if ($CTL_OPT{blast_type} =~ /^ncbi$/ && |
|---|
| 2503 |
! -e $CTL_OPT{blastall} && |
|---|
| 2504 |
-e $CTL_OPT{blastn} && |
|---|
| 2505 |
-e $CTL_OPT{blastx} && |
|---|
| 2506 |
-e $CTL_OPT{tblastx} |
|---|
| 2507 |
) { |
|---|
| 2508 |
warn "WARNING: blast_type is set to \'ncbi\' but ncbi executables\n", |
|---|
| 2509 |
"can not be located. WUBLAST blast will be used instead.\n\n"; |
|---|
| 2510 |
|
|---|
| 2511 |
$CTL_OPT{blast_type} = 'wublast'; |
|---|
| 2512 |
} |
|---|
| 2513 |
|
|---|
| 2514 |
|
|---|
| 2515 |
if ($CTL_OPT{blast_type} =~ /^wublast$/) { |
|---|
| 2516 |
$CTL_OPT{_formater} = $CTL_OPT{xdformat}; |
|---|
| 2517 |
$CTL_OPT{_blastn} = $CTL_OPT{blastn}; |
|---|
| 2518 |
$CTL_OPT{_blastx} = $CTL_OPT{blastx}; |
|---|
| 2519 |
$CTL_OPT{_tblastx} = $CTL_OPT{tblastx}; |
|---|
| 2520 |
} |
|---|
| 2521 |
elsif ($CTL_OPT{blast_type} =~ /^ncbi$/) { |
|---|
| 2522 |
$CTL_OPT{_formater} = $CTL_OPT{formatdb}; |
|---|
| 2523 |
$CTL_OPT{_blastn} = $CTL_OPT{blastall}; |
|---|
| 2524 |
$CTL_OPT{_blastx} = $CTL_OPT{blastall}; |
|---|
| 2525 |
$CTL_OPT{_tblastx} = $CTL_OPT{blastall}; |
|---|
| 2526 |
} |
|---|
| 2527 |
|
|---|
| 2528 |
|
|---|
| 2529 |
if ($CTL_OPT{organism_type} !~ /^eukaryotic$|^prokaryotic$/) { |
|---|
| 2530 |
$error .= "ERROR: organism_type must be set to \'eukaryotic\' or \'prokaryotic\'.\n". |
|---|
| 2531 |
"The value $CTL_OPT{organism_type} is invalid.\n\n"; |
|---|
| 2532 |
|
|---|
| 2533 |
|
|---|
| 2534 |
$CTL_OPT{organism_type} = 'eukaryotic'; |
|---|
| 2535 |
} |
|---|
| 2536 |
|
|---|
| 2537 |
|
|---|
| 2538 |
|
|---|
| 2539 |
|
|---|
| 2540 |
my @infiles; |
|---|
| 2541 |
|
|---|
| 2542 |
|
|---|
| 2543 |
push (@infiles, '_blastn', '_formater') if($CTL_OPT{est}); |
|---|
| 2544 |
push (@infiles, '_blastn', '_formater') if($CTL_OPT{est_reads}); |
|---|
| 2545 |
push (@infiles, '_blastx', '_formater') if($CTL_OPT{protein}); |
|---|
| 2546 |
push (@infiles, '_blastx', '_formater') if($CTL_OPT{repeat_protein}); |
|---|
| 2547 |
push (@infiles, '_tblastx', '_formater') if($CTL_OPT{altest}); |
|---|
| 2548 |
push (@infiles, 'genome') if($CTL_OPT{genome}); |
|---|
| 2549 |
push (@infiles, 'genome') if(!$CTL_OPT{genome_gff} && !$main::eva); |
|---|
| 2550 |
push (@infiles, 'est') if($CTL_OPT{est}); |
|---|
| 2551 |
push (@infiles, 'protein') if($CTL_OPT{protein}); |
|---|
| 2552 |
push (@infiles, 'altest') if($CTL_OPT{altest}); |
|---|
| 2553 |
push (@infiles, 'est_reads') if($CTL_OPT{est_reads}); |
|---|
| 2554 |
push (@infiles, 'probuild') if (grep (/genemark/, @{$CTL_OPT{_run}})); |
|---|
| 2555 |
push (@infiles, 'fathom') if ($CTL_OPT{enable_fathom} && $CTL_OPT{evaluate}); |
|---|
| 2556 |
push (@infiles, 'est_gff') if($CTL_OPT{est_gff}); |
|---|
| 2557 |
push (@infiles, 'protein_gff') if($CTL_OPT{protein_gff}); |
|---|
| 2558 |
push (@infiles, 'genome_gff') if($CTL_OPT{genome_gff}); |
|---|
| 2559 |
push (@infiles, 'genome_gff') if($main::eva && ! $CTL_OPT{model_gff}); |
|---|
| 2560 |
push (@infiles, 'pred_gff') if($CTL_OPT{pred_gff}); |
|---|
| 2561 |
push (@infiles, 'pred_gff') if (grep (/pred_gff/, $CTL_OPT{predictor})); |
|---|
| 2562 |
push (@infiles, 'model_gff') if ($CTL_OPT{model_gff}); |
|---|
| 2563 |
push (@infiles, 'model_gff') if ($main::eva && ! $CTL_OPT{genome_gff}); |
|---|
| 2564 |
push (@infiles, 'model_gff') if (grep (/model_gff/, $CTL_OPT{predictor}) && |
|---|
| 2565 |
(!$CTL_OPT{genome_gff} || |
|---|
| 2566 |
!$CTL_OPT{model_pass}) |
|---|
| 2567 |
); |
|---|
| 2568 |
if($CTL_OPT{organism_type} eq 'eukaryotic'){ |
|---|
| 2569 |
push (@infiles, 'exonerate') if($CTL_OPT{est}); |
|---|
| 2570 |
push (@infiles, 'exonerate') if($CTL_OPT{protein}); |
|---|
| 2571 |
push (@infiles, 'repeat_protein') if ($CTL_OPT{repeat_protein}); |
|---|
| 2572 |
push (@infiles, 'RepeatMasker') if($CTL_OPT{rmlib}); |
|---|
| 2573 |
push (@infiles, 'RepeatMasker') if($CTL_OPT{model_org}); |
|---|
| 2574 |
push (@infiles, 'rm_gff') if($CTL_OPT{rm_gff}); |
|---|
| 2575 |
push (@infiles, 'rmlib') if ($CTL_OPT{rmlib}); |
|---|
| 2576 |
push (@infiles, 'snap') if (grep (/snap/, @{$CTL_OPT{_run}})); |
|---|
| 2577 |
push (@infiles, 'gmhmme3') if (grep (/genemark/, @{$CTL_OPT{_run}})); |
|---|
| 2578 |
push (@infiles, 'augustus') if (grep (/augustus/, @{$CTL_OPT{_run}})); |
|---|
| 2579 |
push (@infiles, 'fgenesh') if (grep (/fgenesh/, @{$CTL_OPT{_run}})); |
|---|
| 2580 |
push (@infiles, 'twinscan') if (grep (/twinscan/, @{$CTL_OPT{_run}})); |
|---|
| 2581 |
push (@infiles, 'jigsaw') if (grep (/jigsaw/, @{$CTL_OPT{_run}})); |
|---|
| 2582 |
} |
|---|
| 2583 |
elsif($CTL_OPT{organism_type} eq 'prokaryotic'){ |
|---|
| 2584 |
push (@infiles, 'gmhmmp') if (grep (/genemark/, @{$CTL_OPT{_run}})); |
|---|
| 2585 |
} |
|---|
| 2586 |
|
|---|
| 2587 |
|
|---|
| 2588 |
%uniq = (); |
|---|
| 2589 |
foreach my $in (@infiles) { |
|---|
| 2590 |
$uniq{$in}++; |
|---|
| 2591 |
} |
|---|
| 2592 |
@infiles = keys %uniq; |
|---|
| 2593 |
|
|---|
| 2594 |
|
|---|
| 2595 |
foreach my $in (@infiles) { |
|---|
| 2596 |
if (not $CTL_OPT{$in}) { |
|---|
| 2597 |
$error .= "ERROR: You have failed to provide a value for \'$in\' in the control files.\n\n"; |
|---|
| 2598 |
next; |
|---|
| 2599 |
} |
|---|
| 2600 |
elsif (not -e $CTL_OPT{$in}) { |
|---|
| 2601 |
$error .= "ERROR: The \'$in\' file $CTL_OPT{$in} does not exist.\n". |
|---|
| 2602 |
"Please check your control files: maker_opts.ctl, maker_bopts, or maker_exe.ctl.\n\n"; |
|---|
| 2603 |
next; |
|---|
| 2604 |
} |
|---|
| 2605 |
|
|---|
| 2606 |
|
|---|
| 2607 |
|
|---|
| 2608 |
$CTL_OPT{$in} = Cwd::abs_path($CTL_OPT{$in}) unless ($in =~ /^_blastn$|^_blastx$|^_tblastx$/); |
|---|
| 2609 |
} |
|---|
| 2610 |
|
|---|
| 2611 |
|
|---|
| 2612 |
if (grep (/^augustus$/, @infiles) && not $CTL_OPT{augustus_species}) { |
|---|
| 2613 |
warn "WARNING: There is no species specified for Augustus in maker_opts.ctl augustus_species.\n". |
|---|
| 2614 |
"As a result the default (fly) will be used.\n\n"; |
|---|
| 2615 |
$CTL_OPT{augustus_species} = "fly"; |
|---|
| 2616 |
} |
|---|
| 2617 |
if (grep (/^augustus$/, @infiles) && |
|---|
| 2618 |
(! $ENV{AUGUSTUS_CONFIG_PATH} || ! -e "$ENV{AUGUSTUS_CONFIG_PATH}/extrinsic/extrinsic.MPE.cfg") |
|---|
| 2619 |
) { |
|---|
| 2620 |
$error .= "ERROR: The environmental variable AUGUSTUS_CONFIG_PATH has not been set\n". |
|---|
| 2621 |
"or is not set correctly Please set this in your profile per Augustus\n". |
|---|
| 2622 |
"installation instructions then try again.\n\n"; |
|---|
| 2623 |
} |
|---|
| 2624 |
if (grep (/^snaps$|^fathom$/, @infiles) && not $CTL_OPT{snaphmm}) { |
|---|
| 2625 |
warn "WARNING: There is no model specified for for Snap/Fathom in maker_opts.ctl snaphmm.\n". |
|---|
| 2626 |
"As a result, the default (fly) will be used.\n\n"; |
|---|
| 2627 |
$CTL_OPT{snaphmm} = "fly"; |
|---|
| 2628 |
} |
|---|
| 2629 |
if (grep (/^snap$|^fathom$/, @infiles) && ! -e $CTL_OPT{snaphmm} && |
|---|
| 2630 |
(! exists $ENV{ZOE} || ! -e $ENV{ZOE}."/HMM/".$CTL_OPT{snaphmm}) |
|---|
| 2631 |
) { |
|---|
| 2632 |
$error .= "ERROR: The snaphmm specified for Snap/Fathom in maker_opts.ctl does not exist.\n\n"; |
|---|
| 2633 |
} |
|---|
| 2634 |
if (grep (/^gmhmme3$/, @infiles) && not $CTL_OPT{gmhmm}) { |
|---|
| 2635 |
$ error .= "ERROR: There is no model specified for for GeneMark in maker_opts.ctl gmhmm.\n\n"; |
|---|
| 2636 |
} |
|---|
| 2637 |
elsif (grep (/^gmhmme3$/, @infiles) && ! -e $CTL_OPT{gmhmm}) { |
|---|
| 2638 |
$error .= "ERROR: The gmhmm specified for GeneMark in maker_opts.ctl does not exist.\n\n"; |
|---|
| 2639 |
} |
|---|
| 2640 |
if (grep (/^fgenesh$/, @infiles)) { |
|---|
| 2641 |
if (! $CTL_OPT{fgenesh_par_file}) { |
|---|
| 2642 |
$error .= "ERROR: There is no parameter file secified for for FgenesH in\n". |
|---|
| 2643 |
"maker_opts.ctl fgenesh_par_file.\n\n"; |
|---|
| 2644 |
} |
|---|
| 2645 |
elsif (! -e $CTL_OPT{fgenesh_par_file}) { |
|---|
| 2646 |
$error .= "ERROR: The parameter file specified for fgenesh in maker_opts.ctl does not exist.\n\n"; |
|---|
| 2647 |
} |
|---|
| 2648 |
} |
|---|
| 2649 |
if ($CTL_OPT{max_dna_len} < 50000) { |
|---|
| 2650 |
warn "WARNING: \'max_dna_len\' is set too low. The minimum value permited is 50,000.\n". |
|---|
| 2651 |
"max_dna_len will be reset to 50,000\n\n"; |
|---|
| 2652 |
$CTL_OPT{max_dna_len} = 50000; |
|---|
| 2653 |
} |
|---|
| 2654 |
if ($CTL_OPT{split_hit} > 0 && $CTL_OPT{organism_type} eq 'prokaryotic') { |
|---|
| 2655 |
warn "WARNING: \'split_hit\' is meaningless for prokaryotic genomes and will be set to 0.\n\n"; |
|---|
| 2656 |
$CTL_OPT{split_hit} = 0; |
|---|
| 2657 |
} |
|---|
| 2658 |
if ($CTL_OPT{single_exon} == 0 && $CTL_OPT{organism_type} eq 'prokaryotic') { |
|---|
| 2659 |
warn "WARNING: \'single_exon\' is required for prokaryotic genomes and will be set to 1.\n\n"; |
|---|
| 2660 |
$CTL_OPT{single_exon} = 1; |
|---|
| 2661 |
} |
|---|
| 2662 |
if ($CTL_OPT{min_contig} <= 0) { |
|---|
| 2663 |
warn "WARNING: \'min_contig\' must be set to 1 or higher.\n". |
|---|
| 2664 |
"min_contig will be reset to 1\n\n"; |
|---|
| 2665 |
$CTL_OPT{min_contig} = 1; |
|---|
| 2666 |
} |
|---|
| 2667 |
if ($CTL_OPT{min_contig} < 0) { |
|---|
| 2668 |
warn "WARNING: \'min_protein\' must be set to 0 or higher.\n". |
|---|
| 2669 |
"min_protein will be reset to 0\n\n"; |
|---|
| 2670 |
$CTL_OPT{min_protein} = 0; |
|---|
| 2671 |
} |
|---|
| 2672 |
if ($CTL_OPT{AED_threshold} < 0 || $CTL_OPT{AED_threshold} > 1) { |
|---|
| 2673 |
warn "WARNING: \'AED_threshold\' must be set to a value betweeb 0 and 1.\n". |
|---|
| 2674 |
"AED_threshold will be reset to 1\n\n"; |
|---|
| 2675 |
$CTL_OPT{AED_threshold} = 1; |
|---|
| 2676 |
} |
|---|
| 2677 |
if ($CTL_OPT{retry} < 0) { |
|---|
| 2678 |
warn "WARNING: \'retry\' must be set to 0 or greater.\n". |
|---|
| 2679 |
"It will now be set to 0\n\n"; |
|---|
| 2680 |
$CTL_OPT{retry} = 0; |
|---|
| 2681 |
} |
|---|
| 2682 |
if($CTL_OPT{TMP} && ! -d $CTL_OPT{TMP}){ |
|---|
| 2683 |
$error .= "The TMP value \'$CTL_OPT{TMP}\' is not a directory or does not exist.\n\n"; |
|---|
| 2684 |
} |
|---|
| 2685 |
if($main::eva && $CTL_OPT{genome_gff} && $CTL_OPT{model_gff}){ |
|---|
| 2686 |
$error .= "You can only specify a GFF3 file for genome_gff or model_gff no both!!\n\n"; |
|---|
| 2687 |
} |
|---|
| 2688 |
|
|---|
| 2689 |
die $error if ($error); |
|---|
| 2690 |
|
|---|
| 2691 |
|
|---|
| 2692 |
my $fasta_gff = ($CTL_OPT{genome_gff}) ? $CTL_OPT{genome_gff} : $CTL_OPT{model_gff}; |
|---|
| 2693 |
my $iterator = new Iterator::Any( -fasta => $CTL_OPT{genome}, |
|---|
| 2694 |
-gff => $fasta_gff |
|---|
| 2695 |
); |
|---|
| 2696 |
|
|---|
| 2697 |
if ($iterator->number_of_entries() == 0) { |
|---|
| 2698 |
my $genome = (! $CTL_OPT{genome}) ? $fasta_gff : $CTL_OPT{genome}; |
|---|
| 2699 |
die "ERROR: The file $genome contains no fasta entries\n\n"; |
|---|
| 2700 |
} |
|---|
| 2701 |
|
|---|
| 2702 |
|
|---|
| 2703 |
if ($iterator->number_of_entries() > 1000 && ! $CTL_OPT{datastore}) { |
|---|
| 2704 |
warn "WARNING: There are more than 1000 fasta entries in the input file.\n". |
|---|
| 2705 |
"A two depth datastore will be used to avoid overloading the data structure of\n". |
|---|
| 2706 |
"the output directory.\n\n"; |
|---|
| 2707 |
|
|---|
| 2708 |
$CTL_OPT{datastore} = 1; |
|---|
| 2709 |
} |
|---|
| 2710 |
|
|---|
| 2711 |
|
|---|
| 2712 |
my @gffs = grep(/\_gff$/, @{[keys %CTL_OPT]}); |
|---|
| 2713 |
foreach my $key (@gffs) { |
|---|
| 2714 |
if ($CTL_OPT{$key}) { |
|---|
| 2715 |
$CTL_OPT{go_gffdb} = 1; |
|---|
| 2716 |
last; |
|---|
| 2717 |
} |
|---|
| 2718 |
} |
|---|
| 2719 |
|
|---|
| 2720 |
|
|---|
| 2721 |
$CTL_OPT{alt_peptide} = uc($CTL_OPT{alt_peptide}); |
|---|
| 2722 |
if ($CTL_OPT{alt_peptide} !~ /^[ABCDEFGHIKLMNPQRSTVWXYZ]$/) { |
|---|
| 2723 |
warn "WARNING: Invalid alternate peptide \'$CTL_OPT{alt_peptide}\'.\n", |
|---|
| 2724 |
"This will be set to the default 'C'.\n\n"; |
|---|
| 2725 |
$CTL_OPT{alt_peptide} = 'C'; |
|---|
| 2726 |
} |
|---|
| 2727 |
|
|---|
| 2728 |
|
|---|
| 2729 |
my $genome = $CTL_OPT{genome}; |
|---|
| 2730 |
$genome = $CTL_OPT{genome_gff} if (not $genome); |
|---|
| 2731 |
($CTL_OPT{out_name}) = $genome =~ /([^\/]+)$/; |
|---|
| 2732 |
$CTL_OPT{out_name} =~ s/\.[^\.]+$//; |
|---|
| 2733 |
$CTL_OPT{CWD} = Cwd::cwd(); |
|---|
| 2734 |
if(! $CTL_OPT{out_base}){ |
|---|
| 2735 |
$CTL_OPT{out_base} = $CTL_OPT{CWD}."/$CTL_OPT{out_name}.maker.output"; |
|---|
| 2736 |
} |
|---|
| 2737 |
mkdir($CTL_OPT{out_base}) if(! -d $CTL_OPT{out_base}); |
|---|
| 2738 |
die "ERROR: Could not build output directory $CTL_OPT{out_base}\n" |
|---|
| 2739 |
if(! -d $CTL_OPT{out_base}); |
|---|
| 2740 |
|
|---|
| 2741 |
|
|---|
| 2742 |
my $check; |
|---|
| 2743 |
if(-e $CTL_OPT{out_base}."/.maker_lock.NFSLock"){ |
|---|
| 2744 |
warn "WARNING: Lock found, maker may already be running.\n". |
|---|
| 2745 |
"Checking for other instance.\n\n"; |
|---|
| 2746 |
$check = 1; |
|---|
| 2747 |
} |
|---|
| 2748 |
|
|---|
| 2749 |
|
|---|
| 2750 |
if($LOCK = new File::NFSLock($CTL_OPT{out_base}."/.maker_lock", 'EX', 40, 40)){ |
|---|
| 2751 |
warn "Everything seems ok!!\n\n" if($check); |
|---|
| 2752 |
|
|---|
| 2753 |
$LOCK->maintain(30); |
|---|
| 2754 |
} |
|---|
| 2755 |
else{ |
|---|
| 2756 |
die "ERROR: Another instance of maker is already running for this dataset.\n\n"; |
|---|
| 2757 |
} |
|---|
| 2758 |
|
|---|
| 2759 |
|
|---|
| 2760 |
|
|---|
| 2761 |
if($CTL_OPT{TMP}){ |
|---|
| 2762 |
$CTL_OPT{_TMP} = tempdir("maker_XXXXXX", CLEANUP => 1, DIR => $CTL_OPT{TMP}); |
|---|
| 2763 |
} |
|---|
| 2764 |
else{ |
|---|
| 2765 |
$CTL_OPT{_TMP} = $TMP; |
|---|
| 2766 |
} |
|---|
| 2767 |
|
|---|
| 2768 |
set_global_temp($CTL_OPT{_TMP}); |
|---|
| 2769 |
|
|---|
| 2770 |
|
|---|
| 2771 |
create_blastdb(\%CTL_OPT, $mpi_size); |
|---|
| 2772 |
build_all_indexes(\%CTL_OPT); |
|---|
| 2773 |
|
|---|
| 2774 |
|
|---|
| 2775 |
generate_control_files($CTL_OPT{out_base}, %CTL_OPT); |
|---|
| 2776 |
|
|---|
| 2777 |
return %CTL_OPT; |
|---|
| 2778 |
} |
|---|
| 2779 |
|
|---|
| 2780 |
|
|---|
| 2781 |
sub generate_control_files { |
|---|
| 2782 |
my $dir = shift || Cwd::cwd(); |
|---|
| 2783 |
my %O = (@_) ? @_ : set_defaults(); |
|---|
| 2784 |
my $ev = 1 if($main::eva); |
|---|
| 2785 |
my $log = 1 if(@_); |
|---|
| 2786 |
|
|---|
| 2787 |
|
|---|
| 2788 |
open (OUT, "> $dir/eval_opts.ctl") if($ev); |
|---|
| 2789 |
open (OUT, "> $dir/maker_opts.log") if(!$ev && $log); |
|---|
| 2790 |
open (OUT, "> $dir/maker_opts.ctl") if(!$ev && !$log); |
|---|
| 2791 |
print OUT "#-----Genome (Required for De-Novo Annotation)\n" if(!$ev); |
|---|
| 2792 |
print OUT "#-----Genome (Required if not internal to GFF3 file)\n" if($ev); |
|---|
| 2793 |
print OUT "genome:$O{genome} #genome sequence file in fasta format\n"; |
|---|
| 2794 |
print OUT "\n"; |
|---|
| 2795 |
print OUT "#-----Re-annotation Options (Only Maker derived GFF3)\n" if(!$ev); |
|---|
| 2796 |
print OUT "#-----Maker Derived GFF3 Annotations to Evaluate (genome fasta is internal to GFF3)\n" if($ev); |
|---|
| 2797 |
print OUT "genome_gff:$O{genome_gff} #re-annotate genome based on this gff3 file\n" if(!$ev); |
|---|
| 2798 |
print OUT "genome_gff:$O{genome_gff} #Maker derived gff3 file\n" if($ev); |
|---|
| 2799 |
print OUT "est_pass:$O{est_pass} #use ests in genome_gff: 1 = yes, 0 = no\n"; |
|---|
| 2800 |
print OUT "altest_pass:$O{altest_pass} #use alternate organism ests in genome_gff: 1 = yes, 0 = no\n"; |
|---|
| 2801 |
print OUT "protein_pass:$O{protein_pass} #use proteins in genome_gff: 1 = yes, 0 = no\n"; |
|---|
| 2802 |
print OUT "rm_pass:$O{rm_pass} #use repeats in genome_gff: 1 = yes, 0 = no\n"; |
|---|
| 2803 |
print OUT "model_pass:$O{model_pass} #use gene models in genome_gff: 1 = yes, 0 = no\n" if(!$ev); |
|---|
| 2804 |
print OUT "pred_pass:$O{pred_pass} #use ab-initio predictions in genome_gff: 1 = yes, 0 = no\n"; |
|---|
| 2805 |
print OUT "other_pass:$O{other_pass} #passthrough everything else in genome_gff: 1 = yes, 0 = no\n" if(!$ev); |
|---|
| 2806 |
print OUT "\n"; |
|---|
| 2807 |
print OUT "#-----External GFF3 Annotations to Evaluate\n" if($ev); |
|---|
| 2808 |
print OUT "model_gff:$O{model_gff} #gene models from an external gff3 file\n" if($ev); |
|---|
| 2809 |
print OUT "\n"if($ev); |
|---|
| 2810 |
print OUT "#-----EST Evidence (you should provide a value for at least one)\n"; |
|---|
| 2811 |
print OUT "est:$O{est} #non-redundant set of assembled ESTs in fasta format (classic EST analysis)\n"; |
|---|
| 2812 |
print OUT "est_reads:$O{est_reads} #unassembled nextgen mRNASeq in fasta format (not fully implemented)\n"; |
|---|
| 2813 |
print OUT "altest:$O{altest} #EST/cDNA sequence file in fasta format from an alternate organism\n"; |
|---|
| 2814 |
print OUT "est_gff:$O{est_gff} #EST evidence from an external gff3 file\n"; |
|---|
| 2815 |
print OUT "altest_gff:$O{altest_gff} #Alternate organism EST evidence from a seperate gff3 file\n"; |
|---|
| 2816 |
print OUT "\n"; |
|---|
| 2817 |
print OUT "#-----Protein Homology Evidence (you should provide a value for at least one)\n"; |
|---|
| 2818 |
print OUT "protein:$O{protein} #protein sequence file in fasta format\n"; |
|---|
| 2819 |
print OUT "protein_gff:$O{protein_gff} #protein homology evidence from an external gff3 file\n"; |
|---|
| 2820 |
print OUT "\n"; |
|---|
| 2821 |
print OUT "#-----Repeat Masking (leave values blank to skip)\n"; |
|---|
| 2822 |
print OUT "model_org:$O{model_org} #model organism for RepBase masking in RepeatMasker\n"; |
|---|
| 2823 |
print OUT "repeat_protein:$O{repeat_protein} #a database of transposable element proteins in fasta format\n"; |
|---|
| 2824 |
print OUT "rmlib:$O{rmlib} #an organism specific repeat library in fasta format\n"; |
|---|
| 2825 |
print OUT "rm_gff:$O{rm_gff} #repeat elements from an external gff3 file\n"; |
|---|
| 2826 |
print OUT "\n"; |
|---|
| 2827 |
print OUT "#-----Gene Prediction Options\n" if(!$ev); |
|---|
| 2828 |
print OUT "#-----Evaluator Ab-Initio Comparison Options\n" if($ev); |
|---|
| 2829 |
print OUT "organism_type:$O{organism_type} #eukaryotic or prokaryotic. Default is eukaryotic\n"; |
|---|
| 2830 |
print OUT "run:$O{run} #ab-initio methods to run (seperate multiple values by ',')\n" if($ev); |
|---|
| 2831 |
print OUT "predictor:$O{predictor} #prediction methods for annotations (seperate multiple values by ',')\n" if(!$ev); |
|---|
| 2832 |
print OUT "unmask:$O{unmask} #Also run ab-initio methods on unmasked sequence, 1 = yes, 0 = no\n"; |
|---|
| 2833 |
print OUT "snaphmm:$O{snaphmm} #SNAP HMM model\n"; |
|---|
| 2834 |
print OUT "gmhmm:$O{gmhmm} #GeneMark HMM model\n"; |
|---|
| 2835 |
print OUT "augustus_species:$O{augustus_species} #Augustus gene prediction model\n"; |
|---|
| 2836 |
print OUT "fgenesh_par_file:$O{fgenesh_par_file} #Fgenesh parameter file\n"; |
|---|
| 2837 |
print OUT "model_gff:$O{model_gff} #gene models from an external gff3 file (annotation pass-through)\n" if(!$ev); |
|---|
| 2838 |
print OUT "pred_gff:$O{pred_gff} #ab-initio predictions from an external gff3 file\n"; |
|---|
| 2839 |
print OUT "\n"; |
|---|
| 2840 |
print OUT "#-----Other Annotation Type Options (features maker doesn't recognize)\n" if(!$ev); |
|---|
| 2841 |
print OUT "other_gff:$O{other_gff} #features to pass-through to final output from an extenal gff3 file\n" if(!$ev); |
|---|
| 2842 |
print OUT "\n" if(!$ev); |
|---|
| 2843 |
print OUT "#-----External Application Specific Options\n"; |
|---|
| 2844 |
print OUT "alt_peptide:$O{alt_peptide} #amino acid used to replace non standard amino acids in blast databases\n"; |
|---|
| 2845 |
print OUT "cpus:$O{cpus} #max number of cpus to use in BLAST and RepeatMasker\n"; |
|---|
| 2846 |
print OUT "\n"; |
|---|
| 2847 |
print OUT "#-----Maker Specific Options\n"; |
|---|
| 2848 |
print OUT "evaluate:$O{evaluate} #run Evaluator on all annotations, 1 = yes, 0 = no\n" if(!$ev); |
|---|
| 2849 |
print OUT "max_dna_len:$O{max_dna_len} #length for dividing up contigs into chunks (larger values increase memory usage)\n"; |
|---|
| 2850 |
print OUT "min_contig:$O{min_contig} #all contigs from the input genome file below this size will be skipped\n"; |
|---|
| 2851 |
print OUT "min_protein:$O{min_protein} #all gene annotations must produce a protein of at least this many amino acids in length\n" if(!$ev); |
|---|
| 2852 |
print OUT "AED_threshold:$O{AED_threshold} #Maximum Annotation Edit Distance allowed for annotations (bound by 0 and 1)\n" if(!$ev); |
|---|
| 2853 |
print OUT "softmask:$O{softmask} #use soft-masked rather than hard-masked seg filtering for wublast\n"; |
|---|
| 2854 |
print OUT "split_hit:$O{split_hit} #length for the splitting of hits (expected max intron size for evidence alignments)\n"; |
|---|
| 2855 |
print OUT "pred_flank:$O{pred_flank} #length of sequence surrounding EST and protein evidence used to extend gene predictions\n"; |
|---|
| 2856 |
print OUT "single_exon:$O{single_exon} #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no\n"; |
|---|
| 2857 |
print OUT "single_length:$O{single_length} #min length required for single exon ESTs if \'single_exon\ is enabled'\n"; |
|---|
| 2858 |
print OUT "keep_preds:$O{keep_preds} #Add non-overlapping ab-inito gene prediction to final annotation set, 1 = yes, 0 = no\n" if(!$ev); |
|---|
| 2859 |
print OUT "map_forward:$O{map_forward} #try to map names and attributes forward from gff3 annotations, 1 = yes, 0 = no\n" if(!$ev); |
|---|
| 2860 |
print OUT "retry:$O{retry} #number of times to retry a contig if there is a failure for some reason\n"; |
|---|
| 2861 |
print OUT "clean_try:$O{clean_try} #removeall data from previous run before retrying, 1 = yes, 0 = no\n"; |
|---|
| 2862 |
print OUT "clean_up:$O{clean_up} #removes theVoid directory with individual analysis files, 1 = yes, 0 = no\n"; |
|---|
| 2863 |
print OUT "TMP:$O{TMP} #specify a directory other than the system default temporary directory for temporary files\n"; |
|---|
| 2864 |
print OUT "\n"; |
|---|
| 2865 |
print OUT "#-----EVALUATOR Control Options\n"; |
|---|
| 2866 |
print OUT "side_thre:$O{side_thre}\n"; |
|---|
| 2867 |
print OUT "eva_window_size:$O{eva_window_size}\n"; |
|---|
| 2868 |
print OUT "eva_split_hit:$O{eva_split_hit}\n"; |
|---|
| 2869 |
print OUT "eva_hspmax:$O{eva_hspmax}\n"; |
|---|
| 2870 |
print OUT "eva_gspmax:$O{eva_gspmax}\n"; |
|---|
| 2871 |
print OUT "enable_fathom:$O{enable_fathom}\n"; |
|---|
| 2872 |
close (OUT); |
|---|
| 2873 |
|
|---|
| 2874 |
|
|---|
| 2875 |
open (OUT, "> $dir/eval_bopts.ctl") if($ev); |
|---|
| 2876 |
open (OUT, "> $dir/maker_bopts.log") if(!$ev && $log); |
|---|
| 2877 |
open (OUT, "> $dir/maker_bopts.ctl") if(!$ev && !$log); |
|---|
| 2878 |
print OUT "#-----BLAST and Exonerate Statistics Thresholds\n"; |
|---|
| 2879 |
print OUT "blast_type:$O{blast_type} #set to 'wublast' or 'ncbi'\n"; |
|---|
| 2880 |
print OUT "\n"; |
|---|
| 2881 |
print OUT "pcov_blastn:$O{pcov_blastn} #Blastn Percent Coverage Threhold EST-Genome Alignments\n"; |
|---|
| 2882 |
print OUT "pid_blastn:$O{pid_blastn} #Blastn Percent Identity Threshold EST-Genome Aligments\n"; |
|---|
| 2883 |
print OUT "eval_blastn:$O{eval_blastn} #Blastn eval cutoff\n"; |
|---|
| 2884 |
print OUT "bit_blastn:$O{bit_blastn} #Blastn bit cutoff\n"; |
|---|
| 2885 |
print OUT "\n"; |
|---|
| 2886 |
print OUT "pcov_blastx:$O{pcov_blastx} #Blastx Percent Coverage Threhold Protein-Genome Alignments\n"; |
|---|
| 2887 |
print OUT "pid_blastx:$O{pid_blastx} #Blastx Percent Identity Threshold Protein-Genome Aligments\n"; |
|---|
| 2888 |
print OUT "eval_blastx:$O{eval_blastx} #Blastx eval cutoff\n"; |
|---|
| 2889 |
print OUT "bit_blastx:$O{bit_blastx} #Blastx bit cutoff\n"; |
|---|
| 2890 |
print OUT "\n"; |
|---|
| 2891 |
print OUT "pcov_rm_blastx:$O{pcov_rm_blastx} #Blastx Percent Coverage Threhold For Transposable Element Masking\n"; |
|---|
| 2892 |
print OUT "pid_rm_blastx:$O{pid_rm_blastx} #Blastx Percent Identity Threshold For Transposbale Element Masking\n"; |
|---|
| 2893 |
print OUT "eval_rm_blastx:$O{eval_rm_blastx} #Blastx eval cutoff for transposable element masking\n"; |
|---|
| 2894 |
print OUT "bit_rm_blastx:$O{bit_rm_blastx} #Blastx bit cutoff for transposable element masking\n"; |
|---|
| 2895 |
print OUT "\n"; |
|---|
| 2896 |
print OUT "pcov_tblastx:$O{pcov_tblastx} #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments\n"; |
|---|
| 2897 |
print OUT "pid_tblastx:$O{pid_tblastx} #tBlastx Percent Identity Threshold alt-EST-Genome Aligments\n"; |
|---|
| 2898 |
print OUT "eval_tblastx:$O{eval_tblastx} #tBlastx eval cutoff\n"; |
|---|
| 2899 |
print OUT "bit_tblastx:$O{bit_tblastx} #tBlastx bit cutoff\n"; |
|---|
| 2900 |
print OUT "\n"; |
|---|
| 2901 |
print OUT "eva_pcov_blastn:$O{eva_pcov_blastn} #Evaluator Blastn Percent Coverage Threshold EST-Genome Alignments\n"; |
|---|
| 2902 |
print OUT "eva_pid_blastn:$O{eva_pid_blastn} #Evaluator Blastn Percent Identity Threshold EST-Genome Alignments\n"; |
|---|
| 2903 |
print OUT "eva_eval_blastn:$O{eva_eval_blastn} #Evaluator Blastn eval cutoff\n"; |
|---|
| 2904 |
print OUT "eva_bit_blastn:$O{eva_bit_blastn} #Evaluator Blastn bit cutoff\n"; |
|---|
| 2905 |
print OUT "\n"; |
|---|
| 2906 |
print OUT "ep_score_limit:$O{ep_score_limit} #exonerate protein percent of maximal score threshold\n"; |
|---|
| 2907 |
print OUT "en_score_limit:$O{en_score_limit} #exonerate nucleotide percent of maximal score threshold\n"; |
|---|
| 2908 |
close(OUT); |
|---|
| 2909 |
|
|---|
| 2910 |
|
|---|
| 2911 |
open (OUT, "> $dir/eval_exe.ctl") if($ev); |
|---|
| 2912 |
open (OUT, "> $dir/maker_exe.log") if(!$ev && $log); |
|---|
| 2913 |
open (OUT, "> $dir/maker_exe.ctl") if(!$ev && !$log); |
|---|
| 2914 |
print OUT "#-----Location of Executables Used by Maker/Evaluator\n"; |
|---|
| 2915 |
print OUT "formatdb:$O{formatdb} #location of NCBI formatdb executable\n"; |
|---|
| 2916 |
print OUT "blastall:$O{blastall} #location of NCBI blastall executable\n"; |
|---|
| 2917 |
print OUT "xdformat:$O{xdformat} #location of WUBLAST xdformat executable\n"; |
|---|
| 2918 |
print OUT "blastn:$O{blastn} #location of WUBLAST blastn executable\n"; |
|---|
| 2919 |
print OUT "blastx:$O{blastx} #location of WUBLAST blastx executable\n"; |
|---|
| 2920 |
print OUT "tblastx:$O{tblastx} #location of WUBLAST tblastx executable\n"; |
|---|
| 2921 |
print OUT "RepeatMasker:$O{RepeatMasker} #location of RepeatMasker executable\n"; |
|---|
| 2922 |
print OUT "exonerate:$O{exonerate} #location of exonerate executable\n"; |
|---|
| 2923 |
print OUT "\n"; |
|---|
| 2924 |
print OUT "#-----Ab-initio Gene Prediction Algorithms\n"; |
|---|
| 2925 |
print OUT "snap:$O{snap} #location of snap executable\n"; |
|---|
| 2926 |
print OUT "gmhmme3:$O{gmhmme3} #location of eukaryotic genemark executable\n"; |
|---|
| 2927 |
print OUT "gmhmmp:$O{gmhmmp} #location of prokaryotic genemark executable\n"; |
|---|
| 2928 |
print OUT "augustus:$O{augustus} #location of augustus executable\n"; |
|---|
| 2929 |
print OUT "fgenesh:$O{fgenesh} #location of fgenesh executable\n"; |
|---|
| 2930 |
print OUT "twinscan:$O{twinscan} #location of twinscan executable (not yet implemented)\n"; |
|---|
| 2931 |
print OUT "\n"; |
|---|
| 2932 |
print OUT "#-----Other Algorithms\n"; |
|---|
| 2933 |
print OUT "jigsaw:$O{jigsaw} #location of jigsaw executable (not yet implemented)\n"; |
|---|
| 2934 |
print OUT "qrna:$O{qrna} #location of qrna executable (not yet implemented)\n"; |
|---|
| 2935 |
print OUT "fathom:$O{fathom} #location of fathom executable (not yet implemented)\n"; |
|---|
| 2936 |
print OUT "probuild:$O{probuild} #location of probuild executable (required for genemark)\n"; |
|---|
| 2937 |
close(OUT); |
|---|
| 2938 |
} |
|---|
| 2939 |
|
|---|
| 2940 |
1; |
|---|