| 137 | | |
|---|
| 138 | | #--Process arguments and the command line |
|---|
| 139 | | GetOptions("RM_off|R" => \$OPT{R}, |
|---|
| 140 | | "force|f" => \$OPT{force}, |
|---|
| 141 | | "genome|g=s" => \$OPT{genome}, |
|---|
| 142 | | "cpus|c=i" => \$OPT{cpus}, |
|---|
| 143 | | "predictor=s" =>\$OPT{predictor}, |
|---|
| 144 | | "retry=i" =>\$OPT{retry}, |
|---|
| 145 | | "clean_try" =>\$OPT{clean_try}, |
|---|
| 146 | | "evaluate" =>\$OPT{evaluate}, |
|---|
| 147 | | "quiet" => \$main::quiet, |
|---|
| 148 | | "CTL" => sub {GI::generate_control_files(); exit(0);}, |
|---|
| 149 | | "help|?" => sub {print $usage; exit(0)} |
|---|
| 150 | | ); |
|---|
| 151 | | |
|---|
| 152 | | #------------------------------------------------------------------------------# |
|---|
| 153 | | #------------------------------------ MAIN ------------------------------------# |
|---|
| 154 | | #------------------------------------------------------------------------------# |
|---|
| 155 | | |
|---|
| 156 | | #---get arguments off the command line |
|---|
| 157 | | my @ctlfiles = @ARGV; |
|---|
| 158 | | |
|---|
| 159 | | if (not @ctlfiles) { |
|---|
| 160 | | if (-e "maker_opts.ctl" && |
|---|
| 161 | | -e "maker_bopts.ctl" && |
|---|
| 162 | | -e "maker_exe.ctl" && |
|---|
| 163 | | -e "evaluator.ctl" |
|---|
| 164 | | ) { |
|---|
| 165 | | |
|---|
| 166 | | @ctlfiles = ("maker_opts.ctl", |
|---|
| 167 | | "maker_bopts.ctl", |
|---|
| 168 | | "maker_exe.ctl", |
|---|
| 169 | | "evaluator.ctl" |
|---|
| 170 | | ); |
|---|
| 171 | | } |
|---|
| 172 | | else { |
|---|
| 173 | | print STDERR "ERROR: Control files not found\n"; |
|---|
| 174 | | print $usage; |
|---|
| 175 | | exit(0); |
|---|
| 176 | | } |
|---|
| 177 | | } |
|---|
| 178 | | |
|---|
| 179 | | #---set up control options from control files |
|---|
| 180 | | my %CTL_OPT = GI::load_control_files(\@ctlfiles, \%OPT); |
|---|
| 181 | | |
|---|
| 182 | | #--open datastructure controller |
|---|
| 183 | | my $DS_CTL = ds_utility->new(\%CTL_OPT); |
|---|
| 184 | | |
|---|
| 185 | | #--set up gff database |
|---|
| 186 | | my $GFF_DB = new GFFDB(\%CTL_OPT); |
|---|
| 187 | | my $build = $GFF_DB->next_build; |
|---|
| 188 | | |
|---|
| 189 | | #---load genome multifasta/GFF3 file |
|---|
| 190 | | my $iterator = new Iterator::Any( -fasta => $CTL_OPT{'genome'}, |
|---|
| 191 | | -gff => $CTL_OPT{'genome_gff'}, |
|---|
| 192 | | ); |
|---|
| 193 | | |
|---|
| 194 | | #---iterate over each sequence in the fasta |
|---|
| 195 | | while (my $fasta = $iterator->nextFasta()) { |
|---|
| 196 | | $LOG = undef; #reset global variable LOG with each contig |
|---|
| 197 | | |
|---|
| 198 | | my $fasta = Fasta::ucFasta(\$fasta); |
|---|
| 199 | | |
|---|
| 200 | | #get fasta parts |
|---|
| 201 | | my $q_def = Fasta::getDef(\$fasta); #Get fasta header |
|---|
| 202 | | my $q_seq_ref = Fasta::getSeqRef(\$fasta); #Get reference to fasta sequence |
|---|
| 203 | | my $seq_id = Fasta::def2SeqID($q_def); #Get sequence identifier |
|---|
| 204 | | my $safe_seq_id = Fasta::seqID2SafeID($seq_id); #Get safe named identifier |
|---|
| 205 | | |
|---|
| 206 | | #set up base and void directories for output |
|---|
| 207 | | my ($out_dir, $the_void) = $DS_CTL->seq_dirs($seq_id); |
|---|
| 208 | | |
|---|
| 209 | | #-build and proccess the run log |
|---|
| 210 | | $LOG = runlog->new( \%CTL_OPT, |
|---|
| 211 | | { 'seq_id' => $seq_id, |
|---|
| 212 | | 'seq_length' => length($$q_seq_ref), |
|---|
| 213 | | 'out_dir' => $out_dir, |
|---|
| 214 | | 'the_void' => $the_void, |
|---|
| 215 | | 'fasta_ref' => \$fasta |
|---|
| 216 | | }, |
|---|
| 217 | | "$the_void/run.log" |
|---|
| 218 | | ); |
|---|
| 219 | | |
|---|
| 220 | | my ($c_flag, $message) = $LOG->get_continue_flag(); |
|---|
| 221 | | $DS_CTL->add_entry($seq_id, $out_dir, $message); |
|---|
| 222 | | |
|---|
| 223 | | next if(! $c_flag || $c_flag <= 0); |
|---|
| 224 | | |
|---|
| 225 | | #==from here on fastas are proccessed as chunks |
|---|
| 226 | | |
|---|
| 227 | | #-set up variables that are heldover from last chunk |
|---|
| 228 | | my $holdover_blastn; |
|---|
| 229 | | my $holdover_blastx; |
|---|
| 230 | | my $holdover_tblastx; |
|---|
| 231 | | my $holdover_pred; |
|---|
| 232 | | my $holdover_est_gff; |
|---|
| 233 | | my $holdover_altest_gff; |
|---|
| 234 | | my $holdover_prot_gff; |
|---|
| 235 | | my $holdover_pred_gff; |
|---|
| 236 | | my $holdover_model_gff; |
|---|
| 237 | | |
|---|
| 238 | | #-set up variables that are the result of chunk accumulation |
|---|
| 239 | | my $masked_total_seq; |
|---|
| 240 | | my %p_fastas; |
|---|
| 241 | | my %t_fastas; |
|---|
| 242 | | |
|---|
| 243 | | my $GFF3 = Dumper::GFF::GFFV3->new("$out_dir/$safe_seq_id.gff", |
|---|
| 244 | | $build, |
|---|
| 245 | | $the_void |
|---|
| 246 | | ); |
|---|
| 247 | | $GFF3->set_current_contig($seq_id, $q_seq_ref); |
|---|
| 248 | | |
|---|
| 249 | | #==REPEAT MASKING HERE |
|---|
| 250 | | my $fasta_chunker = new FastaChunker(); |
|---|
| 251 | | $fasta_chunker->parent_fasta($fasta); |
|---|
| 252 | | $fasta_chunker->chunk_size($CTL_OPT{max_dna_len}); |
|---|
| 253 | | $fasta_chunker->min_size($CTL_OPT{split_hit}); |
|---|
| 254 | | $fasta_chunker->load_chunks(); |
|---|
| 255 | | my $chunk_count = 0; |
|---|
| 256 | | |
|---|
| 257 | | while (my $chunk = $fasta_chunker->get_chunk($chunk_count++)) { |
|---|
| 258 | | #-- repeatmask with gff3 input |
|---|
| 259 | | my $rm_gff_keepers = []; |
|---|
| 260 | | if ($CTL_OPT{go_gffdb}) { |
|---|
| 261 | | $rm_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, |
|---|
| 262 | | $q_seq_ref, |
|---|
| 263 | | 'repeat' |
|---|
| 264 | | ); |
|---|
| 265 | | #mask the chunk |
|---|
| 266 | | $chunk = repeat_mask_seq::mask_chunk($chunk, $rm_gff_keepers); |
|---|
| 267 | | } |
|---|
| 268 | | |
|---|
| 269 | | #-- repeatmask with RepeatMasker |
|---|
| 270 | | my $rm_rb_keepers = []; #repeat masker RepBase |
|---|
| 271 | | if ($CTL_OPT{model_org}) { #model organism repeats |
|---|
| 272 | | $rm_rb_keepers = GI::repeatmask($chunk, |
|---|
| 273 | | $the_void, |
|---|
| 274 | | $safe_seq_id, |
|---|
| 275 | | $CTL_OPT{model_org}, |
|---|
| 276 | | $CTL_OPT{RepeatMasker}, |
|---|
| 277 | | '', |
|---|
| 278 | | $CTL_OPT{cpus}, |
|---|
| 279 | | $CTL_OPT{force}, |
|---|
| 280 | | $LOG |
|---|
| 281 | | ); |
|---|
| 282 | | |
|---|
| 283 | | #mask the chunk |
|---|
| 284 | | $chunk = repeat_mask_seq::mask_chunk($chunk, $rm_rb_keepers); |
|---|
| 285 | | } |
|---|
| 286 | | my $rm_sp_keepers = []; #repeat masker species |
|---|
| 287 | | if ($CTL_OPT{rmlib}) { #species specific repeats; |
|---|
| 288 | | $rm_sp_keepers = GI::repeatmask($chunk, |
|---|
| 289 | | $the_void, |
|---|
| 290 | | $safe_seq_id, |
|---|
| 291 | | $CTL_OPT{model_org}, |
|---|
| 292 | | $CTL_OPT{RepeatMasker}, |
|---|
| 293 | | $CTL_OPT{rmlib}, |
|---|
| 294 | | $CTL_OPT{cpus}, |
|---|
| 295 | | $CTL_OPT{force}, |
|---|
| 296 | | $LOG |
|---|
| 297 | | ); |
|---|
| 298 | | |
|---|
| 299 | | #mask the chunk |
|---|
| 300 | | $chunk = repeat_mask_seq::mask_chunk($chunk, $rm_sp_keepers); |
|---|
| 301 | | } |
|---|
| 302 | | |
|---|
| 303 | | #-- blastx against a repeat library (for better masking) |
|---|
| 304 | | my $rm_blastx_keepers = []; |
|---|
| 305 | | if ($CTL_OPT{_repeat_protein}) { |
|---|
| 306 | | my $res_dir; |
|---|
| 307 | | foreach my $db (@{$CTL_OPT{r_db}}) { |
|---|
| 308 | | $res_dir = |
|---|
| 309 | | GI::blastx_as_chunks($chunk, |
|---|
| 310 | | $db, |
|---|
| 311 | | $the_void, |
|---|
| 312 | | $safe_seq_id, |
|---|
| 313 | | $CTL_OPT{_blastx}, |
|---|
| 314 | | $CTL_OPT{eval_rm_blastx}, |
|---|
| 315 | | $CTL_OPT{split_hit}, |
|---|
| 316 | | $CTL_OPT{cpus}, |
|---|
| 317 | | $CTL_OPT{repeat_protein}, |
|---|
| 318 | | $CTL_OPT{_formater}, |
|---|
| 319 | | 0, |
|---|
| 320 | | $CTL_OPT{force}, |
|---|
| 321 | | $LOG, |
|---|
| 322 | | 1 |
|---|
| 323 | | ); |
|---|
| 324 | | } |
|---|
| 325 | | $rm_blastx_keepers = GI::collect_blastx($chunk, |
|---|
| 326 | | $res_dir, |
|---|
| 327 | | $CTL_OPT{eval_rm_blastx}, |
|---|
| 328 | | $CTL_OPT{bit_rm_blastx}, |
|---|
| 329 | | $CTL_OPT{pcov_rm_blastx}, |
|---|
| 330 | | $CTL_OPT{pid_rm_blastx}, |
|---|
| 331 | | $CTL_OPT{split_hit}, |
|---|
| 332 | | $CTL_OPT{force}, |
|---|
| 333 | | $LOG |
|---|
| 334 | | ); |
|---|
| 335 | | |
|---|
| 336 | | #mask the chunk |
|---|
| 337 | | $chunk = repeat_mask_seq::mask_chunk($chunk, $rm_blastx_keepers); |
|---|
| 338 | | } |
|---|
| 339 | | |
|---|
| 340 | | #-combine and cluster repeat hits for consensus |
|---|
| 341 | | my $rm_keepers = repeat_mask_seq::process($rm_gff_keepers, |
|---|
| 342 | | $rm_rb_keepers, |
|---|
| 343 | | $rm_sp_keepers, |
|---|
| 344 | | $rm_blastx_keepers, |
|---|
| 345 | | $q_seq_ref |
|---|
| 346 | | ); |
|---|
| 347 | | |
|---|
| 348 | | #-add repeats to GFF3 |
|---|
| 349 | | $GFF3->add_repeat_hits($rm_keepers); |
|---|
| 350 | | |
|---|
| 351 | | #-build big masked sequence |
|---|
| 352 | | $masked_total_seq .= $chunk->seq(); |
|---|
| 353 | | } |
|---|
| 354 | | |
|---|
| 355 | | my $masked_fasta = Fasta::toFasta($q_def.' masked', \$masked_total_seq); |
|---|
| 356 | | my $masked_file = $the_void."/query.masked.fasta"; |
|---|
| 357 | | FastaFile::writeFile(\$masked_fasta ,$masked_file); |
|---|
| 358 | | |
|---|
| 359 | | #==ab initio predictions here |
|---|
| 360 | | my $preds = GI::abinits($masked_file, |
|---|
| 361 | | $the_void, |
|---|
| 362 | | $safe_seq_id, |
|---|
| 363 | | \%CTL_OPT, |
|---|
| 364 | | $LOG |
|---|
| 365 | | ); |
|---|
| 366 | | |
|---|
| 367 | | #==QRNA noncoding RNA prediction here |
|---|
| 368 | | my $qra_preds = []; |
|---|
| 369 | | |
|---|
| 370 | | #--build an index of the databases |
|---|
| 371 | | my $proteins = $CTL_OPT{_protein}; |
|---|
| 372 | | my $trans = $CTL_OPT{_est}; |
|---|
| 373 | | my $altests = $CTL_OPT{_altest}; |
|---|
| 374 | | my $fasta_t_index = GI::build_fasta_index($trans) if($trans); |
|---|
| 375 | | my $fasta_p_index = GI::build_fasta_index($proteins) if($proteins); |
|---|
| 376 | | my $fasta_a_index = GI::build_fasta_index($altests) if($altests); |
|---|
| 377 | | |
|---|
| 378 | | #--reset fastachunker for masked chunks |
|---|
| 379 | | $fasta_chunker = new FastaChunker(); |
|---|
| 380 | | $fasta_chunker = new FastaChunker(); |
|---|
| 381 | | $fasta_chunker->parent_fasta($masked_fasta); |
|---|
| 382 | | $fasta_chunker->chunk_size($CTL_OPT{max_dna_len}); |
|---|
| 383 | | $fasta_chunker->min_size($CTL_OPT{split_hit}); |
|---|
| 384 | | $fasta_chunker->load_chunks(); |
|---|
| 385 | | |
|---|
| 386 | | $chunk_count = 0; |
|---|
| 387 | | |
|---|
| 388 | | while (my $chunk = $fasta_chunker->get_chunk($chunk_count++)) { |
|---|
| 389 | | #==BLAST ANALYSIS HERE |
|---|
| 390 | | #-blastn search the file against ESTs |
|---|
| 391 | | my $blastn_keepers = []; |
|---|
| 392 | | if ($CTL_OPT{_est}) { |
|---|
| 393 | | my $res_dir; |
|---|
| 394 | | foreach my $db (@{$CTL_OPT{e_db}}) { |
|---|
| 395 | | $res_dir = GI::blastn_as_chunks($chunk, |
|---|
| 396 | | $db, |
|---|
| 397 | | $the_void, |
|---|
| 398 | | $safe_seq_id, |
|---|
| 399 | | $CTL_OPT{_blastn}, |
|---|
| 400 | | $CTL_OPT{eval_blastn}, |
|---|
| 401 | | $CTL_OPT{split_hit}, |
|---|
| 402 | | $CTL_OPT{cpus}, |
|---|
| 403 | | $CTL_OPT{est}, |
|---|
| 404 | | $CTL_OPT{_formater}, |
|---|
| 405 | | 0, |
|---|
| 406 | | $CTL_OPT{force}, |
|---|
| 407 | | $LOG, |
|---|
| 408 | | 1 |
|---|
| 409 | | ); |
|---|
| 410 | | } |
|---|
| 411 | | $blastn_keepers = GI::collect_blastn($chunk, |
|---|
| 412 | | $res_dir, |
|---|
| 413 | | $CTL_OPT{eval_blastn}, |
|---|
| 414 | | $CTL_OPT{bit_blastn}, |
|---|
| 415 | | $CTL_OPT{pcov_blastn}, |
|---|
| 416 | | $CTL_OPT{pid_blastn}, |
|---|
| 417 | | $CTL_OPT{split_hit}, |
|---|
| 418 | | $CTL_OPT{force}, |
|---|
| 419 | | $LOG |
|---|
| 420 | | ); |
|---|
| 421 | | } |
|---|
| 422 | | |
|---|
| 423 | | #-blastx search the masked input file |
|---|
| 424 | | my $blastx_keepers = []; |
|---|
| 425 | | if ($CTL_OPT{_protein}) { |
|---|
| 426 | | my $res_dir; |
|---|
| 427 | | foreach my $db (@{$CTL_OPT{p_db}}) { |
|---|
| 428 | | $res_dir = GI::blastx_as_chunks($chunk, |
|---|
| 429 | | $db, |
|---|
| 430 | | $the_void, |
|---|
| 431 | | $safe_seq_id, |
|---|
| 432 | | $CTL_OPT{_blastx}, |
|---|
| 433 | | $CTL_OPT{eval_blastx}, |
|---|
| 434 | | $CTL_OPT{split_hit}, |
|---|
| 435 | | $CTL_OPT{cpus}, |
|---|
| 436 | | $CTL_OPT{protein}, |
|---|
| 437 | | $CTL_OPT{_formater}, |
|---|
| 438 | | 0, |
|---|
| 439 | | $CTL_OPT{force}, |
|---|
| 440 | | $LOG, |
|---|
| 441 | | 1 |
|---|
| 442 | | ); |
|---|
| 443 | | } |
|---|
| 444 | | $blastx_keepers = GI::collect_blastx($chunk, |
|---|
| 445 | | $res_dir, |
|---|
| 446 | | $CTL_OPT{eval_blastx}, |
|---|
| 447 | | $CTL_OPT{bit_blastx}, |
|---|
| 448 | | $CTL_OPT{pcov_blastx}, |
|---|
| 449 | | $CTL_OPT{pid_blastx}, |
|---|
| 450 | | $CTL_OPT{split_hit}, |
|---|
| 451 | | $CTL_OPT{force}, |
|---|
| 452 | | $LOG |
|---|
| 453 | | ); |
|---|
| 454 | | } |
|---|
| 455 | | |
|---|
| 456 | | #-tblastx search the masked input file |
|---|
| 457 | | my $tblastx_keepers = []; |
|---|
| 458 | | if ($CTL_OPT{_altest}) { |
|---|
| 459 | | my $res_dir; |
|---|
| 460 | | foreach my $db (@{$CTL_OPT{a_db}}) { |
|---|
| 461 | | $res_dir = GI::tblastx_as_chunks($chunk, |
|---|
| 462 | | $db, |
|---|
| 463 | | $the_void, |
|---|
| 464 | | $safe_seq_id, |
|---|
| 465 | | $CTL_OPT{_tblastx}, |
|---|
| 466 | | $CTL_OPT{eval_tblastx}, |
|---|
| 467 | | $CTL_OPT{split_hit}, |
|---|
| 468 | | $CTL_OPT{cpus}, |
|---|
| 469 | | $CTL_OPT{altest}, |
|---|
| 470 | | $CTL_OPT{_formater}, |
|---|
| 471 | | 0, |
|---|
| 472 | | $CTL_OPT{force}, |
|---|
| 473 | | $LOG, |
|---|
| 474 | | 1 |
|---|
| 475 | | ); |
|---|
| 476 | | } |
|---|
| 477 | | $tblastx_keepers = GI::collect_tblastx($chunk, |
|---|
| 478 | | $res_dir, |
|---|
| 479 | | $CTL_OPT{eval_tblastx}, |
|---|
| 480 | | $CTL_OPT{bit_tblastx}, |
|---|
| 481 | | $CTL_OPT{pcov_tblastx}, |
|---|
| 482 | | $CTL_OPT{pid_tblastx}, |
|---|
| 483 | | $CTL_OPT{split_hit}, |
|---|
| 484 | | $CTL_OPT{force}, |
|---|
| 485 | | $LOG |
|---|
| 486 | | ); |
|---|
| 487 | | } |
|---|
| 488 | | |
|---|
| 489 | | #-get only those predictions on the chunk |
|---|
| 490 | | my $preds_on_chunk = GI::get_preds_on_chunk($preds, |
|---|
| 491 | | $chunk |
|---|
| 492 | | ); |
|---|
| 493 | | |
|---|
| 494 | | #==GFF3 passthrough of evidence |
|---|
| 495 | | my $prot_gff_keepers = []; |
|---|
| 496 | | my $est_gff_keepers = []; |
|---|
| 497 | | my $altest_gff_keepers = []; |
|---|
| 498 | | my $model_gff_keepers = []; |
|---|
| 499 | | my $pred_gff_keepers = []; |
|---|
| 500 | | if ($CTL_OPT{go_gffdb}) { |
|---|
| 501 | | #-protein evidence passthraough |
|---|
| 502 | | $prot_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, |
|---|
| 503 | | $q_seq_ref, |
|---|
| 504 | | 'protein' |
|---|
| 505 | | ); |
|---|
| 506 | | #-est evidence passthrough |
|---|
| 507 | | $est_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, |
|---|
| 508 | | $q_seq_ref, |
|---|
| 509 | | 'est' |
|---|
| 510 | | ); |
|---|
| 511 | | #-altest evidence passthrough |
|---|
| 512 | | $altest_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, |
|---|
| 513 | | $q_seq_ref, |
|---|
| 514 | | 'altest' |
|---|
| 515 | | ); |
|---|
| 516 | | #-gff gene annotation passthrough here |
|---|
| 517 | | $model_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, |
|---|
| 518 | | $q_seq_ref, |
|---|
| 519 | | 'model' |
|---|
| 520 | | ); |
|---|
| 521 | | #-pred passthrough |
|---|
| 522 | | $pred_gff_keepers = $GFF_DB->phathits_on_chunk($chunk, |
|---|
| 523 | | $q_seq_ref, |
|---|
| 524 | | 'pred' |
|---|
| 525 | | ); |
|---|
| 526 | | } |
|---|
| 527 | | |
|---|
| 528 | | #==merge heldover Phathits from last round |
|---|
| 529 | | if ($chunk->number != 0) { #if not first chunk |
|---|
| 530 | | #reviews heldover blast hits, |
|---|
| 531 | | #then merges and reblasts them if they cross the divide |
|---|
| 532 | | ($blastn_keepers, |
|---|
| 533 | | $blastx_keepers, |
|---|
| 534 | | $tblastx_keepers) = GI::merge_resolve_hits(\$masked_fasta, |
|---|
| 535 | | $fasta_t_index, |
|---|
| 536 | | $fasta_p_index, |
|---|
| 537 | | $fasta_a_index, |
|---|
| 538 | | $blastn_keepers, |
|---|
| 539 | | $blastx_keepers, |
|---|
| 540 | | $tblastx_keepers, |
|---|
| 541 | | $holdover_blastn, |
|---|
| 542 | | $holdover_blastx, |
|---|
| 543 | | $holdover_tblastx, |
|---|
| 544 | | $the_void, |
|---|
| 545 | | \%CTL_OPT, |
|---|
| 546 | | $LOG |
|---|
| 547 | | ); |
|---|
| 548 | | #combine remaining holdover types |
|---|
| 549 | | push(@{$preds_on_chunk}, @{$holdover_pred}); |
|---|
| 550 | | push(@{$pred_gff_keepers}, @{$holdover_pred_gff}); |
|---|
| 551 | | push(@{$est_gff_keepers}, @{$holdover_est_gff}); |
|---|
| 552 | | push(@{$altest_gff_keepers}, @{$holdover_altest_gff}); |
|---|
| 553 | | push(@{$prot_gff_keepers}, @{$holdover_prot_gff}); |
|---|
| 554 | | push(@{$model_gff_keepers}, @{$holdover_model_gff}); |
|---|
| 555 | | |
|---|
| 556 | | #clear holdovers |
|---|
| 557 | | @{$holdover_pred} = (); |
|---|
| 558 | | @{$holdover_est_gff} = (); |
|---|
| 559 | | @{$holdover_altest_gff} = (); |
|---|
| 560 | | @{$holdover_prot_gff} = (); |
|---|
| 561 | | @{$holdover_pred_gff} = (); |
|---|
| 562 | | @{$holdover_model_gff} = (); |
|---|
| 563 | | } |
|---|
| 564 | | |
|---|
| 565 | | #==PROCESS HITS CLOSE TO CHUNK DIVISIONS |
|---|
| 566 | | #holdover hits that are too close to the divide for review with next chunk |
|---|
| 567 | | if (not $chunk->is_last) { #if not last chunk |
|---|
| 568 | | ($holdover_blastn, |
|---|
| 569 | | $holdover_blastx, |
|---|
| 570 | | $holdover_tblastx, |
|---|
| 571 | | $holdover_pred, |
|---|
| 572 | | $holdover_est_gff, |
|---|
| 573 | | $holdover_altest_gff, |
|---|
| 574 | | $holdover_prot_gff, |
|---|
| 575 | | $holdover_pred_gff, |
|---|
| 576 | | $holdover_model_gff, |
|---|
| 577 | | $blastn_keepers, |
|---|
| 578 | | $blastx_keepers, |
|---|
| 579 | | $tblastx_keepers, |
|---|
| 580 | | $preds_on_chunk, |
|---|
| 581 | | $est_gff_keepers, |
|---|
| 582 | | $altest_gff_keepers, |
|---|
| 583 | | $prot_gff_keepers, |
|---|
| 584 | | $pred_gff_keepers, |
|---|
| 585 | | $model_gff_keepers |
|---|
| 586 | | ) = GI::process_the_chunk_divide($chunk, |
|---|
| 587 | | $CTL_OPT{'split_hit'}, |
|---|
| 588 | | $blastn_keepers, |
|---|
| 589 | | $blastx_keepers, |
|---|
| 590 | | $tblastx_keepers, |
|---|
| 591 | | $preds_on_chunk, |
|---|
| 592 | | $est_gff_keepers, |
|---|
| 593 | | $altest_gff_keepers, |
|---|
| 594 | | $prot_gff_keepers, |
|---|
| 595 | | $pred_gff_keepers, |
|---|
| 596 | | $model_gff_keepers |
|---|
| 597 | | ); |
|---|
| 598 | | } |
|---|
| 599 | | |
|---|
| 600 | | #==EXONERATE HERE |
|---|
| 601 | | |
|---|
| 602 | | #variables that are persistent outside of try block |
|---|
| 603 | | my $blastx_data; |
|---|
| 604 | | my $exonerate_p_data; |
|---|
| 605 | | |
|---|
| 606 | | #-cluster the blastx hits |
|---|
| 607 | | print STDERR "cleaning blastx...\n" unless $main::quiet; |
|---|
| 608 | | |
|---|
| 609 | | my $blastx_clusters = cluster::clean_and_cluster($blastx_keepers, |
|---|
| 610 | | $q_seq_ref, |
|---|
| 611 | | 10 |
|---|
| 612 | | ); |
|---|
| 613 | | undef $blastx_keepers; #free up memory |
|---|
| 614 | | |
|---|
| 615 | | #-make a multi-fasta of the seqs in the blastx_clusters |
|---|
| 616 | | #-polish the blastx hits with exonerate |
|---|
| 617 | | |
|---|
| 618 | | my $exoner_p_clust = GI::polish_exonerate($fasta, |
|---|
| 619 | | $blastx_clusters, |
|---|
| 620 | | $fasta_p_index, |
|---|
| 621 | | $the_void, |
|---|
| 622 | | 5, |
|---|
| 623 | | 'p', |
|---|
| 624 | | $CTL_OPT{exonerate}, |
|---|
| 625 | | $CTL_OPT{pcov_blastx}, |
|---|
| 626 | | $CTL_OPT{pid_blastx}, |
|---|
| 627 | | $CTL_OPT{ep_score_limit}, |
|---|
| 628 | | $CTL_OPT{ep_matrix}, |
|---|
| 629 | | $CTL_OPT{force}, |
|---|
| 630 | | $LOG |
|---|
| 631 | | ); |
|---|
| 632 | | |
|---|
| 633 | | #flatten clusters |
|---|
| 634 | | $blastx_data = GI::flatten($blastx_clusters); |
|---|
| 635 | | $exonerate_p_data = GI::flatten($exoner_p_clust, 'exonerate:p'); |
|---|
| 636 | | |
|---|
| 637 | | #-cluster the tblastx hits |
|---|
| 638 | | print STDERR "cleaning tblastx...\n" unless $main::quiet; |
|---|
| 639 | | my $tblastx_clusters = cluster::clean_and_cluster($tblastx_keepers, |
|---|
| 640 | | $q_seq_ref, |
|---|
| 641 | | 10 |
|---|
| 642 | | ); |
|---|
| 643 | | undef $tblastx_keepers; #free up memory |
|---|
| 644 | | |
|---|
| 645 | | #flatten the clusters |
|---|
| 646 | | my $tblastx_data = GI::flatten($tblastx_clusters); |
|---|
| 647 | | |
|---|
| 648 | | |
|---|
| 649 | | #-cluster the blastn hits |
|---|
| 650 | | print STDERR "cleaning blastn...\n" unless $main::quiet; |
|---|
| 651 | | my $blastn_clusters = cluster::clean_and_cluster($blastn_keepers, |
|---|
| 652 | | $q_seq_ref, |
|---|
| 653 | | 10 |
|---|
| 654 | | ); |
|---|
| 655 | | undef $blastn_keepers; #free up memory |
|---|
| 656 | | |
|---|
| 657 | | #-polish blastn hits with exonerate |
|---|
| 658 | | my $exoner_e_clust = GI::polish_exonerate($fasta, |
|---|
| 659 | | $blastn_clusters, |
|---|
| 660 | | $fasta_t_index, |
|---|
| 661 | | $the_void, |
|---|
| 662 | | 5, |
|---|
| 663 | | 'e', |
|---|
| 664 | | $CTL_OPT{exonerate}, |
|---|
| 665 | | $CTL_OPT{pcov_blastn}, |
|---|
| 666 | | $CTL_OPT{pid_blastn}, |
|---|
| 667 | | $CTL_OPT{en_score_limit}, |
|---|
| 668 | | $CTL_OPT{en_matrix}, |
|---|
| 669 | | $CTL_OPT{force}, |
|---|
| 670 | | $LOG |
|---|
| 671 | | ); |
|---|
| 672 | | |
|---|
| 673 | | #flatten clusters |
|---|
| 674 | | my $blastn_data = GI::flatten($blastn_clusters); |
|---|
| 675 | | my $exonerate_e_data = GI::flatten($exoner_e_clust, |
|---|
| 676 | | 'exonerate:e' |
|---|
| 677 | | ); |
|---|
| 678 | | |
|---|
| 679 | | #combine final data sets |
|---|
| 680 | | my $final_est = GI::combine($exonerate_e_data, |
|---|
| 681 | | $est_gff_keepers |
|---|
| 682 | | ); |
|---|
| 683 | | my $final_altest = GI::combine($tblastx_data, |
|---|
| 684 | | $altest_gff_keepers |
|---|
| 685 | | ); |
|---|
| 686 | | my $final_prot = GI::combine($blastx_data, |
|---|
| 687 | | $exonerate_p_data, |
|---|
| 688 | | $prot_gff_keepers |
|---|
| 689 | | ); |
|---|
| 690 | | my $final_pred = GI::combine($preds_on_chunk, |
|---|
| 691 | | $pred_gff_keepers |
|---|
| 692 | | ); |
|---|
| 693 | | |
|---|
| 694 | | #####working here########### |
|---|
| 695 | | #==MAKER annotations built here |
|---|
| 696 | | |
|---|
| 697 | | #variables that are persistent outside of try block |
|---|
| 698 | | my $annotations = []; |
|---|
| 699 | | |
|---|
| 700 | | #-auto-annotate the input file |
|---|
| 701 | | GI::combine($blastx_data, ); |
|---|
| 702 | | $annotations = maker::auto_annotator::annotate($fasta, |
|---|
| 703 | | $masked_fasta, |
|---|
| 704 | | $chunk->number(), |
|---|
| 705 | | $final_prot, |
|---|
| 706 | | $final_est, |
|---|
| 707 | | $final_altest, |
|---|
| 708 | | $final_pred, |
|---|
| 709 | | $model_gff_keepers, |
|---|
| 710 | | $the_void, |
|---|
| 711 | | $build, |
|---|
| 712 | | \%CTL_OPT, |
|---|
| 713 | | $LOG |
|---|
| 714 | | ); |
|---|
| 715 | | |
|---|
| 716 | | my $maker_anno = maker::auto_annotator::best_annotations($annotations, |
|---|
| 717 | | $out_dir, |
|---|
| 718 | | \%CTL_OPT |
|---|
| 719 | | ); |
|---|
| 720 | | |
|---|
| 721 | | my $non_over = maker::auto_annotator::get_non_overlaping_abinits($maker_anno, |
|---|
| 722 | | $annotations->{abinit} |
|---|
| 723 | | ); |
|---|
| 724 | | |
|---|
| 725 | | |
|---|
| 726 | | |
|---|
| 727 | | |
|---|
| 728 | | #==OUTPUT DATA HERE |
|---|
| 729 | | |
|---|
| 730 | | #--- GFF3 |
|---|
| 731 | | $GFF3->add_genes($maker_anno); |
|---|
| 732 | | $GFF3->add_phathits($blastx_data); |
|---|
| 733 | | $GFF3->add_phathits($blastn_data); |
|---|
| 734 | | $GFF3->add_phathits($tblastx_data); |
|---|
| 735 | | $GFF3->add_phathits($exonerate_p_data); |
|---|
| 736 | | $GFF3->add_phathits($exonerate_e_data); |
|---|
| 737 | | $GFF3->add_phathits($est_gff_keepers); |
|---|
| 738 | | $GFF3->add_phathits($altest_gff_keepers); |
|---|
| 739 | | $GFF3->add_phathits($prot_gff_keepers); |
|---|
| 740 | | $GFF3->add_phathits($preds_on_chunk); |
|---|
| 741 | | $GFF3->add_phathits($pred_gff_keepers); |
|---|
| 742 | | $GFF3->resolved_flag if (not $chunk->is_last); #adds ### between contigs |
|---|
| 743 | | |
|---|
| 744 | | #--- building fastas for annotations (grows with iteration) |
|---|
| 745 | | GI::maker_p_and_t_fastas($maker_anno, |
|---|
| 746 | | $non_over, |
|---|
| 747 | | $annotations->{abinit}, |
|---|
| 748 | | \%p_fastas, |
|---|
| 749 | | \%t_fastas, |
|---|
| 750 | | ); |
|---|
| 751 | | } |
|---|
| 752 | | #END CONTIG |
|---|
| 753 | | |
|---|
| 754 | | #--- write fastas for ab-initio predictions |
|---|
| 755 | | |
|---|
| 756 | | #--Write annotation fasta files now that all chunks are finished |
|---|
| 757 | | GI::write_p_and_t_fastas(\%p_fastas, \%t_fastas, $safe_seq_id, $out_dir); |
|---|
| 758 | | |
|---|
| 759 | | #--- write GFF3 file |
|---|
| 760 | | $GFF3->finalize(); |
|---|
| 761 | | |
|---|
| 762 | | #--cleanup maker files created with each fasta sequence |
|---|
| 763 | | File::Path::rmtree ($the_void) if $CTL_OPT{clean_up}; #rm temp directory |
|---|
| 764 | | |
|---|
| 765 | | #-- write to DS log the finished files |
|---|
| 766 | | $DS_CTL->add_entry($seq_id, $out_dir, 'FINISHED'); |
|---|
| 767 | | |
|---|
| 768 | | #--- clear the log variable |
|---|
| 769 | | $LOG = undef; |
|---|
| 770 | | } |
|---|
| 771 | | |
|---|
| 772 | | #lets me know when maker is finished |
|---|
| | 152 | my $rank = 0; |
|---|
| | 153 | my $size = 1; |
|---|
| | 154 | $RANK = $rank; |
|---|
| | 155 | |
|---|
| | 156 | #---Process options on the command line |
|---|
| | 157 | try{ |
|---|
| | 158 | GetOptions("RM_off|R" => \$OPT{R}, |
|---|
| | 159 | "force|f" => \$OPT{force}, |
|---|
| | 160 | "genome|g=s" => \$OPT{genome}, |
|---|
| | 161 | "cpus|c=i" => \$OPT{cpus}, |
|---|
| | 162 | "predictor=s" =>\$OPT{predictor}, |
|---|
| | 163 | "retry=i" =>\$OPT{retry}, |
|---|
| | 164 | "evaluate" =>\$OPT{evaluate}, |
|---|
| | 165 | "quiet" =>\$main::quiet, |
|---|
| | 166 | "CTL" => sub {GI::generate_control_files(); exit(0);}, |
|---|
| | 167 | "help|?" => sub {print $usage; exit(0)} |
|---|
| | 168 | ); |
|---|
| | 169 | } |
|---|
| | 170 | catch Error::Simple with{ |
|---|
| | 171 | my $E = shift; |
|---|
| | 172 | |
|---|
| | 173 | print STDERR $E->{-text}; |
|---|
| | 174 | die "\n\nMaker failed parsing command line options!!\n\n"; |
|---|
| | 175 | }; |
|---|
| | 176 | |
|---|
| | 177 | #varibles that are persistent outside of try |
|---|
| | 178 | my %CTL_OPT; |
|---|
| | 179 | my $iterator; |
|---|
| | 180 | my $DS_CTL; |
|---|
| | 181 | my $GFF_DB; |
|---|
| | 182 | my $build; |
|---|
| | 183 | my @failed; |
|---|
| | 184 | |
|---|
| | 185 | try{ |
|---|
| | 186 | #get arguments off the command line |
|---|
| | 187 | my @ctlfiles = @ARGV; |
|---|
| | 188 | |
|---|
| | 189 | if (not @ctlfiles) { |
|---|
| | 190 | if (-e "maker_opts.ctl" && |
|---|
| | 191 | -e "maker_bopts.ctl" && |
|---|
| | 192 | -e "maker_exe.ctl" && |
|---|
| | 193 | -e "evaluator.ctl" |
|---|
| | 194 | ) { |
|---|
| | 195 | |
|---|
| | 196 | @ctlfiles = ("maker_opts.ctl", |
|---|
| | 197 | "maker_bopts.ctl", |
|---|
| | 198 | "maker_exe.ctl", |
|---|
| | 199 | "evaluator.ctl" |
|---|
| | 200 | ); |
|---|
| | 201 | } |
|---|
| | 202 | else { |
|---|
| | 203 | print STDERR "ERROR: Control files not found\n"; |
|---|
| | 204 | print $usage; |
|---|
| | 205 | exit(0); |
|---|
| | 206 | } |
|---|
| | 207 | } |
|---|
| | 208 | |
|---|
| | 209 | #--Control file processing |
|---|
| | 210 | |
|---|
| | 211 | #set up control options from control files |
|---|
| | 212 | %CTL_OPT = GI::load_control_files(\@ctlfiles, \%OPT, $size); |
|---|
| | 213 | |
|---|
| | 214 | #--open datastructure controller |
|---|
| | 215 | $DS_CTL = ds_utility->new(\%CTL_OPT); |
|---|
| | 216 | |
|---|
| | 217 | #--set up gff database |
|---|
| | 218 | $GFF_DB = new GFFDB(\%CTL_OPT); |
|---|
| | 219 | $build = $GFF_DB->next_build; |
|---|
| | 220 | |
|---|
| | 221 | #---load genome multifasta/GFF3 file |
|---|
| | 222 | $iterator = new Iterator::Any( -fasta => $CTL_OPT{'genome'}, |
|---|
| | 223 | -gff => $CTL_OPT{'genome_gff'}, |
|---|
| | 224 | ); |
|---|
| | 225 | } |
|---|
| | 226 | catch Error::Simple with{ |
|---|
| | 227 | my $E = shift; |
|---|
| | 228 | print STDERR $E->{-text}; |
|---|
| | 229 | print STDERR "\n\nMaker failed while examining startup data\n", |
|---|
| | 230 | "(control files and input fasta files)!!\n\n"; |
|---|
| | 231 | my $code = 2; |
|---|
| | 232 | $code = $E->{-value} if (defined($E->{-value})); |
|---|
| | 233 | |
|---|
| | 234 | exit($code); |
|---|
| | 235 | }; |
|---|
| | 236 | |
|---|
| | 237 | my $tier; |
|---|
| | 238 | while (my $fasta = $iterator->nextFasta() || shift @failed){ |
|---|
| | 239 | $tier = Process::MpiTiers->new({fasta =>$fasta, |
|---|
| | 240 | CTL_OPT => \%CTL_OPT, |
|---|
| | 241 | DS_CTL => $DS_CTL, |
|---|
| | 242 | GFF_DB => $GFF_DB, |
|---|
| | 243 | build => $build}, |
|---|
| | 244 | '0' |
|---|
| | 245 | ); |
|---|
| | 246 | |
|---|
| | 247 | $tier->run while(! $tier->terminated || $tier->failed); |
|---|
| | 248 | $DS_CTL->add_entry($tier->DS); |
|---|
| | 249 | push(@failed, $tier->fasta) if ($tier->failed); |
|---|
| | 250 | } |
|---|
| | 251 | |
|---|