| 1 |
#! /usr/bin/perl -w |
|---|
| 2 |
use strict; |
|---|
| 3 |
use FindBin; |
|---|
| 4 |
use lib "$FindBin::Bin/../lib"; |
|---|
| 5 |
use Iterator::Fasta; |
|---|
| 6 |
use Fasta; |
|---|
| 7 |
use Datastore::MD5; |
|---|
| 8 |
use Cwd; |
|---|
| 9 |
use threads; |
|---|
| 10 |
use threads::shared; |
|---|
| 11 |
use Thread::Semaphore; |
|---|
| 12 |
|
|---|
| 13 |
my $usage = " |
|---|
| 14 |
Usage: |
|---|
| 15 |
|
|---|
| 16 |
iprscan_batch <file_name> <cpus> <log_file> |
|---|
| 17 |
|
|---|
| 18 |
Runs iprscan on the given input file. Output goes into a datastore. |
|---|
| 19 |
|
|---|
| 20 |
The cpus option is optional for multi-threading. |
|---|
| 21 |
|
|---|
| 22 |
The log file is optional. The optional log file is not for creating that log |
|---|
| 23 |
file but rather to parse an existing log file. iprscan_batch will then only |
|---|
| 24 |
re-run jubs that the log file says are not yet finished. |
|---|
| 25 |
|
|---|
| 26 |
|
|---|
| 27 |
"; |
|---|
| 28 |
|
|---|
| 29 |
my $file = shift; |
|---|
| 30 |
my $cpus = shift || 1; |
|---|
| 31 |
my $log = shift; |
|---|
| 32 |
|
|---|
| 33 |
if(! $file){ |
|---|
| 34 |
print $usage; |
|---|
| 35 |
exit; |
|---|
| 36 |
} |
|---|
| 37 |
|
|---|
| 38 |
my @files :shared; |
|---|
| 39 |
my @failed :shared; |
|---|
| 40 |
my @finished :shared; |
|---|
| 41 |
my $go :shared; |
|---|
| 42 |
$go = 1; |
|---|
| 43 |
|
|---|
| 44 |
my %log_f; |
|---|
| 45 |
if($log){ |
|---|
| 46 |
open(LOG, "< $log"); |
|---|
| 47 |
while(defined(my $line = <LOG>)){ |
|---|
| 48 |
chomp $line; |
|---|
| 49 |
my @F = split(/[\s\t]/, $line); |
|---|
| 50 |
if($F[0] eq 'FINISHED'){ |
|---|
| 51 |
$log_f{$F[1]}++; |
|---|
| 52 |
} |
|---|
| 53 |
} |
|---|
| 54 |
} |
|---|
| 55 |
|
|---|
| 56 |
|
|---|
| 57 |
my $s = new Thread::Semaphore; |
|---|
| 58 |
my $cwd = Cwd::cwd; |
|---|
| 59 |
my $log_file = "$cwd/$file\_master_datastore.index"; |
|---|
| 60 |
open(LOG, "> $log_file"); |
|---|
| 61 |
close(LOG); |
|---|
| 62 |
|
|---|
| 63 |
my @threads; |
|---|
| 64 |
for(my $i = 0; $i < $cpus; $i++){ |
|---|
| 65 |
my $thr = threads->create(\&launch); |
|---|
| 66 |
push(@threads, $thr); |
|---|
| 67 |
} |
|---|
| 68 |
|
|---|
| 69 |
my $iterator = new Iterator::Fasta($file); |
|---|
| 70 |
my $DS = new Datastore::MD5('root' => "$cwd/$file\_datastore", |
|---|
| 71 |
'depth' => 2 |
|---|
| 72 |
); |
|---|
| 73 |
|
|---|
| 74 |
while (my $fasta = $iterator->nextEntry){ |
|---|
| 75 |
my $seq_id = Fasta::getSeqID(\$fasta); |
|---|
| 76 |
my $safe_id = Fasta::seqID2SafeID($seq_id); |
|---|
| 77 |
my $seq = Fasta::getSeq(\$fasta); |
|---|
| 78 |
|
|---|
| 79 |
my $dir = $DS->id_to_dir($safe_id); |
|---|
| 80 |
$DS->mkdir($safe_id) || die "ERROR: could not make datastore directory\n"; |
|---|
| 81 |
|
|---|
| 82 |
my $fasta_file = "$dir/$safe_id.fasta"; |
|---|
| 83 |
|
|---|
| 84 |
if($log && ! $log_f{"$dir/$safe_id.fasta"} && -e "$dir/$safe_id.fasta"){ |
|---|
| 85 |
system("rm $dir/*;"); |
|---|
| 86 |
} |
|---|
| 87 |
|
|---|
| 88 |
if(-e $fasta_file){ |
|---|
| 89 |
my $it = new Iterator::Fasta($fasta_file); |
|---|
| 90 |
my $fa = $it->nextEntry; |
|---|
| 91 |
my $seq_old = Fasta::getSeq(\$fa); |
|---|
| 92 |
|
|---|
| 93 |
if($seq ne $seq_old){ |
|---|
| 94 |
system("rm $fasta_file*"); |
|---|
| 95 |
} |
|---|
| 96 |
} |
|---|
| 97 |
|
|---|
| 98 |
open(OUT, "> $fasta_file"); |
|---|
| 99 |
print OUT $fasta; |
|---|
| 100 |
close(OUT); |
|---|
| 101 |
|
|---|
| 102 |
push(@files, $fasta_file); |
|---|
| 103 |
} |
|---|
| 104 |
|
|---|
| 105 |
$go = 0; |
|---|
| 106 |
|
|---|
| 107 |
my $count = @files; |
|---|
| 108 |
my %seen; |
|---|
| 109 |
while(my $thr = shift @threads){ |
|---|
| 110 |
if($thr->is_running){ |
|---|
| 111 |
push(@threads, $thr); |
|---|
| 112 |
sleep 1; |
|---|
| 113 |
} |
|---|
| 114 |
else{ |
|---|
| 115 |
$thr->join(); |
|---|
| 116 |
} |
|---|
| 117 |
|
|---|
| 118 |
while(my $f = shift @finished){ |
|---|
| 119 |
open(LOG, ">> $log_file"); |
|---|
| 120 |
print LOG "FINISHED $f\n"; |
|---|
| 121 |
close(LOG); |
|---|
| 122 |
$count--; |
|---|
| 123 |
} |
|---|
| 124 |
|
|---|
| 125 |
$s->down; |
|---|
| 126 |
for(my $i = 0; $i < @failed; $i++){ |
|---|
| 127 |
if(defined $failed[$i]){ |
|---|
| 128 |
if(exists $seen{$failed[$i]}){ |
|---|
| 129 |
$failed[$i] = undef; |
|---|
| 130 |
$count--; |
|---|
| 131 |
} |
|---|
| 132 |
else{ |
|---|
| 133 |
$seen{$failed[$i]}++; |
|---|
| 134 |
} |
|---|
| 135 |
} |
|---|
| 136 |
} |
|---|
| 137 |
$s->up; |
|---|
| 138 |
} |
|---|
| 139 |
|
|---|
| 140 |
#-------------SUBS-------- |
|---|
| 141 |
|
|---|
| 142 |
sub launch{ |
|---|
| 143 |
while ($go || @files || @failed){ |
|---|
| 144 |
if (my $f = shift @files){ |
|---|
| 145 |
if(-e "$f.error"){ |
|---|
| 146 |
my $check = `ls -al $f.error`; |
|---|
| 147 |
my @data = split(/[\s\t]/, $check); |
|---|
| 148 |
if($data[4] == 36){ |
|---|
| 149 |
push(@finished, $f); |
|---|
| 150 |
next; |
|---|
| 151 |
} |
|---|
| 152 |
} |
|---|
| 153 |
|
|---|
| 154 |
system("iprscan -cli -i $f -iprlookup -goterms -format raw -nocrc 1> $f.out 2> $f.error"); |
|---|
| 155 |
my $check = `ls -al $f.error`; |
|---|
| 156 |
my @data = split(/[\s\t]/, $check); |
|---|
| 157 |
|
|---|
| 158 |
if($data[4] == 36){ |
|---|
| 159 |
push(@finished, $f); |
|---|
| 160 |
} |
|---|
| 161 |
else{ |
|---|
| 162 |
push(@failed, $f); |
|---|
| 163 |
} |
|---|
| 164 |
} |
|---|
| 165 |
elsif(@failed){ |
|---|
| 166 |
$s->down; |
|---|
| 167 |
my $f = shift @failed; |
|---|
| 168 |
$s->up; |
|---|
| 169 |
|
|---|
| 170 |
next if(! defined $f); |
|---|
| 171 |
|
|---|
| 172 |
system("iprscan -cli -i $f -iprlookup -goterms -format raw -nocrc 1> $f.out 2> $f.error"); |
|---|
| 173 |
my $check = `ls -al $f.error`; |
|---|
| 174 |
my @data = split(/[\s\t]/, $check); |
|---|
| 175 |
|
|---|
| 176 |
if($data[4] == 36){ |
|---|
| 177 |
push(@finished, $f); |
|---|
| 178 |
} |
|---|
| 179 |
else{ |
|---|
| 180 |
push(@failed, $f); |
|---|
| 181 |
} |
|---|
| 182 |
} |
|---|
| 183 |
else{ |
|---|
| 184 |
sleep 1; |
|---|
| 185 |
} |
|---|
| 186 |
} |
|---|
| 187 |
} |
|---|