| 1 |
#! /usr/bin/perl -w |
|---|
| 2 |
|
|---|
| 3 |
use strict; |
|---|
| 4 |
use Getopt::Long; |
|---|
| 5 |
use File::Temp qw(tempfile); |
|---|
| 6 |
|
|---|
| 7 |
my $usage = " |
|---|
| 8 |
|
|---|
| 9 |
Synopsis: |
|---|
| 10 |
|
|---|
| 11 |
fasta_merge -d maker_datastore_index.log |
|---|
| 12 |
fasta_merge -o genome.all -i <fasta1> <fasta2> ... |
|---|
| 13 |
|
|---|
| 14 |
Descriptions: |
|---|
| 15 |
|
|---|
| 16 |
This script will take a MAKER datastore index log file, extract all |
|---|
| 17 |
the relevant fasta files and create fasta files with relevant |
|---|
| 18 |
categories of sequence (i.e. transcript, protein, GeneMark protien, |
|---|
| 19 |
etc.). For this to work properly you need to be in the same directory |
|---|
| 20 |
as the datastore index. |
|---|
| 21 |
|
|---|
| 22 |
Options: |
|---|
| 23 |
|
|---|
| 24 |
-d The location of the MAKER datastore index log. |
|---|
| 25 |
-o Alternate base name for the output files. |
|---|
| 26 |
-i A optional list of files to process along with or instead of the |
|---|
| 27 |
datastore. |
|---|
| 28 |
|
|---|
| 29 |
"; |
|---|
| 30 |
|
|---|
| 31 |
my $datastore; |
|---|
| 32 |
my @files; |
|---|
| 33 |
my $outfile; |
|---|
| 34 |
|
|---|
| 35 |
GetOptions ("datastor|d=s" => \$datastore, |
|---|
| 36 |
"i=s" => \@files, |
|---|
| 37 |
"o=s" => \$outfile, |
|---|
| 38 |
"help|?" => sub{print $usage; exit();} |
|---|
| 39 |
); |
|---|
| 40 |
|
|---|
| 41 |
if(! $datastore){ |
|---|
| 42 |
print $usage; |
|---|
| 43 |
exit(); |
|---|
| 44 |
} |
|---|
| 45 |
|
|---|
| 46 |
if (! $outfile){ |
|---|
| 47 |
($outfile) = $datastore =~ /([^\/]+)$/; |
|---|
| 48 |
$outfile =~ s/_master_datastore_index.log//; |
|---|
| 49 |
$outfile = "genome" if(! defined $outfile); |
|---|
| 50 |
} |
|---|
| 51 |
|
|---|
| 52 |
die "ERROR: The file \'$datastore\' does not exist\n" if ($datastore && ! -r $datastore); |
|---|
| 53 |
if ($datastore){ |
|---|
| 54 |
open(IN, '<', $datastore) or die "Can't open $datastore for reading\n"; |
|---|
| 55 |
|
|---|
| 56 |
#uniq the entries |
|---|
| 57 |
my %entries; |
|---|
| 58 |
@entries{@{[<IN>]}} = (); |
|---|
| 59 |
|
|---|
| 60 |
my @dirs; |
|---|
| 61 |
foreach my $e (keys %entries){ |
|---|
| 62 |
next unless ($e =~ /FINISHED/); |
|---|
| 63 |
chomp $e; |
|---|
| 64 |
my ($id, $dir, $status) = split("\t", $e); |
|---|
| 65 |
$dir =~ s/\/$//; |
|---|
| 66 |
push(@dirs, $dir); |
|---|
| 67 |
} |
|---|
| 68 |
|
|---|
| 69 |
foreach my $dir (@dirs){ |
|---|
| 70 |
my @f_files = <$dir/*.transcripts.fasta>; |
|---|
| 71 |
push(@files, @f_files); |
|---|
| 72 |
@f_files = <$dir/*.proteins.fasta>; |
|---|
| 73 |
push(@files, @f_files); |
|---|
| 74 |
} |
|---|
| 75 |
} |
|---|
| 76 |
|
|---|
| 77 |
my %groups; |
|---|
| 78 |
|
|---|
| 79 |
foreach my $file (@files){ |
|---|
| 80 |
if($file =~ /([^\.]+)\.transcripts\.fasta$/){ |
|---|
| 81 |
my $key = $1; |
|---|
| 82 |
push(@{$groups{$key}{transcripts}}, $file); |
|---|
| 83 |
} |
|---|
| 84 |
elsif($file =~ /([^\.]+)\.proteins\.fasta$/){ |
|---|
| 85 |
my $key = $1; |
|---|
| 86 |
push(@{$groups{$key}{proteins}}, $file); |
|---|
| 87 |
} |
|---|
| 88 |
else{ |
|---|
| 89 |
push(@{$groups{all}}, $file); |
|---|
| 90 |
} |
|---|
| 91 |
} |
|---|
| 92 |
|
|---|
| 93 |
while(my $key = each %groups){ |
|---|
| 94 |
if($key eq 'all'){ |
|---|
| 95 |
my $all = [sort @{$groups{$key}}]; |
|---|
| 96 |
dump_it($all, "$outfile.fasta"); |
|---|
| 97 |
next; |
|---|
| 98 |
} |
|---|
| 99 |
|
|---|
| 100 |
#maker standard naming convention |
|---|
| 101 |
my $source = 'maker'; |
|---|
| 102 |
$source .= ".$key" if($key ne 'maker'); |
|---|
| 103 |
|
|---|
| 104 |
#protein and transcript files |
|---|
| 105 |
my $trans = [sort @{$groups{$key}{transcripts}}]; |
|---|
| 106 |
my $prot = [sort @{$groups{$key}{proteins}}]; |
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
dump_it($trans, "$outfile.all.$source.transcripts.fasta"); |
|---|
| 110 |
dump_it($prot, "$outfile.all.$source.proteins.fasta"); |
|---|
| 111 |
} |
|---|
| 112 |
|
|---|
| 113 |
sub dump_it { |
|---|
| 114 |
my $files = shift; |
|---|
| 115 |
my $name = shift; |
|---|
| 116 |
|
|---|
| 117 |
open(OUT, "> $name"); |
|---|
| 118 |
close(OUT); |
|---|
| 119 |
|
|---|
| 120 |
foreach my $file (@{$files}){ |
|---|
| 121 |
die "ERROR: The file \'$file\' does not exist\n" if (! -e $file); |
|---|
| 122 |
system("cat $file >> $name"); |
|---|
| 123 |
} |
|---|
| 124 |
} |
|---|