root/bin/fasta_merge

Revision 279, 2.7 kB (checked in by bmoore, 2 weeks ago)

Carsons lib/maker/auto_annotator.pm update

  • Property svn:executable set to *
Line 
1 #! /usr/bin/perl -w
2
3 use strict;
4 use Getopt::Long;
5 use File::Temp qw(tempfile);
6
7 my $usage = "
8
9 Synopsis:
10
11 fasta_merge -d maker_datastore_index.log
12 fasta_merge -o genome.all -i <fasta1> <fasta2> ...
13
14 Descriptions:
15
16 This script will take a MAKER datastore index log file, extract all
17 the relevant fasta files and create fasta files with relevant
18 categories of sequence (i.e. transcript, protein, GeneMark protien,
19 etc.).  For this to work properly you need to be in the same directory
20 as the datastore index.
21
22 Options:
23
24   -d The location of the MAKER datastore index log.
25   -o Alternate base name for the output files.
26   -i A optional list of files to process along with or instead of the
27      datastore.
28
29 ";
30
31 my $datastore;
32 my @files;
33 my $outfile;
34
35 GetOptions ("datastor|d=s" => \$datastore,
36             "i=s" => \@files,
37             "o=s" => \$outfile,
38             "help|?" => sub{print $usage; exit();}
39             );
40
41 if(! $datastore){
42     print $usage;
43     exit();
44 }
45
46 if (! $outfile){
47     ($outfile) = $datastore =~ /([^\/]+)$/;
48     $outfile =~ s/_master_datastore_index.log//;
49     $outfile = "genome" if(! defined $outfile);
50 }
51
52 die "ERROR: The file \'$datastore\' does not exist\n" if ($datastore && ! -r $datastore);
53 if ($datastore){
54     open(IN, '<', $datastore) or die "Can't open $datastore for reading\n";
55
56     #uniq the entries
57     my %entries;
58     @entries{@{[<IN>]}} = ();
59
60     my @dirs;
61     foreach my $e (keys %entries){
62         next unless ($e =~ /FINISHED/);
63         chomp $e;
64         my ($id, $dir, $status) = split("\t", $e);
65         $dir =~ s/\/$//;
66         push(@dirs, $dir);
67     }
68
69     foreach my $dir (@dirs){
70         my @f_files = <$dir/*.transcripts.fasta>;
71         push(@files, @f_files);
72         @f_files = <$dir/*.proteins.fasta>;
73         push(@files, @f_files);
74     }
75 }
76
77 my %groups;
78
79 foreach my $file (@files){
80     if($file =~ /([^\.]+)\.transcripts\.fasta$/){
81         my $key = $1;
82         push(@{$groups{$key}{transcripts}}, $file);
83     }
84     elsif($file =~ /([^\.]+)\.proteins\.fasta$/){
85         my $key = $1;
86         push(@{$groups{$key}{proteins}}, $file);
87     }
88     else{
89         push(@{$groups{all}}, $file);
90     }
91 }
92
93 while(my $key = each %groups){
94     if($key eq 'all'){
95         my $all = [sort @{$groups{$key}}];
96         dump_it($all, "$outfile.fasta");
97         next;
98     }
99
100     #maker standard naming convention
101     my $source = 'maker';
102     $source .= ".$key" if($key ne 'maker');
103
104     #protein and transcript files
105     my $trans = [sort @{$groups{$key}{transcripts}}];
106     my $prot  = [sort @{$groups{$key}{proteins}}];
107
108
109     dump_it($trans, "$outfile.all.$source.transcripts.fasta");
110     dump_it($prot, "$outfile.all.$source.proteins.fasta");
111 }
112
113 sub dump_it {
114     my $files = shift;
115     my $name = shift;
116
117     open(OUT, "> $name");
118     close(OUT);
119
120     foreach my $file (@{$files}){
121         die "ERROR: The file \'$file\' does not exist\n" if (! -e $file);
122         system("cat $file >> $name");
123     }
124 }
Note: See TracBrowser for help on using the browser.