| 1 |
#! /usr/bin/perl -w |
|---|
| 2 |
|
|---|
| 3 |
use strict; |
|---|
| 4 |
use Getopt::Long; |
|---|
| 5 |
use File::Temp qw(tempfile); |
|---|
| 6 |
|
|---|
| 7 |
my $usage = " |
|---|
| 8 |
|
|---|
| 9 |
Synopsis: |
|---|
| 10 |
|
|---|
| 11 |
gff3_merge -d maker_datastore_index.log |
|---|
| 12 |
gff3_merge -o genome.all.gff <gff3_file1> <gff3_file2> ... |
|---|
| 13 |
|
|---|
| 14 |
Descriptions: |
|---|
| 15 |
|
|---|
| 16 |
This script will take a MAKER datastore index log file, extract all |
|---|
| 17 |
the relevant GFF3 files and combined GFF3 file. The script can also |
|---|
| 18 |
combine other correctly formated GFF3 files. For this to work |
|---|
| 19 |
properly you need to be in the same directory as the datastore index. |
|---|
| 20 |
|
|---|
| 21 |
Options: |
|---|
| 22 |
|
|---|
| 23 |
-d The location of the MAKER datastore index log file. |
|---|
| 24 |
-o Alternate base name for the output files. |
|---|
| 25 |
|
|---|
| 26 |
"; |
|---|
| 27 |
|
|---|
| 28 |
my $datastore; |
|---|
| 29 |
my @files; |
|---|
| 30 |
my $outfile; |
|---|
| 31 |
|
|---|
| 32 |
GetOptions ("datastor|d=s" => \$datastore, |
|---|
| 33 |
"o=s" => \$outfile, |
|---|
| 34 |
"help|?" => sub{print $usage; exit();} |
|---|
| 35 |
); |
|---|
| 36 |
|
|---|
| 37 |
if(! $datastore){ |
|---|
| 38 |
@files = @ARGV; |
|---|
| 39 |
} |
|---|
| 40 |
|
|---|
| 41 |
if(! $datastore && ! @files) { |
|---|
| 42 |
print $usage; |
|---|
| 43 |
exit(); |
|---|
| 44 |
} |
|---|
| 45 |
|
|---|
| 46 |
if (! $outfile){ |
|---|
| 47 |
($outfile) = $datastore =~ /([^\/]+)$/; |
|---|
| 48 |
$outfile =~ s/_master_datastore_index.log//; |
|---|
| 49 |
$outfile .= ".gff"; |
|---|
| 50 |
$outfile = "genome.all.gff" if(! defined $outfile); |
|---|
| 51 |
} |
|---|
| 52 |
|
|---|
| 53 |
die "ERROR: The file \'$datastore\' does not exist\n" if ($datastore && ! -e $datastore); |
|---|
| 54 |
if ($datastore){ |
|---|
| 55 |
open(IN, "< $datastore"); |
|---|
| 56 |
|
|---|
| 57 |
#uniq the entries |
|---|
| 58 |
my %seen; |
|---|
| 59 |
while(my $e = <IN>){ |
|---|
| 60 |
next unless ($e =~ /FINISHED/); |
|---|
| 61 |
next if $seen{$e}; |
|---|
| 62 |
$seen{$e}++; |
|---|
| 63 |
chomp $e; |
|---|
| 64 |
my ($id, $dir, $status) = split("\t", $e); |
|---|
| 65 |
$dir =~ s/\/$//; |
|---|
| 66 |
push(@files, $dir); |
|---|
| 67 |
} |
|---|
| 68 |
|
|---|
| 69 |
foreach my $file (@files){ |
|---|
| 70 |
$file =~ /([^\/]+)$/; |
|---|
| 71 |
$file = "$file/$1.gff"; |
|---|
| 72 |
} |
|---|
| 73 |
} |
|---|
| 74 |
|
|---|
| 75 |
@files = sort @files; |
|---|
| 76 |
|
|---|
| 77 |
open(my $GFF, "> $outfile"); |
|---|
| 78 |
print $GFF "\#\#gff-version 3\n"; |
|---|
| 79 |
|
|---|
| 80 |
my ($ANN, $ann_file) = tempfile(); |
|---|
| 81 |
my ($FAS, $fas_file) = tempfile(); |
|---|
| 82 |
print $FAS "\#\#FASTA\n"; |
|---|
| 83 |
|
|---|
| 84 |
my %uniq; |
|---|
| 85 |
foreach my $file (@files){ |
|---|
| 86 |
die "ERROR: The file \'$file\' does not exist\n" if (! -e $file); |
|---|
| 87 |
open(IN, "< $file") || die "ERROR: Could not open file \'$file\'\n"; |
|---|
| 88 |
|
|---|
| 89 |
my $FH = $ANN; |
|---|
| 90 |
|
|---|
| 91 |
while (defined(my $line = <IN>)){ |
|---|
| 92 |
next if ($line =~ /^\#\#gff-version 3/); |
|---|
| 93 |
if($line =~ /^\#\#genome-build/){ |
|---|
| 94 |
next if exists $uniq{$line}; |
|---|
| 95 |
$uniq{$line}++; |
|---|
| 96 |
print $GFF $line; |
|---|
| 97 |
next; |
|---|
| 98 |
} |
|---|
| 99 |
if($line =~ /^\#\#sequence-region/){ |
|---|
| 100 |
die "ERROR: This contig has already been added\: $line\n" if exists $uniq{$line}; |
|---|
| 101 |
$uniq{$line}++; |
|---|
| 102 |
print $GFF $line; |
|---|
| 103 |
next; |
|---|
| 104 |
} |
|---|
| 105 |
if ($line =~ /^\#\#FASTA/){ |
|---|
| 106 |
$FH = $FAS; |
|---|
| 107 |
next; |
|---|
| 108 |
} |
|---|
| 109 |
if ($line =~ /^>/){ |
|---|
| 110 |
$FH = $FAS; |
|---|
| 111 |
} |
|---|
| 112 |
chomp $line; |
|---|
| 113 |
print $FH $line . "\n" if($line); |
|---|
| 114 |
} |
|---|
| 115 |
} |
|---|
| 116 |
close($GFF); |
|---|
| 117 |
close($ANN); |
|---|
| 118 |
close($FAS); |
|---|
| 119 |
|
|---|
| 120 |
system ("cat $ann_file >> $outfile"); |
|---|
| 121 |
system ("cat $fas_file >> $outfile"); |
|---|
| 122 |
unlink("$ann_file"); |
|---|
| 123 |
unlink("$fas_file"); |
|---|