| 1 |
|
|---|
| 2 |
use strict; |
|---|
| 3 |
use warnings; |
|---|
| 4 |
use Getopt::Long; |
|---|
| 5 |
|
|---|
| 6 |
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 |
|
|---|
| 10 |
my $usage = " |
|---|
| 11 |
|
|---|
| 12 |
Synopsis: |
|---|
| 13 |
|
|---|
| 14 |
maker_functional_fasta uniprot-sprot.fasta uniprot-sprot.blast.out \ |
|---|
| 15 |
maker.transcripts.fasta maker.proteins.fasta .... |
|---|
| 16 |
|
|---|
| 17 |
Description: |
|---|
| 18 |
|
|---|
| 19 |
This script will take a uniprot-sprot fasta file, a wu-blast -mformat |
|---|
| 20 |
2 blast output file from a blast of maker proteins against |
|---|
| 21 |
uniprot-sprot proteins and add functional annotations to the maker |
|---|
| 22 |
fasta header changing it from this: |
|---|
| 23 |
|
|---|
| 24 |
>PYU1_T015316 transcript offset:0 AED:0.0476397966594044 \ |
|---|
| 25 |
QI:0|-1|0|1|-1|1|1|0|255 |
|---|
| 26 |
|
|---|
| 27 |
to this: |
|---|
| 28 |
|
|---|
| 29 |
>PYU1_T015316 transcript name:\"Serine-aspartate repeat-containing protein I \ |
|---|
| 30 |
(Staphylococcus saprophyticus)\" offset:0 AED:0.0476397966594044 \ |
|---|
| 31 |
QI:0|-1|0|1|-1|1|1|0|255 |
|---|
| 32 |
|
|---|
| 33 |
"; |
|---|
| 34 |
|
|---|
| 35 |
|
|---|
| 36 |
my ($help); |
|---|
| 37 |
my $opt_success = GetOptions('help' => \$help, |
|---|
| 38 |
); |
|---|
| 39 |
|
|---|
| 40 |
die $usage if $help || ! $opt_success; |
|---|
| 41 |
|
|---|
| 42 |
my ($uniprot_file, $blast_file, @fasta_files) = @ARGV; |
|---|
| 43 |
die $usage unless $uniprot_file && $blast_file && @fasta_files; |
|---|
| 44 |
|
|---|
| 45 |
my $blast = parse_blast($uniprot_file, $blast_file); |
|---|
| 46 |
|
|---|
| 47 |
for my $file (@fasta_files) { |
|---|
| 48 |
|
|---|
| 49 |
open (my $IN, '<', $file) or die "Can't open $file for reading\n$!\n"; |
|---|
| 50 |
|
|---|
| 51 |
LINE: |
|---|
| 52 |
while (<$IN>) { |
|---|
| 53 |
|
|---|
| 54 |
if ($_ !~ /^>/) { |
|---|
| 55 |
print $_; |
|---|
| 56 |
next LINE; |
|---|
| 57 |
} |
|---|
| 58 |
|
|---|
| 59 |
my ($id, $mol_type, @attrbs) = split /\s+/, $_; |
|---|
| 60 |
$id =~ s/^>//; |
|---|
| 61 |
|
|---|
| 62 |
my $func = $blast->{$id} || 'Name:"Protein of unknown function"'; |
|---|
| 63 |
|
|---|
| 64 |
my $header = ">$id $mol_type "; |
|---|
| 65 |
$header .= "$func " if $func; |
|---|
| 66 |
$header .= join " ", @attrbs if @attrbs; |
|---|
| 67 |
|
|---|
| 68 |
print "$header\n"; |
|---|
| 69 |
|
|---|
| 70 |
} |
|---|
| 71 |
} |
|---|
| 72 |
|
|---|
| 73 |
|
|---|
| 74 |
|
|---|
| 75 |
|
|---|
| 76 |
sub parse_blast{ |
|---|
| 77 |
|
|---|
| 78 |
my ($fasta_file, $blast_file) = @_; |
|---|
| 79 |
|
|---|
| 80 |
open(my $IN, '<', $fasta_file) or |
|---|
| 81 |
die "Can't open $fasta_file for reading\n$!\n"; |
|---|
| 82 |
|
|---|
| 83 |
my %fasta; |
|---|
| 84 |
while (<$IN>) { |
|---|
| 85 |
|
|---|
| 86 |
next unless /^>/; |
|---|
| 87 |
|
|---|
| 88 |
my ($id, $desc, $org, $name); |
|---|
| 89 |
|
|---|
| 90 |
if (/>(\S+)\s+(.*?)\s+OS=(.*?)\s+(GN=(.*?)\s+)?PE=/) { |
|---|
| 91 |
$id = $1; |
|---|
| 92 |
$desc = $2; |
|---|
| 93 |
$org = $3; |
|---|
| 94 |
$name = $5 || ''; |
|---|
| 95 |
} |
|---|
| 96 |
else { |
|---|
| 97 |
warn "Can't parse details from FASTA header: $_\n"; |
|---|
| 98 |
} |
|---|
| 99 |
|
|---|
| 100 |
my $func = ''; |
|---|
| 101 |
$func .= "Similar to $name: " if $name; |
|---|
| 102 |
if ($func =~ /Similar to/) { |
|---|
| 103 |
$func .= "$desc " if $desc; |
|---|
| 104 |
} |
|---|
| 105 |
else { |
|---|
| 106 |
$func .= "Similar to $desc " if $desc; |
|---|
| 107 |
} |
|---|
| 108 |
$func .= "($org)" if $org; |
|---|
| 109 |
$func = ('Name:"' . $func . '"') if $func; |
|---|
| 110 |
|
|---|
| 111 |
$fasta{$id} = $func; |
|---|
| 112 |
|
|---|
| 113 |
print ''; |
|---|
| 114 |
} |
|---|
| 115 |
close $IN; |
|---|
| 116 |
|
|---|
| 117 |
open($IN, '<', $blast_file) or |
|---|
| 118 |
die "Can't open $blast_file for reading\n$!\n"; |
|---|
| 119 |
|
|---|
| 120 |
my (%blast, %seen); |
|---|
| 121 |
while (<$IN>) { |
|---|
| 122 |
|
|---|
| 123 |
my ($qid, $sid, $e_val) = split; |
|---|
| 124 |
|
|---|
| 125 |
next if $seen{$qid}; |
|---|
| 126 |
|
|---|
| 127 |
$seen{$qid}++; |
|---|
| 128 |
|
|---|
| 129 |
$blast{$qid} = $fasta{$sid} || ''; |
|---|
| 130 |
} |
|---|
| 131 |
return \%blast; |
|---|
| 132 |
} |
|---|