| 1 |
|
|---|
| 2 |
use strict; |
|---|
| 3 |
use warnings; |
|---|
| 4 |
use Getopt::Long; |
|---|
| 5 |
use URI::Escape; |
|---|
| 6 |
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 |
|
|---|
| 10 |
my $usage = " |
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 |
|
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
|
|---|
| 22 |
|
|---|
| 23 |
|
|---|
| 24 |
|
|---|
| 25 |
; |
|---|
| 26 |
|
|---|
| 27 |
|
|---|
| 28 |
my ($uniprot_file, $blast_file, $gff_file) = @ARGV; |
|---|
| 29 |
die $usage unless $uniprot_file && $blast_file && $gff_file; |
|---|
| 30 |
|
|---|
| 31 |
my $gene_map = parse_genes($gff_file); |
|---|
| 32 |
my $blast = parse_blast($uniprot_file, $blast_file, $gene_map); |
|---|
| 33 |
|
|---|
| 34 |
open (my $IN, '<', $gff_file) or die "Can't open $gff_file for reading\n$!\n"; |
|---|
| 35 |
|
|---|
| 36 |
LINE: |
|---|
| 37 |
while (my $line = <$IN>) { |
|---|
| 38 |
|
|---|
| 39 |
chomp $line; |
|---|
| 40 |
my ($seq, $source, $type, $start, $end, $score, $strand, $phase, |
|---|
| 41 |
$attrb) = split /\t/, $line; |
|---|
| 42 |
|
|---|
| 43 |
if ($line =~ /^\s*\ |
|---|
| 44 |
! $attrb || |
|---|
| 45 |
($type ne 'mRNA' && $type ne 'gene') |
|---|
| 46 |
) { |
|---|
| 47 |
print "$line\n"; |
|---|
| 48 |
next LINE; |
|---|
| 49 |
} |
|---|
| 50 |
|
|---|
| 51 |
my ($id) = $attrb =~ /ID\s*=\s*(.*?);/; |
|---|
| 52 |
|
|---|
| 53 |
my $note; |
|---|
| 54 |
if ($blast->{$id}{note}) { |
|---|
| 55 |
$note .= $blast->{$id}{note}; |
|---|
| 56 |
} |
|---|
| 57 |
else { |
|---|
| 58 |
$note .= 'Protein of unknown function'; |
|---|
| 59 |
} |
|---|
| 60 |
|
|---|
| 61 |
$note = uri_escape($note, ';=%&,'); |
|---|
| 62 |
|
|---|
| 63 |
$note = "Note=" . $note; |
|---|
| 64 |
|
|---|
| 65 |
$line =~ s/;$//; |
|---|
| 66 |
$line .= ';'; |
|---|
| 67 |
$line .= "$note;"; |
|---|
| 68 |
|
|---|
| 69 |
|
|---|
| 70 |
|
|---|
| 71 |
|
|---|
| 72 |
|
|---|
| 73 |
|
|---|
| 74 |
print "$line\n"; |
|---|
| 75 |
} |
|---|
| 76 |
|
|---|
| 77 |
|
|---|
| 78 |
|
|---|
| 79 |
sub parse_genes { |
|---|
| 80 |
|
|---|
| 81 |
my $file = shift; |
|---|
| 82 |
|
|---|
| 83 |
open (my $IN, '<', $file) or die "Can't open $file for reading\n$!\n"; |
|---|
| 84 |
|
|---|
| 85 |
my %gene_map; |
|---|
| 86 |
LINE: |
|---|
| 87 |
while (my $line = <$IN>) { |
|---|
| 88 |
|
|---|
| 89 |
chomp $line; |
|---|
| 90 |
my ($seq, $source, $type, $start, $end, $score, $strand, $phase, |
|---|
| 91 |
$attrb) = split /\t/, $line; |
|---|
| 92 |
|
|---|
| 93 |
if ($line =~ /^\s*\ |
|---|
| 94 |
! $attrb || |
|---|
| 95 |
($type ne 'mRNA') |
|---|
| 96 |
) { |
|---|
| 97 |
next LINE; |
|---|
| 98 |
} |
|---|
| 99 |
|
|---|
| 100 |
my ($id) = $attrb =~ /ID\s*=\s*(.*?);/; |
|---|
| 101 |
my ($parent) = $attrb =~ /Parent\s*=\s*(.*?);/; |
|---|
| 102 |
|
|---|
| 103 |
$gene_map{$id} = $parent; |
|---|
| 104 |
} |
|---|
| 105 |
return \%gene_map; |
|---|
| 106 |
} |
|---|
| 107 |
|
|---|
| 108 |
sub parse_blast{ |
|---|
| 109 |
|
|---|
| 110 |
my ($fasta_file, $blast_file, $gene_map) = @_; |
|---|
| 111 |
|
|---|
| 112 |
open(my $IN, '<', $fasta_file) or |
|---|
| 113 |
die "Can't open $fasta_file for reading\n$!\n"; |
|---|
| 114 |
|
|---|
| 115 |
my %fasta; |
|---|
| 116 |
while (<$IN>) { |
|---|
| 117 |
|
|---|
| 118 |
next unless /^>/; |
|---|
| 119 |
|
|---|
| 120 |
my ($id, $desc, $org, $name); |
|---|
| 121 |
|
|---|
| 122 |
if (/>(\S+)\s+(.*?)\s+OS=(.*?)\s+(GN=(.*?)\s+)?PE=/) { |
|---|
| 123 |
$id = $1; |
|---|
| 124 |
$desc = $2; |
|---|
| 125 |
$org = $3; |
|---|
| 126 |
$name = $5 || ''; |
|---|
| 127 |
} |
|---|
| 128 |
else { |
|---|
| 129 |
warn "Can't parse details from FASTA header: $_\n"; |
|---|
| 130 |
} |
|---|
| 131 |
|
|---|
| 132 |
my $note = ''; |
|---|
| 133 |
$note .= "Similar to " if $name || $desc; |
|---|
| 134 |
$note .= "$name: " if $name; |
|---|
| 135 |
$note .= "$desc " if $desc; |
|---|
| 136 |
$note .= "($org)" if $org; |
|---|
| 137 |
|
|---|
| 138 |
$fasta{$id}{note} = $note; |
|---|
| 139 |
|
|---|
| 140 |
$fasta{$id}{name} = $name; |
|---|
| 141 |
|
|---|
| 142 |
print ''; |
|---|
| 143 |
} |
|---|
| 144 |
close $IN; |
|---|
| 145 |
|
|---|
| 146 |
open($IN, '<', $blast_file) or |
|---|
| 147 |
die "Can't open $blast_file for reading\n$!\n"; |
|---|
| 148 |
|
|---|
| 149 |
my (%blast, %seen); |
|---|
| 150 |
while (<$IN>) { |
|---|
| 151 |
|
|---|
| 152 |
my ($qid, $sid, $e_val) = split; |
|---|
| 153 |
|
|---|
| 154 |
if (! $seen{$qid}) { |
|---|
| 155 |
$blast{$qid} = $fasta{$sid} || ''; |
|---|
| 156 |
$seen{$qid}++; |
|---|
| 157 |
} |
|---|
| 158 |
|
|---|
| 159 |
my $gene_id = $gene_map->{$qid}; |
|---|
| 160 |
if ($gene_id && ! $seen{$gene_id}) { |
|---|
| 161 |
$blast{$gene_id} = $fasta{$sid} || ''; |
|---|
| 162 |
$seen{$gene_id}++; |
|---|
| 163 |
} |
|---|
| 164 |
} |
|---|
| 165 |
return \%blast; |
|---|
| 166 |
} |
|---|