| 1 |
|
|---|
| 2 |
use strict; |
|---|
| 3 |
use warnings; |
|---|
| 4 |
use Getopt::Long; |
|---|
| 5 |
|
|---|
| 6 |
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 |
my $usage = " |
|---|
| 10 |
|
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 |
|
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
; |
|---|
| 21 |
|
|---|
| 22 |
|
|---|
| 23 |
my ($help); |
|---|
| 24 |
my $opt_success = GetOptions('help' => \$help, |
|---|
| 25 |
); |
|---|
| 26 |
|
|---|
| 27 |
die $usage if $help || ! $opt_success; |
|---|
| 28 |
|
|---|
| 29 |
my ($gff_file, $ipr_file) = @ARGV; |
|---|
| 30 |
die $usage unless $gff_file && $ipr_file; |
|---|
| 31 |
|
|---|
| 32 |
my $gene_map = parse_genes($gff_file); |
|---|
| 33 |
my $ipr = parse_ipr_scan($ipr_file, $gene_map); |
|---|
| 34 |
|
|---|
| 35 |
open (my $IN, '<', $gff_file) or die "Can't open $gff_file for reading\n$!\n"; |
|---|
| 36 |
unlink($gff_file); |
|---|
| 37 |
open (my $OUT, '>', $gff_file) or die "Can't open $gff_file for writing\n$!\n"; |
|---|
| 38 |
|
|---|
| 39 |
LINE: |
|---|
| 40 |
while (my $line = <$IN>) { |
|---|
| 41 |
|
|---|
| 42 |
chomp $line; |
|---|
| 43 |
my ($seq, $source, $type, $start, $end, $score, $strand, $phase, |
|---|
| 44 |
$attrb) = split /\t/, $line; |
|---|
| 45 |
|
|---|
| 46 |
if ($line =~ /^\s*\ |
|---|
| 47 |
! $attrb || |
|---|
| 48 |
($type ne 'mRNA' && $type ne 'gene') |
|---|
| 49 |
) { |
|---|
| 50 |
print $OUT "$line\n"; |
|---|
| 51 |
next LINE; |
|---|
| 52 |
} |
|---|
| 53 |
|
|---|
| 54 |
my ($id) = $attrb =~ /ID\s*=\s*(.*?);/; |
|---|
| 55 |
|
|---|
| 56 |
my %db_xrefs; |
|---|
| 57 |
my $db_xref = $ipr->{$id}{dbxref}; |
|---|
| 58 |
for my $db_name (keys %{$db_xref}) { |
|---|
| 59 |
for my $db_data (@{$db_xref->{$db_name}}) { |
|---|
| 60 |
next if $db_data->{id} eq 'NULL'; |
|---|
| 61 |
$db_xrefs{"$db_name:" . $db_data->{id}}++; |
|---|
| 62 |
} |
|---|
| 63 |
} |
|---|
| 64 |
my $db_xref_text = ''; |
|---|
| 65 |
my @db_xrefs = sort keys %db_xrefs; |
|---|
| 66 |
if (@db_xrefs) { |
|---|
| 67 |
$db_xref_text = 'Dbxref='; |
|---|
| 68 |
$db_xref_text .= join ',', @db_xrefs; |
|---|
| 69 |
$db_xref_text .= ';'; |
|---|
| 70 |
} |
|---|
| 71 |
|
|---|
| 72 |
my $ontology_text = ''; |
|---|
| 73 |
if ($ipr->{$id}{go}) { |
|---|
| 74 |
$ontology_text = 'Ontology_term='; |
|---|
| 75 |
my @go_terms = sort keys %{$ipr->{$id}{go}}; |
|---|
| 76 |
$ontology_text .= join ',', @go_terms; |
|---|
| 77 |
$ontology_text .= ';'; |
|---|
| 78 |
}; |
|---|
| 79 |
|
|---|
| 80 |
$line =~ s/;$//; |
|---|
| 81 |
$line .= ';'; |
|---|
| 82 |
$line .= "$db_xref_text$ontology_text"; |
|---|
| 83 |
print $OUT "$line\n"; |
|---|
| 84 |
} |
|---|
| 85 |
|
|---|
| 86 |
|
|---|
| 87 |
|
|---|
| 88 |
|
|---|
| 89 |
sub parse_genes { |
|---|
| 90 |
|
|---|
| 91 |
my $file = shift; |
|---|
| 92 |
|
|---|
| 93 |
open (my $IN, '<', $file) or die "Can't open $file for reading\n$!\n"; |
|---|
| 94 |
|
|---|
| 95 |
my %gene_map; |
|---|
| 96 |
LINE: |
|---|
| 97 |
while (my $line = <$IN>) { |
|---|
| 98 |
|
|---|
| 99 |
chomp $line; |
|---|
| 100 |
my ($seq, $source, $type, $start, $end, $score, $strand, $phase, |
|---|
| 101 |
$attrb) = split /\t/, $line; |
|---|
| 102 |
|
|---|
| 103 |
if ($line =~ /^\s*\ |
|---|
| 104 |
! $attrb || |
|---|
| 105 |
($type ne 'mRNA') |
|---|
| 106 |
) { |
|---|
| 107 |
next LINE; |
|---|
| 108 |
} |
|---|
| 109 |
|
|---|
| 110 |
my ($id) = $attrb =~ /ID\s*=\s*(.*?);/; |
|---|
| 111 |
my ($parent) = $attrb =~ /Parent\s*=\s*(.*?);/; |
|---|
| 112 |
|
|---|
| 113 |
$gene_map{$id} = $parent; |
|---|
| 114 |
} |
|---|
| 115 |
return \%gene_map; |
|---|
| 116 |
} |
|---|
| 117 |
|
|---|
| 118 |
sub parse_ipr_scan { |
|---|
| 119 |
|
|---|
| 120 |
my ($file, $gene_map) = @_; |
|---|
| 121 |
|
|---|
| 122 |
open($IN, '<', $file) or die "Can't open $file for reading\n$!\n"; |
|---|
| 123 |
|
|---|
| 124 |
my %db_map = (BlastProDom => 'ProDom', |
|---|
| 125 |
FPrintScan => 'PRINTS', |
|---|
| 126 |
Gene3D => 'Gene3D', |
|---|
| 127 |
HMMPanther => 'PANTHER', |
|---|
| 128 |
HMMPfam => 'Pfam', |
|---|
| 129 |
HMMPIR => 'PIR', |
|---|
| 130 |
HMMSmart => 'SMART', |
|---|
| 131 |
HMMTigr => 'JCVI_TIGRFAMS', |
|---|
| 132 |
PatternScan => 'Prosite', |
|---|
| 133 |
ProfileScan => 'Prosite', |
|---|
| 134 |
); |
|---|
| 135 |
|
|---|
| 136 |
|
|---|
| 137 |
my %ipr; |
|---|
| 138 |
while (<$IN>) { |
|---|
| 139 |
|
|---|
| 140 |
my ($seq_id, $crc_chksm, $aa_len, $method, $db_id, $db_desc, |
|---|
| 141 |
$start, $end, $e_val, $status, $date, $ipr_id, |
|---|
| 142 |
$ipr_desc, $go_text) = split /\t/, $_; |
|---|
| 143 |
|
|---|
| 144 |
next if grep {$method eq $_} qw(Seg Coil SignalPHMM); |
|---|
| 145 |
|
|---|
| 146 |
$method = $db_map{$method} || $method; |
|---|
| 147 |
|
|---|
| 148 |
push @{$ipr{$seq_id}{dbxref}{$method}}, {id => $db_id, |
|---|
| 149 |
desc => $db_desc}; |
|---|
| 150 |
push @{$ipr{$seq_id}{dbxref}{InterPro}}, {id => $ipr_id, |
|---|
| 151 |
desc => $ipr_desc}; |
|---|
| 152 |
my @go_ids = $go_text =~ /\((GO:\d+)\)/g; |
|---|
| 153 |
map {$ipr{$seq_id}{go}{$_}++} @go_ids; |
|---|
| 154 |
|
|---|
| 155 |
|
|---|
| 156 |
my $gene_id = $gene_map->{$seq_id}; |
|---|
| 157 |
push @{$ipr{$gene_id}{dbxref}{$method}}, {id => $db_id, |
|---|
| 158 |
desc => $db_desc}; |
|---|
| 159 |
push @{$ipr{$gene_id}{dbxref}{InterPro}}, {id => $ipr_id, |
|---|
| 160 |
desc => $ipr_desc}; |
|---|
| 161 |
map {$ipr{$gene_id}{go}{$_}++} @go_ids; |
|---|
| 162 |
} |
|---|
| 163 |
return \%ipr; |
|---|
| 164 |
} |
|---|