| 1 |
#! /usr/bin/perl -w |
|---|
| 2 |
|
|---|
| 3 |
use strict; |
|---|
| 4 |
use URI::Escape; |
|---|
| 5 |
|
|---|
| 6 |
my $file = shift; |
|---|
| 7 |
my $gff3 = shift; |
|---|
| 8 |
|
|---|
| 9 |
my %index; |
|---|
| 10 |
my %list; |
|---|
| 11 |
my %GO; |
|---|
| 12 |
|
|---|
| 13 |
open(IN, "< $file"); |
|---|
| 14 |
while(defined(my $line = <IN>)){ |
|---|
| 15 |
chomp $line; |
|---|
| 16 |
my @F = split("\t", $line); |
|---|
| 17 |
|
|---|
| 18 |
if($index{$F[0]}{$F[11]}){ |
|---|
| 19 |
my $ok = 0; |
|---|
| 20 |
F1: foreach my $grp (@{$index{$F[0]}{$F[11]}}){ |
|---|
| 21 |
F2: foreach my $m (@$grp){ |
|---|
| 22 |
if(overlap($m->[0], $m->[1], $F[6], $F[7]) > 0.5){ |
|---|
| 23 |
#add new member |
|---|
| 24 |
$ok = 1; |
|---|
| 25 |
push(@$grp, [$F[6], $F[7]]); |
|---|
| 26 |
last F1; |
|---|
| 27 |
} |
|---|
| 28 |
} |
|---|
| 29 |
} |
|---|
| 30 |
|
|---|
| 31 |
#make new group |
|---|
| 32 |
push(@{$index{$F[0]}{$F[11]}}, [[$F[6], $F[7]]]) if(! $ok); |
|---|
| 33 |
} |
|---|
| 34 |
else{ |
|---|
| 35 |
#make new group |
|---|
| 36 |
push(@{$index{$F[0]}{$F[11]}}, [[$F[6], $F[7]]]); |
|---|
| 37 |
} |
|---|
| 38 |
|
|---|
| 39 |
my @go = $line =~ /(GO\:\d+)/; |
|---|
| 40 |
|
|---|
| 41 |
$F[12] =~ s/\t/ /g; |
|---|
| 42 |
$F[12] = uri_escape($F[12], '\,\;\=\%\&'); |
|---|
| 43 |
$list{$F[11]} = $F[12]; |
|---|
| 44 |
$GO{$F[11]} = join(",", @go); |
|---|
| 45 |
} |
|---|
| 46 |
close(IN); |
|---|
| 47 |
|
|---|
| 48 |
#colapse groups |
|---|
| 49 |
while(my $t = each %index){ |
|---|
| 50 |
while(my $d = each %{$index{$t}}){ |
|---|
| 51 |
foreach my $grp (@{$index{$t}{$d}}){ |
|---|
| 52 |
my $B; |
|---|
| 53 |
my $E; |
|---|
| 54 |
foreach my $m (@$grp){ |
|---|
| 55 |
$B = $m->[0] if(!$B || $m->[0] < $B); |
|---|
| 56 |
$E = $m->[1] if(!$E || $m->[1] > $E); |
|---|
| 57 |
} |
|---|
| 58 |
$grp = [$B, $E]; |
|---|
| 59 |
} |
|---|
| 60 |
} |
|---|
| 61 |
} |
|---|
| 62 |
|
|---|
| 63 |
while(my $t = each %index){ |
|---|
| 64 |
while(my $d = each %{$index{$t}}){ |
|---|
| 65 |
my $array = $index{$t}{$d}; |
|---|
| 66 |
my @keepers; |
|---|
| 67 |
for(my $i = 0; $i < @$array; $i++){ |
|---|
| 68 |
my $grp1 = $array->[$i]; |
|---|
| 69 |
next if(! $grp1); |
|---|
| 70 |
my $B = $grp1->[0]; |
|---|
| 71 |
my $E = $grp1->[1]; |
|---|
| 72 |
for(my $j = $i+1; $j < @$array; $j++){ |
|---|
| 73 |
my $grp2 = $array->[$j]; |
|---|
| 74 |
next if(! $grp2); |
|---|
| 75 |
if(overlap($grp1->[0], $grp1->[1], $grp2->[0], $grp2->[1]) > 0.5){ |
|---|
| 76 |
$B = $grp2->[0] if($grp2->[0] < $B); |
|---|
| 77 |
$E = $grp2->[1] if($grp2->[1] > $E); |
|---|
| 78 |
|
|---|
| 79 |
$array->[$j] = undef; |
|---|
| 80 |
} |
|---|
| 81 |
} |
|---|
| 82 |
push(@keepers, [$B, $E]) |
|---|
| 83 |
} |
|---|
| 84 |
$index{$t}{$d} = \@keepers; |
|---|
| 85 |
} |
|---|
| 86 |
} |
|---|
| 87 |
|
|---|
| 88 |
#parse gff3 |
|---|
| 89 |
my %details; |
|---|
| 90 |
my %mRNAs; |
|---|
| 91 |
open(IN, "< $gff3"); |
|---|
| 92 |
while(defined(my $line = <IN>)){ |
|---|
| 93 |
last if($line =~ /\#\#FASTA/); |
|---|
| 94 |
next if($line =~ /^\#/); |
|---|
| 95 |
next unless($line =~ /\tCDS\t/); |
|---|
| 96 |
|
|---|
| 97 |
chomp $line; |
|---|
| 98 |
my @F = split("\t", $line); |
|---|
| 99 |
if($F[2] eq 'CDS'){ |
|---|
| 100 |
my ($parent) = $F[8] =~ /Parent\=([^\;\n]+)/; |
|---|
| 101 |
push(@{$mRNAs{$parent}}, [$F[3], $F[4]]); |
|---|
| 102 |
$details{$parent}{seqid} = $F[0]; |
|---|
| 103 |
$details{$parent}{strand} = $F[6]; |
|---|
| 104 |
} |
|---|
| 105 |
} |
|---|
| 106 |
close(IN); |
|---|
| 107 |
|
|---|
| 108 |
#sort CDSs |
|---|
| 109 |
while(my $key = each %mRNAs){ |
|---|
| 110 |
@{$mRNAs{$key}} = sort {$a->[0] <=> $b->[0]} @{$mRNAs{$key}}; |
|---|
| 111 |
} |
|---|
| 112 |
|
|---|
| 113 |
#make gff3 lines |
|---|
| 114 |
my $id = 0; |
|---|
| 115 |
while(my $t = each %index){ |
|---|
| 116 |
my $CDSs = $mRNAs{$t}; |
|---|
| 117 |
while(my $d = each %{$index{$t}}){ |
|---|
| 118 |
my $set = $index{$t}{$d}; |
|---|
| 119 |
|
|---|
| 120 |
foreach my $hit (@$set){ |
|---|
| 121 |
my @match; |
|---|
| 122 |
|
|---|
| 123 |
my $leader = 0; |
|---|
| 124 |
my $started = 0; |
|---|
| 125 |
foreach my $CDS (@$CDSs){ |
|---|
| 126 |
my $offset = $CDS->[0] - 1; |
|---|
| 127 |
|
|---|
| 128 |
my $B = $hit->[0] + $offset - $leader; |
|---|
| 129 |
my $E = $hit->[1] + $offset - $leader; |
|---|
| 130 |
|
|---|
| 131 |
my $cB = $CDS->[0]; |
|---|
| 132 |
my $cE = $CDS->[1]; |
|---|
| 133 |
|
|---|
| 134 |
if($cB <= $B && $B <= $cE && $cB <= $E && $E <= $cE){ |
|---|
| 135 |
push(@match, [$B, $E, $offset - $leader]); |
|---|
| 136 |
last; |
|---|
| 137 |
} |
|---|
| 138 |
elsif($cB <= $B && $B <= $cE){ |
|---|
| 139 |
push(@match, [$B, $cE, $offset - $leader]); |
|---|
| 140 |
$started = 1; |
|---|
| 141 |
} |
|---|
| 142 |
elsif($cB <= $E && $E <= $cE){ |
|---|
| 143 |
push(@match, [$cB, $E, $offset - $leader]); |
|---|
| 144 |
last; |
|---|
| 145 |
} |
|---|
| 146 |
elsif($started){ |
|---|
| 147 |
push(@match, [$cB, $cE, $offset - $leader]); |
|---|
| 148 |
} |
|---|
| 149 |
|
|---|
| 150 |
$leader += abs($cE - $cB) + 1; |
|---|
| 151 |
} |
|---|
| 152 |
|
|---|
| 153 |
@match = sort {$a->[0] <=> $b->[0]} @match; |
|---|
| 154 |
|
|---|
| 155 |
#match begin and end in gff3 |
|---|
| 156 |
my $mB = $match[0]->[0]; |
|---|
| 157 |
my $mE = $match[-1]->[1]; |
|---|
| 158 |
|
|---|
| 159 |
#target begin and end in attributes |
|---|
| 160 |
my $tB = $mB - $match[0]->[2]; |
|---|
| 161 |
my $tE = $mE - $match[-1]->[2]; |
|---|
| 162 |
|
|---|
| 163 |
#gff3 colums |
|---|
| 164 |
my $seqid = $details{$t}{seqid}; |
|---|
| 165 |
my $source = 'iprscan'; |
|---|
| 166 |
my $type = 'match'; |
|---|
| 167 |
my $start = $mB; |
|---|
| 168 |
my $end = $mE; |
|---|
| 169 |
my $score = '.'; |
|---|
| 170 |
my $strand = $details{$t}{strand}; |
|---|
| 171 |
my $phase = '.'; |
|---|
| 172 |
my $p_id = "match:".$id++; |
|---|
| 173 |
my $attributes = "Name=$d $list{$d}\;ID=$p_id\;Target=$d $tB $tE +\;"; |
|---|
| 174 |
$attributes .= "Ontology_term=$GO{$d}\;" if($GO{$d}); |
|---|
| 175 |
|
|---|
| 176 |
print "$seqid\t$source\t$type\t$start\t$end\t$score\t$strand\t$phase\t$attributes\n"; |
|---|
| 177 |
|
|---|
| 178 |
foreach my $p (@match){ |
|---|
| 179 |
my $pB = $p->[0]; |
|---|
| 180 |
my $pE = $p->[1]; |
|---|
| 181 |
my $off = $p->[2]; |
|---|
| 182 |
|
|---|
| 183 |
#set nesessary values |
|---|
| 184 |
$type = 'match_part'; |
|---|
| 185 |
$start = $pB; |
|---|
| 186 |
$end = $pE; |
|---|
| 187 |
$attributes = "Name=$d:match_part:$id\;ID=match_part:".$id++."\;"; |
|---|
| 188 |
$attributes .= "Parent=$p_id\;Target=$d ".($pB - $off)." ".($pE - $off)." +\;"; |
|---|
| 189 |
|
|---|
| 190 |
print "$seqid\t$source\t$type\t$start\t$end\t$score\t$strand\t$phase\t$attributes\n"; |
|---|
| 191 |
} |
|---|
| 192 |
} |
|---|
| 193 |
} |
|---|
| 194 |
} |
|---|
| 195 |
|
|---|
| 196 |
#----------------SUBS-------------------------- |
|---|
| 197 |
sub overlap { |
|---|
| 198 |
my $aB = shift; |
|---|
| 199 |
my $aE = shift; |
|---|
| 200 |
my $bB = shift; |
|---|
| 201 |
my $bE = shift; |
|---|
| 202 |
|
|---|
| 203 |
my @seq; |
|---|
| 204 |
|
|---|
| 205 |
for(my $i = $aB; $i <= $aE; $i++){ |
|---|
| 206 |
$seq[$i]++; |
|---|
| 207 |
} |
|---|
| 208 |
|
|---|
| 209 |
for(my $i = $bB; $i <= $bE; $i++){ |
|---|
| 210 |
$seq[$i]++; |
|---|
| 211 |
} |
|---|
| 212 |
|
|---|
| 213 |
my $count = 0; |
|---|
| 214 |
foreach my $c (@seq[$aB..$aE]){ |
|---|
| 215 |
$count++ if($c == 2); |
|---|
| 216 |
} |
|---|
| 217 |
my $over1 = $count / ($aE - $aB + 1); |
|---|
| 218 |
|
|---|
| 219 |
$count = 0; |
|---|
| 220 |
foreach my $c (@seq[$bB..$bE]){ |
|---|
| 221 |
$count++ if($c == 2); |
|---|
| 222 |
} |
|---|
| 223 |
my $over2 = $count / ($bE - $bB + 1); |
|---|
| 224 |
|
|---|
| 225 |
return ($over1 > $over2) ? $over1: $over2; |
|---|
| 226 |
} |
|---|