| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
package clean; |
|---|
| 5 |
use strict; |
|---|
| 6 |
use vars qw(@ISA @EXPORT $VERSION); |
|---|
| 7 |
use Exporter; |
|---|
| 8 |
use PostData; |
|---|
| 9 |
use FileHandle; |
|---|
| 10 |
use PostData; |
|---|
| 11 |
use Exporter; |
|---|
| 12 |
use PhatHit_utils; |
|---|
| 13 |
use compare; |
|---|
| 14 |
use cluster; |
|---|
| 15 |
use exonerate::splice_info; |
|---|
| 16 |
|
|---|
| 17 |
@ISA = qw( |
|---|
| 18 |
); |
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
|
|---|
| 22 |
sub purge_short_single_exons { |
|---|
| 23 |
my $phat_hits = shift; |
|---|
| 24 |
my $min = shift; |
|---|
| 25 |
|
|---|
| 26 |
my @keepers; |
|---|
| 27 |
foreach my $hit (@{$phat_hits}){ |
|---|
| 28 |
if ($hit->num_hsps == 1){ |
|---|
| 29 |
push(@keepers, $hit) if($hit->{_hsps}->[0]->length('query') >= $min); |
|---|
| 30 |
} |
|---|
| 31 |
else{ |
|---|
| 32 |
push(@keepers, $hit); |
|---|
| 33 |
} |
|---|
| 34 |
} |
|---|
| 35 |
|
|---|
| 36 |
return \@keepers |
|---|
| 37 |
} |
|---|
| 38 |
|
|---|
| 39 |
sub throw_out_bad_splicers { |
|---|
| 40 |
my $phat_hits = shift; |
|---|
| 41 |
my $seq = shift; |
|---|
| 42 |
|
|---|
| 43 |
my @keepers; |
|---|
| 44 |
foreach my $hit (@{$phat_hits}){ |
|---|
| 45 |
if ($hit->num_hsps == 1){ |
|---|
| 46 |
push(@keepers, $hit); |
|---|
| 47 |
next; |
|---|
| 48 |
} |
|---|
| 49 |
|
|---|
| 50 |
exonerate::splice_info::set_donors_acceptors($hit, $seq); |
|---|
| 51 |
my $str = exonerate::splice_info::get_splice_str($hit); |
|---|
| 52 |
|
|---|
| 53 |
my $num = exonerate::splice_info::get_numbers($str); |
|---|
| 54 |
|
|---|
| 55 |
my $s_ratio = $num->{p}/$num->{l}; |
|---|
| 56 |
next if $s_ratio < 0.75; |
|---|
| 57 |
|
|---|
| 58 |
push(@keepers, $hit); |
|---|
| 59 |
} |
|---|
| 60 |
return \@keepers; |
|---|
| 61 |
} |
|---|
| 62 |
|
|---|
| 63 |
sub purge_single_exon_hits { |
|---|
| 64 |
my $phat_hits = shift; |
|---|
| 65 |
|
|---|
| 66 |
my @keepers; |
|---|
| 67 |
foreach my $hit (@{$phat_hits}){ |
|---|
| 68 |
my $num_hsps = $hit->num_hsps(); |
|---|
| 69 |
next unless $num_hsps > 1; |
|---|
| 70 |
push(@keepers, $hit); |
|---|
| 71 |
} |
|---|
| 72 |
return \@keepers; |
|---|
| 73 |
} |
|---|
| 74 |
|
|---|
| 75 |
sub alt_sort { |
|---|
| 76 |
$b->num_hsps <=> $a->num_hsps || $b->length <=> $a->length; |
|---|
| 77 |
} |
|---|
| 78 |
|
|---|
| 79 |
sub is_identical_form { |
|---|
| 80 |
my $hit1 = shift; |
|---|
| 81 |
my $hit2 = shift; |
|---|
| 82 |
|
|---|
| 83 |
my @hsps1 = sort {$a->start('query') <=> $b->start('query')} $hit1->hsps; |
|---|
| 84 |
my @hsps2 = sort {$a->start('query') <=> $b->start('query')} $hit2->hsps; |
|---|
| 85 |
|
|---|
| 86 |
return if (@hsps1 != @hsps2); |
|---|
| 87 |
|
|---|
| 88 |
for(my $i = 0; $i < @hsps1; $i++){ |
|---|
| 89 |
return if($hsps1[$i]->start('query') != $hsps2[$i]->start('query')); |
|---|
| 90 |
return if($hsps1[$i]->end('query') != $hsps2[$i]->end('query')); |
|---|
| 91 |
} |
|---|
| 92 |
|
|---|
| 93 |
return 1; |
|---|
| 94 |
} |
|---|
| 95 |
|
|---|
| 96 |
sub remove_redundant_alt_splices { |
|---|
| 97 |
my $candidates = shift; |
|---|
| 98 |
my $seq = shift; |
|---|
| 99 |
my $flank = shift; |
|---|
| 100 |
|
|---|
| 101 |
return [] unless defined($candidates); |
|---|
| 102 |
|
|---|
| 103 |
my @sorted = sort alt_sort @{$candidates}; |
|---|
| 104 |
|
|---|
| 105 |
my $num = @sorted; |
|---|
| 106 |
my %dead; |
|---|
| 107 |
for (my $i = 0; $i < @sorted -1 ;$i++){ |
|---|
| 108 |
print STDERR " ...processing $i of $num\n" unless($main::quiet); |
|---|
| 109 |
my $hit_i = $sorted[$i]; |
|---|
| 110 |
if (defined($dead{$i})){ |
|---|
| 111 |
next; |
|---|
| 112 |
} |
|---|
| 113 |
for (my $j = $i+1; $j < @sorted; $j++){ |
|---|
| 114 |
my $hit_j = $sorted[$j]; |
|---|
| 115 |
if (defined($dead{$j})){ |
|---|
| 116 |
next; |
|---|
| 117 |
} |
|---|
| 118 |
if (compare::is_redundant_alt_form($hit_i, $hit_j, $seq, $flank)){ |
|---|
| 119 |
$dead{$j}++; |
|---|
| 120 |
|
|---|
| 121 |
} |
|---|
| 122 |
} |
|---|
| 123 |
} |
|---|
| 124 |
|
|---|
| 125 |
my @transcripts; |
|---|
| 126 |
for (my $i = 0; $i < @sorted; $i++){ |
|---|
| 127 |
push(@transcripts, $sorted[$i]) |
|---|
| 128 |
unless defined($dead{$i}); |
|---|
| 129 |
} |
|---|
| 130 |
return \@transcripts; |
|---|
| 131 |
} |
|---|
| 132 |
|
|---|
| 133 |
sub get_best_alt_splices { |
|---|
| 134 |
my $candidates = shift; |
|---|
| 135 |
my $seq = shift; |
|---|
| 136 |
my $flank = shift; |
|---|
| 137 |
|
|---|
| 138 |
return [] unless defined($candidates); |
|---|
| 139 |
|
|---|
| 140 |
my @sorted = sort alt_sort @{$candidates}; |
|---|
| 141 |
|
|---|
| 142 |
my $num = @sorted; |
|---|
| 143 |
my %dead; |
|---|
| 144 |
for (my $i = 0; $i < @sorted -1 ;$i++){ |
|---|
| 145 |
print STDERR " ...processing $i of $num\n" unless($main::quiet); |
|---|
| 146 |
my $hit_i = $sorted[$i]; |
|---|
| 147 |
if (defined($dead{$i})){ |
|---|
| 148 |
next; |
|---|
| 149 |
} |
|---|
| 150 |
for (my $j = $i+1; $j < @sorted; $j++){ |
|---|
| 151 |
my $hit_j = $sorted[$j]; |
|---|
| 152 |
if (defined($dead{$j})){ |
|---|
| 153 |
next; |
|---|
| 154 |
} |
|---|
| 155 |
if (compare::is_same_alt_form($hit_i, $hit_j, $seq, $flank)){ |
|---|
| 156 |
$dead{$j}++; |
|---|
| 157 |
|
|---|
| 158 |
} |
|---|
| 159 |
} |
|---|
| 160 |
} |
|---|
| 161 |
|
|---|
| 162 |
my @transcripts; |
|---|
| 163 |
for (my $i = 0; $i < @sorted; $i++){ |
|---|
| 164 |
push(@transcripts, $sorted[$i]) |
|---|
| 165 |
unless defined($dead{$i}); |
|---|
| 166 |
} |
|---|
| 167 |
return \@transcripts; |
|---|
| 168 |
} |
|---|
| 169 |
|
|---|
| 170 |
|
|---|
| 171 |
|
|---|
| 172 |
sub complexity_filter { |
|---|
| 173 |
my $candidates = shift; |
|---|
| 174 |
my $seq = shift; |
|---|
| 175 |
|
|---|
| 176 |
my @keepers; |
|---|
| 177 |
|
|---|
| 178 |
foreach my $f (@$candidates){ |
|---|
| 179 |
my $qOff = $f->start('query'); |
|---|
| 180 |
my $hOff = $f->start('hit'); |
|---|
| 181 |
|
|---|
| 182 |
my @q_cov; |
|---|
| 183 |
my @h_cov; |
|---|
| 184 |
foreach my $hsp ($f->hsps){ |
|---|
| 185 |
my $qS = $hsp->start('query') - $qOff; |
|---|
| 186 |
my $qE = $hsp->end('query') - $qOff; |
|---|
| 187 |
|
|---|
| 188 |
my $hS = $hsp->start('hit') - $hOff; |
|---|
| 189 |
my $hE = $hsp->end('hit') - $hOff; |
|---|
| 190 |
|
|---|
| 191 |
@q_cov[$qS..$qE] = map {(defined $_) ? $_ + 1 : 1} (@q_cov[$qS..$qE]); |
|---|
| 192 |
@h_cov[$hS..$hE] = map {(defined $_) ? $_ + 1 : 1} (@h_cov[$hS..$hE]); |
|---|
| 193 |
} |
|---|
| 194 |
|
|---|
| 195 |
|
|---|
| 196 |
my $count = 0; |
|---|
| 197 |
my $total = 0; |
|---|
| 198 |
foreach my $c (@q_cov){ |
|---|
| 199 |
next if(! defined $c); |
|---|
| 200 |
$count++; |
|---|
| 201 |
$total += $c; |
|---|
| 202 |
} |
|---|
| 203 |
|
|---|
| 204 |
my $q_ave = ($count) ? $total/$count : 0; |
|---|
| 205 |
|
|---|
| 206 |
|
|---|
| 207 |
$count = 0; |
|---|
| 208 |
$total = 0; |
|---|
| 209 |
foreach my $c (@h_cov){ |
|---|
| 210 |
next if(! defined $c); |
|---|
| 211 |
$count++; |
|---|
| 212 |
$total += $c; |
|---|
| 213 |
} |
|---|
| 214 |
|
|---|
| 215 |
my $h_ave = ($count) ? $total/$count : 0; |
|---|
| 216 |
|
|---|
| 217 |
|
|---|
| 218 |
push(@keepers, $f) unless(($q_ave + $h_ave)/2 > 1.5); |
|---|
| 219 |
} |
|---|
| 220 |
|
|---|
| 221 |
return \@keepers; |
|---|
| 222 |
} |
|---|
| 223 |
|
|---|
| 224 |
1; |
|---|
| 225 |
|
|---|
| 226 |
|
|---|