| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
package repeat_mask_seq; |
|---|
| 5 |
use strict; |
|---|
| 6 |
use vars qw(@ISA @EXPORT $VERSION); |
|---|
| 7 |
use Exporter; |
|---|
| 8 |
use PostData; |
|---|
| 9 |
use FileHandle; |
|---|
| 10 |
use PostData; |
|---|
| 11 |
use Exporter; |
|---|
| 12 |
use PhatHit_utils; |
|---|
| 13 |
use compare; |
|---|
| 14 |
use cluster; |
|---|
| 15 |
use clean; |
|---|
| 16 |
use Bio::Search::HSP::PhatHSP::blastx; |
|---|
| 17 |
use Bio::Search::HSP::PhatHSP::blastn; |
|---|
| 18 |
use Bio::Search::HSP::PhatHSP::tblastx; |
|---|
| 19 |
|
|---|
| 20 |
@ISA = qw( |
|---|
| 21 |
); |
|---|
| 22 |
|
|---|
| 23 |
|
|---|
| 24 |
|
|---|
| 25 |
sub process { |
|---|
| 26 |
my $query_seq = pop; |
|---|
| 27 |
|
|---|
| 28 |
my @features; |
|---|
| 29 |
|
|---|
| 30 |
while (my $f = shift){ |
|---|
| 31 |
push(@features, @{$f}); |
|---|
| 32 |
} |
|---|
| 33 |
|
|---|
| 34 |
my ($tes, $lcs) = seperate_types(\@features); |
|---|
| 35 |
|
|---|
| 36 |
my $shattered_lcs = shatter_hits($lcs); |
|---|
| 37 |
my $tes_keepers = clean_tes($query_seq, $tes); |
|---|
| 38 |
|
|---|
| 39 |
my @best_keepers = (@{$shattered_lcs}, @{$tes_keepers}); |
|---|
| 40 |
|
|---|
| 41 |
return (\@best_keepers); |
|---|
| 42 |
} |
|---|
| 43 |
|
|---|
| 44 |
sub shatter_hits { |
|---|
| 45 |
my $hits = shift; |
|---|
| 46 |
my @shattered_hits; |
|---|
| 47 |
foreach my $hit (@{$hits}){ |
|---|
| 48 |
my $shatter_hits = PhatHit_utils::shatter_hit($hit); |
|---|
| 49 |
push(@shattered_hits, @{$shatter_hits}); |
|---|
| 50 |
} |
|---|
| 51 |
|
|---|
| 52 |
return \@shattered_hits; |
|---|
| 53 |
} |
|---|
| 54 |
|
|---|
| 55 |
sub span_length { |
|---|
| 56 |
my $hit = shift; |
|---|
| 57 |
|
|---|
| 58 |
my ($b, $e) = PhatHit_utils::get_span_of_hit($hit, 'query'); |
|---|
| 59 |
|
|---|
| 60 |
return abs($b - $e); |
|---|
| 61 |
|
|---|
| 62 |
} |
|---|
| 63 |
|
|---|
| 64 |
sub l_sort { |
|---|
| 65 |
span_length($b) <=> span_length($a) |
|---|
| 66 |
} |
|---|
| 67 |
|
|---|
| 68 |
sub clean_tes { |
|---|
| 69 |
my $seq = shift; |
|---|
| 70 |
my $tes = shift; |
|---|
| 71 |
|
|---|
| 72 |
my $shattered_hits = shatter_hits($tes); |
|---|
| 73 |
|
|---|
| 74 |
my $clusters = cluster::shadow_cluster(10, $seq, $shattered_hits, 20); |
|---|
| 75 |
|
|---|
| 76 |
my @keepers; |
|---|
| 77 |
foreach my $c (@{$clusters}){ |
|---|
| 78 |
my @sorted = sort l_sort @{$c}; |
|---|
| 79 |
push(@keepers, $sorted[0]); |
|---|
| 80 |
} |
|---|
| 81 |
return \@keepers; |
|---|
| 82 |
} |
|---|
| 83 |
|
|---|
| 84 |
sub mask_chunk { |
|---|
| 85 |
my $chunk = shift; |
|---|
| 86 |
my $features = shift; |
|---|
| 87 |
|
|---|
| 88 |
my ($tes, $lcs) = seperate_types($features); |
|---|
| 89 |
|
|---|
| 90 |
my $chunk_offset = $chunk->offset(); |
|---|
| 91 |
|
|---|
| 92 |
my $tes_coors = get_coors($tes, $chunk_offset); |
|---|
| 93 |
my $lcs_coors = get_coors($lcs, $chunk_offset); |
|---|
| 94 |
|
|---|
| 95 |
my $seq = $chunk->seq(); |
|---|
| 96 |
|
|---|
| 97 |
_hard_mask_seq (\$seq, $tes_coors, 50, 'N'); |
|---|
| 98 |
_soft_mask_seq(\$seq, $lcs_coors, 0); |
|---|
| 99 |
|
|---|
| 100 |
$chunk->seq($seq); |
|---|
| 101 |
|
|---|
| 102 |
return $chunk; |
|---|
| 103 |
} |
|---|
| 104 |
|
|---|
| 105 |
sub _hard_mask_seq { |
|---|
| 106 |
my $seq = shift; |
|---|
| 107 |
my $features = shift; |
|---|
| 108 |
my $flank = shift || 0; |
|---|
| 109 |
my $replace = shift || 'N'; |
|---|
| 110 |
|
|---|
| 111 |
foreach my $p (@{$features}){ |
|---|
| 112 |
my $b = $p->[0]; |
|---|
| 113 |
my $e = $p->[1]; |
|---|
| 114 |
|
|---|
| 115 |
($b, $e) = ($e, $b) if $e < $b; |
|---|
| 116 |
|
|---|
| 117 |
my $first = $b - $flank; |
|---|
| 118 |
my $last = $e + $flank; |
|---|
| 119 |
$b = ($first > 0) ? $b - $flank : 1; |
|---|
| 120 |
$e = ($last <= length($$seq)) ? $e + $flank : length($$seq); |
|---|
| 121 |
|
|---|
| 122 |
my $l = $e - $b + 1; |
|---|
| 123 |
|
|---|
| 124 |
|
|---|
| 125 |
|
|---|
| 126 |
|
|---|
| 127 |
|
|---|
| 128 |
substr($$seq, $b -1 , $l, "$replace"x$l); |
|---|
| 129 |
} |
|---|
| 130 |
} |
|---|
| 131 |
|
|---|
| 132 |
sub _soft_mask_seq { |
|---|
| 133 |
my $seq = shift; |
|---|
| 134 |
my $features = shift; |
|---|
| 135 |
my $flank = shift || 0; |
|---|
| 136 |
|
|---|
| 137 |
foreach my $p (@{$features}){ |
|---|
| 138 |
my $b = $p->[0]; |
|---|
| 139 |
my $e = $p->[1]; |
|---|
| 140 |
|
|---|
| 141 |
($b, $e) = ($e, $b) if $e < $b; |
|---|
| 142 |
|
|---|
| 143 |
my $first = $b - $flank; |
|---|
| 144 |
my $last = $e + $flank; |
|---|
| 145 |
$b = ($first > 0) ? $b - $flank : 1; |
|---|
| 146 |
$e = ($last <= length($$seq)) ? $e + $flank : length($$seq); |
|---|
| 147 |
|
|---|
| 148 |
my $l = $e - $b + 1; |
|---|
| 149 |
|
|---|
| 150 |
substr($$seq, $b -1 , $l, lc(substr($$seq, $b -1 , $l))); |
|---|
| 151 |
} |
|---|
| 152 |
} |
|---|
| 153 |
|
|---|
| 154 |
sub get_coors { |
|---|
| 155 |
my $hits = shift; |
|---|
| 156 |
my $offset = shift || 0; |
|---|
| 157 |
|
|---|
| 158 |
my @coors; |
|---|
| 159 |
foreach my $hit (@{$hits}){ |
|---|
| 160 |
foreach my $hsp ($hit->hsps()){ |
|---|
| 161 |
push(@coors, [$hsp->nB('query') - $offset, $hsp->nE('query') - $offset]); |
|---|
| 162 |
} |
|---|
| 163 |
} |
|---|
| 164 |
|
|---|
| 165 |
return \@coors; |
|---|
| 166 |
} |
|---|
| 167 |
|
|---|
| 168 |
sub seperate_types { |
|---|
| 169 |
my $features = shift; |
|---|
| 170 |
|
|---|
| 171 |
my @tes; |
|---|
| 172 |
my @lcs; |
|---|
| 173 |
foreach my $f (@{$features}){ |
|---|
| 174 |
if (ref($f) =~ /repeatmasker/){ |
|---|
| 175 |
my $n = $f->hsp('best')->hit->seq_id(); |
|---|
| 176 |
if ($n =~ /Simple_repeat/ || $n =~ /Low_complexity/){ |
|---|
| 177 |
push(@lcs, $f); |
|---|
| 178 |
} |
|---|
| 179 |
else { |
|---|
| 180 |
push(@tes, $f); |
|---|
| 181 |
} |
|---|
| 182 |
} |
|---|
| 183 |
elsif (ref($f) =~ /blastx/) { |
|---|
| 184 |
push(@tes, $f); |
|---|
| 185 |
} |
|---|
| 186 |
elsif (ref($f) =~ /gff/){ |
|---|
| 187 |
my $n = $f->hsp('best')->hit->seq_id(); |
|---|
| 188 |
my $s = $f->algorithm; |
|---|
| 189 |
if ($n =~ /Simple_repeat/ || $n =~ /Low_complexity/){ |
|---|
| 190 |
push(@lcs, $f); |
|---|
| 191 |
} |
|---|
| 192 |
elsif($s =~ /blastx/i){ |
|---|
| 193 |
push(@tes, $f); |
|---|
| 194 |
} |
|---|
| 195 |
else { |
|---|
| 196 |
push(@tes, $f); |
|---|
| 197 |
} |
|---|
| 198 |
} |
|---|
| 199 |
} |
|---|
| 200 |
return (\@tes, \@lcs); |
|---|
| 201 |
} |
|---|
| 202 |
|
|---|
| 203 |
|
|---|
| 204 |
|
|---|
| 205 |
|
|---|
| 206 |
|
|---|
| 207 |
|
|---|
| 208 |
|
|---|
| 209 |
|
|---|
| 210 |
|
|---|
| 211 |
|
|---|
| 212 |
|
|---|
| 213 |
|
|---|
| 214 |
|
|---|
| 215 |
|
|---|
| 216 |
|
|---|
| 217 |
|
|---|
| 218 |
|
|---|
| 219 |
|
|---|
| 220 |
|
|---|
| 221 |
|
|---|
| 222 |
|
|---|
| 223 |
|
|---|
| 224 |
|
|---|
| 225 |
|
|---|
| 226 |
|
|---|
| 227 |
|
|---|
| 228 |
|
|---|
| 229 |
1; |
|---|
| 230 |
|
|---|
| 231 |
|
|---|