| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
package GFFDB; |
|---|
| 5 |
use strict; |
|---|
| 6 |
use vars qw(@ISA @EXPORT $VERSION); |
|---|
| 7 |
use Exporter; |
|---|
| 8 |
use DBI; |
|---|
| 9 |
use DBD::SQLite; |
|---|
| 10 |
use File::Temp; |
|---|
| 11 |
use URI::Escape; |
|---|
| 12 |
use Bio::Search::Hit::PhatHit::gff3; |
|---|
| 13 |
use Bio::Search::HSP::PhatHSP::gff3; |
|---|
| 14 |
use Cwd; |
|---|
| 15 |
use File::NFSLock; |
|---|
| 16 |
|
|---|
| 17 |
@ISA = qw( |
|---|
| 18 |
); |
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
|
|---|
| 22 |
|
|---|
| 23 |
sub new { |
|---|
| 24 |
my ($class, @args) = @_; |
|---|
| 25 |
|
|---|
| 26 |
my $self = {}; |
|---|
| 27 |
bless ($self, $class); |
|---|
| 28 |
|
|---|
| 29 |
my $dbfile = 'dbfile.db'; |
|---|
| 30 |
|
|---|
| 31 |
my $in = shift @args; |
|---|
| 32 |
my %CTL_OPTIONS; |
|---|
| 33 |
if(ref $in eq 'HASH'){ |
|---|
| 34 |
%CTL_OPTIONS = %{$in}; |
|---|
| 35 |
$dbfile = "$CTL_OPTIONS{out_base}/$CTL_OPTIONS{out_name}.db"; |
|---|
| 36 |
|
|---|
| 37 |
|
|---|
| 38 |
unlink($dbfile) if($CTL_OPTIONS{force}); |
|---|
| 39 |
|
|---|
| 40 |
$self->{dbfile} = $dbfile; |
|---|
| 41 |
$self->{last_build} = undef; |
|---|
| 42 |
$self->{next_build} = $CTL_OPTIONS{out_name}."::1.00"; |
|---|
| 43 |
$self->{go_gffdb} = $CTL_OPTIONS{go_gffdb}; |
|---|
| 44 |
|
|---|
| 45 |
$self->initiate(); |
|---|
| 46 |
|
|---|
| 47 |
return $self unless($self->{go_gffdb}); |
|---|
| 48 |
|
|---|
| 49 |
$self->add_maker($CTL_OPTIONS{genome_gff},\%CTL_OPTIONS); |
|---|
| 50 |
$self->add_repeat($CTL_OPTIONS{rm_gff}); |
|---|
| 51 |
$self->add_est($CTL_OPTIONS{est_gff}); |
|---|
| 52 |
$self->add_altest($CTL_OPTIONS{altest_gff}); |
|---|
| 53 |
$self->add_protein($CTL_OPTIONS{protein_gff}); |
|---|
| 54 |
$self->add_pred($CTL_OPTIONS{pred_gff}); |
|---|
| 55 |
$self->add_model($CTL_OPTIONS{model_gff}); |
|---|
| 56 |
$self->add_other($CTL_OPTIONS{other_gff}); |
|---|
| 57 |
$self->do_indexing(); |
|---|
| 58 |
} |
|---|
| 59 |
elsif(defined $in){ |
|---|
| 60 |
$dbfile = $in; |
|---|
| 61 |
$self->{dbfile} = $dbfile; |
|---|
| 62 |
$self->{last_build} = undef; |
|---|
| 63 |
$self->{next_build} = "Build::1.00"; |
|---|
| 64 |
$self->{go_gffdb} = 1; |
|---|
| 65 |
$self->initiate(); |
|---|
| 66 |
} |
|---|
| 67 |
|
|---|
| 68 |
return $self; |
|---|
| 69 |
} |
|---|
| 70 |
|
|---|
| 71 |
sub initiate { |
|---|
| 72 |
my $self = shift; |
|---|
| 73 |
|
|---|
| 74 |
my $dbfile = $self->{dbfile}; |
|---|
| 75 |
|
|---|
| 76 |
my $val; |
|---|
| 77 |
if($self->{go_gffdb}){ |
|---|
| 78 |
if(my $lock = new File::NFSLock($self->{dbfile}, 'EX', , 300)){ |
|---|
| 79 |
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile","","",{AutoCommit => 0}); |
|---|
| 80 |
$dbh->do(qq{PRAGMA default_synchronous = OFF}); |
|---|
| 81 |
$dbh->do(qq{PRAGMA default_cache_size = 10000}); |
|---|
| 82 |
|
|---|
| 83 |
my $tables = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'table'}); |
|---|
| 84 |
if (! grep( /^sources$/, @{$tables})){ |
|---|
| 85 |
$dbh->do(qq{CREATE TABLE sources (name TEXT, source TEXT)}); |
|---|
| 86 |
} |
|---|
| 87 |
|
|---|
| 88 |
$dbh->commit; |
|---|
| 89 |
$dbh->disconnect; |
|---|
| 90 |
$lock->unlock; |
|---|
| 91 |
} |
|---|
| 92 |
} |
|---|
| 93 |
elsif($dbfile && -e $dbfile){ |
|---|
| 94 |
unlink($dbfile); |
|---|
| 95 |
} |
|---|
| 96 |
} |
|---|
| 97 |
|
|---|
| 98 |
sub do_indexing { |
|---|
| 99 |
my $self = shift; |
|---|
| 100 |
|
|---|
| 101 |
return unless($self->{go_gffdb}); |
|---|
| 102 |
my $dbfile = $self->{dbfile}; |
|---|
| 103 |
|
|---|
| 104 |
if(my $lock = new File::NFSLock($self->{dbfile}, 'EX', , 300)){ |
|---|
| 105 |
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile","","",{AutoCommit => 0}); |
|---|
| 106 |
$dbh->do(qq{PRAGMA default_synchronous = OFF}); |
|---|
| 107 |
$dbh->do(qq{PRAGMA default_cache_size = 10000}); |
|---|
| 108 |
|
|---|
| 109 |
my $tables = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'table'}); |
|---|
| 110 |
my $indices = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'index'}); |
|---|
| 111 |
|
|---|
| 112 |
foreach my $table (@{$tables}){ |
|---|
| 113 |
next if($table eq 'sources'); |
|---|
| 114 |
unless (grep(/^$table\_inx$/, @{$indices})){ |
|---|
| 115 |
$dbh->do("CREATE INDEX $table\_inx ON $table(seqid)"); |
|---|
| 116 |
} |
|---|
| 117 |
} |
|---|
| 118 |
|
|---|
| 119 |
$dbh->commit; |
|---|
| 120 |
$dbh->disconnect; |
|---|
| 121 |
$lock->unlock; |
|---|
| 122 |
} |
|---|
| 123 |
} |
|---|
| 124 |
|
|---|
| 125 |
sub add_maker { |
|---|
| 126 |
my $self = shift @_; |
|---|
| 127 |
my $gff_file = shift @_; |
|---|
| 128 |
my %codes = %{shift @_}; |
|---|
| 129 |
|
|---|
| 130 |
return unless($self->{go_gffdb}); |
|---|
| 131 |
|
|---|
| 132 |
my @types; |
|---|
| 133 |
push(@types, 'repeat_maker') if($codes{rm_pass}); |
|---|
| 134 |
push(@types, 'est_maker') if($codes{est_pass}); |
|---|
| 135 |
push(@types, 'altest_maker') if($codes{altest_pass}); |
|---|
| 136 |
push(@types, 'protein_maker') if($codes{protein_pass}); |
|---|
| 137 |
push(@types, 'pred_maker') if($codes{pred_pass}); |
|---|
| 138 |
push(@types, 'model_maker') if($codes{model_pass}); |
|---|
| 139 |
push(@types, 'other_maker') if($codes{other_pass}); |
|---|
| 140 |
|
|---|
| 141 |
my $dbfile = $self->{dbfile}; |
|---|
| 142 |
|
|---|
| 143 |
if(my $lock = new File::NFSLock($self->{dbfile}, 'EX', , 300)){ |
|---|
| 144 |
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile","","",{AutoCommit => 0}); |
|---|
| 145 |
$dbh->do(qq{PRAGMA default_synchronous = OFF}); |
|---|
| 146 |
$dbh->do(qq{PRAGMA default_cache_size = 10000}); |
|---|
| 147 |
|
|---|
| 148 |
|
|---|
| 149 |
my $tables = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'table'}); |
|---|
| 150 |
@$tables = grep {/\_maker$/} @$tables; |
|---|
| 151 |
|
|---|
| 152 |
my %all; |
|---|
| 153 |
foreach my $t (@types, @$tables){ |
|---|
| 154 |
$all{$t}++; |
|---|
| 155 |
} |
|---|
| 156 |
|
|---|
| 157 |
my %skip; |
|---|
| 158 |
foreach my $table (keys %all){ |
|---|
| 159 |
my $source = (grep {/^$table$/} @types) ? $gff_file : 'empty'; |
|---|
| 160 |
|
|---|
| 161 |
if (grep {/^$table$/} @{$tables}){ |
|---|
| 162 |
my ($o_source) = $dbh->selectrow_array(qq{SELECT source FROM sources WHERE name = '$table'}); |
|---|
| 163 |
|
|---|
| 164 |
if($source ne $o_source){ |
|---|
| 165 |
$dbh->do(qq{DROP TABLE $table}); |
|---|
| 166 |
$dbh->do(qq{CREATE TABLE $table (seqid TEXT, source TEXT, start INT, end INT, line TEXT)}); |
|---|
| 167 |
$dbh->do(qq{UPDATE sources SET source = '$source' WHERE name = '$table'}); |
|---|
| 168 |
} |
|---|
| 169 |
else{ |
|---|
| 170 |
$skip{$table}++; |
|---|
| 171 |
} |
|---|
| 172 |
} |
|---|
| 173 |
else{ |
|---|
| 174 |
$dbh->do(qq{CREATE TABLE $table (seqid TEXT, source TEXT, start INT, end INT, line TEXT)}); |
|---|
| 175 |
$dbh->do(qq{INSERT INTO sources (name, source) VALUES ('$table', '$source')}); |
|---|
| 176 |
} |
|---|
| 177 |
} |
|---|
| 178 |
|
|---|
| 179 |
|
|---|
| 180 |
if(scalar(keys %skip) < @types && $gff_file){ |
|---|
| 181 |
open (my $IN, "< $gff_file") or die "ERROR: Could not open file: $gff_file\n"; |
|---|
| 182 |
my $count = 0; |
|---|
| 183 |
my $line; |
|---|
| 184 |
while(defined($line = <$IN>)){ |
|---|
| 185 |
chomp($line); |
|---|
| 186 |
if($line =~ /^\ |
|---|
| 187 |
my $build = $1; |
|---|
| 188 |
my ($id, $count) = split("::", $build); |
|---|
| 189 |
next unless($count =~ /^\d+\.\d\d$/); |
|---|
| 190 |
|
|---|
| 191 |
$self->{last_build} = $build; |
|---|
| 192 |
$self->{next_build} = sprintf $id.'::%.2f', $count; |
|---|
| 193 |
} |
|---|
| 194 |
last if ($line =~ /^\ |
|---|
| 195 |
next if ($line =~ /^\s*$/); |
|---|
| 196 |
next if ($line =~ /^\ |
|---|
| 197 |
|
|---|
| 198 |
my $l = $self->_parse_line(\$line); |
|---|
| 199 |
|
|---|
| 200 |
my $table; |
|---|
| 201 |
if($l->{source} =~ /^repeatmasker$|^blastx\:repeat|^repeat_gff\:/i){ |
|---|
| 202 |
next if (! $codes{rm_pass}); |
|---|
| 203 |
next if ($skip{repeat_maker}); |
|---|
| 204 |
$table = 'repeat_maker'; |
|---|
| 205 |
} |
|---|
| 206 |
elsif($l->{source} =~ /^blastn$|^est2genome$|^est_gff\:/i){ |
|---|
| 207 |
next if (! $codes{est_pass}); |
|---|
| 208 |
next if ($skip{est_maker}); |
|---|
| 209 |
$table = 'est_maker'; |
|---|
| 210 |
} |
|---|
| 211 |
elsif($l->{source} =~ /^tblastx$|^altest_gff\:/i){ |
|---|
| 212 |
next if (! $codes{altest_pass}); |
|---|
| 213 |
next if ($skip{altest_maker}); |
|---|
| 214 |
$table = 'altest_maker'; |
|---|
| 215 |
} |
|---|
| 216 |
elsif($l->{source} =~ /^blastx$|^protein2genome$|^protein_gff\:/i){ |
|---|
| 217 |
next if (! $codes{protein_pass}); |
|---|
| 218 |
next if ($skip{protein_maker}); |
|---|
| 219 |
$table = 'protein_maker'; |
|---|
| 220 |
} |
|---|
| 221 |
elsif($l->{source} =~ /^snap\_*|^augustus\_*|^twinscan\_*|^fgenesh\_*|^genemark\_*|^pred_gff\:/i){ |
|---|
| 222 |
next if (! $codes{pred_pass}); |
|---|
| 223 |
next if ($skip{pred_maker}); |
|---|
| 224 |
$table = 'pred_maker'; |
|---|
| 225 |
} |
|---|
| 226 |
elsif($l->{source} =~ /^maker$|^model_gff\:/i){ |
|---|
| 227 |
next if (! $codes{model_pass}); |
|---|
| 228 |
next if ($skip{model_maker}); |
|---|
| 229 |
$table = 'model_maker'; |
|---|
| 230 |
} |
|---|
| 231 |
elsif($l->{source} =~/^\.$/){ |
|---|
| 232 |
next; |
|---|
| 233 |
} |
|---|
| 234 |
else{ |
|---|
| 235 |
next if (! $codes{other_pass}); |
|---|
| 236 |
next if ($skip{other_maker}); |
|---|
| 237 |
$table = 'other_maker'; |
|---|
| 238 |
} |
|---|
| 239 |
|
|---|
| 240 |
$self->_add_to_db($dbh, $table, $l); |
|---|
| 241 |
if($count == 10000){ |
|---|
| 242 |
$dbh->commit; |
|---|
| 243 |
$count = 0; |
|---|
| 244 |
} |
|---|
| 245 |
else{ |
|---|
| 246 |
$count++; |
|---|
| 247 |
} |
|---|
| 248 |
} |
|---|
| 249 |
close($IN); |
|---|
| 250 |
} |
|---|
| 251 |
|
|---|
| 252 |
|
|---|
| 253 |
$dbh->commit; |
|---|
| 254 |
$dbh->disconnect; |
|---|
| 255 |
$lock->unlock; |
|---|
| 256 |
} |
|---|
| 257 |
} |
|---|
| 258 |
|
|---|
| 259 |
sub add_repeat { |
|---|
| 260 |
my $self = shift; |
|---|
| 261 |
my $gff_file = shift; |
|---|
| 262 |
my $table = 'repeat_gff'; |
|---|
| 263 |
|
|---|
| 264 |
$self->_add_type($gff_file, $table); |
|---|
| 265 |
} |
|---|
| 266 |
|
|---|
| 267 |
sub add_est { |
|---|
| 268 |
my $self = shift; |
|---|
| 269 |
my $gff_file = shift; |
|---|
| 270 |
my $table = 'est_gff'; |
|---|
| 271 |
|
|---|
| 272 |
$self->_add_type($gff_file, $table); |
|---|
| 273 |
} |
|---|
| 274 |
|
|---|
| 275 |
sub add_altest { |
|---|
| 276 |
my $self = shift; |
|---|
| 277 |
my $gff_file = shift; |
|---|
| 278 |
my $table = 'altest_gff'; |
|---|
| 279 |
|
|---|
| 280 |
$self->_add_type($gff_file, $table); |
|---|
| 281 |
} |
|---|
| 282 |
|
|---|
| 283 |
sub add_protein { |
|---|
| 284 |
my $self = shift; |
|---|
| 285 |
my $gff_file = shift; |
|---|
| 286 |
my $table = 'protein_gff'; |
|---|
| 287 |
|
|---|
| 288 |
$self->_add_type($gff_file, $table); |
|---|
| 289 |
} |
|---|
| 290 |
|
|---|
| 291 |
sub add_pred { |
|---|
| 292 |
my $self = shift; |
|---|
| 293 |
my $gff_file = shift; |
|---|
| 294 |
my $table = 'pred_gff'; |
|---|
| 295 |
|
|---|
| 296 |
$self->_add_type($gff_file, $table); |
|---|
| 297 |
} |
|---|
| 298 |
|
|---|
| 299 |
|
|---|
| 300 |
sub add_model { |
|---|
| 301 |
my $self = shift; |
|---|
| 302 |
my $gff_file = shift; |
|---|
| 303 |
my $table = 'model_gff'; |
|---|
| 304 |
|
|---|
| 305 |
$self->_add_type($gff_file, $table); |
|---|
| 306 |
} |
|---|
| 307 |
|
|---|
| 308 |
sub add_other { |
|---|
| 309 |
my $self = shift; |
|---|
| 310 |
my $gff_file = shift; |
|---|
| 311 |
my $table = 'other_gff'; |
|---|
| 312 |
|
|---|
| 313 |
$self->_add_type($gff_file, $table); |
|---|
| 314 |
} |
|---|
| 315 |
|
|---|
| 316 |
sub _add_type { |
|---|
| 317 |
my $self = shift; |
|---|
| 318 |
my $gff_file = shift; |
|---|
| 319 |
my $table = shift; |
|---|
| 320 |
|
|---|
| 321 |
return unless($self->{go_gffdb}); |
|---|
| 322 |
|
|---|
| 323 |
my $dbfile = $self->{dbfile}; |
|---|
| 324 |
|
|---|
| 325 |
if(my $lock = new File::NFSLock($self->{dbfile}, 'EX', , 300)){ |
|---|
| 326 |
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile","","",{AutoCommit => 0}); |
|---|
| 327 |
$dbh->do(qq{PRAGMA default_synchronous = OFF}); |
|---|
| 328 |
$dbh->do(qq{PRAGMA default_cache_size = 10000}); |
|---|
| 329 |
|
|---|
| 330 |
|
|---|
| 331 |
my $source = ($gff_file) ? $gff_file : 'empty'; |
|---|
| 332 |
my $tables = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'table'}); |
|---|
| 333 |
my $skip = 0; |
|---|
| 334 |
if (grep(/^$table$/, @{$tables})){ |
|---|
| 335 |
my ($o_source) = $dbh->selectrow_array(qq{SELECT source FROM sources WHERE name = '$table'}); |
|---|
| 336 |
|
|---|
| 337 |
if($source ne $o_source){ |
|---|
| 338 |
my ($index) = $dbh->selectrow_array(qq{SELECT name FROM sqlite_master WHERE name = '$table\_inx'}); |
|---|
| 339 |
$dbh->do(qq{DROP TABLE $table}); |
|---|
| 340 |
$dbh->do(qq{CREATE TABLE $table (seqid TEXT, source TEXT, start INT, end INT, line TEXT)}); |
|---|
| 341 |
$dbh->do(qq{UPDATE sources SET source = '$source' WHERE name = '$table'}); |
|---|
| 342 |
} |
|---|
| 343 |
else{ |
|---|
| 344 |
$skip = 1; |
|---|
| 345 |
} |
|---|
| 346 |
} |
|---|
| 347 |
else{ |
|---|
| 348 |
$dbh->do(qq{CREATE TABLE $table (seqid TEXT, source TEXT, start INT, end INT, line TEXT)}); |
|---|
| 349 |
$dbh->do(qq{INSERT INTO sources (name, source) VALUES ('$table', '$source')}); |
|---|
| 350 |
} |
|---|
| 351 |
|
|---|
| 352 |
|
|---|
| 353 |
if(! $skip && $gff_file){ |
|---|
| 354 |
open (my $IN, "< $gff_file") or die "ERROR: Could not open file: $gff_file\n"; |
|---|
| 355 |
my $count = 0; |
|---|
| 356 |
while(defined(my $line = <$IN>)){ |
|---|
| 357 |
chomp($line); |
|---|
| 358 |
last if ($line =~ /^\ |
|---|
| 359 |
next if ($line =~ /^\s*$/); |
|---|
| 360 |
next if ($line =~ /^\ |
|---|
| 361 |
|
|---|
| 362 |
my $l = $self->_parse_line(\$line, $table); |
|---|
| 363 |
$self->_add_to_db($dbh, $table, $l) unless($l->{type} eq 'contig'); |
|---|
| 364 |
|
|---|
| 365 |
if($count == 10000){ |
|---|
| 366 |
$dbh->commit; |
|---|
| 367 |
$count = 0; |
|---|
| 368 |
} |
|---|
| 369 |
else{ |
|---|
| 370 |
$count++; |
|---|
| 371 |
} |
|---|
| 372 |
} |
|---|
| 373 |
close($IN); |
|---|
| 374 |
} |
|---|
| 375 |
|
|---|
| 376 |
|
|---|
| 377 |
$dbh->commit(); |
|---|
| 378 |
$dbh->disconnect(); |
|---|
| 379 |
$lock->unlock; |
|---|
| 380 |
} |
|---|
| 381 |
} |
|---|
| 382 |
|
|---|
| 383 |
sub _parse_line{ |
|---|
| 384 |
my $self = shift; |
|---|
| 385 |
my $line = shift; |
|---|
| 386 |
my $tag = shift; |
|---|
| 387 |
|
|---|
| 388 |
chomp $$line; |
|---|
| 389 |
my @data = split(/\t/, $$line); |
|---|
| 390 |
|
|---|
| 391 |
foreach my $d (@data){ |
|---|
| 392 |
$d = uri_escape($d,'\'\"\%'); |
|---|
| 393 |
} |
|---|
| 394 |
|
|---|
| 395 |
$data[1] = "$tag:$data[1]" if($tag && $data[1] !~ /^$tag\:/); |
|---|
| 396 |
$data[6] = '+' if($data[6] =~ /^\.$|^0$/); |
|---|
| 397 |
|
|---|
| 398 |
my %l = (seqid => $data[0], |
|---|
| 399 |
source => $data[1], |
|---|
| 400 |
type => $data[2], |
|---|
| 401 |
start => $data[3], |
|---|
| 402 |
end => $data[4], |
|---|
| 403 |
line => join("\t", @data) |
|---|
| 404 |
); |
|---|
| 405 |
|
|---|
| 406 |
return (\%l); |
|---|
| 407 |
} |
|---|
| 408 |
|
|---|
| 409 |
sub _add_to_db { |
|---|
| 410 |
my $self = shift; |
|---|
| 411 |
my $dbh = shift; |
|---|
| 412 |
my $table = shift; |
|---|
| 413 |
my $l = shift; |
|---|
| 414 |
|
|---|
| 415 |
$dbh->do("INSERT INTO $table (seqid, source, start, end, line) ". |
|---|
| 416 |
"VALUES (\'".$l->{seqid}."\', \'".$l->{source}."\', ". |
|---|
| 417 |
$l->{start}.", ".$l->{end}.", \'".$l->{line}."\')" |
|---|
| 418 |
); |
|---|
| 419 |
} |
|---|
| 420 |
|
|---|
| 421 |
sub phathits_on_chunk { |
|---|
| 422 |
my $self = shift; |
|---|
| 423 |
my $chunk = shift; |
|---|
| 424 |
my $seq_ref = shift; |
|---|
| 425 |
my $h_type = shift; |
|---|
| 426 |
|
|---|
| 427 |
return [] unless($self->{go_gffdb}); |
|---|
| 428 |
|
|---|
| 429 |
my $dbfile = $self->{dbfile}; |
|---|
| 430 |
|
|---|
| 431 |
my $c_start = $chunk->offset + 1; |
|---|
| 432 |
my $c_end = $chunk->offset + $chunk->length; |
|---|
| 433 |
my $seqid = $chunk->seqid; |
|---|
| 434 |
|
|---|
| 435 |
my $ref1 = []; |
|---|
| 436 |
my $ref2 = []; |
|---|
| 437 |
if(my $lock = new File::NFSLock($self->{dbfile}, 'EX', , 300)){ |
|---|
| 438 |
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile","",""); |
|---|
| 439 |
|
|---|
| 440 |
my $tables = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'table'}); |
|---|
| 441 |
|
|---|
| 442 |
|
|---|
| 443 |
if (grep(/^$h_type\_gff$/, @{$tables})){ |
|---|
| 444 |
$ref1 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_gff WHERE seqid = '$seqid'}); |
|---|
| 445 |
} |
|---|
| 446 |
|
|---|
| 447 |
|
|---|
| 448 |
if (grep(/^$h_type\_maker$/, @{$tables})){ |
|---|
| 449 |
$ref2 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_maker WHERE seqid = '$seqid'}); |
|---|
| 450 |
} |
|---|
| 451 |
|
|---|
| 452 |
$dbh->disconnect; |
|---|
| 453 |
$lock->unlock; |
|---|
| 454 |
} |
|---|
| 455 |
|
|---|
| 456 |
my $features = _ary_to_features($ref1, $ref2); |
|---|
| 457 |
|
|---|
| 458 |
my $structs; |
|---|
| 459 |
if($h_type eq 'model'){ |
|---|
| 460 |
$structs = _get_genes($features, $seq_ref); |
|---|
| 461 |
} |
|---|
| 462 |
elsif($h_type eq 'repeat'){ |
|---|
| 463 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 464 |
} |
|---|
| 465 |
elsif($h_type eq 'est'){ |
|---|
| 466 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 467 |
} |
|---|
| 468 |
elsif($h_type eq 'altest'){ |
|---|
| 469 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 470 |
} |
|---|
| 471 |
elsif($h_type eq 'protein'){ |
|---|
| 472 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 473 |
} |
|---|
| 474 |
elsif($h_type eq 'pred'){ |
|---|
| 475 |
$structs = _get_genes($features, $seq_ref); |
|---|
| 476 |
my $structs2 = _get_structs($features, $seq_ref); |
|---|
| 477 |
push(@$structs, @$structs2); |
|---|
| 478 |
} |
|---|
| 479 |
elsif($h_type eq 'other'){ |
|---|
| 480 |
die "ERROR: Can not build phathits for type: \'other\'\n"; |
|---|
| 481 |
} |
|---|
| 482 |
else{ |
|---|
| 483 |
die "ERROR: no recognized type in GFFDB::phathits_on_chunk\n"; |
|---|
| 484 |
} |
|---|
| 485 |
|
|---|
| 486 |
my @phat_hits; |
|---|
| 487 |
foreach my $s (@{$structs}){ |
|---|
| 488 |
next unless ($c_start <= $s->{start} && $s->{start} <= $c_end); |
|---|
| 489 |
push(@phat_hits, @{_load_hits($s, $seq_ref)}); |
|---|
| 490 |
} |
|---|
| 491 |
|
|---|
| 492 |
return \@phat_hits; |
|---|
| 493 |
} |
|---|
| 494 |
|
|---|
| 495 |
sub phathits_on_contig { |
|---|
| 496 |
my $self = shift; |
|---|
| 497 |
my $seqid = shift; |
|---|
| 498 |
my $seq_ref = shift; |
|---|
| 499 |
my $h_type = shift; |
|---|
| 500 |
|
|---|
| 501 |
return [] unless($self->{go_gffdb}); |
|---|
| 502 |
|
|---|
| 503 |
my $ref1 = []; |
|---|
| 504 |
my $ref2 = []; |
|---|
| 505 |
if(my $lock = new File::NFSLock($self->{dbfile}, 'EX', , 300)){ |
|---|
| 506 |
my $dbfile = $self->{dbfile}; |
|---|
| 507 |
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile","",""); |
|---|
| 508 |
|
|---|
| 509 |
my $tables = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'table'}); |
|---|
| 510 |
|
|---|
| 511 |
|
|---|
| 512 |
if (grep(/^$h_type\_gff$/, @{$tables})){ |
|---|
| 513 |
$ref1 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_gff WHERE seqid = '$seqid'}); |
|---|
| 514 |
} |
|---|
| 515 |
|
|---|
| 516 |
|
|---|
| 517 |
if (grep(/^$h_type\_maker$/, @{$tables})){ |
|---|
| 518 |
$ref2 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_maker WHERE seqid = '$seqid'}); |
|---|
| 519 |
} |
|---|
| 520 |
|
|---|
| 521 |
$dbh->disconnect; |
|---|
| 522 |
$lock->unlock; |
|---|
| 523 |
} |
|---|
| 524 |
|
|---|
| 525 |
my $features = _ary_to_features($ref1, $ref2); |
|---|
| 526 |
|
|---|
| 527 |
my $structs; |
|---|
| 528 |
if($h_type eq 'model'){ |
|---|
| 529 |
$structs = _get_genes($features, $seq_ref); |
|---|
| 530 |
} |
|---|
| 531 |
elsif($h_type eq 'repeat'){ |
|---|
| 532 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 533 |
} |
|---|
| 534 |
elsif($h_type eq 'est'){ |
|---|
| 535 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 536 |
} |
|---|
| 537 |
elsif($h_type eq 'altest'){ |
|---|
| 538 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 539 |
} |
|---|
| 540 |
elsif($h_type eq 'protein'){ |
|---|
| 541 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 542 |
} |
|---|
| 543 |
elsif($h_type eq 'pred'){ |
|---|
| 544 |
$structs = _get_structs($features, $seq_ref); |
|---|
| 545 |
my $structs2 = _get_genes($features, $seq_ref); |
|---|
| 546 |
push(@{$structs}, @{$structs2}); |
|---|
| 547 |
} |
|---|
| 548 |
elsif($h_type eq 'other'){ |
|---|
| 549 |
die "ERROR: Can not build phathits for type: \'other\'\n"; |
|---|
| 550 |
} |
|---|
| 551 |
else{ |
|---|
| 552 |
die "ERROR: no recognized type in GFFDB::phathits_on_chunk\n"; |
|---|
| 553 |
} |
|---|
| 554 |
|
|---|
| 555 |
my @phat_hits; |
|---|
| 556 |
foreach my $s (@{$structs}){ |
|---|
| 557 |
push(@phat_hits, @{_load_hits($s, $seq_ref)}); |
|---|
| 558 |
} |
|---|
| 559 |
|
|---|
| 560 |
return \@phat_hits; |
|---|
| 561 |
} |
|---|
| 562 |
|
|---|
| 563 |
sub get_existing_gene_names { |
|---|
| 564 |
my $self = shift; |
|---|
| 565 |
my $seqid = shift; |
|---|
| 566 |
|
|---|
| 567 |
return {} unless($self->{go_gffdb}); |
|---|
| 568 |
|
|---|
| 569 |
my %names; |
|---|
| 570 |
if(my $lock = new File::NFSLock($self->{dbfile}, 'EX', , 300)){ |
|---|
| 571 |
my $dbfile = $self->{dbfile}; |
|---|
| 572 |
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbfile","",""); |
|---|
| 573 |
|
|---|
| 574 |
my $tables = $dbh->selectcol_arrayref(qq{SELECT name FROM sqlite_master WHERE type = 'table'}); |
|---|
| 575 |
|
|---|
| 576 |
|
|---|
| 577 |
my $h_type = 'model'; |
|---|
| 578 |
|
|---|
| 579 |
|
|---|
| 580 |
my $ref1 = []; |
|---|
| 581 |
if (grep(/^$h_type\_gff$/, @{$tables})){ |
|---|
| 582 |
$ref1 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_gff WHERE seqid = '$seqid'}); |
|---|
| 583 |
} |
|---|
| 584 |
|
|---|
| 585 |
|
|---|
| 586 |
my $ref2 = []; |
|---|
| 587 |
if (grep(/^$h_type\_maker$/, @{$tables})){ |
|---|
| 588 |
$ref2 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_maker WHERE seqid = '$seqid'}); |
|---|
| 589 |
} |
|---|
| 590 |
|
|---|
| 591 |
my $features = _ary_to_features($ref1, $ref2); |
|---|
| 592 |
foreach my $f (@$features){ |
|---|
| 593 |
my $tag = $f->primary_tag(); |
|---|
| 594 |
|
|---|
| 595 |
if ($tag eq 'gene') { |
|---|
| 596 |
my $id = _get_annotation($f,'ID'); |
|---|
| 597 |
my $name = _get_annotation($f,'Name'); |
|---|
| 598 |
|
|---|
| 599 |
$name = $id if($name eq ''); |
|---|
| 600 |
|
|---|
| 601 |
|
|---|
| 602 |
($name) = $name =~ /^([^\s\t\n]+)/; |
|---|
| 603 |
$names{$name}++; |
|---|
| 604 |
|
|---|
| 605 |
|
|---|
| 606 |
my ($c_id) = $name =~ /$seqid\-[\-]+\-gene\-(\d+\.*\d*)/; |
|---|
| 607 |
$names{$c_id}++ if(defined $c_id); |
|---|
| 608 |
} |
|---|
| 609 |
} |
|---|
| 610 |
|
|---|
| 611 |
|
|---|
| 612 |
$h_type = 'pred'; |
|---|
| 613 |
|
|---|
| 614 |
|
|---|
| 615 |
$ref1 = []; |
|---|
| 616 |
if (grep(/^$h_type\_gff$/, @{$tables})){ |
|---|
| 617 |
$ref1 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_gff WHERE seqid = '$seqid'}); |
|---|
| 618 |
} |
|---|
| 619 |
|
|---|
| 620 |
|
|---|
| 621 |
$ref2 = []; |
|---|
| 622 |
if (grep(/^$h_type\_maker$/, @{$tables})){ |
|---|
| 623 |
$ref2 = $dbh->selectall_arrayref(qq{SELECT line FROM $h_type\_maker WHERE seqid = '$seqid'}); |
|---|
| 624 |
} |
|---|
| 625 |
|
|---|
| 626 |
$features = _ary_to_features($ref1, $ref2); |
|---|
| 627 |
foreach my $f (@$features){ |
|---|
| 628 |
my $tag = $f->primary_tag(); |
|---|
| 629 |
|
|---|
| 630 |
if ($tag eq 'match' || $tag eq 'gene') { |
|---|
| 631 |
my $id = _get_annotation($f,'ID'); |
|---|
| 632 |
my $name = _get_annotation($f,'Name'); |
|---|
| 633 |
|
|---|
| 634 |
$name = $id if($name eq ''); |
|---|
| 635 |
|
|---|
| 636 |
|
|---|
| 637 |
($name) = $name =~ /^([^\s\t\n]+)/; |
|---|
| 638 |
$name =~ s/\-mRNA\-\d+$//; |
|---|
| 639 |
$names{$name}++; |
|---|
| 640 |
|
|---|
| 641 |
|
|---|
| 642 |
my ($c_id) = $name =~ /$seqid\-[\-]+\-gene\-(\d+\.*\d*)/; |
|---|
| 643 |
$names{$c_id}++ if(defined $c_id); |
|---|
| 644 |
} |
|---|
| 645 |
} |
|---|
| 646 |
|
|---|
| 647 |
$dbh->disconnect; |
|---|
| 648 |
$lock->unlock; |
|---|
| 649 |
} |
|---|
| 650 |
|
|---|
| 651 |
return \%names; |
|---|
| 652 |
} |
|---|
| 653 |
|
|---|
| 654 |
sub last_build { |
|---|
| 655 |
my $self = shift; |
|---|
| 656 |
|
|---|
| 657 |
return $self->{last_build} || undef; |
|---|
| 658 |
} |
|---|
| 659 |
|
|---|
| 660 |
sub next_build { |
|---|
| 661 |
my $self = shift; |
|---|
| 662 |
|
|---|
| 663 |
return $self->{next_build} || undef; |
|---|
| 664 |
} |
|---|
| 665 |
|
|---|
| 666 |
|
|---|
| 667 |
|
|---|
| 668 |
|
|---|
| 669 |
sub _ary_to_features{ |
|---|
| 670 |
my @features; |
|---|
| 671 |
|
|---|
| 672 |
while(my $ary_ref = shift){ |
|---|
| 673 |
foreach my $row (@{$ary_ref}){ |
|---|
| 674 |
my $string = $row->[0]; |
|---|
| 675 |
my $feat = Bio::SeqFeature::Generic->new(); |
|---|
| 676 |
|
|---|
| 677 |
my ($seqname, $source, $primary, $start, $end, |
|---|
| 678 |
$score, $strand, $frame, $groups) = split(/\t/, $string); |
|---|
| 679 |
|
|---|
| 680 |
if ( ! defined $frame ) { |
|---|
| 681 |
die "ERROR: [$string] does not look like GFF3\n"; |
|---|
| 682 |
} |
|---|
| 683 |
|
|---|
| 684 |
$feat->seq_id($seqname); |
|---|
| 685 |
$feat->source_tag($source); |
|---|
| 686 |
$feat->primary_tag($primary); |
|---|
| 687 |
$feat->start($start); |
|---|
| 688 |
$feat->end($end); |
|---|
| 689 |
$feat->frame($frame); |
|---|
| 690 |
$feat->score($score) unless ( $score eq '.' ); |
|---|
| 691 |
if ( $strand eq '-' ) { $feat->strand(-1); } |
|---|
| 692 |
if ( $strand eq '+' ) { $feat->strand(1); } |
|---|
| 693 |
if ( $strand eq '.' ) { $feat->strand(0); } |
|---|
| 694 |
my @groups = split(/\s*;\s*/, $groups); |
|---|
| 695 |
|
|---|
| 696 |
for my $group (@groups) { |
|---|
| 697 |
my ($tag,$value) = split /=/,$group; |
|---|
| 698 |
$tag = uri_unescape($tag); |
|---|
| 699 |
my @values = map {uri_unescape($_)} split /,/,$value; |
|---|
| 700 |
for my $v ( @values ) { $feat->add_tag_value($tag,$v); } |
|---|
| 701 |
} |
|---|
| 702 |
push (@features, $feat); |
|---|
| 703 |
} |
|---|
| 704 |
} |
|---|
| 705 |
|
|---|
| 706 |
return \@features; |
|---|
| 707 |
} |
|---|
| 708 |
|
|---|
| 709 |
sub _load_hits { |
|---|
| 710 |
my $g = shift; |
|---|
| 711 |
my $seq_ref = shift; |
|---|
| 712 |
|
|---|
| 713 |
my $gene_id = $g->{id}; |
|---|
| 714 |
my $gene_name = $g->{name}; |
|---|
| 715 |
|
|---|
| 716 |
|
|---|
| 717 |
my @anns; |
|---|
| 718 |
if(@anns = $g->{f}->annotation->get_all_annotation_keys()){ |
|---|
| 719 |
@anns = grep {!/^ID$|^Name$|^Target$|^Parent$|^_AED$|^_QI$/} @anns; |
|---|
| 720 |
foreach my $ann (@anns){ |
|---|
| 721 |
my @list = $g->{f}->annotation->get_Annotations(); |
|---|
| 722 |
@list = map {$_->value()} @list; |
|---|
| 723 |
$ann = $ann.'='.join(',', @list); |
|---|
| 724 |
} |
|---|
| 725 |
} |
|---|
| 726 |
elsif(@anns = $g->{f}->get_all_tags){ |
|---|
| 727 |
@anns = grep {!/^ID$|^Name$|^Target$|^Parent$|^_AED$|^_QI$/} @anns; |
|---|
| 728 |
foreach my $ann (@anns){ |
|---|
| 729 |
my @list = $g->{f}->get_tag_values($ann); |
|---|
| 730 |
$ann = $ann.'='.join(',', @list); |
|---|
| 731 |
} |
|---|
| 732 |
} |
|---|
| 733 |
my $gene_attrib = join(';', @anns); |
|---|
| 734 |
|
|---|
| 735 |
my @phat_hits; |
|---|
| 736 |
foreach my $t (@{$g->{mRNAs}}){ |
|---|
| 737 |
|
|---|
| 738 |
my $tran_id = $t->{id}; |
|---|
| 739 |
my $tran_name = $t->{name}; |
|---|
| 740 |
($tran_name) = $tran_name =~ /(^[^\s]+)/; |
|---|
| 741 |
$tran_name =~ s/(mRNA\-\d+)\-AED\:.*/$1/; |
|---|
| 742 |
|
|---|
| 743 |
my $description = "g_name=$gene_name;g_id=$gene_id;t_name=$tran_name;t_id=$tran_id" unless(! $gene_name); |
|---|
| 744 |
|
|---|
| 745 |
my $f = new Bio::Search::Hit::PhatHit::gff3('-name' => $tran_name, |
|---|
| 746 |
'-description' => $description, |
|---|
| 747 |
'-algorithm' => $t->{f}->source_tag, |
|---|
| 748 |
'-length' => length($t->{seq}), |
|---|
| 749 |
'-score' => $t->{f}->score || '.', |
|---|
| 750 |
); |
|---|
| 751 |
|
|---|
| 752 |
$f->{gene_id} = $gene_id unless(! $gene_id); |
|---|
| 753 |
$f->{gene_name} = $gene_name unless(! $gene_name); |
|---|
| 754 |
$f->{_tran_name} = $tran_name unless(! $gene_name); |
|---|
| 755 |
$f->{_tran_id} = $tran_id unless(! $tran_id); |
|---|
| 756 |
$f->{maker_qi} = $t->{maker_qi} unless(! $t->{maker_qi}); |
|---|
| 757 |
$f->{gene_attrib} = $gene_attrib unless(! $gene_attrib); |
|---|
| 758 |
|
|---|
| 759 |
|
|---|
| 760 |
my @anns; |
|---|
| 761 |
if(@anns = $t->{f}->annotation->get_all_annotation_keys()){ |
|---|
| 762 |
@anns = grep {!/^ID$|^Name$|^Target$|^Parent$|^_AED$|^_QI$/} @anns; |
|---|
| 763 |
foreach my $ann (@anns){ |
|---|
| 764 |
my @list = $t->{f}->annotation->get_Annotations(); |
|---|
| 765 |
@list = map {$_->value()} @list; |
|---|
| 766 |
$ann = $ann.'='.join(',', @list); |
|---|
| 767 |
} |
|---|
| 768 |
} |
|---|
| 769 |
elsif(@anns = $t->{f}->get_all_tags){ |
|---|
| 770 |
@anns = grep {!/^ID$|^Name$|^Target$|^Parent$|^_AED$|^_QI$/} @anns; |
|---|
| 771 |
foreach my $ann (@anns){ |
|---|
| 772 |
my @list = $t->{f}->get_tag_values($ann); |
|---|
| 773 |
$ann = $ann.'='.join(',', @list); |
|---|
| 774 |
} |
|---|
| 775 |
} |
|---|
| 776 |
$f->{-attrib} = join(';', @anns) if(@anns); |
|---|
| 777 |
|
|---|
| 778 |
|
|---|
| 779 |
my $type = $t->{f}->primary_tag; |
|---|
| 780 |
$f->{transcript_type}=$type; |
|---|
| 781 |
|
|---|
| 782 |
$f->queryLength(length($t->{seq})); |
|---|
| 783 |
|
|---|
| 784 |
if(defined $t->{cdss} && @{$t->{cdss}}){ |
|---|
| 785 |
my ($t_offset, $t_end) = _get_t_offset_and_end($t); |
|---|
| 786 |
|
|---|
| 787 |
if($t_offset != -1){ |
|---|
| 788 |
$f->{translation_offset} = $t_offset; |
|---|
| 789 |
$f->{translation_end} = $t_end; |
|---|
| 790 |
my $cdss = _load_cdss($t, $seq_ref); |
|---|
| 791 |
$f->{cdss} = $cdss; |
|---|
| 792 |
} |
|---|
| 793 |
else{ |
|---|
| 794 |
warn "WARNING: Problem cause by bad CDS entries in GFF3 file for ". |
|---|
| 795 |
$t->{name}."\n". |
|---|
| 796 |
"Maker will just figure out a new CDS entry internally\n\n"; |
|---|
| 797 |
} |
|---|
| 798 |
} |
|---|
| 799 |
|
|---|
| 800 |
$f->{seq} = $t->{seq}; |
|---|
| 801 |
|
|---|
| 802 |
my $hsps = _load_hsps($t, $seq_ref); |
|---|
| 803 |
|
|---|
| 804 |
foreach my $hsp (@{$hsps}){ |
|---|
| 805 |
$f->add_hsp($hsp); |
|---|
| 806 |
} |
|---|
| 807 |
|
|---|
| 808 |
push(@phat_hits, $f); |
|---|
| 809 |
} |
|---|
| 810 |
|
|---|
| 811 |
return \@phat_hits; |
|---|
| 812 |
} |
|---|
| 813 |
|
|---|
| 814 |
sub _get_t_offset_and_end { |
|---|
| 815 |
my $t = shift; |
|---|
| 816 |
|
|---|
| 817 |
my $t_seq = $t->{seq}; |
|---|
| 818 |
my $c_seq = $t->{cds_seq}; |
|---|
| 819 |
|
|---|
| 820 |
my $t_offset = index($t_seq, $c_seq); |
|---|
| 821 |
|
|---|
| 822 |
my $t_end = $t_offset + length($c_seq) -1; |
|---|
| 823 |
|
|---|
| 824 |
warn "WARNING: Problem in GFFDB::_get_t_offset_and_end\n" if $t_offset == -1; |
|---|
| 825 |
|
|---|
| 826 |
return ($t_offset, $t_end); |
|---|
| 827 |
} |
|---|
| 828 |
|
|---|
| 829 |
sub _load_cdss { |
|---|
| 830 |
my $t = shift; |
|---|
| 831 |
my $seq_ref = shift; |
|---|
| 832 |
|
|---|
| 833 |
my @hsps; |
|---|
| 834 |
my $hit_start = 1; |
|---|
| 835 |
my $hit_strand = 1; |
|---|
| 836 |
my $hit_name = $t->{name}; |
|---|
| 837 |
($hit_name) = $hit_name =~ /(^[^\s]+)/; |
|---|
| 838 |
$hit_name =~ s/(mRNA\-\d+)\-AED\:.*/$1/; |
|---|
| 839 |
|
|---|
| 840 |
foreach my $e (@{$t->{cdss}}){ |
|---|
| 841 |
my @args; |
|---|
| 842 |
my $hit_end = $e->{f}->end - $e->{f}->start + $hit_start; |
|---|
| 843 |
|
|---|
| 844 |
my $value = _get_annotation($e->{f}, 'Target'); |
|---|
| 845 |
if($value ne ''){ |
|---|
| 846 |
my @dats = split(/\s/, $value); |
|---|
| 847 |
|
|---|
| 848 |
$hit_name = $dats[0]; |
|---|
| 849 |
$hit_start = $dats[1]; |
|---|
| 850 |
$hit_end = $dats[2]; |
|---|
| 851 |
$hit_strand = (defined ($dats[3]) && $dats[3] eq '-') ? -1 : 1; |
|---|
| 852 |
} |
|---|
| 853 |
|
|---|
| 854 |
push(@args, '-query_start'); |
|---|
| 855 |
push(@args, $e->{f}->start); |
|---|
| 856 |
|
|---|
| 857 |
push(@args, '-query_seq'); |
|---|
| 858 |
push(@args, $e->{seq}); |
|---|
| 859 |
|
|---|
| 860 |
push(@args, '-score'); |
|---|
| 861 |
push(@args, $e->{f}->score); |
|---|
| 862 |
|
|---|
| 863 |
push(@args, '-homology_seq'); |
|---|
| 864 |
push(@args, $e->{seq}); |
|---|
| 865 |
|
|---|
| 866 |
push(@args, '-hit_start'); |
|---|
| 867 |
push(@args, $hit_start); |
|---|
| 868 |
|
|---|
| 869 |
push(@args, '-hit_seq'); |
|---|
| 870 |
push(@args, $e->{seq}); |
|---|
| 871 |
|
|---|
| 872 |
push(@args, '-hsp_length'); |
|---|
| 873 |
push(@args, $e->{f}->end - $e->{f}->start + 1); |
|---|
| 874 |
|
|---|
| 875 |
push(@args, '-identical'); |
|---|
| 876 |
push(@args, $e->{f}->end - $e->{f}->start + 1); |
|---|
| 877 |
|
|---|
| 878 |
push(@args, '-hit_length'); |
|---|
| 879 |
push(@args, $e->{f}->end - $e->{f}->start + 1); |
|---|
| 880 |
|
|---|
| 881 |
push(@args, '-query_name'); |
|---|
| 882 |
push(@args, $e->{f}->seq_id); |
|---|
| 883 |
|
|---|
| 884 |
push(@args, '-algorithm'); |
|---|
| 885 |
push(@args, $e->{f}->source_tag); |
|---|
| 886 |
|
|---|
| 887 |
push(@args, '-bits'); |
|---|
| 888 |
push(@args, 2*($e->{f}->end - $e->{f}->start + 1)); |
|---|
| 889 |
|
|---|
| 890 |
push(@args, '-evalue'); |
|---|
| 891 |
push(@args, 0.0); |
|---|
| 892 |
|
|---|
| 893 |
push(@args, '-pvalue'); |
|---|
| 894 |
push(@args, 0.0); |
|---|
| 895 |
|
|---|
| 896 |
push(@args, '-query_length'); |
|---|
| 897 |
push(@args, length($$seq_ref)); |
|---|
| 898 |
|
|---|
| 899 |
push(@args, '-query_end'); |
|---|
| 900 |
push(@args, $e->{f}->end); |
|---|
| 901 |
|
|---|
| 902 |
push(@args, '-conserved'); |
|---|
| 903 |
push(@args, length($e->{seq})); |
|---|
| 904 |
|
|---|
| 905 |
push(@args, '-hit_name'); |
|---|
| 906 |
push(@args, $hit_name); |
|---|
| 907 |
|
|---|
| 908 |
push(@args, '-hit_end'); |
|---|
| 909 |
push(@args, $hit_end); |
|---|
| 910 |
|
|---|
| 911 |
push(@args, '-query_gaps'); |
|---|
| 912 |
push(@args, 0); |
|---|
| 913 |
|
|---|
| 914 |
push(@args, '-hit_gaps'); |
|---|
| 915 |
push(@args, 0); |
|---|
| 916 |
|
|---|
| 917 |
my $hsp = new Bio::Search::HSP::PhatHSP::gff3(@args); |
|---|
| 918 |
$hsp->queryName($e->{f}->seq_id); |
|---|
| 919 |
|
|---|
| 920 |
|
|---|
| 921 |
|
|---|
| 922 |
if ($e->{f}->strand == 1 ){ |
|---|
| 923 |
$hsp->{_strand_hack}->{query} = 1; |
|---|
| 924 |
$hsp->{_strand_hack}->{hit} = $hit_strand; |
|---|
| 925 |
} |
|---|
| 926 |
else { |
|---|
| 927 |
$hsp->{_strand_hack}->{query} = -1; |
|---|
| 928 |
$hsp->{_strand_hack}->{hit} = $hit_strand; |
|---|
| 929 |
} |
|---|
| 930 |
|
|---|
| 931 |
$hit_start += $e->{f}->end - $e->{f}->start + 1; |
|---|
| 932 |
|
|---|
| 933 |
push(@hsps, $hsp); |
|---|
| 934 |
} |
|---|
| 935 |
|
|---|
| 936 |
return \@hsps; |
|---|
| 937 |
} |
|---|
| 938 |
|
|---|
| 939 |
sub _load_hsps { |
|---|
| 940 |
my $t = shift; |
|---|
| 941 |
my $seq_ref = shift; |
|---|
| 942 |
|
|---|
| 943 |
my @hsps; |
|---|
| 944 |
my $hit_start = 1; |
|---|
| 945 |
my $hit_strand = 1; |
|---|
| 946 |
my $hit_name = $t->{name}; |
|---|
| 947 |
($hit_name) = $hit_name =~ /(^[^\s]+)/; |
|---|
| 948 |
$hit_name =~ s/(mRNA\-\d+)\-AED\:.*/$1/; |
|---|
| 949 |
|
|---|
| 950 |
|
|---|
| 951 |
|
|---|
| 952 |
if($t->{exons}->[0]->{f}->source_tag =~ /^snap|^augustus|^fgenesh|^genemark/){ |
|---|
| 953 |
my $features = $t->{exons}; |
|---|
| 954 |
@{$features} = sort {$a->{f}->start <=> $b->{f}->start} @{$features}; |
|---|
| 955 |
my $length = 0; |
|---|
| 956 |
foreach my $e (@{$features}){ |
|---|
| 957 |
$length += abs($e->{f}->end - $e->{f}->start) + 1; |
|---|
| 958 |
} |
|---|
| 959 |
|
|---|
| 960 |
my $overhang = $length % 3; |
|---|
| 961 |
if($overhang != 0){ |
|---|
| 962 |
if($features->[0]->{f}->strand == 1){ |
|---|
| 963 |
my $last = $features->[-1]; |
|---|
| 964 |
my $l_length = abs($last->{f}->end - $last->{f}->start) + 1; |
|---|
| 965 |
|
|---|
| 966 |
while($l_length <= $overhang){ |
|---|
| 967 |
pop(@{$features}); |
|---|
| 968 |
$overhang -= $l_length; |
|---|
| 969 |
$last = $features->[-1]; |
|---|
| 970 |
$l_length = abs($last->{f}->end - $last->{f}->start) + 1; |
|---|
| 971 |
} |
|---|
| 972 |
|
|---|
| 973 |
$last->{f}->end($last->{f}->end - $overhang); |
|---|
| 974 |
} |
|---|
| 975 |
elsif($features->[0]->{f}->strand == -1){ |
|---|
| 976 |
my $last = $features->[0]; |
|---|
| 977 |
my $l_length = abs($last->{f}->end - $last->{f}->start) + 1; |
|---|
| 978 |
|
|---|
| 979 |
while($l_length <= $overhang){ |
|---|
| 980 |
shift(@{$features}); |
|---|
| 981 |
$overhang -= $l_length; |
|---|
| 982 |
$last = $features->[0]; |
|---|
| 983 |
$l_length = abs($last->{f}->end - $last->{f}->start) + 1; |
|---|
| 984 |
} |
|---|
| 985 |
|
|---|
| 986 |
$last->{f}->start($last->{f}->start + $overhang); |
|---|
| 987 |
} |
|---|
| 988 |
else{ |
|---|
| 989 |
die "FATAL: No exon strand in Widget::snap\n"; |
|---|
| 990 |
} |
|---|
| 991 |
} |
|---|
| 992 |
} |
|---|
| 993 |
|
|---|
| 994 |
|
|---|
| 995 |
foreach my $e (@{$t->{exons}}){ |
|---|
| 996 |
my @args; |
|---|
| 997 |
my $hit_end = $e->{f}->end - $e->{f}->start + $hit_start; |
|---|
| 998 |
|
|---|
| 999 |
my $value = _get_annotation($e->{f}, 'Target'); |
|---|
| 1000 |
if($value ne ''){ |
|---|
| 1001 |
my @dats = split(/\s/, $value); |
|---|
| 1002 |
|
|---|
| 1003 |
$hit_name = $dats[0]; |
|---|
| 1004 |
$hit_start = $dats[1]; |
|---|
| 1005 |
$hit_end = $dats[2]; |
|---|
| 1006 |
$hit_strand = (defined ($dats[3]) && $dats[3] eq '-') ? -1 : 1; |
|---|
| 1007 |
} |
|---|
| 1008 |
|
|---|
| 1009 |
push(@args, '-query_start'); |
|---|
| 1010 |
push(@args, $e->{f}->start); |
|---|
| 1011 |
|
|---|
| 1012 |
push(@args, '-query_seq'); |
|---|
| 1013 |
push(@args, $e->{seq}); |
|---|
| 1014 |
|
|---|
| 1015 |
push(@args, '-score'); |
|---|
| 1016 |
push(@args, $e->{f}->score); |
|---|
| 1017 |
|
|---|
| 1018 |
push(@args, '-homology_seq'); |
|---|
| 1019 |
push(@args, $e->{seq}); |
|---|
| 1020 |
|
|---|
| 1021 |
push(@args, '-hit_start'); |
|---|
| 1022 |
push(@args, $hit_start); |
|---|
| 1023 |
|
|---|
| 1024 |
push(@args, '-hit_seq'); |
|---|
| 1025 |
push(@args, $e->{seq}); |
|---|
| 1026 |
|
|---|
| 1027 |
push(@args, '-hsp_length'); |
|---|
| 1028 |
push(@args, $e->{f}->end - $e->{f}->start + 1); |
|---|
| 1029 |
|
|---|
| 1030 |
push(@args, '-identical'); |
|---|
| 1031 |
push(@args, $e->{f}->end - $e->{f}->start + 1); |
|---|
| 1032 |
|
|---|
| 1033 |
push(@args, '-hit_length'); |
|---|
| 1034 |
push(@args, $e->{f}->end - $e->{f}->start + 1); |
|---|
| 1035 |
|
|---|
| 1036 |
push(@args, '-query_name'); |
|---|
| 1037 |
push(@args, $e->{f}->seq_id); |
|---|
| 1038 |
|
|---|
| 1039 |
push(@args, '-algorithm'); |
|---|
| 1040 |
push(@args, $e->{f}->source_tag); |
|---|
| 1041 |
|
|---|
| 1042 |
push(@args, '-bits'); |
|---|
| 1043 |
push(@args, 2*($e->{f}->end - $e->{f}->start + 1)); |
|---|
| 1044 |
|
|---|
| 1045 |
push(@args, '-evalue'); |
|---|
| 1046 |
push(@args, 0.0); |
|---|
| 1047 |
|
|---|
| 1048 |
push(@args, '-pvalue'); |
|---|
| 1049 |
push(@args, 0.0); |
|---|
| 1050 |
|
|---|
| 1051 |
push(@args, '-query_length'); |
|---|
| 1052 |
push(@args, length($$seq_ref)); |
|---|
| 1053 |
|
|---|
| 1054 |
push(@args, '-query_end'); |
|---|
| 1055 |
push(@args, $e->{f}->end); |
|---|
| 1056 |
|
|---|
| 1057 |
push(@args, '-conserved'); |
|---|
| 1058 |
push(@args, length($e->{seq})); |
|---|
| 1059 |
|
|---|
| 1060 |
push(@args, '-hit_name'); |
|---|
| 1061 |
push(@args, $hit_name); |
|---|
| 1062 |
|
|---|
| 1063 |
push(@args, '-hit_end'); |
|---|
| 1064 |
push(@args, $hit_end); |
|---|
| 1065 |
|
|---|
| 1066 |
push(@args, '-query_gaps'); |
|---|
| 1067 |
push(@args, 0); |
|---|
| 1068 |
|
|---|
| 1069 |
push(@args, '-hit_gaps'); |
|---|
| 1070 |
push(@args, 0); |
|---|
| 1071 |
|
|---|
| 1072 |
|
|---|
| 1073 |
my @anns; |
|---|
| 1074 |
if(@anns = $e->{f}->annotation->get_all_annotation_keys()){ |
|---|
| 1075 |
@anns = grep {!/^ID$|^Name$|^Target$|^Parent$|^_AED$|^_QI$/} @anns; |
|---|
| 1076 |
foreach my $ann (@anns){ |
|---|
| 1077 |
my @list = $e->{f}->annotation->get_Annotations(); |
|---|
| 1078 |
@list = map {$_->value()} @list; |
|---|
| 1079 |
$ann = $ann.'='.join(',', @list); |
|---|
| 1080 |
} |
|---|
| 1081 |
} |
|---|
| 1082 |
elsif(@anns = $e->{f}->get_all_tags){ |
|---|
| 1083 |
@anns = grep {!/^ID$|^Name$|^Target$|^Parent$|^_AED$|^_QI$/} @anns; |
|---|
| 1084 |
foreach my $ann (@anns){ |
|---|
| 1085 |
my @list = $e->{f}->get_tag_values($ann); |
|---|
| 1086 |
$ann = $ann.'='.join(',', @list); |
|---|
| 1087 |
} |
|---|
| 1088 |
} |
|---|
| 1089 |
my $attrib = join(';', @anns) if(@anns); |
|---|
| 1090 |
push(@args, '-attrib'); |
|---|
| 1091 |
push(@args, $attrib); |
|---|
| 1092 |
|
|---|
| 1093 |
my $hsp = new Bio::Search::HSP::PhatHSP::gff3(@args); |
|---|
| 1094 |
$hsp->queryName($e->{f}->seq_id); |
|---|
| 1095 |
|
|---|
| 1096 |
|
|---|
| 1097 |
|
|---|
| 1098 |
if ($e->{f}->strand == 1 ){ |
|---|
| 1099 |
$hsp->{_strand_hack}->{query} = 1; |
|---|
| 1100 |
$hsp->{_strand_hack}->{hit} = $hit_strand; |
|---|
| 1101 |
} |
|---|
| 1102 |
else { |
|---|
| 1103 |
$hsp->{_strand_hack}->{query} = -1; |
|---|
| 1104 |
$hsp->{_strand_hack}->{hit} = $hit_strand; |
|---|
| 1105 |
} |
|---|
| 1106 |
|
|---|
| 1107 |
$hit_start += $e->{f}->end - $e->{f}->start + 1; |
|---|
| 1108 |
|
|---|
| 1109 |
push(@hsps, $hsp); |
|---|
| 1110 |
} |
|---|
| 1111 |
|
|---|
| 1112 |
return \@hsps; |
|---|
| 1113 |
} |
|---|
| 1114 |
|
|---|
| 1115 |
|
|---|
| 1116 |
sub _get_genes { |
|---|
| 1117 |
my $features = shift; |
|---|
| 1118 |
my $seq_ref = shift; |
|---|
| 1119 |
|
|---|
| 1120 |
my $exons = _grab(['exon'], $features); |
|---|
| 1121 |
my $cdss = _grab(['CDS'], $features); |
|---|
| 1122 |
my $mRNAs = _grab(['mRNA'], $features); |
|---|
| 1123 |
my $UTRs = _grab(['five_prime_UTR', 'three_prime_UTR'], $features); |
|---|
| 1124 |
|
|---|
| 1125 |
foreach my $p_id (keys %{$mRNAs}){ |
|---|
| 1126 |
for (my $i = 0; $ i < @{$mRNAs->{$p_id}}; $i++) { |
|---|
| 1127 |
my $f = $mRNAs->{$p_id}->[$i]->{f}; |
|---|
| 1128 |
my $id = $mRNAs->{$p_id}->[$i]->{id}; |
|---|
| 1129 |
|
|---|
| 1130 |
$mRNAs->{$p_id}->[$i]->{exons} = $exons->{$id}; |
|---|
| 1131 |
$mRNAs->{$p_id}->[$i]->{cdss} = $cdss->{$id}; |
|---|
| 1132 |
$mRNAs->{$p_id}->[$i]->{maker_qi} = _get_maker_qi($mRNAs->{$p_id}->[$i]); |
|---|
| 1133 |
} |
|---|
| 1134 |
} |
|---|
| 1135 |
|
|---|
| 1136 |
my @genes; |
|---|
| 1137 |
foreach my $f (@{$features}){ |
|---|
| 1138 |
my $tag = $f->primary_tag(); |
|---|
| 1139 |
|
|---|
| 1140 |
if ($tag eq 'gene') { |
|---|
| 1141 |
my $id=_get_annotation($f,'ID'); |
|---|
| 1142 |
my $name=_get_annotation($f,'Name'); |
|---|
| 1143 |
|
|---|
| 1144 |
$name = $id if ($name eq ''); |
|---|
| 1145 |
|
|---|
| 1146 |
|
|---|
| 1147 |
if(exists $exons->{$id}){ |
|---|
| 1148 |
foreach my $mRNA (@{$mRNAs->{$id}}){ |
|---|
| 1149 |
my $t_id = $mRNA->{id}; |
|---|
| 1150 |
$mRNA->{exons} = _fix_wormbase($exons->{$id}, $cdss->{$t_id}, $UTRs->{$t_id}); |
|---|
| 1151 |
} |
|---|
| 1152 |
} |
|---|
| 1153 |
|
|---|
| 1154 |
push(@genes, {'f' => $f, |
|---|
| 1155 |
'mRNAs' => $mRNAs->{$id}, |
|---|
| 1156 |
'part_of' => [], |
|---|
| 1157 |
'id' => $id, |
|---|
| 1158 |
'name' => $name, |
|---|
| 1159 |
'start' => $f->start, |
|---|
| 1160 |
'end' => $f->end, |
|---|
| 1161 |
} |
|---|
| 1162 |
); |
|---|
| 1163 |
} |
|---|
| 1164 |
} |
|---|
| 1165 |
|
|---|
| 1166 |
my @valid_genes; |
|---|
| 1167 |
for my $gene (@genes) { |
|---|
| 1168 |
push @valid_genes, $gene if _validate_gene($gene); |
|---|
| 1169 |
} |
|---|
| 1170 |
|
|---|
| 1171 |
_load_seqs(\@valid_genes, $seq_ref); |
|---|
| 1172 |
|
|---|
| 1173 |
return (\@valid_genes); |
|---|
| 1174 |
} |
|---|
| 1175 |
|
|---|
| 1176 |
|
|---|
| 1177 |
sub _fix_wormbase { |
|---|
| 1178 |
my $exons = shift || []; |
|---|
| 1179 |
my $cdss = shift || []; |
|---|
| 1180 |
my $UTRs = shift || []; |
|---|
| 1181 |
|
|---|
| 1182 |
my @keepers; |
|---|
| 1183 |
foreach my $exon (@$exons){ |
|---|
| 1184 |
my $e = $exon->{f}; |
|---|
| 1185 |
my $eB = $e->start; |
|---|
| 1186 |
my $eE = $e->end; |
|---|
| 1187 |
|
|---|
| 1188 |
my $ok = 0; |
|---|
| 1189 |
my $okB = 0; |
|---|
| 1190 |
my $okE = 0; |
|---|
| 1191 |
|
|---|
| 1192 |
|
|---|
| 1193 |
foreach my $piece (@$cdss){ |
|---|
| 1194 |
my $p = $piece->{f}; |
|---|
| 1195 |
my $pB = $p->start; |
|---|
| 1196 |
my $pE = $p->end; |
|---|
| 1197 |
|
|---|
| 1198 |
if($pB == $eB && $pE == $eE){ |
|---|
| 1199 |
$ok = 1; |
|---|
| 1200 |
} |
|---|
| 1201 |
elsif($pB == $eB && $pE != $eE){ |
|---|
| 1202 |
$okB = 1; |
|---|
| 1203 |
} |
|---|
| 1204 |
elsif($pB != $eB && $pE == $eE){ |
|---|
| 1205 |
$okE = 1; |
|---|
| 1206 |
} |
|---|
| 1207 |
|
|---|
| 1208 |
last if($ok); |
|---|
| 1209 |
} |
|---|
| 1210 |
|
|---|
| 1211 |
if($ok){ |
|---|
| 1212 |
push(@keepers, $exon); |
|---|
| 1213 |
next; |
|---|
| 1214 |
} |
|---|
| 1215 |
|
|---|
| 1216 |
|
|---|
| 1217 |
foreach my $piece (@$UTRs){ |
|---|
| 1218 |
my $p = $piece->{f}; |
|---|
| 1219 |
my $pB = $p->start; |
|---|
| 1220 |
my $pE = $p->end; |
|---|
| 1221 |
|
|---|
| 1222 |
if($pB == $eB && $pE == $eE){ |
|---|
| 1223 |
$ok = 1; |
|---|
| 1224 |
} |
|---|
| 1225 |
elsif($pB == $eB && $pE != $eE){ |
|---|
| 1226 |
$okB = 1; |
|---|
| 1227 |
$ok = 1 if($okE); |
|---|
| 1228 |
} |
|---|
| 1229 |
elsif($pB != $eB && $pE == $eE){ |
|---|
| 1230 |
$okE = 1; |
|---|
| 1231 |
$ok = 1 if($okB); |
|---|
| 1232 |
} |
|---|
| 1233 |
|
|---|
| 1234 |
last if($ok); |
|---|
| 1235 |
} |
|---|
| 1236 |
|
|---|
| 1237 |
if($ok){ |
|---|
| 1238 |
push(@keepers, $exon); |
|---|
| 1239 |
next; |
|---|
| 1240 |
} |
|---|
| 1241 |
} |
|---|
| 1242 |
|
|---|
| 1243 |
return \@keepers; |
|---|
| 1244 |
} |
|---|
| 1245 |
|
|---|
| 1246 |
|
|---|
| 1247 |
|
|---|
| 1248 |
sub _get_structs { |
|---|
| 1249 |
my $features = shift; |
|---|
| 1250 |
my $seq_ref = shift; |
|---|
| 1251 |
|
|---|
| 1252 |
my @bases; |
|---|
| 1253 |
my %index; |
|---|
| 1254 |
foreach my $f (@{$features}){ |
|---|
| 1255 |
my $tag_t = $f->primary_tag(); |
|---|
| 1256 |
|
|---|
| 1257 |
next if($tag_t =~ /^gene$|^mRNA$|^exon$|^CDS$/); |
|---|
| 1258 |
|
|---|
| 1259 |
my $id = _get_annotation($f,'ID'); |
|---|
| 1260 |
my $name = _get_annotation($f, 'Name'); |
|---|
| 1261 |
my $p_ids = _get_p_ids($f); |
|---|
| 1262 |
|
|---|
| 1263 |
$name = $id if($name eq ''); |
|---|
| 1264 |
|
|---|
| 1265 |
my $struct = { f => $f, |
|---|
| 1266 |
id => $id, |
|---|
| 1267 |
name => $name, |
|---|
| 1268 |
part_of => $p_ids |
|---|
| 1269 |
}; |
|---|
| 1270 |
|
|---|
| 1271 |
if (! @{$p_ids}){ |
|---|
| 1272 |
push(@bases, $struct); |
|---|
| 1273 |
} |
|---|
| 1274 |
else{ |
|---|
| 1275 |
foreach my $p_id (@{$p_ids}){ |
|---|
| 1276 |
push(@{$index{$p_id}}, $struct); |
|---|
| 1277 |
} |
|---|
| 1278 |
} |
|---|
| 1279 |
} |
|---|
| 1280 |
|
|---|
| 1281 |
foreach my $b (@bases){ |
|---|
| 1282 |
if (exists $index{$b->{id}}){ |
|---|
| 1283 |
$b->{exons} = $index{$b->{id}}; |
|---|
| 1284 |
} |
|---|
| 1285 |
else{ |
|---|
| 1286 |
push(@{$b->{exons}}, { f => $b->{f}, |
|---|
| 1287 |
id => $b->{id}, |
|---|
| 1288 |
name => $b->{name}, |
|---|
| 1289 |
part_of => [$b->{id}] |
|---|
| 1290 |
} |
|---|
| 1291 |
); |
|---|
| 1292 |
} |
|---|
| 1293 |
} |
|---|
| 1294 |
|
|---|
| 1295 |
|
|---|
| 1296 |
my @genes; |
|---|
| 1297 |
foreach my $b (@bases){ |
|---|
| 1298 |
push(@genes, { f => $b->{f}, |
|---|
| 1299 |
mRNAs => [$b], |
|---|
| 1300 |
part_of => [], |
|---|
| 1301 |
id => '', |
|---|
| 1302 |
name => '', |
|---|
| 1303 |
start => $b->{f}->start, |
|---|
| 1304 |
end => $b->{f}->end |
|---|
| 1305 |
} |
|---|
| 1306 |
); |
|---|
| 1307 |
} |
|---|
| 1308 |
|
|---|
| 1309 |
_load_seqs(\@genes, $seq_ref); |
|---|
| 1310 |
|
|---|
| 1311 |
return (\@genes); |
|---|
| 1312 |
} |
|---|
| 1313 |
|
|---|
| 1314 |
sub _grab { |
|---|
| 1315 |
my $types = shift; |
|---|
| 1316 |
my $features = shift; |
|---|
| 1317 |
|
|---|
| 1318 |
my %booty; |
|---|
| 1319 |
|
|---|
| 1320 |
for my $type (@{$types}) { |
|---|
| 1321 |
foreach my $f (@{$features}){ |
|---|
| 1322 |
my $tag_t = $f->primary_tag(); |
|---|
| 1323 |
|
|---|
| 1324 |
if ($tag_t eq $type) { |
|---|
| 1325 |
my $id = _get_annotation($f,'ID'); |
|---|
| 1326 |
my $name = _get_annotation($f, 'Name'); |
|---|
| 1327 |
my $p_ids = _get_p_ids($f); |
|---|
| 1328 |
|
|---|
| 1329 |
$name = $id if($name eq ''); |
|---|
| 1330 |
|
|---|
| 1331 |
foreach my $p_id (@{$p_ids}){ |
|---|
| 1332 |
push(@{$booty{$p_id}}, {f => $f, |
|---|
| 1333 |
id => $id, |
|---|
| 1334 |
name => $name,, |
|---|
| 1335 |
part_of => $p_ids, |
|---|
| 1336 |
}); |
|---|
| 1337 |
} |
|---|
| 1338 |
} |
|---|
| 1339 |
} |
|---|
| 1340 |
} |
|---|
| 1341 |
return \%booty; |
|---|
| 1342 |
} |
|---|
| 1343 |
|
|---|
| 1344 |
sub _get_annotation { |
|---|
| 1345 |
my ($f,$type)=@_;; |
|---|
| 1346 |
my $annotation_collection = $f->annotation; |
|---|
| 1347 |
my ($annotation) = $annotation_collection->get_Annotations($type); |
|---|
| 1348 |
|
|---|
| 1349 |
my $value; |
|---|
| 1350 |
if (defined $annotation){ |
|---|
| 1351 |
$value = $annotation->value(); |
|---|
| 1352 |
} |
|---|
| 1353 |
elsif( grep {/^$type$/} $f->get_all_tags ){ |
|---|
| 1354 |
($value) = $f->get_tag_values($type); |
|---|
| 1355 |
} |
|---|
| 1356 |
else{ |
|---|
| 1357 |
$value = ''; |
|---|
| 1358 |
} |
|---|
| 1359 |
|
|---|
| 1360 |
return $value; |
|---|
| 1361 |
} |
|---|
| 1362 |
|
|---|
| 1363 |
sub _get_p_ids { |
|---|
| 1364 |
my $f = shift; |
|---|
| 1365 |
|
|---|
| 1366 |
my @parents = $f->annotation->get_Annotations('Parent'); |
|---|
| 1367 |
my @p_ids; |
|---|
| 1368 |
|
|---|
| 1369 |
foreach my $p (@parents){ |
|---|
| 1370 |
push(@p_ids, $p->{value}); |
|---|
| 1371 |
} |
|---|
| 1372 |
|
|---|
| 1373 |
|
|---|
| 1374 |
if (! @p_ids && grep {/^Parent$/} $f->get_all_tags ){ |
|---|
| 1375 |
@p_ids = $f->get_tag_values('Parent'); |
|---|
| 1376 |
} |
|---|
| 1377 |
|
|---|
| 1378 |
|
|---|
| 1379 |
return \@p_ids; |
|---|
| 1380 |
} |
|---|
| 1381 |
|
|---|
| 1382 |
sub _validate_gene { |
|---|
| 1383 |
my $gene = shift; |
|---|
| 1384 |
|
|---|
| 1385 |
my @strands; |
|---|
| 1386 |
|
|---|
| 1387 |
|
|---|
| 1388 |
my $g_strand = $gene->{f}{_location}{_strand}; |
|---|
| 1389 |
if ($g_strand != 1 && $g_strand != -1) { |
|---|
| 1390 |
warn "Invalid strand in gene caught at " . |
|---|
| 1391 |
"FlyBase::validate_gene\n"; |
|---|
| 1392 |
return undef; |
|---|
| 1393 |
} |
|---|
| 1394 |
|
|---|
| 1395 |
|
|---|
| 1396 |
|
|---|
| 1397 |
push @strands, $g_strand; |
|---|
| 1398 |
|
|---|
| 1399 |
for my $transcript (@{$gene->{transcripts}}) { |
|---|
| 1400 |
|
|---|
| 1401 |
my $t_strand = $transcript->{f}{_location}{_strand}; |
|---|
| 1402 |
if ($t_strand != 1 && $t_strand != -1) { |
|---|
| 1403 |
warn "Invalid strand in transcript caught " . |
|---|
| 1404 |
"at FlyBase::validate_gene\n"; |
|---|
| 1405 |
return undef; |
|---|
| 1406 |
} |
|---|
| 1407 |
|
|---|
| 1408 |
|
|---|
| 1409 |
push @strands, $t_strand; |
|---|
| 1410 |
|
|---|
| 1411 |
for my $cds (@{$transcript->{cdss}}) { |
|---|
| 1412 |
|
|---|
| 1413 |
my $c_strand = $transcript->{f}{_location}{_strand}; |
|---|
| 1414 |
if ($c_strand != 1 && $c_strand != -1) { |
|---|
| 1415 |
warn "Invalid strand in cds caught " . |
|---|
| 1416 |
"at FlyBase::validate_gene\n"; |
|---|
| 1417 |
return undef; |
|---|
| 1418 |
} |
|---|
| 1419 |
|
|---|
| 1420 |
|
|---|
| 1421 |
push @strands, $c_strand; |
|---|
| 1422 |
} |
|---|
| 1423 |
|
|---|
| 1424 |
for my $exon (@{$transcript->{exons}}) { |
|---|
| 1425 |
|
|---|
| 1426 |
my $e_strand = $transcript->{f}{_location}{_strand}; |
|---|
| 1427 |
if ($e_strand != 1 && $e_strand != -1) { |
|---|
| 1428 |
warn "Invalid strand in exon caught " . |
|---|
| 1429 |
"at FlyBase::validate_gene\n"; |
|---|
| 1430 |
return undef; |
|---|
| 1431 |
} |
|---|
| 1432 |
|
|---|
| 1433 |
|
|---|
| 1434 |
push @strands, $e_strand; |
|---|
| 1435 |
} |
|---|
| 1436 |
} |
|---|
| 1437 |
|
|---|
| 1438 |
my $first = shift @strands; |
|---|
| 1439 |
return undef if grep {$first != $_} @strands; |
|---|
| 1440 |
|
|---|
| 1441 |
|
|---|
| 1442 |
return 1; |
|---|
| 1443 |
} |
|---|
| 1444 |
|
|---|
| 1445 |
sub _get_gene_seq { |
|---|
| 1446 |
my $g = shift; |
|---|
| 1447 |
my $seq_ref = shift; |
|---|
| 1448 |
|
|---|
| 1449 |
my $g_b = $g->{f}->start(); |
|---|
| 1450 |
my $g_e = $g->{f}->end(); |
|---|
| 1451 |
|
|---|
| 1452 |
my $start; |
|---|
| 1453 |
my $length; |
|---|
| 1454 |
my ($src_s, $src_e); |
|---|
| 1455 |
if (($g_b - 1) < 0){ |
|---|
| 1456 |
$start = $g_b - 1; |
|---|
| 1457 |
$length = abs($g_e - $g_b) + 1; |
|---|
| 1458 |
$src_s = 1; |
|---|
| 1459 |
$src_e = $g_e > length($$seq_ref) ? length($$seq_ref) : $g_e; |
|---|
| 1460 |
} |
|---|
| 1461 |
else { |
|---|
| 1462 |
$start = $g_b - 1; |
|---|
| 1463 |
$length = abs($g_e - $g_b) + 1; |
|---|
| 1464 |
$src_s = $g_b; |
|---|
| 1465 |
$src_e = $g_e > length($$seq_ref) ? length($$seq_ref) : $g_e; |
|---|
| 1466 |
} |
|---|
| 1467 |
|
|---|
| 1468 |
my $g_seq = substr($$seq_ref, $start, $length); |
|---|
| 1469 |
|
|---|
| 1470 |
return ($g_seq, $src_s, $src_e); |
|---|
| 1471 |
} |
|---|
| 1472 |
|
|---|
| 1473 |
sub _load_seqs { |
|---|
| 1474 |
my $genes = shift; |
|---|
| 1475 |
my $seq_ref = shift; |
|---|
| 1476 |
|
|---|
| 1477 |
foreach my $g (@{$genes}){ |
|---|
| 1478 |
my ($g_seg_seq, $src_start, $src_end) = _get_gene_seq($g, $seq_ref); |
|---|
| 1479 |
|
|---|
| 1480 |
$g->{seq} = $g_seg_seq; |
|---|
| 1481 |
$g->{src_s} = $src_start; |
|---|
| 1482 |
$g->{src_e} = $src_end; |
|---|
| 1483 |
$g->{i_start} = 1; |
|---|
| 1484 |
$g->{i_end} = length($g_seg_seq); |
|---|
| 1485 |
|
|---|
| 1486 |
foreach my $t (@{$g->{mRNAs}}){ |
|---|
| 1487 |
foreach my $e (@{$t->{exons}}){ |
|---|
| 1488 |
my $e_seq = _get_exon_seq($e, $seq_ref); |
|---|
| 1489 |
|
|---|
| 1490 |
$e->{seq} = $e_seq; |
|---|
| 1491 |
$e->{i_start} = $e->{f}->start() - $g->{src_s} + 1; |
|---|
| 1492 |
|
|---|
| 1493 |
$e->{i_end} = $e->{f}->end() - $g->{src_s} + 1; |
|---|
| 1494 |
} |
|---|
| 1495 |
foreach my $c (@{$t->{cdss}}){ |
|---|
| 1496 |
my $c_seq = _get_exon_seq($c, $seq_ref); |
|---|
| 1497 |
|
|---|
| 1498 |
$c->{seq} = $c_seq; |
|---|
| 1499 |
$c->{i_start} = $c->{f}->start() - $g->{src_s} + 1; |
|---|
| 1500 |
|
|---|
| 1501 |
$c->{i_end} = $c->{f}->end() - $g->{src_s} + 1; |
|---|
| 1502 |
} |
|---|
| 1503 |
|
|---|
| 1504 |
my $t_seq = _get_mRNA_seq($t, $seq_ref); |
|---|
| 1505 |
my $cds_seq = _get_cds_seq($t, $seq_ref); |
|---|
| 1506 |
|
|---|
| 1507 |
|
|---|
| 1508 |
my $index = index($t_seq, $cds_seq) if($cds_seq); |
|---|
| 1509 |
|
|---|
| 1510 |
$t->{seq} = $t_seq; |
|---|
| 1511 |
$t->{cds_seq} = $cds_seq; |
|---|
| 1512 |
|
|---|
| 1513 |
$t->{i_start} = $t->{f}->start() - $g->{src_s} + 1; |
|---|
| 1514 |
$t->{i_end} = $t->{f}->end() - $g->{src_s} + 1; |
|---|
| 1515 |
|
|---|
| 1516 |
} |
|---|
| 1517 |
} |
|---|
| 1518 |
} |
|---|
| 1519 |
|
|---|
| 1520 |
sub _get_cds_seq { |
|---|
| 1521 |
my $t = shift; |
|---|
| 1522 |
my $seq_ref = shift; |
|---|
| 1523 |
|
|---|
| 1524 |
my $sorted = _sort_cdss($t); |
|---|
| 1525 |
|
|---|
| 1526 |
my $cds_seq; |
|---|
| 1527 |
foreach my $c (@{$sorted}){ |
|---|
| 1528 |
my $c_seq = $c->{seq} || _get_exon_seq($c, $seq_ref); |
|---|
| 1529 |
$cds_seq .= $c_seq; |
|---|
| 1530 |
|
|---|
| 1531 |
} |
|---|
| 1532 |
return $cds_seq; |
|---|
| 1533 |
} |
|---|
| 1534 |
|
|---|
| 1535 |
sub _get_mRNA_seq { |
|---|
| 1536 |
my $t = shift; |
|---|
| 1537 |
my $seq_ref = shift; |
|---|
| 1538 |
|
|---|
| 1539 |
my $sorted = _sort_exons($t); |
|---|
| 1540 |
|
|---|
| 1541 |
my $transcript; |
|---|
| 1542 |
foreach my $e (@{$sorted}){ |
|---|
| 1543 |
my $exon_seq = $e->{seq} || _get_exon_seq($e, $seq_ref); |
|---|
| 1544 |
$transcript .= $exon_seq; |
|---|
| 1545 |
|
|---|
| 1546 |
} |
|---|
| 1547 |
return $transcript; |
|---|
| 1548 |
} |
|---|
| 1549 |
|
|---|
| 1550 |
sub _get_exon_seq { |
|---|
| 1551 |
my $e = shift; |
|---|
| 1552 |
my $seq_ref = shift; |
|---|
| 1553 |
|
|---|
| 1554 |
my $e_b = $e->{f}->start(); |
|---|
| 1555 |
my $e_e = $e->{f}->end(); |
|---|
| 1556 |
|
|---|
| 1557 |
my $length = abs($e_e - $e_b) + 1; |
|---|
| 1558 |
|
|---|
| 1559 |
my $exon_seq = substr($$seq_ref, $e_b - 1, $length); |
|---|
| 1560 |
|
|---|
| 1561 |
$exon_seq = Fasta::revComp($exon_seq) if $e->{f}->strand() == -1; |
|---|
| 1562 |
|
|---|
| 1563 |
return $exon_seq; |
|---|
| 1564 |
} |
|---|
| 1565 |
|
|---|
| 1566 |
sub _sort_exons { |
|---|
| 1567 |
my $t = shift; |
|---|
| 1568 |
|
|---|
| 1569 |
my @sorted; |
|---|
| 1570 |
|
|---|
| 1571 |
if ($t->{f}->strand() == 1){ |
|---|
| 1572 |
@sorted = |
|---|
| 1573 |
sort {$a->{f}->start <=> $b->{f}->start} @{$t->{exons}}; |
|---|
| 1574 |
} |
|---|
| 1575 |
elsif ($t->{f}->strand() == -1){ |
|---|
| 1576 |
@sorted = |
|---|
| 1577 |
sort {$b->{f}->start <=> $a->{f}->start} @{$t->{exons}}; |
|---|
| 1578 |
} |
|---|
| 1579 |
else { |
|---|
| 1580 |
die "ERROR: Unknown strand in GFFDB::_sort_exons!\n"; |
|---|
| 1581 |
} |
|---|
| 1582 |
return \@sorted; |
|---|
| 1583 |
} |
|---|
| 1584 |
|
|---|
| 1585 |
sub _sort_cdss { |
|---|
| 1586 |
my $t = shift; |
|---|
| 1587 |
|
|---|
| 1588 |
my @sorted; |
|---|
| 1589 |
if ($t->{f}->strand() == 1){ |
|---|
| 1590 |
@sorted = |
|---|
| 1591 |
sort {$a->{f}->start <=> $b->{f}->start} @{$t->{cdss}}; |
|---|
| 1592 |
} |
|---|
| 1593 |
elsif ($t->{f}->strand() == -1){ |
|---|
| 1594 |
@sorted = |
|---|
| 1595 |
sort {$b->{f}->start <=> $a->{f}->start} @{$t->{cdss}}; |
|---|
| 1596 |
} |
|---|
| 1597 |
else { |
|---|
| 1598 |
die "Error: unknown strand in GFFDB::_sort_cdss!\n"; |
|---|
| 1599 |
} |
|---|
| 1600 |
return \@sorted; |
|---|
| 1601 |
} |
|---|
| 1602 |
|
|---|
| 1603 |
sub _get_maker_qi { |
|---|
| 1604 |
my $mRNA = shift; |
|---|
| 1605 |
|
|---|
| 1606 |
my $name = $mRNA->{name}; |
|---|
| 1607 |
|
|---|
| 1608 |
return 'NA' unless defined $name; |
|---|
| 1609 |
|
|---|
| 1610 |
my ($maker_qi) = $name =~ /QI:([\d\|\.-]+)/; |
|---|
| 1611 |
return $maker_qi; |
|---|
| 1612 |
} |
|---|
| 1613 |
|
|---|
| 1614 |
|
|---|
| 1615 |
1; |
|---|
| 1616 |
|
|---|
| 1617 |
|
|---|