Changeset 242
- Timestamp:
- 08/18/09 14:25:27 (3 months ago)
- Files:
-
- bin/maker2chado (modified) (5 diffs)
- lib/GI.pm (modified) (54 diffs)
- lib/PhatHit_utils.pm (modified) (3 diffs)
- lib/Process/MpiChunk.pm (modified) (15 diffs)
- lib/Widget/genemark.pm (modified) (2 diffs)
- lib/Widget/genemark/gmhmm_wrap (added)
- lib/Widget/genemark/gmhmme3_wrap (deleted)
- lib/maker/auto_annotator.pm (modified) (19 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
bin/maker2chado
r241 r242 152 152 $DBH->do(qq{ INSERT INTO organism (abbreviation, genus, species, common_name) VALUES ('$jid', 'JOB', '$jid', '$jid') }); 153 153 $DBH->commit; 154 $o_list = $DBH->selectcol_arrayref(qq{SELECT abbreviation FROM organism}); 155 $c_list = $DBH->selectcol_arrayref(qq{SELECT common_name FROM organism}); 154 156 } 155 157 } … … 159 161 die "FATAL: This script must be run interactively if no valid organism is supplied\n\n" if(! -t); 160 162 161 for(my $i = 0; $i <= abs(@$ o_list/10); $i ++){163 for(my $i = 0; $i <= abs(@$c_list/10); $i ++){ 162 164 my @menu; 163 165 for(my $j = $i*10; $j < $i*10+10; $j++){ 164 push(@menu, $ o_list->[$j]) if(exists $o_list->[$j]);166 push(@menu, $c_list->[$j]) if(exists $c_list->[$j]); 165 167 } 166 168 167 push(@menu, "Show more organisms -->") if($i + 1 < abs(@$ o_list/10));169 push(@menu, "Show more organisms -->") if($i + 1 < abs(@$c_list/10)); 168 170 push(@menu, "<-- Previous list") if($i > 0); 169 171 push(@menu, "<Add a new organism to the database>"); … … 222 224 $DBH->commit; 223 225 $o_list = $DBH->selectcol_arrayref(qq{SELECT abbreviation FROM organism}); 226 $c_list = $DBH->selectcol_arrayref(qq{SELECT common_name FROM organism}); 224 227 $i = -1; 225 228 next; … … 318 321 #-------subs-------- 319 322 sub remove_db_menu{ 320 for(my $i = 0; $i < abs(@$ o_list/10); $i ++){323 for(my $i = 0; $i < abs(@$c_list/10); $i ++){ 321 324 my @menu; 322 325 for(my $j = $i*10; $j < $i*10+10; $j++){ 323 push(@menu, $ o_list->[$j]) if(exists $o_list->[$j]);326 push(@menu, $c_list->[$j]) if(exists $c_list->[$j]); 324 327 } 325 328 326 push(@menu, "Show more organisms -->") if($i + 1 < abs(@$ o_list/10));329 push(@menu, "Show more organisms -->") if($i + 1 < abs(@$c_list/10)); 327 330 push(@menu, "<-- Previous list") if($i > 0); 328 331 push(@menu, "<Go back to main menu>"); … … 347 350 $DBH->commit; 348 351 $o_list = $DBH->selectcol_arrayref(qq{SELECT abbreviation FROM organism}); 349 352 $c_list = $DBH->selectcol_arrayref(qq{SELECT common_name FROM organism}); 353 350 354 return; 351 355 } lib/GI.pm
r239 r242 242 242 my $keepers = blastx($chunk, 243 243 $t_file, 244 undef, 244 245 $the_void, 245 246 $p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e}, 246 $CTL_OPT{_blastx}, 247 $CTL_OPT{eval_blastx}, 248 $CTL_OPT{bit_blastx}, 249 $CTL_OPT{pcov_blastx}, 250 $CTL_OPT{pid_blastx}, 251 $CTL_OPT{split_hit}, 252 $CTL_OPT{cpus}, 247 \%CTL_OPT, 253 248 $LOG 254 249 ); … … 262 257 my $keepers = blastn($chunk, 263 258 $t_file, 259 undef, 264 260 $the_void, 265 261 $p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e}, 266 $CTL_OPT{_blastn}, 267 $CTL_OPT{eval_blastn}, 268 $CTL_OPT{bit_blastn}, 269 $CTL_OPT{pcov_blastn}, 270 $CTL_OPT{pid_blastn}, 271 $CTL_OPT{split_hit}, 272 $CTL_OPT{cpus}, 262 \%CTL_OPT, 273 263 $LOG 274 264 ); … … 282 272 my $keepers = tblastx($chunk, 283 273 $t_file, 274 undef, 284 275 $the_void, 285 276 $p_safe_id.".".$piece->[0]->{b}.".".$piece->[0]->{e}, 286 $CTL_OPT{_tblastx}, 287 $CTL_OPT{eval_tblastx}, 288 $CTL_OPT{bit_tblastx}, 289 $CTL_OPT{pcov_tblastx}, 290 $CTL_OPT{pid_tblastx}, 291 $CTL_OPT{split_hit}, 292 $CTL_OPT{cpus}, 277 \%CTL_OPT, 293 278 $LOG 294 279 ); … … 800 785 801 786 #genemark sometimes fails if called directly so I built a wrapper 802 my $exe = "$FindBin::Bin/../lib/Widget/genemark/gmhmm e3_wrap";803 my $gm = $CTL_OPT->{ gmhmme3}; #genemark787 my $exe = "$FindBin::Bin/../lib/Widget/genemark/gmhmm_wrap"; 788 my $gm = $CTL_OPT->{organism_type} eq 'eukaryotic' ? $CTL_OPT->{gmhmme3} : $CTL_OPT->{gmhmmp}; #genemark 804 789 my $pro = $CTL_OPT->{probuild}; #helper exe 805 790 my $hmm = $CTL_OPT->{gmhmm}; … … 1234 1219 my $chunk = shift; 1235 1220 my $db = shift; 1236 my $the_void = shift; 1221 my $old_db = shift; 1222 my $the_void = shift; 1237 1223 my $seq_id = shift; 1238 my $blastn = shift; 1239 my $eval_blastn = shift; 1240 my $split_hit = shift; 1241 my $cpus = shift; 1242 my $old_db = shift; 1243 my $formater = shift; 1244 my $rank = shift; 1245 my $LOG = shift; 1246 my $LOG_FLAG = shift; 1224 my $CTL_OPT = shift; 1225 my $rank = shift; 1226 my $LOG = shift; 1227 my $LOG_FLAG = shift; 1228 1229 my $blastn = $CTL_OPT->{_blastn}; 1230 my $bit_blastn = $CTL_OPT->{bit_blastn}; 1231 my $eval_blastn = $CTL_OPT->{eval_blastn}; 1232 my $pcov_blastn = $CTL_OPT->{pcov_blastn}; 1233 my $pid_blastn = $CTL_OPT->{pid_blastn}; 1234 my $split_hit = $CTL_OPT->{split_hit}; 1235 my $cpus = $CTL_OPT->{cpus}; 1236 my $formater = $CTL_OPT->{_formater}; 1247 1237 1248 1238 #build names for files to use and copy … … 1292 1282 $split_hit, 1293 1283 $cpus, 1284 $CTL_OPT->{organism_type} 1294 1285 ); 1295 1286 … … 1300 1291 #----------------------------------------------------------------------------- 1301 1292 sub collect_blastn{ 1302 my $chunk = shift; 1303 my $blast_dir = shift; 1304 my $eval_blastn = shift; 1305 my $bit_blastn = shift, 1306 my $pcov_blastn = shift; 1307 my $pid_blastn = shift; 1308 my $split_hit = shift; 1309 my $LOG = shift; 1293 my $chunk = shift; 1294 my $blast_dir = shift; 1295 my $CTL_OPT = shift; 1296 my $LOG = shift; 1297 1298 my $eval_blastn = $CTL_OPT->{eval_blastn}; 1299 my $bit_blastn = $CTL_OPT->{bit_blastn}, 1300 my $pcov_blastn = $CTL_OPT->{pcov_blastn}; 1301 my $pid_blastn = $CTL_OPT->{pid_blastn}; 1302 my $split_hit = $CTL_OPT->{split_hit}; 1310 1303 1311 1304 my $blast_finished = $blast_dir; … … 1357 1350 my $chunk = shift; 1358 1351 my $db = shift; 1359 my $the_void = shift; 1352 my $old_db = shift; 1353 my $the_void = shift; 1360 1354 my $seq_id = shift; 1361 my $blastn = shift; 1362 my $eval_blastn = shift; 1363 my $bit_blastn = shift, 1364 my $pcov_blastn = shift; 1365 my $pid_blastn = shift; 1366 my $split_hit = shift; 1367 my $cpus = shift; 1368 my $LOG = shift; 1355 my $CTL_OPT = shift; 1356 my $rank = shift; 1357 my $LOG = shift; 1358 1359 my $blastn = $CTL_OPT->{_blastn}; 1360 my $bit_blastn = $CTL_OPT->{bit_blastn}; 1361 my $eval_blastn = $CTL_OPT->{eval_blastn}; 1362 my $pcov_blastn = $CTL_OPT->{pcov_blastn}; 1363 my $pid_blastn = $CTL_OPT->{pid_blastn}; 1364 my $split_hit = $CTL_OPT->{split_hit}; 1365 my $cpus = $CTL_OPT->{cpus}; 1366 my $formater = $CTL_OPT->{_formater}; 1369 1367 1370 1368 my ($db_n) = $db =~ /([^\/]+)$/; … … 1386 1384 $split_hit, 1387 1385 $cpus, 1388 ); 1386 $CTL_OPT->{organism_type} 1387 ); 1389 1388 1390 1389 my %params; … … 1418 1417 my $split_hit = shift; 1419 1418 my $cpus = shift; 1419 my $org_type = shift; 1420 1420 1421 1421 my $command = $blast; … … 1429 1429 $command .= " Q=3"; 1430 1430 $command .= " Z=1000"; 1431 $command .= " Y=500000000";1431 $command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000"; 1432 1432 $command .= " cpus=$cpus"; 1433 $command .= " topcomboN=1";1434 $command .= " hspmax=100";1435 $command .= " gspmax=100";1436 $command .= " hspsepqmax=$split_hit";1433 $command .= ($org_type eq 'eukaryotic') ? " topcomboN=1" : ""; 1434 $command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5"; 1435 $command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5"; 1436 $command .= ($org_type eq 'eukaryotic') ? " hspsepqmax=$split_hit" : ""; 1437 1437 $command .= " lcmask"; 1438 1438 $command .= " gi"; 1439 $command .= ($org_type eq 'eukaryotic') ? "" : " kap"; 1439 1440 #$command .= " mformat=2"; # remove for full report 1440 1441 $command .= " -o $out_file"; … … 1449 1450 $command .= " -G 3"; 1450 1451 $command .= " -z 1000"; 1451 $command .= " -Y 500000000";1452 $command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000"; 1452 1453 $command .= " -a $cpus"; 1453 $command .= " -K 100";1454 $command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5"; 1454 1455 $command .= " -U"; 1455 1456 $command .= " -F T"; … … 1477 1478 #----------------------------------------------------------------------------- 1478 1479 sub blastx_as_chunks { 1479 my $chunk = shift; 1480 my $db = shift; 1481 my $the_void = shift; 1482 my $seq_id = shift; 1483 my $blastx = shift; 1484 my $eval_blastx = shift; 1485 my $split_hit = shift; 1486 my $cpus = shift; 1487 my $old_db = shift; 1488 my $formater = shift; 1489 my $rank = shift; 1490 my $LOG = shift; 1491 my $LOG_FLAG = shift; 1492 my $softmask = shift; 1480 my $chunk = shift; 1481 my $db = shift; 1482 my $old_db = shift; 1483 my $the_void = shift; 1484 my $seq_id = shift; 1485 my $CTL_OPT = shift; 1486 my $rank = shift; 1487 my $LOG = shift; 1488 my $LOG_FLAG = shift; 1489 1490 my $rflag = 1 if($old_db && $CTL_OPT->{repeat_protein} eq $old_db); #am I running repeat data? 1491 1492 my $blastx = $CTL_OPT->{_blastx}; 1493 my $bit_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{bit_blastx}; 1494 my $eval_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{eval_blastx}; 1495 my $pcov_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pcov_blastx}; 1496 my $pid_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pid_blastx}; 1497 my $split_hit = $CTL_OPT->{split_hit}; #repeat proteins get shatttered later anyway 1498 my $cpus = $CTL_OPT->{cpus}; 1499 my $formater = $CTL_OPT->{_formater}; 1500 my $softmask = ($rflag) ? 1 : $CTL_OPT->{softmask}; #always on for repeats 1493 1501 1494 1502 #build names for files to use and copy … … 1538 1546 $split_hit, 1539 1547 $cpus, 1540 $softmask 1548 $softmask, 1549 $CTL_OPT->{organism_type} 1541 1550 ); 1542 1551 … … 1547 1556 #----------------------------------------------------------------------------- 1548 1557 sub collect_blastx{ 1549 my $chunk = shift; 1550 my $blast_dir = shift; 1551 my $eval_blastx = shift; 1552 my $bit_blastx = shift, 1553 my $pcov_blastx = shift; 1554 my $pid_blastx = shift; 1555 my $split_hit = shift; 1556 my $LOG = shift; 1558 my $chunk = shift; 1559 my $blast_dir = shift; 1560 my $CTL_OPT = shift; 1561 my $LOG = shift; 1562 1563 my $eval_blastx = $CTL_OPT->{eval_blastx}; 1564 my $bit_blastx = $CTL_OPT->{bit_blastx}, 1565 my $pcov_blastx = $CTL_OPT->{pcov_blastx}; 1566 my $pid_blastx = $CTL_OPT->{pid_blastx}; 1567 my $split_hit = $CTL_OPT->{split_hit}; 1557 1568 1558 1569 my $blast_finished = $blast_dir; … … 1604 1615 my $chunk = shift; 1605 1616 my $db = shift; 1606 my $the_void = shift; 1617 my $old_db = shift; 1618 my $the_void = shift; 1607 1619 my $seq_id = shift; 1608 my $blastx = shift; 1609 my $eval_blastx = shift; 1610 my $bit_blastx = shift; 1611 my $pcov_blastx = shift; 1612 my $pid_blastx = shift; 1613 my $split_hit = shift; 1614 my $cpus = shift; 1615 my $LOG = shift; 1616 my $softmask = shift; 1620 my $CTL_OPT = shift; 1621 my $rank = shift; 1622 my $LOG = shift; 1623 1624 my $rflag = 1 if($old_db && $CTL_OPT->{repeat_protein} eq $old_db); #am I running repeat data? 1625 1626 my $blastx = $CTL_OPT->{_blastx}; 1627 my $bit_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{bit_blastx}; 1628 my $eval_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{eval_blastx}; 1629 my $pcov_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pcov_blastx}; 1630 my $pid_blastx = ($rflag) ? $CTL_OPT->{bit_rm_blastx} : $CTL_OPT->{pid_blastx}; 1631 my $split_hit = $CTL_OPT->{split_hit}; #repeat proteins get shatttered later anyway 1632 my $cpus = $CTL_OPT->{cpus}; 1633 my $formater = $CTL_OPT->{_formater}; 1634 my $softmask = ($rflag) ? 1 : $CTL_OPT->{softmask}; 1617 1635 1618 1636 my ($db_n) = $db =~ /([^\/]+)$/; … … 1635 1653 $split_hit, 1636 1654 $cpus, 1655 $softmask, 1656 $CTL_OPT->{organism_type} 1637 1657 ); 1638 1658 … … 1685 1705 my $cpus = shift; 1686 1706 my $softmask = shift; 1707 my $org_type = shift; 1687 1708 1688 1709 my $command = $blast; … … 1694 1715 #$command .= " wink=5"; 1695 1716 $command .= " Z=300"; 1696 $command .= " Y=500000000";1697 $command .= " hspmax=100";1717 $command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000"; 1718 $command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5"; 1698 1719 $command .= " cpus=$cpus"; 1699 $command .= " gspmax=100";1720 $command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5"; 1700 1721 #$command .= " hspsepqmax=10000"; 1701 1722 $command .= " lcmask"; … … 1709 1730 $command .= " -d $db -i $q_file -b 100000 -v 100000 -e $eval_blast"; 1710 1731 $command .= " -z 300"; 1711 $command .= " -Y 500000000";1732 $command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000"; 1712 1733 $command .= " -a $cpus"; 1713 $command .= " -K 100";1734 $command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5"; 1714 1735 $command .= " -U"; 1715 1736 $command .= " -F T"; … … 1740 1761 my $chunk = shift; 1741 1762 my $db = shift; 1742 my $the_void = shift; 1763 my $old_db = shift; 1764 my $the_void = shift; 1743 1765 my $seq_id = shift; 1744 my $tblastx = shift; 1745 my $eval_tblastx = shift; 1746 my $split_hit = shift; 1747 my $cpus = shift; 1748 my $old_db = shift; 1749 my $formater = shift; 1750 my $rank = shift; 1751 my $LOG = shift; 1752 my $LOG_FLAG = shift; 1753 my $softmask = shift; 1766 my $CTL_OPT = shift; 1767 my $rank = shift; 1768 my $LOG = shift; 1769 my $LOG_FLAG = shift; 1770 1771 my $tblastx = $CTL_OPT->{_tblastx}; 1772 my $bit_tblastx = $CTL_OPT->{bit_tblastx}; 1773 my $eval_tblastx = $CTL_OPT->{eval_tblastx}; 1774 my $pcov_tblastx = $CTL_OPT->{pcov_tblastx}; 1775 my $pid_tblastx = $CTL_OPT->{pid_tblastx}; 1776 my $split_hit = $CTL_OPT->{split_hit}; 1777 my $cpus = $CTL_OPT->{cpus}; 1778 my $formater = $CTL_OPT->{_formater}; 1779 my $softmask = $CTL_OPT->{softmask}; 1754 1780 1755 1781 #build names for files to use and copy … … 1799 1825 $split_hit, 1800 1826 $cpus, 1801 $softmask 1827 $softmask, 1828 $CTL_OPT->{organism_type} 1802 1829 ); 1803 1830 … … 1808 1835 #----------------------------------------------------------------------------- 1809 1836 sub collect_tblastx{ 1810 my $chunk = shift; 1811 my $blast_dir = shift; 1812 my $eval_tblastx = shift; 1813 my $bit_tblastx = shift, 1814 my $pcov_tblastx = shift; 1815 my $pid_tblastx = shift; 1816 my $split_hit = shift; 1817 my $LOG = shift; 1837 my $chunk = shift; 1838 my $blast_dir = shift; 1839 my $CTL_OPT = shift; 1840 my $LOG = shift; 1841 1842 my $eval_tblastx = $CTL_OPT->{eval_tblastx}; 1843 my $bit_tblastx = $CTL_OPT->{bit_tblastx}, 1844 my $pcov_tblastx = $CTL_OPT->{pcov_tblastx}; 1845 my $pid_tblastx = $CTL_OPT->{pid_tblastx}; 1846 my $split_hit = $CTL_OPT->{split_hit}; 1818 1847 1819 1848 my $blast_finished = $blast_dir; … … 1865 1894 my $chunk = shift; 1866 1895 my $db = shift; 1867 my $the_void = shift; 1896 my $old_db = shift; 1897 my $the_void = shift; 1868 1898 my $seq_id = shift; 1869 my $tblastx = shift; 1870 my $eval_tblastx = shift; 1871 my $bit_tblastx = shift, 1872 my $pcov_tblastx = shift; 1873 my $pid_tblastx = shift; 1874 my $split_hit = shift; 1875 my $cpus = shift; 1876 my $LOG = shift; 1877 my $softmask = shift; 1899 my $CTL_OPT = shift; 1900 my $rank = shift; 1901 my $LOG = shift; 1902 1903 my $tblastx = $CTL_OPT->{_tblastx}; 1904 my $bit_tblastx = $CTL_OPT->{bit_tblastx}; 1905 my $eval_tblastx = $CTL_OPT->{eval_tblastx}; 1906 my $pcov_tblastx = $CTL_OPT->{pcov_tblastx}; 1907 my $pid_tblastx = $CTL_OPT->{pid_tblastx}; 1908 my $split_hit = $CTL_OPT->{split_hit}; 1909 my $cpus = $CTL_OPT->{cpus}; 1910 my $formater = $CTL_OPT->{_formater}; 1911 my $softmask = $CTL_OPT->{softmask}; 1878 1912 1879 1913 my ($db_n) = $db =~ /([^\/]+)$/; … … 1895 1929 $split_hit, 1896 1930 $cpus, 1931 $softmask, 1932 $CTL_OPT->{organism_type} 1897 1933 ); 1898 1934 … … 1928 1964 my $cpus = shift; 1929 1965 my $softmask = shift; 1966 my $org_type = shift; 1930 1967 1931 1968 my $command = $blast; … … 1935 1972 #$command .= " W=15"; 1936 1973 $command .= " Z=1000"; 1937 $command .= " Y=500000000";1974 $command .= ($org_type eq 'eukaryotic') ? " Y=500000000" : " Y=20000000"; 1938 1975 $command .= " cpus=$cpus"; 1939 $command .= " topcomboN=1";1940 $command .= " hspmax=100";1941 $command .= " gspmax=100";1942 $command .= " hspsepqmax=$split_hit";1976 $command .= ($org_type eq 'eukaryotic') ? " topcomboN=1" : ""; 1977 $command .= ($org_type eq 'eukaryotic') ? " hspmax=100" : " hspmax=5"; 1978 $command .= ($org_type eq 'eukaryotic') ? " gspmax=100" : " gspmax=5"; 1979 $command .= ($org_type eq 'eukaryotic') ? " hspsepqmax=$split_hit" : ""; 1943 1980 $command .= " lcmask"; 1944 1981 $command .= " gi"; 1982 $command .= ($org_type eq 'eukaryotic') ? "" : " kap"; 1945 1983 #$command .= " mformat=2"; # remove for full report 1946 1984 $command .= " -o $out_file"; … … 1951 1989 #$command .= " -W 15"; 1952 1990 $command .= " -z 1000"; 1953 $command .= " -Y 500000000";1991 $command .= ($org_type eq 'eukaryotic') ? " -Y 500000000" : " -Y 20000000"; 1954 1992 $command .= " -a $cpus"; 1955 $command .= " -K 100";1993 $command .= ($org_type eq 'eukaryotic') ? " -K 100" : " -K 5"; 1956 1994 $command .= " -U"; 1957 1995 $command .= " -F T"; … … 2121 2159 $CTL_OPT{'rmlib'} = ''; 2122 2160 $CTL_OPT{'rm_gff'} = ''; 2161 $CTL_OPT{'organism_type'} = 'eukaryotic'; 2123 2162 $CTL_OPT{'predictor'} = 'est2genome'; 2124 2163 $CTL_OPT{'predictor'} = 'model_gff' if($main::eva); … … 2137 2176 $CTL_OPT{'min_contig'} = 1; 2138 2177 $CTL_OPT{'split_hit'} = 10000; 2178 $CTL_OPT{'softmask'} = 1; 2139 2179 $CTL_OPT{'pred_flank'} = 200; 2140 2180 $CTL_OPT{'single_exon'} = 0; … … 2162 2202 if ($type eq 'all' || $type eq 'bopts') { 2163 2203 $CTL_OPT{'blast_type'} = 'wublast'; 2164 $CTL_OPT{'softmask'} = 1;2165 2204 $CTL_OPT{'pcov_blastn'} = 0.80; 2166 2205 $CTL_OPT{'pid_blastn'} = 0.85; … … 2200 2239 'snap', 2201 2240 'gmhmme3', 2241 'gmhmmp', 2202 2242 'augustus', 2203 2243 'fgenesh', … … 2286 2326 2287 2327 #skip repeat masking command line option 2288 if ($OPT{R} ) {2328 if ($OPT{R} || $CTL_OPT{organism_type} eq 'prokaryotic') { 2289 2329 $CTL_OPT{model_org} = ''; 2290 2330 $CTL_OPT{repeat_protein} = ''; … … 2292 2332 $CTL_OPT{rm_gff} = ''; 2293 2333 $CTL_OPT{rm_pass} = 0; 2334 2335 print STDERR "INFO: "; 2336 print STDERR "No repeats expected in prokaryotic organisms.\n" 2337 if($CTL_OPT{organism_type} eq 'prokaryotic'); 2338 print STDERR "Now removing all repeat masking options.\n\n"; 2294 2339 } 2295 2340 2296 2341 #if no repeat masking options are set don't run masking dependent methods 2342 #i.e. unmasked predictions 2297 2343 if ($CTL_OPT{model_org} eq '' && 2298 2344 $CTL_OPT{repeat_protein} eq '' && … … 2302 2348 $CTL_OPT{genome_gff} eq '') 2303 2349 ) { 2350 warn "WARNING: There are no masking options set, yet unmask is set to 1.\n". 2351 "This is not valid. The value for unmask will be set to 0.\n\n" 2352 if($CTL_OPT{unmask} == 1); 2353 2354 $CTL_OPT{unmask} = 0; 2304 2355 $CTL_OPT{_no_mask} = 1; #no masking options found 2305 2356 } … … 2314 2365 2315 2366 my %uniq; 2367 $CTL_OPT{_predictor} = []; 2316 2368 foreach my $p (@predictors) { 2317 if ($p !~ /^snap$|^augustus$|^est2genome$|^ fgenesh$/ &&2369 if ($p !~ /^snap$|^augustus$|^est2genome$|^protein2genome$|^fgenesh$/ && 2318 2370 $p !~ /^genemark$|^jigsaw$|^model_gff$|^pred_gff$/ 2319 2371 ) { … … 2322 2374 "snap, genemark, augustus, and fgenesh\n\n"; 2323 2375 } 2324 else { 2325 push(@{$CTL_OPT{_predictor}}, $p) unless($uniq{$p}); 2376 elsif($CTL_OPT{organism_type} eq 'prokaryotic'){ 2377 if($p =~ /^snap$|^augustus$|^fgenesh$|^jigsaw$/){ 2378 warn "WARNING: the predictor $p does not support prokaryotic organisms\n". 2379 "and will be ignored.\n\n"; 2380 } 2381 else{ 2382 push(@{$CTL_OPT{_run}}, $p) unless(exists $uniq{$p} || $p !~ /genemark/); 2383 push(@{$CTL_OPT{_predictor}}, $p) unless(exists $uniq{$p}); 2384 $uniq{$p}++; 2385 } 2386 } 2387 elsif($CTL_OPT{organism_type} eq 'eukaryotic'){ 2388 push(@{$CTL_OPT{_predictor}}, $p) unless(exists $uniq{$p}); 2326 2389 if($p =~ /^snap$|^augustus$|^fgenesh$|^genemark$|^jigsaw$/){ 2327 push(@{$CTL_OPT{_run}}, $p) unless( $uniq{$p});2390 push(@{$CTL_OPT{_run}}, $p) unless(exists $uniq{$p}); 2328 2391 } 2329 2392 $uniq{$p}++; 2330 2393 } 2331 } 2332 2394 else{ 2395 die "FATAL: Logic error in GI::load_contol_files\n"; 2396 } 2397 } 2398 2399 #reset predictor value based on previous filtering step (for log) 2400 $CTL_OPT{predictor} = join(",", @{$CTL_OPT{_predictor}}); 2401 2333 2402 #parse run and error check 2334 2403 $CTL_OPT{run} =~ s/\s+//g; … … 2350 2419 warn "WARNING: blast_type must be set to \'wublast\' or \'ncbi\'.\n", 2351 2420 "The value $CTL_OPT{blast_type} is invalid.\n", 2352 "This will now be reset to the default 'wublast' ";2421 "This will now be reset to the default 'wublast'.\n\n"; 2353 2422 2354 2423 $CTL_OPT{blast_type} = 'wublast'; … … 2362 2431 ) { 2363 2432 warn "WARNING: blast_type is set to \'wublast\' but wublast executables\n", 2364 "can not be located. NCBI blast will be used instead.\n ";2433 "can not be located. NCBI blast will be used instead.\n\n"; 2365 2434 2366 2435 $CTL_OPT{blast_type} = 'ncbi'; … … 2374 2443 ) { 2375 2444 warn "WARNING: blast_type is set to \'ncbi\' but ncbi executables\n", 2376 "can not be located. WUBLAST blast will be used instead.\n ";2445 "can not be located. WUBLAST blast will be used instead.\n\n"; 2377 2446 2378 2447 $CTL_OPT{blast_type} = 'wublast'; 2379 2448 } 2380 2449 2450 #use standard value to refer to both NCBI and WUBLAST 2381 2451 if ($CTL_OPT{blast_type} =~ /^wublast$/) { 2382 2452 $CTL_OPT{_formater} = $CTL_OPT{xdformat}; … … 2390 2460 $CTL_OPT{_blastx} = $CTL_OPT{blastall}; 2391 2461 $CTL_OPT{_tblastx} = $CTL_OPT{blastall}; 2462 } 2463 2464 #check organism type values 2465 if ($CTL_OPT{organism_type} !~ /^eukaryotic$|^prokaryotic$/) { 2466 $error .= "ERROR: organism_type must be set to \'eukaryotic\' or \'prokaryotic\'.\n". 2467 "The value $CTL_OPT{organism_type} is invalid.\n\n"; 2468 2469 #set the default just to assist in populating remaining errors 2470 $CTL_OPT{organism_type} = 'eukaryotic'; 2392 2471 } 2393 2472 … … 2405 2484 push (@infiles, 'genome') if($CTL_OPT{genome}); 2406 2485 push (@infiles, 'genome') if(!$CTL_OPT{genome_gff} && !$main::eva); 2407 push (@infiles, 'exonerate') if($CTL_OPT{est});2408 push (@infiles, 'exonerate') if($CTL_OPT{protein});2409 push (@infiles, 'repeat_protein') if ($CTL_OPT{repeat_protein});2410 2486 push (@infiles, 'est') if($CTL_OPT{est}); 2411 2487 push (@infiles, 'protein') if($CTL_OPT{protein}); 2412 2488 push (@infiles, 'altest') if($CTL_OPT{altest}); 2413 2489 push (@infiles, 'est_reads') if($CTL_OPT{est_reads}); 2414 push (@infiles, 'RepeatMasker') if($CTL_OPT{rmlib});2415 push (@infiles, 'RepeatMasker') if($CTL_OPT{model_org});2416 push (@infiles, 'rmlib') if ($CTL_OPT{rmlib});2417 push (@infiles, 'snap') if (grep (/snap/, @{$CTL_OPT{_run}}));2418 push (@infiles, 'gmhmme3') if (grep (/genemark/, @{$CTL_OPT{_run}}));2419 2490 push (@infiles, 'probuild') if (grep (/genemark/, @{$CTL_OPT{_run}})); 2420 push (@infiles, 'augustus') if (grep (/augustus/, @{$CTL_OPT{_run}}));2421 push (@infiles, 'fgenesh') if (grep (/fgenesh/, @{$CTL_OPT{_run}}));2422 push (@infiles, 'twinscan') if (grep (/twinscan/, @{$CTL_OPT{_run}}));2423 push (@infiles, 'jigsaw') if (grep (/jigsaw/, @{$CTL_OPT{_run}}));2424 2491 push (@infiles, 'fathom') if ($CTL_OPT{enable_fathom} && $CTL_OPT{evaluate}); 2425 push (@infiles, 'rm_gff') if($CTL_OPT{rm_gff});2426 2492 push (@infiles, 'est_gff') if($CTL_OPT{est_gff}); 2427 2493 push (@infiles, 'protein_gff') if($CTL_OPT{protein_gff}); … … 2436 2502 !$CTL_OPT{model_pass}) 2437 2503 ); 2504 if($CTL_OPT{organism_type} eq 'eukaryotic'){ 2505 push (@infiles, 'exonerate') if($CTL_OPT{est}); 2506 push (@infiles, 'exonerate') if($CTL_OPT{protein}); 2507 push (@infiles, 'repeat_protein') if ($CTL_OPT{repeat_protein}); 2508 push (@infiles, 'RepeatMasker') if($CTL_OPT{rmlib}); 2509 push (@infiles, 'RepeatMasker') if($CTL_OPT{model_org}); 2510 push (@infiles, 'rm_gff') if($CTL_OPT{rm_gff}); 2511 push (@infiles, 'rmlib') if ($CTL_OPT{rmlib}); 2512 push (@infiles, 'snap') if (grep (/snap/, @{$CTL_OPT{_run}})); 2513 push (@infiles, 'gmhmme3') if (grep (/genemark/, @{$CTL_OPT{_run}})); 2514 push (@infiles, 'augustus') if (grep (/augustus/, @{$CTL_OPT{_run}})); 2515 push (@infiles, 'fgenesh') if (grep (/fgenesh/, @{$CTL_OPT{_run}})); 2516 push (@infiles, 'twinscan') if (grep (/twinscan/, @{$CTL_OPT{_run}})); 2517 push (@infiles, 'jigsaw') if (grep (/jigsaw/, @{$CTL_OPT{_run}})); 2518 } 2519 elsif($CTL_OPT{organism_type} eq 'prokaryotic'){ 2520 push (@infiles, 'gmhmmp') if (grep (/genemark/, @{$CTL_OPT{_run}})); 2521 } 2438 2522 2439 2523 #uniq the array … … 2447 2531 foreach my $in (@infiles) { 2448 2532 if (not $CTL_OPT{$in}) { 2449 $error .= "ERROR: You have failed to provide a value for \'$in\' in the control files \n\n";2533 $error .= "ERROR: You have failed to provide a value for \'$in\' in the control files.\n\n"; 2450 2534 next; 2451 2535 } 2452 2536 elsif (not -e $CTL_OPT{$in}) { 2453 2537 $error .= "ERROR: The \'$in\' file $CTL_OPT{$in} does not exist.\n". 2454 "Please check your control files: maker_opts.ctl, maker_bopts, or maker_exe.ctl \n\n";2538 "Please check your control files: maker_opts.ctl, maker_bopts, or maker_exe.ctl.\n\n"; 2455 2539 next; 2456 2540 } … … 2493 2577 if (! $CTL_OPT{fgenesh_par_file}) { 2494 2578 $error .= "ERROR: There is no parameter file secified for for FgenesH in\n". 2495 "maker_opts.ctl fgenesh_par_file \n\n";2579 "maker_opts.ctl fgenesh_par_file.\n\n"; 2496 2580 } 2497 2581 elsif (! -e $CTL_OPT{fgenesh_par_file}) { … … 2500 2584 } 2501 2585 if ($CTL_OPT{max_dna_len} < 50000) { 2502 warn "WARNING: \'max_dna_len\' is set too low. The minimum value permited is 50,000 \n".2586 warn "WARNING: \'max_dna_len\' is set too low. The minimum value permited is 50,000.\n". 2503 2587 "max_dna_len will be reset to 50,000\n\n"; 2504 2588 $CTL_OPT{max_dna_len} = 50000; 2589 } 2590 if ($CTL_OPT{split_hit} > 0 && $CTL_OPT{organism_type} eq 'prokaryotic') { 2591 warn "WARNING: \'split_hit\' is meaningless for prokaryotic genomes and will be set to 0.\n\n"; 2592 $CTL_OPT{split_hit} = 0; 2593 } 2594 if ($CTL_OPT{single_exon} == 0 && $CTL_OPT{organism_type} eq 'prokaryotic') { 2595 warn "WARNING: \'single_exon\' is required for prokaryotic genomes and will be set to 1.\n\n"; 2596 $CTL_OPT{single_exon} = 1; 2505 2597 } 2506 2598 if ($CTL_OPT{min_contig} <= 0) { … … 2520 2612 } 2521 2613 if($CTL_OPT{TMP} && ! -d $CTL_OPT{TMP}){ 2522 $error .= "The TMP value \'$CTL_OPT{TMP}\' is not a directory or does not exist \n";2614 $error .= "The TMP value \'$CTL_OPT{TMP}\' is not a directory or does not exist.\n\n"; 2523 2615 } 2524 2616 if($main::eva && $CTL_OPT{genome_gff} && $CTL_OPT{model_gff}){ #only for evaluator 2525 $error .= "You can only specify a GFF3 file for genome_gff or model_gff no both!!\n ";2617 $error .= "You can only specify a GFF3 file for genome_gff or model_gff no both!!\n\n"; 2526 2618 } 2527 2619 … … 2536 2628 if ($iterator->number_of_entries() == 0) { 2537 2629 my $genome = (! $CTL_OPT{genome}) ? $fasta_gff : $CTL_OPT{genome}; 2538 die "ERROR: The file $genome contains no fasta entries\n ";2630 die "ERROR: The file $genome contains no fasta entries\n\n"; 2539 2631 } 2540 2632 … … 2560 2652 $CTL_OPT{alt_peptide} = uc($CTL_OPT{alt_peptide}); 2561 2653 if ($CTL_OPT{alt_peptide} !~ /^[ABCDEFGHIKLMNPQRSTVWXYZ]$/) { 2562 warn "WARNING: Invalid alternate peptide \'$CTL_OPT{alt_peptide}\' \n",2563 "This will be set to the default 'C' \n";2654 warn "WARNING: Invalid alternate peptide \'$CTL_OPT{alt_peptide}\'.\n", 2655 "This will be set to the default 'C'.\n\n"; 2564 2656 $CTL_OPT{alt_peptide} = 'C'; 2565 2657 } … … 2593 2685 } 2594 2686 else{ 2595 die "ERROR: Another instance of maker is already running for this dataset\n"; 2596 } 2597 2598 2687 die "ERROR: Another instance of maker is already running for this dataset.\n\n"; 2688 } 2689 2599 2690 #--set up optional global TMP 2600 2691 if($CTL_OPT{TMP}){ … … 2661 2752 print OUT "#-----Gene Prediction Options\n" if(!$ev); 2662 2753 print OUT "#-----Evaluator Ab-Initio Comparison Options\n" if($ev); 2663 #print OUT "run:$O{run} #ab-initio methods to run (seperate multiple values by ',')\n"; 2754 print OUT "organism_type:$O{organism_type} #eukaryotic or prokaryotic. Default is eukaryotic\n"; 2755 print OUT "run:$O{run} #ab-initio methods to run (seperate multiple values by ',')\n" if($ev); 2664 2756 print OUT "predictor:$O{predictor} #prediction methods for annotations (seperate multiple values by ',')\n" if(!$ev); 2665 2757 print OUT "unmask:$O{unmask} #Also run ab-initio methods on unmasked sequence, 1 = yes, 0 = no\n"; … … 2683 2775 print OUT "min_contig:$O{min_contig} #all contigs from the input genome file below this size will be skipped\n"; 2684 2776 print OUT "min_protein:$O{min_protein} #all gene annotations must produce a protein of at least this many amino acids in length\n" if(!$ev); 2777 print OUT "softmask:$O{softmask} #use soft-masked rather than hard-masked seg filtering for wublast\n"; 2685 2778 print OUT "split_hit:$O{split_hit} #length for the splitting of hits (expected max intron size for evidence alignments)\n"; 2686 2779 print OUT "pred_flank:$O{pred_flank} #length of sequence surrounding EST and protein evidence used to extend gene predictions\n"; … … 2709 2802 print OUT "#-----BLAST and Exonerate Statistics Thresholds\n"; 2710 2803 print OUT "blast_type:$O{blast_type} #set to 'wublast' or 'ncbi'\n"; 2711 print OUT "softmask:$O{softmask} #use soft-masked rather than hard-masked seg filtering for wublast\n";2712 2804 print OUT "\n"; 2713 2805 print OUT "pcov_blastn:$O{pcov_blastn} #Blastn Percent Coverage Threhold EST-Genome Alignments\n"; … … 2757 2849 print OUT "snap:$O{snap} #location of snap executable\n"; 2758 2850 print OUT "gmhmme3:$O{gmhmme3} #location of eukaryotic genemark executable\n"; 2851 print OUT "gmhmmp:$O{gmhmmp} #location of prokaryotic genemark executable\n"; 2759 2852 print OUT "augustus:$O{augustus} #location of augustus executable\n"; 2760 2853 print OUT "fgenesh:$O{fgenesh} #location of fgenesh executable\n"; lib/PhatHit_utils.pm
r127 r242 121 121 } 122 122 #------------------------------------------------------------------------ 123 sub get_hit_coors { 124 my $hits = shift; 125 my $what = shift; 126 my @coors; 127 foreach my $hit (@{$hits}){ 128 push(@coors, [$hit->nB($what), $hit->nE($what)]); 129 } 130 return \@coors; 131 } 132 #------------------------------------------------------------------------ 123 133 sub split_hit_on_intron { 124 134 my $hits = shift; … … 245 255 my $hit = shift; 246 256 247 my $ref = ref($hit );257 my $ref = ref($hit->[0]); 248 258 249 259 my @new_hits; 250 my $i = 0;251 260 foreach my $hsp ($hit->hsps){ 252 261 … … 266 275 267 276 push(@new_hits, $new_hit); 268 $i++; 277 } 278 279 return \@new_hits; 280 } 281 #------------------------------------------------------------------------ 282 sub shatter_all_hits { 283 my $hits = shift; 284 285 my $ref = ref($hits->[0]); 286 287 my @new_hits; 288 foreach my $hit (@$hits){ 289 foreach my $hsp ($hit->hsps){ 290 291 my $new_hit = new $ref('-name' => $hit->name, 292 '-description' => $hit->description, 293 '-algorithm' => $hit->algorithm, 294 '-length' => $hit->length, 295 '-score' => $hsp->score, 296 '-bits' => $hsp->bits, 297 '-significance' => $hsp->significance 298 ); 299 300 $new_hit->queryLength($hit->queryLength); 301 $new_hit->{is_shattered} = 1; 302 303 $new_hit->add_hsp($hsp); 304 305 push(@new_hits, $new_hit); 306 } 307 } 308 309 return \@new_hits; 310 } 311 #------------------------------------------------------------------------ 312 #this method assumes that the features are all on the same strand 313 sub make_flat_hits { 314 my $hits = shift; 315 my $seq = shift; 316 317 return [] if(! @$hits); 318 319 my $ref = ref($hits->[0]); 320 my $hsp_ref = ref($hits->[0]->{_hsps}->[0]); 321 my $strand = $hits->[0]->strand; 322 323 my $coors = get_hit_coors($hits, 'query'); 324 my $pieces = Shadower::getPieces($seq, $coors, 0); 325 326 my @new_hits; 327 328 foreach my $p (@$pieces){ 329 my $nB = $p->{b}; 330 my $nE = $p->{e}; 331 my $length = abs($nE -$nB) + 1; 332 333 #natural end and beginning are non-normalized values 334 ($nB, $nE) = ($nE, $nB) if($nB < $nE && $strand == -1); 335 336 my $pSeq = $p->{piece}; 337 $pSeq = Fasta::revComp(\$pSeq) if($strand == -1); 338 339 my $new_hit = new $ref('-name' => "flat_hit_$nB\_$nE", 340 '-description' => '', 341 '-algorithm' => $hits->[0]->algorithm, 342 '-length' => $length, 343 '-score' => $length, 344 '-bits' => $length*2, 345 '-significance' => 0 346 ); 347 348 $new_hit->queryLength($length); 349 350 my @args; 351 352 push(@args, '-query_start'); 353 push(@args, $nB); 354 355 push(@args, '-query_seq'); 356 push(@args, $pSeq); 357 358 push(@args, '-score'); 359 push(@args, $length); 360 push(@args, '-homology_seq'); 361 push(@args, $pSeq); 362 363 push(@args, '-hit_start'); 364 push(@args, 1); 365 366 push(@args, '-hit_seq'); 367 push(@args, $pSeq); 368 369 push(@args, '-hsp_length'); 370 push(@args, $length); 371 372 push(@args, '-identical'); 373 push(@args, $length); 374 375 push(@args, '-hit_length'); 376 push(@args, $length); 377 378 push(@args, '-query_name'); 379 push(@args, $hits->[0]->{_hsps}->[0]->query_name); 380 381 push(@args, '-algorithm'); 382 push(@args, $hits->[0]->algorithm); 383 384 push(@args, '-bits'); 385 push(@args, $length*2); 386 387 push(@args, '-evalue'); 388 push(@args, 0); 389 390 push(@args, '-pvalue'); 391 push(@args, 0); 392 393 push(@args, '-query_length'); 394 push(@args, $length); 395 396 push(@args, '-query_end'); 397 push(@args, $nE); 398 399 push(@args, '-conserved'); 400 push(@args, $length); 401 402 push(@args, '-hit_name'); 403 push(@args, "flat_hit_$nB\_$nE"); 404 405 push(@args, '-hit_end'); 406 push(@args, $length); 407 408 push(@args, '-query_gaps'); 409 push(@args, 0); 410 411 push(@args, '-hit_gaps'); 412 push(@args, 0); 413 414 my $hsp = new $hsp_ref(@args); 415 $hsp->queryName($hits->[0]->{_hsps}->[0]->query_name); 416 #------------------------------------------------- 417 # setting strand because bioperl is all messed up! 418 #------------------------------------------------ 419 if ($strand == 1 ){ 420 $hsp->{_strand_hack}->{query} = 1; 421 $hsp->{_strand_hack}->{hit} = 1; 422 } 423 else { 424 $hsp->{_strand_hack}->{query} = -1; 425 $hsp->{_strand_hack}->{hit} = 1; 426 } 427 $new_hit->add_hsp($hsp); 428 429 push(@new_hits, $new_hit); 269 430 } 270 431 lib/Process/MpiChunk.pm
r229 r242 459 459 $res_dir = GI::blastx_as_chunks($chunk, 460 460 $db, 461 $CTL_OPT{repeat_protein}, 461 462 $the_void, 462 463 $safe_seq_id, 463 $CTL_OPT{_blastx}, 464 $CTL_OPT{eval_rm_blastx}, 465 $CTL_OPT{split_hit}, 466 $CTL_OPT{cpus}, 467 $CTL_OPT{repeat_protein}, 468 $CTL_OPT{_formater}, 464 \%CTL_OPT, 469 465 $self->{RANK}, 470 466 $LOG, 471 $LOG_FLAG, 472 1 467 $LOG_FLAG 473 468 ); 474 469 } … … 512 507 $rm_blastx_keepers = GI::collect_blastx($chunk, 513 508 $res_dir, 514 $CTL_OPT{eval_rm_blastx}, 515 $CTL_OPT{bit_rm_blastx}, 516 $CTL_OPT{pcov_rm_blastx}, 517 $CTL_OPT{pid_rm_blastx}, 518 $CTL_OPT{split_hit}, 509 \%CTL_OPT, 519 510 $LOG 520 511 ); … … 774 765 $res_dir = GI::blastn_as_chunks($chunk, 775 766 $db, 767 $CTL_OPT{est}, 776 768 $the_void, 777 769 $safe_seq_id, 778 $CTL_OPT{_blastn}, 779 $CTL_OPT{eval_blastn}, 780 $CTL_OPT{split_hit}, 781 $CTL_OPT{cpus}, 782 $CTL_OPT{est}, 783 $CTL_OPT{_formater}, 770 \%CTL_OPT, 784 771 $self->{RANK}, 785 772 $LOG, … … 830 817 $blastn_keepers = GI::collect_blastn($chunk, 831 818 $res_dir, 832 $CTL_OPT{eval_blastn}, 833 $CTL_OPT{bit_blastn}, 834 $CTL_OPT{pcov_blastn}, 835 $CTL_OPT{pid_blastn}, 836 $CTL_OPT{split_hit}, 819 \%CTL_OPT, 837 820 $LOG 838 821 ); … … 890 873 $res_dir = GI::blastx_as_chunks($chunk, 891 874 $db, 875 $CTL_OPT{protein}, 892 876 $the_void, 893 877 $safe_seq_id, 894 $CTL_OPT{_blastx}, 895 $CTL_OPT{eval_blastx}, 896 $CTL_OPT{split_hit}, 897 $CTL_OPT{cpus}, 898 $CTL_OPT{protein}, 899 $CTL_OPT{_formater}, 878 \%CTL_OPT, 900 879 $self->{RANK}, 901 880 $LOG, 902 $LOG_FLAG, 903 $CTL_OPT{softmask} 881 $LOG_FLAG 904 882 ); 905 883 … … 945 923 $blastx_keepers = GI::collect_blastx($chunk, 946 924 $res_dir, 947 $CTL_OPT{eval_blastx}, 948 $CTL_OPT{bit_blastx}, 949 $CTL_OPT{pcov_blastx}, 950 $CTL_OPT{pid_blastx}, 951 $CTL_OPT{split_hit}, 925 \%CTL_OPT, 952 926 $LOG 953 927 ); … … 1005 979 $res_dir = GI::tblastx_as_chunks($chunk, 1006 980 $db, 981 $CTL_OPT{altest}, 1007 982 $the_void, 1008 983 $safe_seq_id, 1009 $CTL_OPT{_tblastx}, 1010 $CTL_OPT{eval_tblastx}, 1011 $CTL_OPT{split_hit}, 1012 $CTL_OPT{cpus}, 1013 $CTL_OPT{altest}, 1014 $CTL_OPT{_formater}, 984 \%CTL_OPT, 1015 985 $self->{RANK}, 1016 986 $LOG, 1017 $LOG_FLAG, 1018 $CTL_OPT{softmask} 987 $LOG_FLAG 1019 988 ); 1020 989 } … … 1059 1028 $tblastx_keepers = GI::collect_tblastx($chunk, 1060 1029 $res_dir, 1061 $CTL_OPT{eval_tblastx}, 1062 $CTL_OPT{bit_tblastx}, 1063 $CTL_OPT{pcov_tblastx}, 1064 $CTL_OPT{pid_tblastx}, 1065 $CTL_OPT{split_hit}, 1030 \%CTL_OPT, 1066 1031 $LOG 1067 1032 ); … … 1254 1219 ); 1255 1220 } 1221 1222 #shatter hits and flip strand of blastn where appropriate. 1223 #I shatter after processing the chunk divide to avoid weird 1224 #complications from flipping on only one side of a split HSP 1225 if($CTL_OPT{organism_type} eq 'prokaryotic'){ 1226 $blastn_keepers = PhatHit_utils::shatter_all_hits($blastn_keepers); 1227 $blastx_keepers = PhatHit_utils::shatter_all_hits($blastx_keepers); 1228 $tblastx_keepers = PhatHit_utils::shatter_all_hits($tblastx_keepers); 1229 #this checks the open reading frame and can flip the hit 1230 foreach my $phat_hit (@$blastn_keepers){ 1231 $phat_hit = PhatHit_utils::copy($phat_hit, 'both') 1232 if exonerate::splice_info::needs_to_be_revcomped($phat_hit); 1233 } 1234 } 1235 1256 1236 #-------------------------CODE 1257 1237 … … 1328 1308 #-make a multi-fasta of the seqs in the blastx_clusters 1329 1309 #-polish the blastx hits with exonerate 1330 1331 my $exoner_p_clust = GI::polish_exonerate($fasta, 1332 $blastx_clusters, 1333 $fasta_p_index, 1334 $the_void, 1335 5, 1336 'p', 1337 $CTL_OPT{exonerate}, 1338 $CTL_OPT{pcov_blastx}, 1339 $CTL_OPT{pid_blastx}, 1340 $CTL_OPT{ep_score_limit}, 1341 $CTL_OPT{ep_matrix}, 1342 $LOG 1343 ); 1310 my $exoner_p_clust =[]; 1311 if($CTL_OPT{organism_type} eq 'eukaryotic'){ 1312 $exoner_p_clust = GI::polish_exonerate($fasta, 1313 $blastx_clusters, 1314 $fasta_p_index, 1315 $the_void, 1316 5, 1317 'p', 1318 $CTL_OPT{exonerate}, 1319 $CTL_OPT{pcov_blastx}, 1320 $CTL_OPT{pid_blastx}, 1321 $CTL_OPT{ep_score_limit}, 1322 $CTL_OPT{ep_matrix}, 1323 $LOG 1324 ); 1325 } 1344 1326 1345 1327 #flatten clusters … … 1403 1385 my $tblastx_data = GI::flatten($tblastx_clusters); 1404 1386 1405 1406 1387 #-cluster the blastn hits 1407 1388 print STDERR "cleaning blastn...\n" unless $main::quiet; … … 1413 1394 1414 1395 #-polish blastn hits with exonerate 1415 my $exoner_e_clust = GI::polish_exonerate($fasta, 1416 $blastn_clusters, 1417 $fasta_t_index, 1418 $the_void, 1419 5, 1420 'e', 1421 $CTL_OPT{exonerate}, 1422 $CTL_OPT{pcov_blastn}, 1423 $CTL_OPT{pid_blastn}, 1424 $CTL_OPT{en_score_limit}, 1425 $CTL_OPT{en_matrix}, 1426 $LOG 1427 ); 1396 my $exoner_e_clust = []; 1397 if($CTL_OPT{organism_type} eq 'eukaryotic'){ 1398 $exoner_e_clust = GI::polish_exonerate($fasta, 1399 $blastn_clusters, 1400 $fasta_t_index, 1401 $the_void, 1402 5, 1403 'e', 1404 $CTL_OPT{exonerate}, 1405 $CTL_OPT{pcov_blastn}, 1406 $CTL_OPT{pid_blastn}, 1407 $CTL_OPT{en_score_limit}, 1408 $CTL_OPT{en_matrix}, 1409 $LOG 1410 ); 1411 } 1428 1412 1429 1413 #flatten clusters … … 1465 1449 tblastx_data 1466 1450 blastx_data 1451 blastn_data 1467 1452 exonerate_e_data 1468 1453 exonerate_p_data … … 1490 1475 my $tblastx_data = $VARS->{tblastx_data}; 1491 1476 my $blastx_data = $VARS->{blastx_data}; 1477 my $blastn_data = $VARS->{blastn_data}; 1492 1478 my $exonerate_e_data = $VARS->{exonerate_e_data}; 1493 1479 my $exonerate_p_data = $VARS->{exonerate_p_data}; … … 1503 1489 my $final_est = GI::combine($exonerate_e_data, 1504 1490 $est_gff_keepers 1505 ); 1491 ); 1492 1493 if($CTL_OPT{organism_type} eq 'prokaryotic'){ 1494 $final_est = GI::combine($blastn_data, 1495 $final_est 1496 ); 1497 } 1498 1506 1499 my $final_altest = GI::combine($tblastx_data, 1507 1500 $altest_gff_keepers lib/Widget/genemark.pm
r207 r242 58 58 #------------------------------------------------------------------------------- 59 59 sub parse { 60 my $report = shift; 61 my $params = shift; 62 my $q_file = shift; 63 64 open(my $IN, "< $report"); 65 my $line = <$IN>; 66 close($IN); 67 68 if($line =~ /eukariotyc|eukaryotic/i){ #spelled wrong in version 3.9 69 return parse_eukaryotic($report, $params, $q_file); 70 } 71 elsif($line =~ /prokaryotic/i){ 72 return parse_prokaryotic($report, $params, $q_file); 73 } 74 else{ 75 die "FATAL: Can not identify the version of GeneMark used for the report\n\n"; 76 } 77 } 78 79 sub parse_eukaryotic{ 60 80 my $report = shift; 61 81 my $params = shift; … … 89 109 $g{$id}[$i]{score} = ''; 90 110 $i++; 111 } 112 $fh->close(); 113 my $phat_hits = load_phat_hits($q_name, $q_seq, \%g, $params); 114 return $phat_hits; 115 } 116 117 sub parse_prokaryotic{ 118 my $report = shift; 119 my $params = shift; 120 my $q_file = shift; 121 122 my $iterator = new Iterator::Fasta($q_file); 123 my $fasta = $iterator->nextEntry(); 124 125 my $def = Fasta::getDef(\$fasta); 126 my $q_seq = Fasta::getSeqRef(\$fasta); 127 my $q_name = Fasta::def2SeqID($def); 128 129 my $fh = new FileHandle(); 130 $fh->open($report); 131 132 my %g; 133 while (my $line = <$fh>){ 134 chomp($line); 135 next if(! $line || $line !~ /^\s*\d+\s+[-+]\s+\d+/); 136 $line =~ s/^\s+//; 137 my @stuff = split(/\s+/, $line); 138 139 my ($id) = $stuff[0]; 140 die "FATAL: Logic error in Widget::genemark::parse_prokaryotic" 141 if(defined($g{$id})); 142 143 $g{$id}[0]{strand} = $stuff[1] eq '+' ? 1 : -1; 144 $g{$id}[0]{b} = $stuff[2]; 145 $g{$id}[0]{e} = $stuff[3]; 146 $g{$id}[0]{type} = 'Single'; 147 $g{$id}[0]{score} = ''; 91 148 } 92 149 $fh->close(); lib/maker/auto_annotator.pm
r235 r242 44 44 my $single_exon = shift; 45 45 my $single_length = shift; 46 47 #only ESTs from splice concious algorithms i.e. no blastn 48 my $clean_est = get_selected_types($est_hits,'est2genome', 'est_gff'); 49 my $clean_altest = $alt_est_hits; 50 if ($single_exon == 1) { 51 #do nothing 52 }else { 53 # don't use unpliced single exon ESTs-- may be genomic contamination 54 $clean_est = clean::purge_single_exon_hits($clean_est); 55 $clean_altest = clean::purge_single_exon_hits($clean_altest); 56 } 57 58 # throw out the exonerate est hits with weird splice sites 59 $clean_est = clean::throw_out_bad_splicers($clean_est, $seq); 46 my $organism_type = shift; 47 48 my $clean_est = []; 49 my $clean_altest = []; 50 #only ESTs from splice concious algorithms for euaryotes i.e. no blastn 51 if($organism_type eq 'eukaryotic'){ 52 $clean_est = get_selected_types($est_hits,'est2genome', 'est_gff'); 53 $clean_altest = $alt_est_hits; 54 if ($single_exon == 1) { 55 #do nothing 56 }else { 57 # don't use unpliced single exon ESTs-- may be genomic contamination 58 $clean_est = clean::purge_single_exon_hits($clean_est); 59 $clean_altest = clean::purge_single_exon_hits($clean_altest); 60 } 61 62 # throw out the exonerate est hits with weird splice sites 63 $clean_est = clean::throw_out_bad_splicers($clean_est, $seq); 64 } 65 #include blastn and don't filter for splicing 66 else{ 67 $clean_est = get_selected_types($est_hits,'blastn', 'est_gff', 'blastn'); 68 $clean_altest = $alt_est_hits; 69 } 60 70 61 71 … … 145 155 my @bx_data; 146 156 foreach my $c (@{$hint_clusters}){ 147 my $bx = prep_blastx_data($c, $c_id, $seq );157 my $bx = prep_blastx_data($c, $c_id, $seq, $organism_type); 148 158 push(@bx_data, @{$bx}) if defined $bx; 149 159 … … 342 352 my @c_keepers; 343 353 foreach my $c (@$clusters){ 344 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff' );354 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff', 'blastn'); 345 355 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 346 356 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); … … 379 389 my $c_id = shift; 380 390 my $seq = shift; 381 382 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff'); 391 my $org_type = shift || 'eukaryotic'; 392 393 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff', 'blastn'); 383 394 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 384 395 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); … … 393 404 394 405 # group of most informative alt splices 395 my $gomias = clean::purge_single_exon_hits($ests_in_cluster); 396 $gomias = clean::get_best_alt_splices($gomias, $seq, 10); 406 my $gomias = []; 407 408 if($org_type eq 'eukaryotic'){ 409 $gomias = clean::purge_single_exon_hits($ests_in_cluster); 410 $gomias = clean::get_best_alt_splices($gomias, $seq, 10); 411 } 412 else{ 413 $gomias = PhatHit_utils::make_flat_hits($ests_in_cluster, $seq); 414 } 397 415 398 416 my @data; … … 440 458 441 459 my $models_in_cluster = get_selected_types($c,'model_gff', 'maker'); 442 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff' );460 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff', 'blastn'); 443 461 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 444 462 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); … … 480 498 my $seq = shift; 481 499 482 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff' );500 my $ests_in_cluster = get_selected_types($c,'est2genome', 'est_gff', 'blastn'); 483 501 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 484 502 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); … … 519 537 my $seq = shift; 520 538 521 my $ests_in_cluster = get_selected_types($c, 'est2genome', 'est_gff' );539 my $ests_in_cluster = get_selected_types($c, 'est2genome', 'est_gff', 'blastn'); 522 540 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 523 541 … … 552 570 553 571 554 my $ests_in_cluster = get_selected_types($c, 'est2genome', 'est_gff' );572 my $ests_in_cluster = get_selected_types($c, 'est2genome', 'est_gff', 'blastn'); 555 573 my $ps_in_cluster = get_selected_types($c,'protein2genome'); 556 574 my $bx_in_cluster = get_selected_types($c,'blastx', 'protein_gff'); … … 608 626 $v_seq_ref, 609 627 $CTL_OPTIONS->{single_exon}, 610 $CTL_OPTIONS->{single_length} 628 $CTL_OPTIONS->{single_length}, 629 $CTL_OPTIONS->{organism_type} 611 630 ); 612 631 … … 881 900 foreach my $p (@{$CTL_OPTIONS->{_predictor}}){ 882 901 foreach my $g (@{$annotations->{$p}}){ 883 if($p ne 'est2genome' && $ g->{g_strand} == 1){902 if($p ne 'est2genome' && $p ne 'protein2genome' && $g->{g_strand} == 1){ 884 903 push(@$p_list, $g) if($g->{AED} < 1 || $p eq 'model_gff'); 885 904 } 886 elsif($p ne 'est2genome' && $ g->{g_strand} == -1) {905 elsif($p ne 'est2genome' && $p ne 'protein2genome' && $g->{g_strand} == -1) { 887 906 push(@$m_list, $g) if($g->{AED} < 1 || $p eq 'model_gff'); 888 907 } 889 908 elsif($g->{g_strand} == 1){ 890 push(@p_est2g, $g); #seperate est2genome genes909 push(@p_est2g, $g); #seperate est2genome and protein2genome genes 891 910 } 892 911 elsif($g->{g_strand} == -1){ 893 push(@m_est2g, $g); #seperate est2genome genes912 push(@m_est2g, $g); #seperate est2genome and protein2genome genes 894 913 } 895 914 else{ … … 1178 1197 my $predictor = shift; 1179 1198 my $predictions = shift; 1180 my $CTL_OPT IONS= shift;1199 my $CTL_OPT = shift; 1181 1200 1182 1201 my $q_id = Fasta::def2SeqID($def); … … 1187 1206 my $ests = $set->{ests}; 1188 1207 my $model = $set->{model}; 1208 my $gomiph = $set->{gomiph}; 1189 1209 1190 1210 #------gff passthrough … … 1221 1241 next if (! defined $mia); 1222 1242 my $transcript = pneu($ests, $mia, $seq); 1243 1244 next if !$transcript; 1245 1246 my $all_preds = get_overlapping_hits($transcript, $predictions); 1247 $set->{all_preds} = $all_preds; 1223 1248 1224 next if(! $transcript);1225 1226 my $all_preds = get_overlapping_hits($transcript, $predictions);1227 $set->{all_preds} = $all_preds;1228 1229 1249 push(@transcripts, [$transcript, $set, $mia]); 1230 1250 $i++; 1251 1252 next; 1253 } 1254 1255 #------protein2genome 1256 if ($predictor eq 'protein2genome') { 1257 next if($CTL_OPT->{organism_type} eq 'eukaryotic'); 1258 next if(! @$gomiph); 1259 1260 my $utr = []; 1261 push(@$utr, $mia) if($mia); 1262 my $miphs = PhatHit_utils::make_flat_hits($gomiph, $seq); 1263 1264 foreach my $miph (@$miphs){ 1265 my $transcript = pneu($utr, $miph, $seq); 1266 1267 next if(! $transcript); 1268 1269 my $all_preds = get_overlapping_hits($transcript, $predictions); 1270 $set->{all_preds} = $all_preds; 1271 1272 push(@transcripts, [$transcript, $set, $miph]); 1273 $i++; 1274 } 1275 1231 1276 next; 1232 1277 } … … 1236 1281 1237 1282 #------default hint based behavior 1238 my $gomiph = $set->{gomiph};1239 1283 my $blastx = get_selected_types($gomiph,'blastx', 'protein_gff'); 1240 1284 my $pol_p = get_selected_types($gomiph,'protein2genome'); … … 1248 1292 $set, 1249 1293 $predictor, 1250 $CTL_OPT IONS,1294 $CTL_OPT, 1251 1295 $LOG 1252 1296 ); … … 1380 1424 1381 1425 my $pol_p_hits = get_selected_types($evi->{gomiph}, 'protein2genome'); 1382 my $pol_e_hits = get_selected_types($evi->{ests}, 'est2genome', 'est_gff' );1426 my $pol_e_hits = get_selected_types($evi->{ests}, 'est2genome', 'est_gff', 'blastn'); 1383 1427 my $blastx_hits = get_selected_types($evi->{gomiph},'blastx', 'protein_gff'); 1384 1428 my $tblastx_hits = get_selected_types($evi->{alt_ests},'tblastx', 'altest_gff'); … … 1506 1550 my $the_void = shift; 1507 1551 my $CTL_OPTIONS = shift; 1552 1553 #fix weird sequence names 1554 $seq_id = Fasta::seqID2SafeID($seq_id); 1508 1555 1509 1556 #place evidence and p_bases in index for easy retrieval … … 1586 1633 } 1587 1634 elsif ($predictor eq 'abinit') { 1635 #now check for preexisting name 1588 1636 if ($c->[0]->name =~ /^maker-$seq_id|$seq_id-abinit/) { 1589 1637 $g_name = $c->[0]->name;
