Changeset 268

Show
Ignore:
Timestamp:
09/23/09 11:23:55 (2 months ago)
Author:
cholt
Message:

added AED_threshold to MAKER

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • lib/GI.pm

    r266 r268  
    22372237      $CTL_OPT{'single_length'} = 250; 
    22382238      $CTL_OPT{'min_protein'} = 0; 
     2239      $CTL_OPT{'AED_threshod'} = 1; 
    22392240      $CTL_OPT{'keep_preds'} = 0; 
    22402241      $CTL_OPT{'map_forward'} = 0; 
     
    26612662      "min_protein will be reset to 0\n\n"; 
    26622663      $CTL_OPT{min_protein} = 0; 
     2664   } 
     2665   if ($CTL_OPT{AED_threshold} < 0 || $CTL_OPT{AED_threshold} > 1) { 
     2666      warn "WARNING: \'AED_threshold\' must be set to a value betweeb 0 and 1.\n". 
     2667      "AED_threshold will be reset to 1\n\n"; 
     2668      $CTL_OPT{AED_threshold} = 1; 
    26632669   } 
    26642670   if ($CTL_OPT{retry} < 0) { 
     
    28312837   print OUT "min_contig:$O{min_contig} #all contigs from the input genome file below this size will be skipped\n"; 
    28322838   print OUT "min_protein:$O{min_protein} #all gene annotations must produce a protein of at least this many amino acids in length\n" if(!$ev); 
     2839   print OUT "AED_threshold:$O{AED_threshold} #Maximum Annotation Edit Distance allowed for annotations (bound by 0 and 1)\n" if(!$ev); 
    28332840   print OUT "softmask:$O{softmask} #use soft-masked rather than hard-masked seg filtering for wublast\n"; 
    28342841   print OUT "split_hit:$O{split_hit} #length for the splitting of hits (expected max intron size for evidence alignments)\n"; 
  • lib/clean.pm

    r151 r268  
    168168} 
    169169#------------------------------------------------------------------------ 
     170#this function throws out hits that result from multiple overlapping HSPs 
     171#from the same region (caused by low complexity repeats)  
     172sub complexity_filter { 
     173    my $candidates = shift; 
     174    my $seq        = shift; 
     175     
     176    my @keepers; 
     177     
     178    foreach my $f (@$candidates){ 
     179        my $qOff = $f->start('query'); 
     180        my $hOff = $f->start('hit'); 
     181         
     182        my @q_cov; 
     183        my @h_cov; 
     184        foreach my $hsp ($f->hsps){ 
     185            my $qS = $hsp->start('query') - $qOff; 
     186            my $qE = $hsp->end('query') - $qOff; 
     187             
     188            my $hS = $hsp->start('hit') - $hOff; 
     189            my $hE = $hsp->end('hit') - $hOff; 
     190             
     191            @q_cov[$qS..$qE] = map {(defined $_) ? $_ + 1 : 1} (@q_cov[$qS..$qE]); 
     192                @h_cov[$hS..$hE] = map {(defined $_) ? $_ + 1 : 1} (@h_cov[$hS..$hE]); 
     193        } 
     194         
     195        #average coverage for qeury 
     196        my $count = 0; 
     197        my $total = 0; 
     198        foreach my $c (@q_cov){ 
     199            next if(! defined $c); 
     200            $count++; 
     201            $total += $c;                
     202        } 
     203         
     204        my $q_ave = ($count) ? $total/$count : 0; 
     205         
     206        #average coverage for hit 
     207        $count = 0; 
     208        $total = 0; 
     209        foreach my $c (@h_cov){ 
     210            next if(! defined $c); 
     211            $count++; 
     212            $total += $c;                
     213        } 
     214 
     215        my $h_ave = ($count) ? $total/$count : 0; 
     216 
     217        #if average basepair hits twice on both query and subject, skip 
     218        push(@keepers, $f) unless(($q_ave + $h_ave)/2 > 2); 
     219    } 
     220     
     221    return \@keepers; 
     222} 
     223#------------------------------------------------------------------------ 
    1702241; 
    171225 
  • lib/cluster.pm

    r194 r268  
    4646                print STDERR "total clusters:$num_c now processing $counter\n" 
    4747                        unless $main::quiet; 
    48                 my $alts = clean::get_best_alt_splices($c, $seq, 10); 
     48                my $clean = clean::complexity_filter($c, $seq); 
     49                my $alts = clean::get_best_alt_splices($clean, $seq, 10); 
    4950                my $i = 0; 
    5051                my @new_cluster; 
     
    5455                        $i++; 
    5556                } 
     57                next if(! @new_cluster); 
    5658                push(@{$clean_clusters[$counter]}, @new_cluster); 
    5759                $counter++; 
    58         } 
    59                  
     60        }                
    6061 
    6162        return \@clean_clusters; 
    62  
    6363} 
    6464#------------------------------------------------------------------------ 
  • lib/maker/auto_annotator.pm

    r243 r268  
    16901690         my $t_struct = load_transcript_struct($f, $g_name, $i, $seq, $evidence, $p_base, $the_void, $CTL_OPTIONS); 
    16911691 
    1692          push(@t_structs, $t_struct) unless ($t_struct->{p_length} < $CTL_OPTIONS->{min_protein}); 
     1692         push(@t_structs, $t_struct) unless ($t_struct->{p_length} < $CTL_OPTIONS->{min_protein} || 
     1693                                             $t_struct->{AED} > $CTL_OPTIONS->{AED_threshold} 
     1694                                             ); 
    16931695 
    16941696         $AED = $t_struct->{AED} if($t_struct->{AED} < $AED); 
  • lib/runlog.pm

    r243 r268  
    3838                  'pred_flank', 
    3939                  'min_protein', 
     40                  'AED_threshold', 
    4041                  'single_exon', 
    4142                  'single_length',