root/bin/maker_map_ids

Revision 270, 4.1 kB (checked in by bmoore, 2 months ago)

Modifications to some scripts by Barry

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5
6 #-----------------------------------------------------------------------------
7 #----------------------------------- MAIN ------------------------------------
8 #-----------------------------------------------------------------------------
9 my $usage = "
10
11 Synopsis:
12
13 maker_map_ids --prefix PYU1_ --justify 6 genome.all.gff > genome.all.id.map
14
15 Description:
16
17 This script wil take a GFF3 file and create a mapping file of gene and
18 transcript IDs to more numerically incremented human friendly unique
19 IDs.
20
21 Options:
22
23   --prefix      The prefix to use for all IDs
24   --justify     The unique integer portion of the ID will be right justified
25                 with '0's to this length.
26   --sort_order  A tab delimited file containing two columns: contig_id
27                 and sort_order.  Genes and transcripts will be named
28                 in consecutive order along the contigs, and the
29                 contigs will be sorted in the order specified by the
30                 file.  If sort_order is not given and there are
31                 ##sequence-region directives at the top of the gff
32                 file then naming will be ordered by decreasing contig
33                 length.
34
35 Assumptions:
36
37   The script limits ID mapping to gene and mRNA features, and in
38   includes 'G' and 'T' abbreviations within the IDs for genes and
39   mRNAs respectively.  These behaviors are hard coded.
40
41 ";
42
43
44 my ($help, $prefix, $justify, $sort_order);
45 my $opt_success = GetOptions('help'         => \$help,
46                              'prefix=s'     => \$prefix,
47                              'justify=s'    => \$justify,
48                              'sort_order=s' => \$sort_order,
49                               );
50
51 die $usage if $help || ! $opt_success  || ! $prefix;
52 $justify ||= 6;
53
54 my $gff_file = shift;
55 die $usage unless $gff_file;
56
57 my $sort_map = parse_sort_order($sort_order, $gff_file);
58
59 open (my $IN, '<', $gff_file) or die "Can't open $gff_file for reading\n$!\n";
60
61 my %counter;
62 my %ids;
63 # Build a hash of ids that need to be mapped;
64 while (<$IN>) {
65
66         last if /^\#\#FASTA/;
67         next if /^\s*\#/;
68
69         my ($seq, $source, $type, $start, $end, $score, $strand,
70             $phase, $attrb_text) = split /\t/, $_;
71
72         # Here we explicity limit the mapping to genes and mRNAs
73         if ($type =~ /^(gene|mRNA)$/) {
74                 my ($id) = $attrb_text =~ /ID=(.*?);/;
75                 $ids{$seq}{$start}{$id} = $type;
76         }
77 }
78 # Create the new ID map.  Sort contigs by $sort_map in outer loop and
79 # features by $start in second loop.
80 for my $contig_id (sort {sort_contigs($a, $b, $sort_map)} keys %ids) {
81         my $contig = $ids{$contig_id};
82         for my $start (sort {$b <=> $a} keys %{$contig}) {
83                 for my $id (keys %{$contig->{$start}}) {
84                         my $type  = $contig->{$start}{$id};
85                         #Here we create gene and mRNA abbreviation.
86                         my $abrv  = $type eq 'gene' ? 'G' : 'T';
87                         my $count = sprintf '%06s', ++$counter{$type};
88                         my $new_id    = 'PYU1_' . $abrv . $count;
89                         print "$id\t$new_id\n";
90                         print '';
91                 }
92         }
93 }
94 #-----------------------------------------------------------------------------
95 #-------------------------------- SUBROUTINES --------------------------------
96 #-----------------------------------------------------------------------------
97 sub parse_sort_order {
98
99         my ($sort_order_file, $gff_file) = @_;
100
101         my %sort_order;
102
103         if ($sort_order_file) {
104                 open(my $IN, '<', $sort_order_file) or die "Can't open $sort_order_file\n$!\n";
105                 my %sort_order = (<$IN>);
106                 close $IN;
107         }
108         elsif (`grep -P '\#\#sequence-region' $gff_file`) {
109                 open(my $IN, '<', $gff_file) or die "Can't open $gff_file\n$!\n";
110                 my $flag;
111                 while (my $line = <$IN>) {
112                         if ($line =~ /\#\#sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/) {
113                                 $flag++;
114                                 my ($contig_id, $start, $end) = ($1, $2, $3);
115                                 $sort_order{$contig_id} = ($end - $start);
116                         }
117                         last if $flag;
118                 }
119                 close $IN;
120         }
121         else {
122                 open(my $IN, '<', $gff_file) or die "Can't open $gff_file\n$!\n";
123
124                 while (<$IN>) {
125                         my @fields = split /\t/, $_;
126                         next unless @fields == 9;
127                         my $seq = $fields[0];
128                         $sort_order{$seq}++;
129                 }
130         }
131         return \%sort_order;
132 }
133 #-----------------------------------------------------------------------------
134 sub sort_contigs {
135         my ($a, $b, $sort_map) = @_;
136         return ($sort_order->{$a} <=> $sort_map->{$b}) if defined $sort_order;
137         return ($sort_order->{$b} <=> $sort_map->{$a});
138 }
Note: See TracBrowser for help on using the browser.