root/bin/map_gff_ids

Revision 199, 4.3 kB (checked in by bmoore, 8 months ago)

updates to ID mapping scripts and reformatted README for printer and screen

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5
6 #-----------------------------------------------------------------------------
7 #----------------------------------- MAIN ------------------------------------
8 #-----------------------------------------------------------------------------
9 my $usage = "
10
11 Synopsis:
12
13 map_gff_ids genome.all.id.map genome.all.gff
14
15 Description:
16
17 This script takes a id map file and changes the name of the
18 appropriate attributes in a gff file.  The map file is tab delimited
19 file with two columns: old_name and new_name.The script requires that
20 old_name (or the part of old_name before the first colon) be equal to
21 an attribute value before it will map it.
22
23 ";
24
25
26 my ($help);
27 my $opt_success = GetOptions('help'    => \$help,
28                               );
29
30 my ($map_file, $gff_file) = @ARGV;
31 die $usage unless $map_file && $gff_file;
32
33 # Read map file and create a map hash
34 open (my $MAP, '<', $map_file) or die "Can't open $map_file for reading\n$!\n";
35 my %map;
36 map {my ($old, $new) = split;$map{$old} = $new} (<$MAP>);
37 close $MAP;
38
39 # Open $gff_file for reading, unlink it to avoid clobbering it and then open
40 # the same file for writing.  This allows in-place editing.
41 open (my $IN, '<', $gff_file) or die "Can't open $gff_file for reading\n$!\n";
42 unlink($gff_file);
43 open(my $OUT, '>', $gff_file) or die "Can't open $gff_file for writing\n$!\n";
44
45 # Set the order in which attributes will appear in the attribute text.
46 # Attributes not listed here will be sorted alphabetically.
47 my %order = (ID     => 1,
48              Parent => 2,
49              Name   => 3,
50              Alias  => 4,
51             );
52
53 my $fasta_start;
54 LINE:
55 while (my $line = <$IN>) {
56         $fasta_start++ if $line =~ /^\#\#FASTA/;
57         my ($seq, $source, $type, $start, $end, $score,
58             $strand, $phase, $attrb_text) = split /\t/, $line;
59         if (! $fasta_start    && # If we're not in the fasta section...
60             $line !~ /^\s*\#/ && # and we're not a comment line...
61             $start =~ /^\d+$/ && # and we have a valid start column...
62             $end =~ /^\d+$/   && # and a valid end column...
63             $attrb_text          # and attribute text
64            ) {
65                 chomp $attrb_text;
66                 my %attrb = parse_attributes($attrb_text);
67
68                 # Create an alias with the old_name for gene and mRNA features.
69                 my $alias;
70                 if ($type eq 'mRNA' || $type eq 'gene') {
71                         $alias = $attrb{ID}[0] if $type =~ /^(gene|mRNA)$/;
72                 }
73
74                 # Collect all IDs that should be mapped for this feature
75                 # (the values of Parent and ID tags)
76                 my @ids;
77                 push @ids, @{$attrb{Parent}} if $attrb{Parent};
78                 push @ids, @{$attrb{ID}}     if $attrb{ID};
79
80                 # Keep only the IDs that are in the mapping file.
81                 my @map_ids;
82                 map {push @map_ids, $_ if $map{$_}}  @ids;
83
84                 # If there is nothing to map, print the line and go to
85                 # the next feature.
86                 if (! @map_ids) {
87                         print $OUT $line;
88                         next LINE;
89                 };
90
91                 # Map old_name to new_name for each tag value...
92                 for my $tag (keys %attrb) {
93                         for my $value (@{$attrb{$tag}}) {
94                                 for my $id (@map_ids) {
95                                         # Only if the value (or the portion preceding
96                                         # the first colon) is equal to the map key.
97                                         next unless ($value eq $id || $value =~ /$id:/);
98                                         $value =~ s/$id/$map{$id}/;
99                                 }
100                         }
101                 }
102                 # Now that we're done mapping, add the alias to the attributes
103                 # if we have one.
104                 $attrb{Alias} = [$alias] if $alias;
105
106                 # Make sure we have all attribute keys added to sort order to
107                 # avoid undef in <=>
108                 map {$order{$_} ||= 99} keys %attrb;
109
110                 # Create new attribute text.
111                 my $new_attrb_text;
112                 for my $tag (sort {$order{$a} <=> $order{$b} || $a cmp $b} keys %attrb) {
113                         my $value_text = join ',', @{$attrb{$tag}};
114                         $new_attrb_text .= "$tag=$value_text;"
115                 }
116                 $new_attrb_text .= "\n";
117
118                 # Create a new $line.
119                 $line = join "\t", ($seq,
120                                  $source,
121                                  $type,
122                                  $start,
123                                  $end,
124                                  $score,
125                                  $strand,
126                                  $phase,
127                                  $new_attrb_text,
128                                  );
129         }
130         print $OUT $line;
131 }
132 #-----------------------------------------------------------------------------
133 #-------------------------- Subroutines --------------------------------------
134 #-----------------------------------------------------------------------------
135 sub parse_attributes {
136         my $attrb_text = shift;
137
138         my %attrb;
139         my @attrb_list = split /\s*;\s*/, $attrb_text;
140         for my $attrb_pair (@attrb_list) {
141                 my ($tag, $value_text) = split /\s*=\s*/, $attrb_pair;
142                 my @values = split /\s*,\s*/, $value_text;
143                 $attrb{$tag} = \@values;
144         }
145         return %attrb;
146 }
Note: See TracBrowser for help on using the browser.