| 1 |
#! /usr/bin/perl -w |
|---|
| 2 |
use strict; |
|---|
| 3 |
|
|---|
| 4 |
my $file = shift; |
|---|
| 5 |
|
|---|
| 6 |
#parse file |
|---|
| 7 |
open (IN, "< $file"); |
|---|
| 8 |
my @genes; |
|---|
| 9 |
my @mRNAs; |
|---|
| 10 |
my @stuff; |
|---|
| 11 |
|
|---|
| 12 |
while (defined(my $line = <IN>)){ |
|---|
| 13 |
#skip comments and fasta |
|---|
| 14 |
last if($line =~ /^\#\#FASTA/); |
|---|
| 15 |
next if($line =~ /^\#/); |
|---|
| 16 |
|
|---|
| 17 |
#add contig lines to header |
|---|
| 18 |
my $f = parse($line); |
|---|
| 19 |
next if($f->{type} eq 'contig'); |
|---|
| 20 |
|
|---|
| 21 |
push(@genes, $f) if($f->{type} eq 'gene'); |
|---|
| 22 |
push(@mRNAs, $f) if($f->{type} eq 'mRNA'); |
|---|
| 23 |
push(@stuff, $f) if($f->{type} eq 'CDS'); |
|---|
| 24 |
push(@stuff, $f) if($f->{type} eq 'five_prime_UTR'); |
|---|
| 25 |
push(@stuff, $f) if($f->{type} eq 'three_prime_UTR'); |
|---|
| 26 |
push(@stuff, $f) if($f->{type} eq 'start_codon'); |
|---|
| 27 |
push(@stuff, $f) if($f->{type} eq 'stop_codon'); |
|---|
| 28 |
} |
|---|
| 29 |
close(IN); |
|---|
| 30 |
|
|---|
| 31 |
#organize transcript features |
|---|
| 32 |
my %stuffinx; |
|---|
| 33 |
foreach my $f (@stuff){ |
|---|
| 34 |
my $parent = $f->{Parent}; |
|---|
| 35 |
push (@{$stuffinx{$parent}}, $f); |
|---|
| 36 |
} |
|---|
| 37 |
|
|---|
| 38 |
#organize mRNAs |
|---|
| 39 |
my %mRNAinx; |
|---|
| 40 |
foreach my $f (@mRNAs){ |
|---|
| 41 |
my $parent = $f->{Parent}; |
|---|
| 42 |
push (@{$mRNAinx{$parent}}, $f); |
|---|
| 43 |
} |
|---|
| 44 |
|
|---|
| 45 |
foreach my $g (@genes){ |
|---|
| 46 |
my $gene_id = $g->{ID}; |
|---|
| 47 |
my $g_mRNAs = $mRNAinx{$g->{ID}}; |
|---|
| 48 |
|
|---|
| 49 |
foreach my $t (@$g_mRNAs){ |
|---|
| 50 |
my $transcript_id = $t->{ID}; |
|---|
| 51 |
my $t_stuff = $stuffinx{$t->{ID}}; |
|---|
| 52 |
|
|---|
| 53 |
my @UTRs = grep {$_->{type} =~ /UTR$/} @$t_stuff; |
|---|
| 54 |
my @CDSs = grep {$_->{type} eq 'CDS'} @$t_stuff; |
|---|
| 55 |
my @stops = grep {$_->{type} eq 'stop_codon'} @$t_stuff; |
|---|
| 56 |
my @starts = grep {$_->{type} eq 'start_codon'} @$t_stuff; |
|---|
| 57 |
|
|---|
| 58 |
@CDSs = fix_CDS(\@CDSs,\@stops); |
|---|
| 59 |
|
|---|
| 60 |
print_it($gene_id, $transcript_id, @$t_stuff); |
|---|
| 61 |
} |
|---|
| 62 |
} |
|---|
| 63 |
|
|---|
| 64 |
#------------------------SUBS------------------------ |
|---|
| 65 |
sub fix_CDS { |
|---|
| 66 |
my $CDSs = shift; |
|---|
| 67 |
my $stops = shift; |
|---|
| 68 |
|
|---|
| 69 |
my @keepers; |
|---|
| 70 |
foreach my $CDS (@$CDSs){ |
|---|
| 71 |
my $cB = $CDS->{start}; |
|---|
| 72 |
my $cE = $CDS->{end}; |
|---|
| 73 |
|
|---|
| 74 |
foreach my $stop (@$stops){ |
|---|
| 75 |
my $sB = $stop->{start}; |
|---|
| 76 |
my $sE = $stop->{end}; |
|---|
| 77 |
|
|---|
| 78 |
if($cB == $sB){ |
|---|
| 79 |
$CDS->{start} += 3; |
|---|
| 80 |
} |
|---|
| 81 |
elsif($cE == $sE){ |
|---|
| 82 |
$CDS->{end} -= 3; |
|---|
| 83 |
} |
|---|
| 84 |
} |
|---|
| 85 |
|
|---|
| 86 |
push(@keepers, $CDS); |
|---|
| 87 |
} |
|---|
| 88 |
|
|---|
| 89 |
return @keepers |
|---|
| 90 |
} |
|---|
| 91 |
|
|---|
| 92 |
sub print_it { |
|---|
| 93 |
my $gene_id = shift; |
|---|
| 94 |
my $transcript_id = shift; |
|---|
| 95 |
|
|---|
| 96 |
($transcript_id) = $transcript_id =~ /^([^\s\t\n]+)/; |
|---|
| 97 |
while(my $f = shift @_){ |
|---|
| 98 |
$f->{type} = '5UTR' if($f->{type} eq 'five_prime_UTR'); |
|---|
| 99 |
$f->{type} = '3UTR' if($f->{type} eq 'three_prime_UTR'); |
|---|
| 100 |
|
|---|
| 101 |
print $f->{seqid} . "\t"; |
|---|
| 102 |
print $f->{source} . "\t"; |
|---|
| 103 |
print $f->{type} . "\t"; |
|---|
| 104 |
print $f->{start} . "\t"; |
|---|
| 105 |
print $f->{end} . "\t"; |
|---|
| 106 |
print $f->{score} . "\t"; |
|---|
| 107 |
print $f->{strand} . "\t"; |
|---|
| 108 |
print $f->{phase} . "\t"; |
|---|
| 109 |
print "gene_id \"".$gene_id."\"\;"; |
|---|
| 110 |
print " transcript_id \"".$transcript_id."\"\;"; |
|---|
| 111 |
print "\n"; |
|---|
| 112 |
} |
|---|
| 113 |
} |
|---|
| 114 |
|
|---|
| 115 |
sub parse { |
|---|
| 116 |
my $l = shift; |
|---|
| 117 |
|
|---|
| 118 |
chomp $l; |
|---|
| 119 |
my @F = split("\t", $l); |
|---|
| 120 |
my @att = attrib($F[8]); |
|---|
| 121 |
|
|---|
| 122 |
my $f = {seqid => $F[0], |
|---|
| 123 |
source => $F[1], |
|---|
| 124 |
type => $F[2], |
|---|
| 125 |
start => $F[3], |
|---|
| 126 |
end => $F[4], |
|---|
| 127 |
score => $F[5], |
|---|
| 128 |
strand => $F[6], |
|---|
| 129 |
phase => $F[7], |
|---|
| 130 |
ID => $att[0], |
|---|
| 131 |
Name => $att[1], |
|---|
| 132 |
Parent => $att[2]}; |
|---|
| 133 |
|
|---|
| 134 |
return $f; |
|---|
| 135 |
} |
|---|
| 136 |
|
|---|
| 137 |
sub attrib { |
|---|
| 138 |
my $att = shift; |
|---|
| 139 |
|
|---|
| 140 |
my ($ID) = $att =~ /ID\=([^\;\n]+)/; |
|---|
| 141 |
my ($name) = $att =~ /Name\=([^\;\n]+)/; |
|---|
| 142 |
my ($parent) = $att =~ /Parent\=([^\;\n]+)/; |
|---|
| 143 |
|
|---|
| 144 |
$ID = $name if(! defined $ID); |
|---|
| 145 |
$name = $ID if(! defined $name); |
|---|
| 146 |
|
|---|
| 147 |
return ($ID, $name, $parent); |
|---|
| 148 |
} |
|---|