| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
|
|---|
| 4 |
use Bio::DB::GFF; |
|---|
| 5 |
use Bio::Tools::GFF; |
|---|
| 6 |
use Bio::FeatureIO::gff; |
|---|
| 7 |
|
|---|
| 8 |
my $dir = $ARGV[0]; |
|---|
| 9 |
if (not $dir) { |
|---|
| 10 |
die "add_utr_gff.pl <directory>\n |
|---|
| 11 |
where directory is the location of your gff files.\n" |
|---|
| 12 |
} |
|---|
| 13 |
|
|---|
| 14 |
opendir DIR, "$dir" or die $!; |
|---|
| 15 |
my @files = grep/.+\.gff$/, readdir DIR; |
|---|
| 16 |
close DIR; |
|---|
| 17 |
|
|---|
| 18 |
foreach my $file (@files) { |
|---|
| 19 |
chomp($file); |
|---|
| 20 |
my $outfile = "$dir/$file"; |
|---|
| 21 |
$outfile =~ s/\.gff/\.wutr\.gff3/; |
|---|
| 22 |
open NEW, ">$outfile" or die $!; |
|---|
| 23 |
print NEW "##gff-version 3\n"; |
|---|
| 24 |
my %exon = (); |
|---|
| 25 |
my %parent = (); |
|---|
| 26 |
my %id2name = (); |
|---|
| 27 |
my %cds = (); |
|---|
| 28 |
my %strand = (); |
|---|
| 29 |
|
|---|
| 30 |
my $parser = new Bio::Tools::GFF(-gff_version => 3, |
|---|
| 31 |
-file => "$dir/$file"); |
|---|
| 32 |
|
|---|
| 33 |
while (my $feature = $parser->next_feature()) { |
|---|
| 34 |
my $tag = $feature->primary_tag; |
|---|
| 35 |
my $start = $feature->start; |
|---|
| 36 |
my $end = $feature->end; |
|---|
| 37 |
if ($feature->source_tag eq "maker") { |
|---|
| 38 |
if ($tag eq "mRNA") { |
|---|
| 39 |
my @id = $feature->get_tag_values('ID'); |
|---|
| 40 |
my $id = $id[0]; |
|---|
| 41 |
my @name = $feature->get_tag_values('Name'); |
|---|
| 42 |
my $name = $name[0]; |
|---|
| 43 |
$id2name{$id} = $name; |
|---|
| 44 |
$strand{$name} = $feature->strand; |
|---|
| 45 |
}elsif ($tag eq "CDS") { |
|---|
| 46 |
my @parents = $feature->get_tag_values('Parent'); |
|---|
| 47 |
foreach my $p (@parents) { |
|---|
| 48 |
my $pid = $p; |
|---|
| 49 |
my $pname = $id2name{$pid}; |
|---|
| 50 |
$strand{$pname} = $feature->strand; |
|---|
| 51 |
push @{$cds{$pname}}, ($start, $end); |
|---|
| 52 |
} |
|---|
| 53 |
}elsif ($tag eq "exon") { |
|---|
| 54 |
my @parents = $feature->get_tag_values('Parent'); |
|---|
| 55 |
foreach my $p (@parents) { |
|---|
| 56 |
my $pid = $p; |
|---|
| 57 |
my $pname = $id2name{$pid}; |
|---|
| 58 |
push @{$exons{$pname}}, ($start, $end); |
|---|
| 59 |
} |
|---|
| 60 |
} |
|---|
| 61 |
} |
|---|
| 62 |
} |
|---|
| 63 |
my %utr = (); |
|---|
| 64 |
F1:foreach my $mrna (keys %exons) { |
|---|
| 65 |
my @exons = sort {$a <=> $b} @{$exons{$mrna}}; |
|---|
| 66 |
|
|---|
| 67 |
next F1 unless $cds{$mrna}; |
|---|
| 68 |
my @cds = sort {$a <=> $b} @{$cds{$mrna}}; |
|---|
| 69 |
|
|---|
| 70 |
$cds_start = $cds[0]; |
|---|
| 71 |
$cds_end = $cds[$#cds]; |
|---|
| 72 |
$num_exons = scalar(@exons)/2; |
|---|
| 73 |
|
|---|
| 74 |
my $i = 0; |
|---|
| 75 |
my $j = 1; |
|---|
| 76 |
foreach (1..$num_exons) { |
|---|
| 77 |
my $exon_start = $exons[$i]; |
|---|
| 78 |
my $exon_end = $exons[$j]; |
|---|
| 79 |
if (not $exon_start) { |
|---|
| 80 |
print "no exon start\n"; |
|---|
| 81 |
} |
|---|
| 82 |
if ($exon_start < $cds_start) { |
|---|
| 83 |
if ($exon_end < $cds_start) { |
|---|
| 84 |
push @{$utr{$mrna}}, ["lower_utr",$exon_start, $exon_end]; |
|---|
| 85 |
}else { |
|---|
| 86 |
push @{$utr{$mrna}}, ["lower_utr",$exon_start, $cds_start - 1]; |
|---|
| 87 |
} |
|---|
| 88 |
}elsif ($exon_end > $cds_end) { |
|---|
| 89 |
if ($exon_start > $cds_end) { |
|---|
| 90 |
push @{$utr{$mrna}}, ["upper_utr",$exon_start, $exon_end]; |
|---|
| 91 |
}else { |
|---|
| 92 |
push @{$utr{$mrna}}, ["upper_utr",$cds_end+1, $exon_end]; |
|---|
| 93 |
} |
|---|
| 94 |
} |
|---|
| 95 |
$i += 2; |
|---|
| 96 |
$j += 2; |
|---|
| 97 |
} |
|---|
| 98 |
} |
|---|
| 99 |
open GFF, "<$dir/$file" or die $!; |
|---|
| 100 |
my $line = <GFF>; |
|---|
| 101 |
W1:while ($line = <GFF>) { |
|---|
| 102 |
chomp($line); |
|---|
| 103 |
print NEW $line,"\n"; |
|---|
| 104 |
my @features = split(/\t/, $line); |
|---|
| 105 |
my $seqid = $features[0]; |
|---|
| 106 |
my $source = $features[1]; |
|---|
| 107 |
next W1 unless $source; |
|---|
| 108 |
if ($source eq "maker") { |
|---|
| 109 |
my $tag = $features[2]; |
|---|
| 110 |
my $start = $features[3]; |
|---|
| 111 |
my $end = $features[4]; |
|---|
| 112 |
my $strand = $features[6]; |
|---|
| 113 |
if ($tag eq "mRNA") { |
|---|
| 114 |
%annotations = split(/[=;]/, $features[8]); |
|---|
| 115 |
my $name = (split(/\s+/, $annotations{'Name'}))[0]; |
|---|
| 116 |
$strand = $strand{$name}; |
|---|
| 117 |
if ($utr{$name}) { |
|---|
| 118 |
my $utr_id = 1; |
|---|
| 119 |
foreach $utr (@{$utr{$name}}) { |
|---|
| 120 |
($type, $start, $end) = @{$utr}; |
|---|
| 121 |
my $utr_start = $start; |
|---|
| 122 |
my $utr_end = $end; |
|---|
| 123 |
if ($strand == 1 && $type eq "lower_utr") { |
|---|
| 124 |
$utrtag = "five_prime_UTR"; |
|---|
| 125 |
$gstrand = "+"; |
|---|
| 126 |
}elsif ($strand == 1 && $type eq "upper_utr") { |
|---|
| 127 |
$utrtag = "three_prime_UTR"; |
|---|
| 128 |
$gstrand = "+"; |
|---|
| 129 |
}elsif ($strand == -1 && $type eq "lower_utr") { |
|---|
| 130 |
$utrtag = "three_prime_UTR"; |
|---|
| 131 |
$gstrand = "-"; |
|---|
| 132 |
}else { |
|---|
| 133 |
$utrtag = "five_prime_UTR"; |
|---|
| 134 |
$gstrand = "-"; |
|---|
| 135 |
} |
|---|
| 136 |
my $id = "ID=$name\:utr\:$utr_id"; |
|---|
| 137 |
my $parent = "Parent=$name"; |
|---|
| 138 |
$annot = join(";", $id, $parent); |
|---|
| 139 |
print NEW join("\t",$seqid,"maker",$utrtag,$utr_start,$utr_end,".", $gstrand,".",$annot),"\n"; |
|---|
| 140 |
$utr_id ++; |
|---|
| 141 |
} |
|---|
| 142 |
} |
|---|
| 143 |
} |
|---|
| 144 |
} |
|---|
| 145 |
} |
|---|
| 146 |
} |
|---|