root/bin/maker_functional_fasta

Revision 270, 3.0 kB (checked in by bmoore, 2 months ago)

Modifications to some scripts by Barry

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5
6 #-----------------------------------------------------------------------------
7 #----------------------------------- MAIN ------------------------------------
8 #-----------------------------------------------------------------------------
9
10 my $usage = "
11
12 Synopsis:
13
14 maker_functional_fasta uniprot-sprot.fasta uniprot-sprot.blast.out \
15                        maker.transcripts.fasta maker.proteins.fasta ....
16
17 Description:
18
19 This script will take a uniprot-sprot fasta file, a wu-blast -mformat
20 2 blast output file from a blast of maker proteins against
21 uniprot-sprot proteins and add functional annotations to the maker
22 fasta header changing it from this:
23
24 >PYU1_T015316 transcript offset:0 AED:0.0476397966594044 \
25 QI:0|-1|0|1|-1|1|1|0|255
26
27 to this:
28
29 >PYU1_T015316 transcript name:\"Serine-aspartate repeat-containing protein I \
30 (Staphylococcus saprophyticus)\" offset:0 AED:0.0476397966594044             \
31 QI:0|-1|0|1|-1|1|1|0|255
32
33 ";
34
35
36 my ($help);
37 my $opt_success = GetOptions('help'    => \$help,
38                             );
39
40 die $usage if $help || ! $opt_success;
41
42 my ($uniprot_file, $blast_file, @fasta_files) = @ARGV;
43 die $usage unless $uniprot_file && $blast_file && @fasta_files;
44
45 my $blast = parse_blast($uniprot_file, $blast_file);
46
47 for my $file (@fasta_files) {
48
49 open (my $IN,  '<', $file) or die "Can't open $file for reading\n$!\n";
50
51 LINE:
52 while (<$IN>) {
53
54     if ($_ !~ /^>/) {
55         print $_;
56         next LINE;
57     }
58
59     my ($id, $mol_type, @attrbs) = split /\s+/, $_;
60     $id =~ s/^>//;
61
62     my $func = $blast->{$id} || 'Name:"Protein of unknown function"';
63
64     my $header = ">$id $mol_type ";
65     $header .= "$func " if $func;
66     $header .= join " ", @attrbs if @attrbs;
67
68     print "$header\n";
69
70     }
71 }
72 #-----------------------------------------------------------------------------
73 #-------------------------------- SUBROUTINES --------------------------------
74 #-----------------------------------------------------------------------------
75
76 sub parse_blast{
77
78         my ($fasta_file, $blast_file) = @_;
79
80         open(my $IN, '<', $fasta_file) or
81           die "Can't open $fasta_file for reading\n$!\n";
82
83         my %fasta;
84         while (<$IN>) {
85
86                 next unless /^>/;
87
88                 my ($id, $desc, $org, $name);
89                 #>sp|P18560|1101L_ASFB7 Protein MGF 110-1L OS=African swine fever virus (strain Badajoz 1971 Vero-adapted) GN=BA71V-008 PE=3 SV=1
90                 if (/>(\S+)\s+(.*?)\s+OS=(.*?)\s+(GN=(.*?)\s+)?PE=/) {
91                         $id   = $1;
92                         $desc = $2;
93                         $org  = $3;
94                         $name = $5 || '';
95                 }
96                 else {
97                         warn "Can't parse details from FASTA header: $_\n";
98                 }
99
100                 my $func = '';
101                 $func .= "Similar to $name: " if $name;
102                 if ($func =~ /Similar to/) {
103                         $func .= "$desc "  if $desc;
104                 }
105                 else {
106                         $func .= "Similar to $desc "  if $desc;
107                 }
108                 $func .= "($org)"  if $org;
109                 $func  = ('Name:"' . $func . '"') if $func;
110
111                 $fasta{$id} = $func;
112
113                 print '';
114         }
115         close $IN;
116
117         open($IN, '<', $blast_file) or
118           die "Can't open $blast_file for reading\n$!\n";
119
120         my (%blast, %seen);
121         while (<$IN>) {
122
123                 my ($qid, $sid, $e_val) = split;
124
125                 next if $seen{$qid};
126
127                 $seen{$qid}++;
128
129                 $blast{$qid} = $fasta{$sid} || '';
130         }
131         return \%blast;
132 }
Note: See TracBrowser for help on using the browser.