gbkInfo script

GenBank flat file을 열어서 locus tag, product 등 기본 정보를 tab-delimited text file로 출력하고, CDS 정보는 표준출력으로 내보내는 스크립트이다. tr 옵션을 주면 아미노산 서열로 번역이 되어 출력된다. 용도에 따라 여러 가지 형태로 변형하여 사용했었지만, 아래에 소개한 gbkInfo_working.pl은 가장 포괄적인 기능을 갖도록 작성하였다.
#!/usr/bin/perl
# 
# Usage - gbkInfo_working.pl GenBank_file [tr]
#

use Bio::SeqIO;

$seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank');
$infoFile = $ARGV[0] . '.txt';
$seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta');

# Can handle multiple sequence objects in a single GenBank file
while (my $seqObj = $seqIn->next_seq()) {
    @features = $seqObj->get_SeqFeatures();
    foreach my $feat ( @features ) {
        # types of primary_tag: source, gene, CDS, rRNA, tRNA
        if ($feat->primary_tag eq 'source') {
            print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism);
        }

# For each gene feature, check if it is pseudo or not.
# Extract function info if possible.
# Values will be stored in %annot hash.

        if ($feat->primary_tag eq 'gene') {
            $num++;
            $locus = eval {($feat->get_tag_values(locus_tag))[0]};
            $annot{$locus}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function);
            $annot{$locus}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag);
            if ($feat->has_tag(pseudo)) {
                $annot{$locus}->{'isPseudo'} = 'pseudo';
                push @pseudo, $locus;
            }
            if ($feat->strand == '1') {
                $annot{$locus}->{'strand'} = '+';
                $annot{$locus}->{'stop'} = $feat->end;
            } else {
                $annot{$locus}->{'strand'} = '-';
                $annot{$locus}->{'stop'} = $feat->start;
            }
# start < end
            $annot{$locus}->{'start'} = $feat->start;
            $annot{$locus}->{'end'} = $feat->end;
        }

# For each CDS feature, product/gene/EC_number will be extracted.
# There can be multiple EC numbers in one CDS feature.
#

        if ($feat->primary_tag eq 'CDS') {
            $num2++ unless $feat->has_tag(pseudo);
            my $locus = eval {($feat->get_tag_values(locus_tag))[0]};

# If there are multiple "pseudo" CDS features (fragmentary) in one given locus,
# process the first one only. Remaining CDS feature(s) has the same information.
#
            next if exists $seen{$locus};
            $seen{$locus} = '';

            $annot{$locus}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product);
            $annot{$locus}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene);
            $annot{$locus}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id);

# If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
#
            if ($feat->has_tag(EC_number)) {
                my @tag = $feat->get_tag_values(EC_number);
                $annot{$locus}->{EC_number} = join "; ", @tag;
            }

# Extracting gene sequences (not pseudo). 
# If you want to process specified genes only, then use $locus variable.
#
            if ($feat->has_tag(translation)) {
                my $NTsequence = $feat->seq->seq();
                my $AAsequence = eval {($feat->get_tag_values(translation))[0]};
                my $sequence = $NTsequence;
                $sequence = $AAsequence if $ARGV[1] eq 'tr';
                my $product = $annot{$locus}->{'product'};
# You can add any other information to 'desc'.
                my $seqObj = Bio::Seq->new(-display_id => $locus,
                                           -desc       => $product,
                                           -seq        => $sequence
                                           );
                $seqOut->write_seq($seqObj);
            }
        }
    }
}

print STDERR "[STDERR] $ARGV[0] is $num gene features\n";
print STDERR "[STDERR] $ARGV[0] is $num2 active CDS features (not marked as pseudo)\n";

# Now print the entire information!
#


open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n";
print STDERR "[STDERR] Feature information is being written to $infoFile (pre-existing file will be overwritten!)\n";

print INFO join "\t", '#source', 'feature', 'locus tag', 'old locus tag', 'isPseudo?', 'start', 'end', 'straind',
           'stop', 'gene', 'product', 'protein id', 'EC number', 'function' . "\n";
for my $locus (keys %annot) {
    print INFO join "\t", $ARGV[0], 'protein-coding gene', $locus, $annot{$locus}->{'old_locus_tag'}, 
               $annot{$locus}->{'isPseudo'},
               $annot{$locus}->{'start'}, $annot{$locus}->{'end'}, $annot{$locus}->{'strand'},
               $annot{$locus}->{'stop'}, $annot{$locus}->{'gene'}, $annot{$locus}->{'product'},
               $annot{$locus}->{'protein_id'}, 
               $annot{$locus}->{'EC_number'}, $annot{$locus}->{'function'}
               . "\n";
}