bioinfo:gbkinfo_script
This is an old revision of the document!
gbkInfo script
GenBank flat file을 열어서 locus tag, product 등 기본 정보를 tab-delimited text file로 출력하고, CDS 정보는 표준출력으로 내보내는 스크립트이다. tr 옵션을 주면 아미노산 서열로 번역이 되어 출력된다. 용도에 따라 여러 가지 형태로 변형하여 사용했었지만, 아래에 소개한 gbkInfo_working.pl은 가장 포괄적인 기능을 갖도록 작성하였다.
#!/usr/bin/perl
#
# Usage - gbkInfo_working.pl GenBank_file [tr]
#
use Bio::SeqIO;
$seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank');
$infoFile = $ARGV[0] . '.txt';
$seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta');
# Can handle multiple sequence objects in a single GenBank file
while (my $seqObj = $seqIn->next_seq()) {
@features = $seqObj->get_SeqFeatures();
foreach my $feat ( @features ) {
# types of primary_tag: source, gene, CDS, rRNA, tRNA
if ($feat->primary_tag eq 'source') {
print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism);
}
# For each gene feature, check if it is pseudo or not.
# Extract function info if possible.
# Values will be stored in %annot hash.
if ($feat->primary_tag eq 'gene') {
$num++;
$locus = eval {($feat->get_tag_values(locus_tag))[0]};
$annot{$locus}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function);
$annot{$locus}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag);
if ($feat->has_tag(pseudo)) {
$annot{$locus}->{'isPseudo'} = 'pseudo';
push @pseudo, $locus;
}
if ($feat->strand == '1') {
$annot{$locus}->{'strand'} = '+';
$annot{$locus}->{'stop'} = $feat->end;
} else {
$annot{$locus}->{'strand'} = '-';
$annot{$locus}->{'stop'} = $feat->start;
}
# start < end
$annot{$locus}->{'start'} = $feat->start;
$annot{$locus}->{'end'} = $feat->end;
}
# For each CDS feature, product/gene/EC_number will be extracted.
# There can be multiple EC numbers in one CDS feature.
#
if ($feat->primary_tag eq 'CDS') {
$num2++ unless $feat->has_tag(pseudo);
my $locus = eval {($feat->get_tag_values(locus_tag))[0]};
# If there are multiple "pseudo" CDS features (fragmentary) in one given locus,
# process the first one only. Remaining CDS feature(s) has the same information.
#
next if exists $seen{$locus};
$seen{$locus} = '';
$annot{$locus}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product);
$annot{$locus}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene);
$annot{$locus}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id);
# If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
#
if ($feat->has_tag(EC_number)) {
my @tag = $feat->get_tag_values(EC_number);
$annot{$locus}->{EC_number} = join "; ", @tag;
}
# Extracting gene sequences (not pseudo).
# If you want to process specified genes only, then use $locus variable.
#
if ($feat->has_tag(translation)) {
my $NTsequence = $feat->seq->seq();
my $AAsequence = eval {($feat->get_tag_values(translation))[0]};
my $sequence = $NTsequence;
$sequence = $AAsequence if $ARGV[1] eq 'tr';
my $product = $annot{$locus}->{'product'};
# You can add any other information to 'desc'.
my $seqObj = Bio::Seq->new(-display_id => $locus,
-desc => $product,
-seq => $sequence
);
$seqOut->write_seq($seqObj);
}
}
}
}
print STDERR "[STDERR] $ARGV[0] is $num gene features\n";
print STDERR "[STDERR] $ARGV[0] is $num2 active CDS features (not marked as pseudo)\n";
# Now print the entire information!
#
open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n";
print STDERR "[STDERR] Feature information is being written to $infoFile (pre-existing file will be overwritten!)\n";
print INFO join "\t", '#source', 'feature', 'locus tag', 'old locus tag', 'isPseudo?', 'start', 'end', 'straind',
'stop', 'gene', 'product', 'protein id', 'EC number', 'function' . "\n";
for my $locus (keys %annot) {
print INFO join "\t", $ARGV[0], 'protein-coding gene', $locus, $annot{$locus}->{'old_locus_tag'},
$annot{$locus}->{'isPseudo'},
$annot{$locus}->{'start'}, $annot{$locus}->{'end'}, $annot{$locus}->{'strand'},
$annot{$locus}->{'stop'}, $annot{$locus}->{'gene'}, $annot{$locus}->{'product'},
$annot{$locus}->{'protein_id'},
$annot{$locus}->{'EC_number'}, $annot{$locus}->{'function'}
. "\n";
}
bioinfo/gbkinfo_script.1559695826.txt.gz · Last modified: (external edit)
