#!/usr/bin/perl
#
# Written by Haeyoung Jeong
#
use Bio::SeqIO;
use File::Basename;
# latest update: 2022-05-06
$scriptname = basename($0);
$modtime = (stat($scriptname))[9];
$timestamp = localtime($modtime);
if ($#ARGV == -1) {
print STDERR "Usage : $scriptname <GenBank file> [-seq]\n";
print STDERR " script last modified at $timestamp\n";
exit;
}
# debug mode is for additional analysis using the 'data.txt' file
$debug = 0;
if ($debug) {
print STDERR "[STDERR] Entering debug mode...[ \$debug = $debug ; data.txt file will be used ]\n";
}
$seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank');
$infoFile = basename($ARGV[0]) . '.txt';
if ($ARGV[1] eq '-seq') {
print STDERR "[STDERR] Nucleotide and amino acid sequence will be written to the $infoFile file.\n";
}
$seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta');
$LOCUS_num = $num = $num2 = 0;
# Can handle multiple sequence objects in a single GenBank file
while (my $seqObj = $seqIn->next_seq()) {
my $LOCUS = $seqObj->display_id();
my $VERSION = $seqObj->version();
print STDERR "[STDERR] LOCUS: $LOCUS (VERSION: $VERSION)\n";
$LOCUS_num++;
$whole_seq{$LOCUS} = $seqObj;
@features = $seqObj->get_SeqFeatures();
foreach my $feat ( @features ) {
# types of primary_tag: source, gene, CDS, rRNA, tRNA
if ($feat->primary_tag eq 'source') {
print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism);
}
# For each gene feature, check if it is pseudo or not.
# Extract function info if possible.
# Values will be stored in %annot hash.
if ($feat->primary_tag eq 'gene') {
$num++;
# $annot{$locus_tag}->{'locus'} holds 'LOCUS' (e.g., CP000727)
$locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
$geneFeaturePresence{$locus_tag} = "";
$annot{$locus_tag}->{'locus'} = $LOCUS;
$annot{$locus_tag}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function);
$annot{$locus_tag}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag);
$seqref{$locus_tag} = $feat->seq;
if ($feat->has_tag(pseudo)) {
$annot{$locus_tag}->{'isPseudo'} = 'pseudo';
push @pseudo, $locus_tag;
}
# $feat->start is always smaller than $feat->end.
if ($feat->strand == '1') {
$annot{$locus_tag}->{'strand'} = '+';
} else {
$annot{$locus_tag}->{'strand'} = '-';
}
$annot{$locus_tag}->{'start'} = $feat->start;
$annot{$locus_tag}->{'end'} = $feat->end;
}
# For each CDS feature, product/gene/EC_number will be extracted.
# There can be multiple EC numbers in one CDS feature.
#
# if ($feat->primary_tag eq 'CDS' || $feat->primary_tag eq 'rRNA' || $feat->primary_tag eq 'tRNA') {
if ($feat->primary_tag eq 'CDS') {
$num2++ unless $feat->has_tag(pseudo);
my $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
# This is for gene feature-less GenBank file produced by Prokka.
# If possible, use subroutine.
#
unless (exists $geneFeaturePresence{$locus_tag}) {
$annot{$locus_tag}->{'locus'} = $LOCUS;
if ($feat->strand == '1') {
$annot{$locus_tag}->{'strand'} = '+';
} else {
$annot{$locus_tag}->{'strand'} = '-';
}
$annot{$locus_tag}->{'start'} = $feat->start;
$annot{$locus_tag}->{'end'} = $feat->end;
}
# If there are multiple "pseudo" CDS features (fragmentary) in one given locus,
# process the first one only. Remaining CDS feature(s) has the same information.
#
next if exists $seen{$locus_tag};
$seen{$locus_tag} = '';
$annot{$locus_tag}->{'type'} = $feat->primary_tag;
$annot{$locus_tag}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product);
$annot{$locus_tag}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene);
$annot{$locus_tag}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id);
# If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
#
if ($feat->has_tag(EC_number)) {
my @tag = $feat->get_tag_values(EC_number);
$annot{$locus_tag}->{EC_number} = join "; ", @tag;
}
# Extracting gene sequences if not pseudo.
# If you want to process specified genes only, then use $locus_tag variable.
# Sequence features will not be printed if $debug is nonzero.
#
if ($feat->has_tag(translation)) {
my $NTsequence = $feat->seq->seq();
my $AAsequence = eval {($feat->get_tag_values(translation))[0]};
my $product = $annot{$locus_tag}->{'product'};
$annot{$locus_tag}->{'NTseq'} = $NTsequence;
$annot{$locus_tag}->{'AAseq'} = $AAsequence;
# You can add any other information to 'desc'.
# Uncomment the following five lines if you want print seq to STDOUT and
# specific -seq to be ether $NTsequence or $AAsequence.
# my $seqObj = Bio::Seq->new(-display_id => $locus_tag,
# -desc => $product,
# -seq => $NTsequence
# );
# $seqOut->write_seq($seqObj);
}
}
}
}
$argv0 = basename($ARGV[0]);
print STDERR "[STDERR] $argv0 has $LOCUS_num sequence(s)\n";
print STDERR "[STDERR] $argv0 has $num gene features\n";
print STDERR "[STDERR] $argv0 has $num2 active CDS features (not marked as 'pseudo')\n";
print STDERR "[STDERR] $argv0 has ", scalar @pseudo, " pseudo genes\n";
# Now print the entire information!
# sort required!
@final = ();
open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n";
print STDERR "[STDERR] Feature information is being written to [ $infoFile ] (pre-existing file was overwritten!)\n";
print INFO join "\t", '#source', 'seqID', 'feature', 'locus_tag', 'old_locus_tag', 'isPseudo?', 'start', 'end', 'strand',
'gene', 'product', 'protein_id', 'EC_number', 'function', 'NT_sequence' , 'AA_sequence' . "\n";
for my $locus_tag (sort keys %annot) {
my $temp = join "\t", $argv0, $annot{$locus_tag}->{'locus'}, $annot{$locus_tag}->{'type'},
$locus_tag, $annot{$locus_tag}->{'old_locus_tag'},
$annot{$locus_tag}->{'isPseudo'},
$annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'},
$annot{$locus_tag}->{'gene'}, $annot{$locus_tag}->{'product'},
$annot{$locus_tag}->{'protein_id'},
$annot{$locus_tag}->{'EC_number'}, $annot{$locus_tag}->{'function'};
if ($ARGV[1] eq '-seq') {
$temp .= "\t" . $annot{$locus_tag}->{'NTseq'} . "\t" . $annot{$locus_tag}->{'AAseq'};
}
$temp .= "\n";
push @final, $temp;
}
@sorted_final = map { $_->[0] }
sort {
$a->[1] cmp $b->[1] # First sort by locus
||
$a->[2] <=> $b->[2] # Second sort by start position (numeric)
}
map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start
print INFO @sorted_final;
if ($debug) {
# process using <data.txt> file
# data.txt example (tab-delimited files; three columns)
# seqID locus_tag SNP_position
# NC_009698 CLC_0737 758733
# NC_009698 CLC_0847 867070
# Sorting data.txt using the third column as a key before running this script is desirable
open DATA, "data.txt" or die "Can't open data.txt file for reading!\n";
print join "\t", '#SeqID', 'locus_tag', 'strand', 'feature_start', 'SNP_position', 'NT_position (gene)',
'AA_position (gene)', 'position_in_codon',
'codon', 'AA', 'nucleotide (genome)' . "\n";
while (<DATA>) {
next if /^#/;
chomp;
my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_;
if ($LOCUS ne $annot{$locus_tag}->{'locus'}) {
print STDERR "[FATAL ERROR] Something went wrong! Aborting....\n";
exit;
}
($start, $end, $strand) = ($annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'});
if ($strand eq '+') {
$NA_position_in_feature = $SNP - $annot{$locus_tag}->{'start'}; # zero-base position, i.e., offset
} else {
$NA_position_in_feature = $annot{$locus_tag}->{'end'} - $SNP;
}
if (exists $annot{$locus_tag}->{'isPseudo'} || $locus_tag eq 'NA') {
print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, 'NA', $SNP, 'NA', 'NA', 'NA', 'NA', 'NA',
$whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
} else {
$AA_position_in_feature = int($NA_position_in_feature / 3) + 1;
$position_in_codon = ($NA_position_in_feature + 1) % 3;
$position_in_codon = 3 if $position_in_codon == 0;
$codon = $seqref{$locus_tag}->trunc($AA_position_in_feature * 3 - 2, $AA_position_in_feature * 3);
$AA = $codon->translate();
print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, $start, $SNP, $NA_position_in_feature + 1, $AA_position_in_feature,
$position_in_codon, $codon->seq(), $AA->seq(),
$whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
}
}
}