Table of Contents

my_rename1.pl

#!/usr/bin/perl
#
open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
while (<ALL>) {
    chomp;
    next unless /^(GCF|GCA)/;
    my @data = split /\t/, $_;
    $data[8] =~ s/strain=//; # infraspefic_name
    my $strain = $data[8];
    $data[7] =~ s/ $strain$//; # remove redundant strain name
    my $name = $data[7] . ' ' . $data[8];
    $name .= $data[15] if $name =~ /^\S+ \S+ $/; # add asm_name if required
    $name =~ s/:/-/g;
    $name =~ s/\s+/_/g;
    $name =~ s/;.*$//;     # remove semicolon and following information
    $name =~ s/\//_/;
    $name =~ s/_$//;
    my @temp = split /\//, $data[19];
    $key = pop @temp;
    $key2name{$key} = $name;
    print $key, "\t", $name, "\n";
}

my_rename2.pl

#!/usr/bin/perl
#
# $ARGV[0] : id2name file
#    GCF_000015065.1_ASM1506v1       Bt_str._Al_Hakam
#    GCF_000092165.1_ASM9216v1       Bt_BMB171
#    ...
# $ARGV[1] : fna file 
#    GCF_000015065.1_ASM1506v1_genomic.fna
#

open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
while (<ALL>) {
    chomp;
    my @data = split /\t/, $_;
    $key2name{$data[0]} = $data[1];
}

my @temp = split /_/, $ARGV[1];
my $suffix = pop @temp;
my $accession = $temp[0] . '_' . $temp[1];
$suffix =~ /^.+\.(.+)$/;
$end = $1;
$key = join '_', @temp;

#print $key, " ", $key2name{$key}, " ", $end, "\n";
$file = $key2name{$key} . '_' . $accession . '.' . $end;
print "$ARGV[1] ===> $file\n";
system("cp $ARGV[1] $file") if defined $file;

safe_download.sh

#!/usr/bin/bash

# argument (FtpPath_RefSeq) example
# ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/741/935/GCF_000741935.1_ASM74193v1
#
# How to run:
#   while read path; do bash ./THIS_SCRIPT; done < download_path.txt

TARGET_DIR=download

# Modify 'genomic.gbff.gz' as you want!
DOWNLOAD_PATH=$(echo $1 | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+\/)(GC._.+)|\1\2\/\2_genomic.gbff.gz|')
FILE=${DOWNLOAD_PATH##*/}
MD5SUM_FILE=${1}/md5checksums.txt

wget ${MD5SUM_FILE}
grep ${FILE} md5checksums.txt > $$checksum.txt
wget ${DOWNLOAD_PATH}

md5sum -c $$checksum.txt
if [ $? -eq 0 ]; then
    echo ${FILE} download OK! Moving to ${TARGET_DIR}...
    mv ${FILE} $TARGET_DIR
else
    echo ${FILE} download FAIL!
    echo ${DOWNLOAD_PATH} >> failed_download.txt
fi

rm md5checksums.txt* $$checksum.txt

gbkInfo.pl

#!/usr/bin/perl
# 
#    Written by Haeyoung Jeong
#    

use Bio::SeqIO;
use File::Basename;

# latest update: 2022-05-06

$scriptname = basename($0);
$modtime = (stat($scriptname))[9];
$timestamp = localtime($modtime);

if ($#ARGV == -1) {
    print STDERR "Usage : $scriptname <GenBank file> [-seq]\n";
    print STDERR "        script last modified at $timestamp\n";
    exit;
}

# debug mode is for additional analysis using the 'data.txt' file
$debug = 0;
if ($debug) {
    print STDERR "[STDERR] Entering debug mode...[ \$debug = $debug ; data.txt file will be used ]\n";
}

$seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank');
$infoFile = basename($ARGV[0]) . '.txt';
if ($ARGV[1] eq '-seq') {
    print STDERR "[STDERR] Nucleotide and amino acid sequence will be written to the $infoFile file.\n";
}
$seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta');

$LOCUS_num = $num = $num2 = 0;

# Can handle multiple sequence objects in a single GenBank file
while (my $seqObj = $seqIn->next_seq()) {
    my $LOCUS = $seqObj->display_id();
    my $VERSION = $seqObj->version();
    print STDERR "[STDERR] LOCUS: $LOCUS (VERSION: $VERSION)\n";
    $LOCUS_num++;
    $whole_seq{$LOCUS} = $seqObj;
    @features = $seqObj->get_SeqFeatures();
    foreach my $feat ( @features ) {
        # types of primary_tag: source, gene, CDS, rRNA, tRNA
        if ($feat->primary_tag eq 'source') {
            print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism);
        }

# For each gene feature, check if it is pseudo or not.
# Extract function info if possible.
# Values will be stored in %annot hash.

        if ($feat->primary_tag eq 'gene') {
            $num++;
# $annot{$locus_tag}->{'locus'} holds 'LOCUS' (e.g., CP000727)
            $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
            $geneFeaturePresence{$locus_tag} = "";
            $annot{$locus_tag}->{'locus'} = $LOCUS;
            $annot{$locus_tag}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function);
            $annot{$locus_tag}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag);
            $seqref{$locus_tag} = $feat->seq;
            if ($feat->has_tag(pseudo)) {
                $annot{$locus_tag}->{'isPseudo'} = 'pseudo';
                push @pseudo, $locus_tag;
            }
# $feat->start is always smaller than $feat->end.
            if ($feat->strand == '1') {
                $annot{$locus_tag}->{'strand'} = '+';
            } else {
                $annot{$locus_tag}->{'strand'} = '-';
            }
            $annot{$locus_tag}->{'start'} = $feat->start;
            $annot{$locus_tag}->{'end'} = $feat->end;
        }

# For each CDS feature, product/gene/EC_number will be extracted.
# There can be multiple EC numbers in one CDS feature.
#

#        if ($feat->primary_tag eq 'CDS' || $feat->primary_tag eq 'rRNA' || $feat->primary_tag eq 'tRNA') {
        if ($feat->primary_tag eq 'CDS') {
            $num2++ unless $feat->has_tag(pseudo);
            my $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};

# This is for gene feature-less GenBank file produced by Prokka.
# If possible, use subroutine.
#
            unless (exists $geneFeaturePresence{$locus_tag}) {
                $annot{$locus_tag}->{'locus'} = $LOCUS;
                if ($feat->strand == '1') {
                    $annot{$locus_tag}->{'strand'} = '+';
                } else {
                    $annot{$locus_tag}->{'strand'} = '-';
                }
                $annot{$locus_tag}->{'start'} = $feat->start;
                $annot{$locus_tag}->{'end'} = $feat->end;            
            }

# If there are multiple "pseudo" CDS features (fragmentary) in one given locus,
# process the first one only. Remaining CDS feature(s) has the same information.
#
            next if exists $seen{$locus_tag};
            $seen{$locus_tag} = '';

            $annot{$locus_tag}->{'type'} = $feat->primary_tag;
            $annot{$locus_tag}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product);
            $annot{$locus_tag}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene);
            $annot{$locus_tag}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id);

# If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
#
            if ($feat->has_tag(EC_number)) {
                my @tag = $feat->get_tag_values(EC_number);
                $annot{$locus_tag}->{EC_number} = join "; ", @tag;
            }

# Extracting gene sequences if not pseudo.
# If you want to process specified genes only, then use $locus_tag variable.
# Sequence features will not be printed if $debug is nonzero.
#
            if ($feat->has_tag(translation)) {
                my $NTsequence = $feat->seq->seq();
                my $AAsequence = eval {($feat->get_tag_values(translation))[0]};
                my $product = $annot{$locus_tag}->{'product'};
                $annot{$locus_tag}->{'NTseq'} = $NTsequence;
                $annot{$locus_tag}->{'AAseq'} = $AAsequence;
# You can add any other information to 'desc'.
# Uncomment the following five lines if you want print seq to STDOUT and
# specific -seq to be ether $NTsequence or $AAsequence.
#                my $seqObj = Bio::Seq->new(-display_id => $locus_tag,
#                                           -desc       => $product,
#                                           -seq        => $NTsequence
#                                           );
#                $seqOut->write_seq($seqObj);
            }
        }
    }
}

$argv0 = basename($ARGV[0]);
print STDERR "[STDERR] $argv0 has $LOCUS_num sequence(s)\n";
print STDERR "[STDERR] $argv0 has $num gene features\n";
print STDERR "[STDERR] $argv0 has $num2 active CDS features (not marked as 'pseudo')\n";
print STDERR "[STDERR] $argv0 has ", scalar @pseudo, " pseudo genes\n";

# Now print the entire information!
# sort required!

@final = ();
open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n";
print STDERR "[STDERR] Feature information is being written to [ $infoFile ] (pre-existing file was overwritten!)\n";

print INFO join "\t", '#source', 'seqID', 'feature', 'locus_tag', 'old_locus_tag', 'isPseudo?', 'start', 'end', 'strand',
           'gene', 'product', 'protein_id', 'EC_number', 'function', 'NT_sequence' , 'AA_sequence' . "\n";
for my $locus_tag (sort keys %annot) {
    my $temp = join "\t", $argv0, $annot{$locus_tag}->{'locus'}, $annot{$locus_tag}->{'type'}, 
             $locus_tag, $annot{$locus_tag}->{'old_locus_tag'}, 
             $annot{$locus_tag}->{'isPseudo'},
             $annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'},
             $annot{$locus_tag}->{'gene'}, $annot{$locus_tag}->{'product'},
             $annot{$locus_tag}->{'protein_id'}, 
             $annot{$locus_tag}->{'EC_number'}, $annot{$locus_tag}->{'function'};
             if ($ARGV[1] eq '-seq') {
                 $temp .= "\t" . $annot{$locus_tag}->{'NTseq'} . "\t" . $annot{$locus_tag}->{'AAseq'};
             }
             $temp .= "\n";
    push @final, $temp;
}

@sorted_final = map  { $_->[0] }
                sort { 
                       $a->[1] cmp $b->[1] # First sort by locus
                               ||
                       $a->[2] <=> $b->[2] # Second sort by start position (numeric)
                     }
                map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start

print INFO @sorted_final;
   
if ($debug) { 
# process using <data.txt> file
# data.txt example (tab-delimited files; three columns)
# seqID	locus_tag	SNP_position
# NC_009698	CLC_0737	758733
# NC_009698	CLC_0847	867070
# Sorting data.txt using the third column as a key before running this script is desirable
    open DATA, "data.txt" or die "Can't open data.txt file for reading!\n";
    print join "\t", '#SeqID', 'locus_tag', 'strand', 'feature_start', 'SNP_position', 'NT_position (gene)', 
                     'AA_position (gene)', 'position_in_codon',
                     'codon', 'AA', 'nucleotide (genome)' . "\n";
    while (<DATA>) {
        next if /^#/;
        chomp;

        my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_;
        if ($LOCUS ne $annot{$locus_tag}->{'locus'}) {
            print STDERR "[FATAL ERROR] Something went wrong! Aborting....\n";
            exit;
        }

        ($start, $end, $strand) = ($annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'});
        if ($strand eq '+') {
           $NA_position_in_feature = $SNP - $annot{$locus_tag}->{'start'}; # zero-base position, i.e., offset
        } else {
           $NA_position_in_feature = $annot{$locus_tag}->{'end'} - $SNP;
        }
        if (exists $annot{$locus_tag}->{'isPseudo'} || $locus_tag eq 'NA') {
            print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, 'NA', $SNP, 'NA', 'NA', 'NA', 'NA', 'NA',
                             $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
        } else {
            $AA_position_in_feature = int($NA_position_in_feature / 3) + 1;
            $position_in_codon = ($NA_position_in_feature + 1) % 3;
            $position_in_codon = 3 if $position_in_codon == 0;
            $codon = $seqref{$locus_tag}->trunc($AA_position_in_feature * 3 - 2, $AA_position_in_feature * 3);
            $AA = $codon->translate();
            print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, $start, $SNP, $NA_position_in_feature + 1, $AA_position_in_feature,
                             $position_in_codon, $codon->seq(), $AA->seq(), 
                             $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
        }
    }
}