====== my_rename1.pl ======
  #!/usr/bin/perl
  #
  open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
  while (<ALL>) {
      chomp;
      next unless /^(GCF|GCA)/;
      my @data = split /\t/, $_;
      $data[8] =~ s/strain=//; # infraspefic_name
      my $strain = $data[8];
      $data[7] =~ s/ $strain$//; # remove redundant strain name
      my $name = $data[7] . ' ' . $data[8];
      $name .= $data[15] if $name =~ /^\S+ \S+ $/; # add asm_name if required
      $name =~ s/:/-/g;
      $name =~ s/\s+/_/g;
      $name =~ s/;.*$//;     # remove semicolon and following information
      $name =~ s/\//_/;
      $name =~ s/_$//;
      my @temp = split /\//, $data[19];
      $key = pop @temp;
      $key2name{$key} = $name;
      print $key, "\t", $name, "\n";
  }

====== my_rename2.pl ======

  #!/usr/bin/perl
  #
  # $ARGV[0] : id2name file
  #    GCF_000015065.1_ASM1506v1       Bt_str._Al_Hakam
  #    GCF_000092165.1_ASM9216v1       Bt_BMB171
  #    ...
  # $ARGV[1] : fna file 
  #    GCF_000015065.1_ASM1506v1_genomic.fna
  #
  
  open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
  while (<ALL>) {
      chomp;
      my @data = split /\t/, $_;
      $key2name{$data[0]} = $data[1];
  }
  
  my @temp = split /_/, $ARGV[1];
  my $suffix = pop @temp;
  my $accession = $temp[0] . '_' . $temp[1];
  $suffix =~ /^.+\.(.+)$/;
  $end = $1;
  $key = join '_', @temp;
  
  #print $key, " ", $key2name{$key}, " ", $end, "\n";
  $file = $key2name{$key} . '_' . $accession . '.' . $end;
  print "$ARGV[1] ===> $file\n";
  system("cp $ARGV[1] $file") if defined $file;

====== safe_download.sh ======
  #!/usr/bin/bash
  
  # argument (FtpPath_RefSeq) example
  # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/741/935/GCF_000741935.1_ASM74193v1
  #
  # How to run:
  #   while read path; do bash ./THIS_SCRIPT; done < download_path.txt
  
  TARGET_DIR=download
  
  # Modify 'genomic.gbff.gz' as you want!
  DOWNLOAD_PATH=$(echo $1 | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+\/)(GC._.+)|\1\2\/\2_genomic.gbff.gz|')
  FILE=${DOWNLOAD_PATH##*/}
  MD5SUM_FILE=${1}/md5checksums.txt
  
  wget ${MD5SUM_FILE}
  grep ${FILE} md5checksums.txt > $$checksum.txt
  wget ${DOWNLOAD_PATH}
  
  md5sum -c $$checksum.txt
  if [ $? -eq 0 ]; then
      echo ${FILE} download OK! Moving to ${TARGET_DIR}...
      mv ${FILE} $TARGET_DIR
  else
      echo ${FILE} download FAIL!
      echo ${DOWNLOAD_PATH} >> failed_download.txt
  fi
  
  rm md5checksums.txt* $$checksum.txt

====== gbkInfo.pl ======

  #!/usr/bin/perl
  # 
  #    Written by Haeyoung Jeong
  #    
  
  use Bio::SeqIO;
  use File::Basename;
  
  # latest update: 2022-05-06
  
  $scriptname = basename($0);
  $modtime = (stat($scriptname))[9];
  $timestamp = localtime($modtime);
  
  if ($#ARGV == -1) {
      print STDERR "Usage : $scriptname <GenBank file> [-seq]\n";
      print STDERR "        script last modified at $timestamp\n";
      exit;
  }
  
  # debug mode is for additional analysis using the 'data.txt' file
  $debug = 0;
  if ($debug) {
      print STDERR "[STDERR] Entering debug mode...[ \$debug = $debug ; data.txt file will be used ]\n";
  }
  
  $seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank');
  $infoFile = basename($ARGV[0]) . '.txt';
  if ($ARGV[1] eq '-seq') {
      print STDERR "[STDERR] Nucleotide and amino acid sequence will be written to the $infoFile file.\n";
  }
  $seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta');
  
  $LOCUS_num = $num = $num2 = 0;
  
  # Can handle multiple sequence objects in a single GenBank file
  while (my $seqObj = $seqIn->next_seq()) {
      my $LOCUS = $seqObj->display_id();
      my $VERSION = $seqObj->version();
      print STDERR "[STDERR] LOCUS: $LOCUS (VERSION: $VERSION)\n";
      $LOCUS_num++;
      $whole_seq{$LOCUS} = $seqObj;
      @features = $seqObj->get_SeqFeatures();
      foreach my $feat ( @features ) {
          # types of primary_tag: source, gene, CDS, rRNA, tRNA
          if ($feat->primary_tag eq 'source') {
              print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism);
          }
  
  # For each gene feature, check if it is pseudo or not.
  # Extract function info if possible.
  # Values will be stored in %annot hash.
  
          if ($feat->primary_tag eq 'gene') {
              $num++;
  # $annot{$locus_tag}->{'locus'} holds 'LOCUS' (e.g., CP000727)
              $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
              $geneFeaturePresence{$locus_tag} = "";
              $annot{$locus_tag}->{'locus'} = $LOCUS;
              $annot{$locus_tag}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function);
              $annot{$locus_tag}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag);
              $seqref{$locus_tag} = $feat->seq;
              if ($feat->has_tag(pseudo)) {
                  $annot{$locus_tag}->{'isPseudo'} = 'pseudo';
                  push @pseudo, $locus_tag;
              }
  # $feat->start is always smaller than $feat->end.
              if ($feat->strand == '1') {
                  $annot{$locus_tag}->{'strand'} = '+';
              } else {
                  $annot{$locus_tag}->{'strand'} = '-';
              }
              $annot{$locus_tag}->{'start'} = $feat->start;
              $annot{$locus_tag}->{'end'} = $feat->end;
          }
  
  # For each CDS feature, product/gene/EC_number will be extracted.
  # There can be multiple EC numbers in one CDS feature.
  #
  
  #        if ($feat->primary_tag eq 'CDS' || $feat->primary_tag eq 'rRNA' || $feat->primary_tag eq 'tRNA') {
          if ($feat->primary_tag eq 'CDS') {
              $num2++ unless $feat->has_tag(pseudo);
              my $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
  
  # This is for gene feature-less GenBank file produced by Prokka.
  # If possible, use subroutine.
  #
              unless (exists $geneFeaturePresence{$locus_tag}) {
                  $annot{$locus_tag}->{'locus'} = $LOCUS;
                  if ($feat->strand == '1') {
                      $annot{$locus_tag}->{'strand'} = '+';
                  } else {
                      $annot{$locus_tag}->{'strand'} = '-';
                  }
                  $annot{$locus_tag}->{'start'} = $feat->start;
                  $annot{$locus_tag}->{'end'} = $feat->end;            
              }
  
  # If there are multiple "pseudo" CDS features (fragmentary) in one given locus,
  # process the first one only. Remaining CDS feature(s) has the same information.
  #
              next if exists $seen{$locus_tag};
              $seen{$locus_tag} = '';
  
              $annot{$locus_tag}->{'type'} = $feat->primary_tag;
              $annot{$locus_tag}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product);
              $annot{$locus_tag}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene);
              $annot{$locus_tag}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id);
  
  # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
  #
              if ($feat->has_tag(EC_number)) {
                  my @tag = $feat->get_tag_values(EC_number);
                  $annot{$locus_tag}->{EC_number} = join "; ", @tag;
              }
  
  # Extracting gene sequences if not pseudo.
  # If you want to process specified genes only, then use $locus_tag variable.
  # Sequence features will not be printed if $debug is nonzero.
  #
              if ($feat->has_tag(translation)) {
                  my $NTsequence = $feat->seq->seq();
                  my $AAsequence = eval {($feat->get_tag_values(translation))[0]};
                  my $product = $annot{$locus_tag}->{'product'};
                  $annot{$locus_tag}->{'NTseq'} = $NTsequence;
                  $annot{$locus_tag}->{'AAseq'} = $AAsequence;
  # You can add any other information to 'desc'.
  # Uncomment the following five lines if you want print seq to STDOUT and
  # specific -seq to be ether $NTsequence or $AAsequence.
  #                my $seqObj = Bio::Seq->new(-display_id => $locus_tag,
  #                                           -desc       => $product,
  #                                           -seq        => $NTsequence
  #                                           );
  #                $seqOut->write_seq($seqObj);
              }
          }
      }
  }
  
  $argv0 = basename($ARGV[0]);
  print STDERR "[STDERR] $argv0 has $LOCUS_num sequence(s)\n";
  print STDERR "[STDERR] $argv0 has $num gene features\n";
  print STDERR "[STDERR] $argv0 has $num2 active CDS features (not marked as 'pseudo')\n";
  print STDERR "[STDERR] $argv0 has ", scalar @pseudo, " pseudo genes\n";
  
  # Now print the entire information!
  # sort required!
  
  @final = ();
  open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n";
  print STDERR "[STDERR] Feature information is being written to [ $infoFile ] (pre-existing file was overwritten!)\n";
  
  print INFO join "\t", '#source', 'seqID', 'feature', 'locus_tag', 'old_locus_tag', 'isPseudo?', 'start', 'end', 'strand',
             'gene', 'product', 'protein_id', 'EC_number', 'function', 'NT_sequence' , 'AA_sequence' . "\n";
  for my $locus_tag (sort keys %annot) {
      my $temp = join "\t", $argv0, $annot{$locus_tag}->{'locus'}, $annot{$locus_tag}->{'type'}, 
               $locus_tag, $annot{$locus_tag}->{'old_locus_tag'}, 
               $annot{$locus_tag}->{'isPseudo'},
               $annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'},
               $annot{$locus_tag}->{'gene'}, $annot{$locus_tag}->{'product'},
               $annot{$locus_tag}->{'protein_id'}, 
               $annot{$locus_tag}->{'EC_number'}, $annot{$locus_tag}->{'function'};
               if ($ARGV[1] eq '-seq') {
                   $temp .= "\t" . $annot{$locus_tag}->{'NTseq'} . "\t" . $annot{$locus_tag}->{'AAseq'};
               }
               $temp .= "\n";
      push @final, $temp;
  }
  
  @sorted_final = map  { $_->[0] }
                  sort { 
                         $a->[1] cmp $b->[1] # First sort by locus
                                 ||
                         $a->[2] <=> $b->[2] # Second sort by start position (numeric)
                       }
                  map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start
  
  print INFO @sorted_final;
     
  if ($debug) { 
  # process using <data.txt> file
  # data.txt example (tab-delimited files; three columns)
  # seqID	locus_tag	SNP_position
  # NC_009698	CLC_0737	758733
  # NC_009698	CLC_0847	867070
  # Sorting data.txt using the third column as a key before running this script is desirable
      open DATA, "data.txt" or die "Can't open data.txt file for reading!\n";
      print join "\t", '#SeqID', 'locus_tag', 'strand', 'feature_start', 'SNP_position', 'NT_position (gene)', 
                       'AA_position (gene)', 'position_in_codon',
                       'codon', 'AA', 'nucleotide (genome)' . "\n";
      while (<DATA>) {
          next if /^#/;
          chomp;
  
          my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_;
          if ($LOCUS ne $annot{$locus_tag}->{'locus'}) {
              print STDERR "[FATAL ERROR] Something went wrong! Aborting....\n";
              exit;
          }
  
          ($start, $end, $strand) = ($annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'});
          if ($strand eq '+') {
             $NA_position_in_feature = $SNP - $annot{$locus_tag}->{'start'}; # zero-base position, i.e., offset
          } else {
             $NA_position_in_feature = $annot{$locus_tag}->{'end'} - $SNP;
          }
          if (exists $annot{$locus_tag}->{'isPseudo'} || $locus_tag eq 'NA') {
              print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, 'NA', $SNP, 'NA', 'NA', 'NA', 'NA', 'NA',
                               $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
          } else {
              $AA_position_in_feature = int($NA_position_in_feature / 3) + 1;
              $position_in_codon = ($NA_position_in_feature + 1) % 3;
              $position_in_codon = 3 if $position_in_codon == 0;
              $codon = $seqref{$locus_tag}->trunc($AA_position_in_feature * 3 - 2, $AA_position_in_feature * 3);
              $AA = $codon->translate();
              print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, $start, $SNP, $NA_position_in_feature + 1, $AA_position_in_feature,
                               $position_in_codon, $codon->seq(), $AA->seq(), 
                               $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
          }
      }
  }