Differences

This shows you the differences between two versions of the page.

--- custom_scripts [2023/06/22 17:12] – removed - external edit (Unknown date) 127.0.0.1
+++ custom_scripts [2023/06/29 12:13] (current) – [gbkInfo.pl] hyjeong
@@ Line 1: / Line 1: @@
+====== my_rename1.pl ======
+  #!/usr/bin/perl
+  #
+  open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
+  while (<ALL>) {
+      chomp;
+      next unless /^(GCF|GCA)/;
+      my @data = split /\t/, $_;
+      $data[8] =~ s/strain=//; # infraspefic_name
+      my $strain = $data[8];
+      $data[7] =~ s/ $strain$//; # remove redundant strain name
+      my $name = $data[7] . ' ' . $data[8];
+      $name .= $data[15] if $name =~ /^\S+ \S+ $/; # add asm_name if required
+      $name =~ s/:/-/g;
+      $name =~ s/\s+/_/g;
+      $name =~ s/;.*$//;     # remove semicolon and following information
+      $name =~ s/\//_/;
+      $name =~ s/_$//;
+      my @temp = split /\//, $data[19];
+      $key = pop @temp;
+      $key2name{$key} = $name;
+      print $key, "\t", $name, "\n";
+  }
+====== my_rename2.pl ======
+  #!/usr/bin/perl
+  #
+  # $ARGV[0] : id2name file
+  #    GCF_000015065.1_ASM1506v1       Bt_str._Al_Hakam
+  #    GCF_000092165.1_ASM9216v1       Bt_BMB171
+  #    ...
+  # $ARGV[1] : fna file
+  #    GCF_000015065.1_ASM1506v1_genomic.fna
+  #
+  open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
+  while (<ALL>) {
+      chomp;
+      my @data = split /\t/, $_;
+      $key2name{$data[0]} = $data[1];
+  }
+  my @temp = split /_/, $ARGV[1];
+  my $suffix = pop @temp;
+  my $accession = $temp[0] . '_' . $temp[1];
+  $suffix =~ /^.+\.(.+)$/;
+  $end = $1;
+  $key = join '_', @temp;
+  #print $key, " ", $key2name{$key}, " ", $end, "\n";
+  $file = $key2name{$key} . '_' . $accession . '.' . $end;
+  print "$ARGV[1] ===> $file\n";
+  system("cp $ARGV[1] $file") if defined $file;
+====== safe_download.sh ======
+  #!/usr/bin/bash
+  # argument (FtpPath_RefSeq) example
+  # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/741/935/GCF_000741935.1_ASM74193v1
+  #
+  # How to run:
+  #   while read path; do bash ./THIS_SCRIPT; done < download_path.txt
+  TARGET_DIR=download
+  # Modify 'genomic.gbff.gz' as you want!
+  DOWNLOAD_PATH=$(echo $1 | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+\/)(GC._.+)|\1\2\/\2_genomic.gbff.gz|')
+  FILE=${DOWNLOAD_PATH##*/}
+  MD5SUM_FILE=${1}/md5checksums.txt
+  wget ${MD5SUM_FILE}
+  grep ${FILE} md5checksums.txt > $$checksum.txt
+  wget ${DOWNLOAD_PATH}
+  md5sum -c $$checksum.txt
+  if [ $? -eq 0 ]; then
+      echo ${FILE} download OK! Moving to ${TARGET_DIR}...
+      mv ${FILE} $TARGET_DIR
+  else
+      echo ${FILE} download FAIL!
+      echo ${DOWNLOAD_PATH} >> failed_download.txt
+  fi
+  rm md5checksums.txt* $$checksum.txt
+====== gbkInfo.pl ======
+  #!/usr/bin/perl
+  #
+  #    Written by Haeyoung Jeong
+  #
+  use Bio::SeqIO;
+  use File::Basename;
+  # latest update: 2022-05-06
+  $scriptname = basename($0);
+  $modtime = (stat($scriptname))[9];
+  $timestamp = localtime($modtime);
+  if ($#ARGV == -1) {
+      print STDERR "Usage : $scriptname <GenBank file> [-seq]\n";
+      print STDERR "        script last modified at $timestamp\n";
+      exit;
+  }
+  # debug mode is for additional analysis using the 'data.txt' file
+  $debug = 0;
+  if ($debug) {
+      print STDERR "[STDERR] Entering debug mode...[ \$debug = $debug ; data.txt file will be used ]\n";
+  }
+  $seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank');
+  $infoFile = basename($ARGV[0]) . '.txt';
+  if ($ARGV[1] eq '-seq') {
+      print STDERR "[STDERR] Nucleotide and amino acid sequence will be written to the $infoFile file.\n";
+  }
+  $seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta');
+  $LOCUS_num = $num = $num2 = 0;
+  # Can handle multiple sequence objects in a single GenBank file
+  while (my $seqObj = $seqIn->next_seq()) {
+      my $LOCUS = $seqObj->display_id();
+      my $VERSION = $seqObj->version();
+      print STDERR "[STDERR] LOCUS: $LOCUS (VERSION: $VERSION)\n";
+      $LOCUS_num++;
+      $whole_seq{$LOCUS} = $seqObj;
+      @features = $seqObj->get_SeqFeatures();
+      foreach my $feat ( @features ) {
+          # types of primary_tag: source, gene, CDS, rRNA, tRNA
+          if ($feat->primary_tag eq 'source') {
+              print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism);
+          }
+  # For each gene feature, check if it is pseudo or not.
+  # Extract function info if possible.
+  # Values will be stored in %annot hash.
+          if ($feat->primary_tag eq 'gene') {
+              $num++;
+  # $annot{$locus_tag}->{'locus'} holds 'LOCUS' (e.g., CP000727)
+              $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
+              $geneFeaturePresence{$locus_tag} = "";
+              $annot{$locus_tag}->{'locus'} = $LOCUS;
+              $annot{$locus_tag}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function);
+              $annot{$locus_tag}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag);
+              $seqref{$locus_tag} = $feat->seq;
+              if ($feat->has_tag(pseudo)) {
+                  $annot{$locus_tag}->{'isPseudo'} = 'pseudo';
+                  push @pseudo, $locus_tag;
+              }
+  # $feat->start is always smaller than $feat->end.
+              if ($feat->strand == '1') {
+                  $annot{$locus_tag}->{'strand'} = '+';
+              } else {
+                  $annot{$locus_tag}->{'strand'} = '-';
+              }
+              $annot{$locus_tag}->{'start'} = $feat->start;
+              $annot{$locus_tag}->{'end'} = $feat->end;
+          }
+  # For each CDS feature, product/gene/EC_number will be extracted.
+  # There can be multiple EC numbers in one CDS feature.
+  #
+  #        if ($feat->primary_tag eq 'CDS' || $feat->primary_tag eq 'rRNA' || $feat->primary_tag eq 'tRNA') {
+          if ($feat->primary_tag eq 'CDS') {
+              $num2++ unless $feat->has_tag(pseudo);
+              my $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
+  # This is for gene feature-less GenBank file produced by Prokka.
+  # If possible, use subroutine.
+  #
+              unless (exists $geneFeaturePresence{$locus_tag}) {
+                  $annot{$locus_tag}->{'locus'} = $LOCUS;
+                  if ($feat->strand == '1') {
+                      $annot{$locus_tag}->{'strand'} = '+';
+                  } else {
+                      $annot{$locus_tag}->{'strand'} = '-';
+                  }
+                  $annot{$locus_tag}->{'start'} = $feat->start;
+                  $annot{$locus_tag}->{'end'} = $feat->end;
+              }
+  # If there are multiple "pseudo" CDS features (fragmentary) in one given locus,
+  # process the first one only. Remaining CDS feature(s) has the same information.
+  #
+              next if exists $seen{$locus_tag};
+              $seen{$locus_tag} = '';
+              $annot{$locus_tag}->{'type'} = $feat->primary_tag;
+              $annot{$locus_tag}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product);
+              $annot{$locus_tag}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene);
+              $annot{$locus_tag}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id);
+  # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
+  #
+              if ($feat->has_tag(EC_number)) {
+                  my @tag = $feat->get_tag_values(EC_number);
+                  $annot{$locus_tag}->{EC_number} = join "; ", @tag;
+              }
+  # Extracting gene sequences if not pseudo.
+  # If you want to process specified genes only, then use $locus_tag variable.
+  # Sequence features will not be printed if $debug is nonzero.
+  #
+              if ($feat->has_tag(translation)) {
+                  my $NTsequence = $feat->seq->seq();
+                  my $AAsequence = eval {($feat->get_tag_values(translation))[0]};
+                  my $product = $annot{$locus_tag}->{'product'};
+                  $annot{$locus_tag}->{'NTseq'} = $NTsequence;
+                  $annot{$locus_tag}->{'AAseq'} = $AAsequence;
+  # You can add any other information to 'desc'.
+  # Uncomment the following five lines if you want print seq to STDOUT and
+  # specific -seq to be ether $NTsequence or $AAsequence.
+  #                my $seqObj = Bio::Seq->new(-display_id => $locus_tag,
+  #                                           -desc       => $product,
+  #                                           -seq        => $NTsequence
+  #                                           );
+  #                $seqOut->write_seq($seqObj);
+              }
+          }
+      }
+  }
+  $argv0 = basename($ARGV[0]);
+  print STDERR "[STDERR] $argv0 has $LOCUS_num sequence(s)\n";
+  print STDERR "[STDERR] $argv0 has $num gene features\n";
+  print STDERR "[STDERR] $argv0 has $num2 active CDS features (not marked as 'pseudo')\n";
+  print STDERR "[STDERR] $argv0 has ", scalar @pseudo, " pseudo genes\n";
+  # Now print the entire information!
+  # sort required!
+  @final = ();
+  open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n";
+  print STDERR "[STDERR] Feature information is being written to [ $infoFile ] (pre-existing file was overwritten!)\n";
+  print INFO join "\t", '#source', 'seqID', 'feature', 'locus_tag', 'old_locus_tag', 'isPseudo?', 'start', 'end', 'strand',
+             'gene', 'product', 'protein_id', 'EC_number', 'function', 'NT_sequence' , 'AA_sequence' . "\n";
+  for my $locus_tag (sort keys %annot) {
+      my $temp = join "\t", $argv0, $annot{$locus_tag}->{'locus'}, $annot{$locus_tag}->{'type'},
+               $locus_tag, $annot{$locus_tag}->{'old_locus_tag'},
+               $annot{$locus_tag}->{'isPseudo'},
+               $annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'},
+               $annot{$locus_tag}->{'gene'}, $annot{$locus_tag}->{'product'},
+               $annot{$locus_tag}->{'protein_id'},
+               $annot{$locus_tag}->{'EC_number'}, $annot{$locus_tag}->{'function'};
+               if ($ARGV[1] eq '-seq') {
+                   $temp .= "\t" . $annot{$locus_tag}->{'NTseq'} . "\t" . $annot{$locus_tag}->{'AAseq'};
+               }
+               $temp .= "\n";
+      push @final, $temp;
+  }
+  @sorted_final = map  { $_->[0] }
+                  sort {
+                         $a->[1] cmp $b->[1] # First sort by locus
+                                 ||
+                         $a->[2] <=> $b->[2] # Second sort by start position (numeric)
+                       }
+                  map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start
+  print INFO @sorted_final;
+  if ($debug) {
+  # process using <data.txt> file
+  # data.txt example (tab-delimited files; three columns)
+  # seqID	locus_tag	SNP_position
+  # NC_009698	CLC_0737	758733
+  # NC_009698	CLC_0847	867070
+  # Sorting data.txt using the third column as a key before running this script is desirable
+      open DATA, "data.txt" or die "Can't open data.txt file for reading!\n";
+      print join "\t", '#SeqID', 'locus_tag', 'strand', 'feature_start', 'SNP_position', 'NT_position (gene)',
+                       'AA_position (gene)', 'position_in_codon',
+                       'codon', 'AA', 'nucleotide (genome)' . "\n";
+      while (<DATA>) {
+          next if /^#/;
+          chomp;
+          my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_;
+          if ($LOCUS ne $annot{$locus_tag}->{'locus'}) {
+              print STDERR "[FATAL ERROR] Something went wrong! Aborting....\n";
+              exit;
+          }
+          ($start, $end, $strand) = ($annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'});
+          if ($strand eq '+') {
+             $NA_position_in_feature = $SNP - $annot{$locus_tag}->{'start'}; # zero-base position, i.e., offset
+          } else {
+             $NA_position_in_feature = $annot{$locus_tag}->{'end'} - $SNP;
+          }
+          if (exists $annot{$locus_tag}->{'isPseudo'} || $locus_tag eq 'NA') {
+              print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, 'NA', $SNP, 'NA', 'NA', 'NA', 'NA', 'NA',
+                               $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
+          } else {
+              $AA_position_in_feature = int($NA_position_in_feature / 3) + 1;
+              $position_in_codon = ($NA_position_in_feature + 1) % 3;
+              $position_in_codon = 3 if $position_in_codon == 0;
+              $codon = $seqref{$locus_tag}->trunc($AA_position_in_feature * 3 - 2, $AA_position_in_feature * 3);
+              $AA = $codon->translate();
+              print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, $start, $SNP, $NA_position_in_feature + 1, $AA_position_in_feature,
+                               $position_in_codon, $codon->seq(), $AA->seq(),
+                               $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
+          }
+      }
+  }