User Tools

Site Tools


custom_scripts

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
custom_scripts [2023/06/22 17:12] – removed - external edit (Unknown date) 127.0.0.1custom_scripts [2023/06/29 12:13] (current) – [gbkInfo.pl] hyjeong
Line 1: Line 1:
 +====== my_rename1.pl ======
 +  #!/usr/bin/perl
 +  #
 +  open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
 +  while (<ALL>) {
 +      chomp;
 +      next unless /^(GCF|GCA)/;
 +      my @data = split /\t/, $_;
 +      $data[8] =~ s/strain=//; # infraspefic_name
 +      my $strain = $data[8];
 +      $data[7] =~ s/ $strain$//; # remove redundant strain name
 +      my $name = $data[7] . ' ' . $data[8];
 +      $name .= $data[15] if $name =~ /^\S+ \S+ $/; # add asm_name if required
 +      $name =~ s/:/-/g;
 +      $name =~ s/\s+/_/g;
 +      $name =~ s/;.*$//;     # remove semicolon and following information
 +      $name =~ s/\//_/;
 +      $name =~ s/_$//;
 +      my @temp = split /\//, $data[19];
 +      $key = pop @temp;
 +      $key2name{$key} = $name;
 +      print $key, "\t", $name, "\n";
 +  }
  
 +====== my_rename2.pl ======
 +
 +  #!/usr/bin/perl
 +  #
 +  # $ARGV[0] : id2name file
 +  #    GCF_000015065.1_ASM1506v1       Bt_str._Al_Hakam
 +  #    GCF_000092165.1_ASM9216v1       Bt_BMB171
 +  #    ...
 +  # $ARGV[1] : fna file 
 +  #    GCF_000015065.1_ASM1506v1_genomic.fna
 +  #
 +  
 +  open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!";
 +  while (<ALL>) {
 +      chomp;
 +      my @data = split /\t/, $_;
 +      $key2name{$data[0]} = $data[1];
 +  }
 +  
 +  my @temp = split /_/, $ARGV[1];
 +  my $suffix = pop @temp;
 +  my $accession = $temp[0] . '_' . $temp[1];
 +  $suffix =~ /^.+\.(.+)$/;
 +  $end = $1;
 +  $key = join '_', @temp;
 +  
 +  #print $key, " ", $key2name{$key}, " ", $end, "\n";
 +  $file = $key2name{$key} . '_' . $accession . '.' . $end;
 +  print "$ARGV[1] ===> $file\n";
 +  system("cp $ARGV[1] $file") if defined $file;
 +
 +====== safe_download.sh ======
 +  #!/usr/bin/bash
 +  
 +  # argument (FtpPath_RefSeq) example
 +  # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/741/935/GCF_000741935.1_ASM74193v1
 +  #
 +  # How to run:
 +  #   while read path; do bash ./THIS_SCRIPT; done < download_path.txt
 +  
 +  TARGET_DIR=download
 +  
 +  # Modify 'genomic.gbff.gz' as you want!
 +  DOWNLOAD_PATH=$(echo $1 | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+\/)(GC._.+)|\1\2\/\2_genomic.gbff.gz|')
 +  FILE=${DOWNLOAD_PATH##*/}
 +  MD5SUM_FILE=${1}/md5checksums.txt
 +  
 +  wget ${MD5SUM_FILE}
 +  grep ${FILE} md5checksums.txt > $$checksum.txt
 +  wget ${DOWNLOAD_PATH}
 +  
 +  md5sum -c $$checksum.txt
 +  if [ $? -eq 0 ]; then
 +      echo ${FILE} download OK! Moving to ${TARGET_DIR}...
 +      mv ${FILE} $TARGET_DIR
 +  else
 +      echo ${FILE} download FAIL!
 +      echo ${DOWNLOAD_PATH} >> failed_download.txt
 +  fi
 +  
 +  rm md5checksums.txt* $$checksum.txt
 +
 +====== gbkInfo.pl ======
 +
 +  #!/usr/bin/perl
 +  # 
 +  #    Written by Haeyoung Jeong
 +  #    
 +  
 +  use Bio::SeqIO;
 +  use File::Basename;
 +  
 +  # latest update: 2022-05-06
 +  
 +  $scriptname = basename($0);
 +  $modtime = (stat($scriptname))[9];
 +  $timestamp = localtime($modtime);
 +  
 +  if ($#ARGV == -1) {
 +      print STDERR "Usage : $scriptname <GenBank file> [-seq]\n";
 +      print STDERR "        script last modified at $timestamp\n";
 +      exit;
 +  }
 +  
 +  # debug mode is for additional analysis using the 'data.txt' file
 +  $debug = 0;
 +  if ($debug) {
 +      print STDERR "[STDERR] Entering debug mode...[ \$debug = $debug ; data.txt file will be used ]\n";
 +  }
 +  
 +  $seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank');
 +  $infoFile = basename($ARGV[0]) . '.txt';
 +  if ($ARGV[1] eq '-seq') {
 +      print STDERR "[STDERR] Nucleotide and amino acid sequence will be written to the $infoFile file.\n";
 +  }
 +  $seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta');
 +  
 +  $LOCUS_num = $num = $num2 = 0;
 +  
 +  # Can handle multiple sequence objects in a single GenBank file
 +  while (my $seqObj = $seqIn->next_seq()) {
 +      my $LOCUS = $seqObj->display_id();
 +      my $VERSION = $seqObj->version();
 +      print STDERR "[STDERR] LOCUS: $LOCUS (VERSION: $VERSION)\n";
 +      $LOCUS_num++;
 +      $whole_seq{$LOCUS} = $seqObj;
 +      @features = $seqObj->get_SeqFeatures();
 +      foreach my $feat ( @features ) {
 +          # types of primary_tag: source, gene, CDS, rRNA, tRNA
 +          if ($feat->primary_tag eq 'source') {
 +              print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism);
 +          }
 +  
 +  # For each gene feature, check if it is pseudo or not.
 +  # Extract function info if possible.
 +  # Values will be stored in %annot hash.
 +  
 +          if ($feat->primary_tag eq 'gene') {
 +              $num++;
 +  # $annot{$locus_tag}->{'locus'} holds 'LOCUS' (e.g., CP000727)
 +              $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
 +              $geneFeaturePresence{$locus_tag} = "";
 +              $annot{$locus_tag}->{'locus'} = $LOCUS;
 +              $annot{$locus_tag}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function);
 +              $annot{$locus_tag}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag);
 +              $seqref{$locus_tag} = $feat->seq;
 +              if ($feat->has_tag(pseudo)) {
 +                  $annot{$locus_tag}->{'isPseudo'} = 'pseudo';
 +                  push @pseudo, $locus_tag;
 +              }
 +  # $feat->start is always smaller than $feat->end.
 +              if ($feat->strand == '1') {
 +                  $annot{$locus_tag}->{'strand'} = '+';
 +              } else {
 +                  $annot{$locus_tag}->{'strand'} = '-';
 +              }
 +              $annot{$locus_tag}->{'start'} = $feat->start;
 +              $annot{$locus_tag}->{'end'} = $feat->end;
 +          }
 +  
 +  # For each CDS feature, product/gene/EC_number will be extracted.
 +  # There can be multiple EC numbers in one CDS feature.
 +  #
 +  
 +  #        if ($feat->primary_tag eq 'CDS' || $feat->primary_tag eq 'rRNA' || $feat->primary_tag eq 'tRNA') {
 +          if ($feat->primary_tag eq 'CDS') {
 +              $num2++ unless $feat->has_tag(pseudo);
 +              my $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]};
 +  
 +  # This is for gene feature-less GenBank file produced by Prokka.
 +  # If possible, use subroutine.
 +  #
 +              unless (exists $geneFeaturePresence{$locus_tag}) {
 +                  $annot{$locus_tag}->{'locus'} = $LOCUS;
 +                  if ($feat->strand == '1') {
 +                      $annot{$locus_tag}->{'strand'} = '+';
 +                  } else {
 +                      $annot{$locus_tag}->{'strand'} = '-';
 +                  }
 +                  $annot{$locus_tag}->{'start'} = $feat->start;
 +                  $annot{$locus_tag}->{'end'} = $feat->end;            
 +              }
 +  
 +  # If there are multiple "pseudo" CDS features (fragmentary) in one given locus,
 +  # process the first one only. Remaining CDS feature(s) has the same information.
 +  #
 +              next if exists $seen{$locus_tag};
 +              $seen{$locus_tag} = '';
 +  
 +              $annot{$locus_tag}->{'type'} = $feat->primary_tag;
 +              $annot{$locus_tag}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product);
 +              $annot{$locus_tag}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene);
 +              $annot{$locus_tag}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id);
 +  
 +  # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18)
 +  #
 +              if ($feat->has_tag(EC_number)) {
 +                  my @tag = $feat->get_tag_values(EC_number);
 +                  $annot{$locus_tag}->{EC_number} = join "; ", @tag;
 +              }
 +  
 +  # Extracting gene sequences if not pseudo.
 +  # If you want to process specified genes only, then use $locus_tag variable.
 +  # Sequence features will not be printed if $debug is nonzero.
 +  #
 +              if ($feat->has_tag(translation)) {
 +                  my $NTsequence = $feat->seq->seq();
 +                  my $AAsequence = eval {($feat->get_tag_values(translation))[0]};
 +                  my $product = $annot{$locus_tag}->{'product'};
 +                  $annot{$locus_tag}->{'NTseq'} = $NTsequence;
 +                  $annot{$locus_tag}->{'AAseq'} = $AAsequence;
 +  # You can add any other information to 'desc'.
 +  # Uncomment the following five lines if you want print seq to STDOUT and
 +  # specific -seq to be ether $NTsequence or $AAsequence.
 +  #                my $seqObj = Bio::Seq->new(-display_id => $locus_tag,
 +  #                                           -desc       => $product,
 +  #                                           -seq        => $NTsequence
 +  #                                           );
 +  #                $seqOut->write_seq($seqObj);
 +              }
 +          }
 +      }
 +  }
 +  
 +  $argv0 = basename($ARGV[0]);
 +  print STDERR "[STDERR] $argv0 has $LOCUS_num sequence(s)\n";
 +  print STDERR "[STDERR] $argv0 has $num gene features\n";
 +  print STDERR "[STDERR] $argv0 has $num2 active CDS features (not marked as 'pseudo')\n";
 +  print STDERR "[STDERR] $argv0 has ", scalar @pseudo, " pseudo genes\n";
 +  
 +  # Now print the entire information!
 +  # sort required!
 +  
 +  @final = ();
 +  open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n";
 +  print STDERR "[STDERR] Feature information is being written to [ $infoFile ] (pre-existing file was overwritten!)\n";
 +  
 +  print INFO join "\t", '#source', 'seqID', 'feature', 'locus_tag', 'old_locus_tag', 'isPseudo?', 'start', 'end', 'strand',
 +             'gene', 'product', 'protein_id', 'EC_number', 'function', 'NT_sequence' , 'AA_sequence' . "\n";
 +  for my $locus_tag (sort keys %annot) {
 +      my $temp = join "\t", $argv0, $annot{$locus_tag}->{'locus'}, $annot{$locus_tag}->{'type'}, 
 +               $locus_tag, $annot{$locus_tag}->{'old_locus_tag'}, 
 +               $annot{$locus_tag}->{'isPseudo'},
 +               $annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'},
 +               $annot{$locus_tag}->{'gene'}, $annot{$locus_tag}->{'product'},
 +               $annot{$locus_tag}->{'protein_id'}, 
 +               $annot{$locus_tag}->{'EC_number'}, $annot{$locus_tag}->{'function'};
 +               if ($ARGV[1] eq '-seq') {
 +                   $temp .= "\t" . $annot{$locus_tag}->{'NTseq'} . "\t" . $annot{$locus_tag}->{'AAseq'};
 +               }
 +               $temp .= "\n";
 +      push @final, $temp;
 +  }
 +  
 +  @sorted_final = map  { $_->[0] }
 +                  sort { 
 +                         $a->[1] cmp $b->[1] # First sort by locus
 +                                 ||
 +                         $a->[2] <=> $b->[2] # Second sort by start position (numeric)
 +                       }
 +                  map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start
 +  
 +  print INFO @sorted_final;
 +     
 +  if ($debug) { 
 +  # process using <data.txt> file
 +  # data.txt example (tab-delimited files; three columns)
 +  # seqID locus_tag SNP_position
 +  # NC_009698 CLC_0737 758733
 +  # NC_009698 CLC_0847 867070
 +  # Sorting data.txt using the third column as a key before running this script is desirable
 +      open DATA, "data.txt" or die "Can't open data.txt file for reading!\n";
 +      print join "\t", '#SeqID', 'locus_tag', 'strand', 'feature_start', 'SNP_position', 'NT_position (gene)', 
 +                       'AA_position (gene)', 'position_in_codon',
 +                       'codon', 'AA', 'nucleotide (genome)' . "\n";
 +      while (<DATA>) {
 +          next if /^#/;
 +          chomp;
 +  
 +          my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_;
 +          if ($LOCUS ne $annot{$locus_tag}->{'locus'}) {
 +              print STDERR "[FATAL ERROR] Something went wrong! Aborting....\n";
 +              exit;
 +          }
 +  
 +          ($start, $end, $strand) = ($annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'});
 +          if ($strand eq '+') {
 +             $NA_position_in_feature = $SNP - $annot{$locus_tag}->{'start'}; # zero-base position, i.e., offset
 +          } else {
 +             $NA_position_in_feature = $annot{$locus_tag}->{'end'} - $SNP;
 +          }
 +          if (exists $annot{$locus_tag}->{'isPseudo'} || $locus_tag eq 'NA') {
 +              print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, 'NA', $SNP, 'NA', 'NA', 'NA', 'NA', 'NA',
 +                               $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
 +          } else {
 +              $AA_position_in_feature = int($NA_position_in_feature / 3) + 1;
 +              $position_in_codon = ($NA_position_in_feature + 1) % 3;
 +              $position_in_codon = 3 if $position_in_codon == 0;
 +              $codon = $seqref{$locus_tag}->trunc($AA_position_in_feature * 3 - 2, $AA_position_in_feature * 3);
 +              $AA = $codon->translate();
 +              print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, $start, $SNP, $NA_position_in_feature + 1, $AA_position_in_feature,
 +                               $position_in_codon, $codon->seq(), $AA->seq(), 
 +                               $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n";
 +          }
 +      }
 +  }