====== my_rename1.pl ====== #!/usr/bin/perl # open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!"; while () { chomp; next unless /^(GCF|GCA)/; my @data = split /\t/, $_; $data[8] =~ s/strain=//; # infraspefic_name my $strain = $data[8]; $data[7] =~ s/ $strain$//; # remove redundant strain name my $name = $data[7] . ' ' . $data[8]; $name .= $data[15] if $name =~ /^\S+ \S+ $/; # add asm_name if required $name =~ s/:/-/g; $name =~ s/\s+/_/g; $name =~ s/;.*$//; # remove semicolon and following information $name =~ s/\//_/; $name =~ s/_$//; my @temp = split /\//, $data[19]; $key = pop @temp; $key2name{$key} = $name; print $key, "\t", $name, "\n"; } ====== my_rename2.pl ====== #!/usr/bin/perl # # $ARGV[0] : id2name file # GCF_000015065.1_ASM1506v1 Bt_str._Al_Hakam # GCF_000092165.1_ASM9216v1 Bt_BMB171 # ... # $ARGV[1] : fna file # GCF_000015065.1_ASM1506v1_genomic.fna # open ALL, $ARGV[0] or die "Can't open file $ARGV[0]!"; while () { chomp; my @data = split /\t/, $_; $key2name{$data[0]} = $data[1]; } my @temp = split /_/, $ARGV[1]; my $suffix = pop @temp; my $accession = $temp[0] . '_' . $temp[1]; $suffix =~ /^.+\.(.+)$/; $end = $1; $key = join '_', @temp; #print $key, " ", $key2name{$key}, " ", $end, "\n"; $file = $key2name{$key} . '_' . $accession . '.' . $end; print "$ARGV[1] ===> $file\n"; system("cp $ARGV[1] $file") if defined $file; ====== safe_download.sh ====== #!/usr/bin/bash # argument (FtpPath_RefSeq) example # ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/741/935/GCF_000741935.1_ASM74193v1 # # How to run: # while read path; do bash ./THIS_SCRIPT; done < download_path.txt TARGET_DIR=download # Modify 'genomic.gbff.gz' as you want! DOWNLOAD_PATH=$(echo $1 | sed -r 's|(ftp://ftp.ncbi.nlm.nih.gov/genomes/all/.+\/)(GC._.+)|\1\2\/\2_genomic.gbff.gz|') FILE=${DOWNLOAD_PATH##*/} MD5SUM_FILE=${1}/md5checksums.txt wget ${MD5SUM_FILE} grep ${FILE} md5checksums.txt > $$checksum.txt wget ${DOWNLOAD_PATH} md5sum -c $$checksum.txt if [ $? -eq 0 ]; then echo ${FILE} download OK! Moving to ${TARGET_DIR}... mv ${FILE} $TARGET_DIR else echo ${FILE} download FAIL! echo ${DOWNLOAD_PATH} >> failed_download.txt fi rm md5checksums.txt* $$checksum.txt ====== gbkInfo.pl ====== #!/usr/bin/perl # # Written by Haeyoung Jeong # use Bio::SeqIO; use File::Basename; # latest update: 2022-05-06 $scriptname = basename($0); $modtime = (stat($scriptname))[9]; $timestamp = localtime($modtime); if ($#ARGV == -1) { print STDERR "Usage : $scriptname [-seq]\n"; print STDERR " script last modified at $timestamp\n"; exit; } # debug mode is for additional analysis using the 'data.txt' file $debug = 0; if ($debug) { print STDERR "[STDERR] Entering debug mode...[ \$debug = $debug ; data.txt file will be used ]\n"; } $seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank'); $infoFile = basename($ARGV[0]) . '.txt'; if ($ARGV[1] eq '-seq') { print STDERR "[STDERR] Nucleotide and amino acid sequence will be written to the $infoFile file.\n"; } $seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta'); $LOCUS_num = $num = $num2 = 0; # Can handle multiple sequence objects in a single GenBank file while (my $seqObj = $seqIn->next_seq()) { my $LOCUS = $seqObj->display_id(); my $VERSION = $seqObj->version(); print STDERR "[STDERR] LOCUS: $LOCUS (VERSION: $VERSION)\n"; $LOCUS_num++; $whole_seq{$LOCUS} = $seqObj; @features = $seqObj->get_SeqFeatures(); foreach my $feat ( @features ) { # types of primary_tag: source, gene, CDS, rRNA, tRNA if ($feat->primary_tag eq 'source') { print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism); } # For each gene feature, check if it is pseudo or not. # Extract function info if possible. # Values will be stored in %annot hash. if ($feat->primary_tag eq 'gene') { $num++; # $annot{$locus_tag}->{'locus'} holds 'LOCUS' (e.g., CP000727) $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]}; $geneFeaturePresence{$locus_tag} = ""; $annot{$locus_tag}->{'locus'} = $LOCUS; $annot{$locus_tag}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function); $annot{$locus_tag}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag); $seqref{$locus_tag} = $feat->seq; if ($feat->has_tag(pseudo)) { $annot{$locus_tag}->{'isPseudo'} = 'pseudo'; push @pseudo, $locus_tag; } # $feat->start is always smaller than $feat->end. if ($feat->strand == '1') { $annot{$locus_tag}->{'strand'} = '+'; } else { $annot{$locus_tag}->{'strand'} = '-'; } $annot{$locus_tag}->{'start'} = $feat->start; $annot{$locus_tag}->{'end'} = $feat->end; } # For each CDS feature, product/gene/EC_number will be extracted. # There can be multiple EC numbers in one CDS feature. # # if ($feat->primary_tag eq 'CDS' || $feat->primary_tag eq 'rRNA' || $feat->primary_tag eq 'tRNA') { if ($feat->primary_tag eq 'CDS') { $num2++ unless $feat->has_tag(pseudo); my $locus_tag = eval {($feat->get_tag_values(locus_tag))[0]}; # This is for gene feature-less GenBank file produced by Prokka. # If possible, use subroutine. # unless (exists $geneFeaturePresence{$locus_tag}) { $annot{$locus_tag}->{'locus'} = $LOCUS; if ($feat->strand == '1') { $annot{$locus_tag}->{'strand'} = '+'; } else { $annot{$locus_tag}->{'strand'} = '-'; } $annot{$locus_tag}->{'start'} = $feat->start; $annot{$locus_tag}->{'end'} = $feat->end; } # If there are multiple "pseudo" CDS features (fragmentary) in one given locus, # process the first one only. Remaining CDS feature(s) has the same information. # next if exists $seen{$locus_tag}; $seen{$locus_tag} = ''; $annot{$locus_tag}->{'type'} = $feat->primary_tag; $annot{$locus_tag}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product); $annot{$locus_tag}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene); $annot{$locus_tag}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id); # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18) # if ($feat->has_tag(EC_number)) { my @tag = $feat->get_tag_values(EC_number); $annot{$locus_tag}->{EC_number} = join "; ", @tag; } # Extracting gene sequences if not pseudo. # If you want to process specified genes only, then use $locus_tag variable. # Sequence features will not be printed if $debug is nonzero. # if ($feat->has_tag(translation)) { my $NTsequence = $feat->seq->seq(); my $AAsequence = eval {($feat->get_tag_values(translation))[0]}; my $product = $annot{$locus_tag}->{'product'}; $annot{$locus_tag}->{'NTseq'} = $NTsequence; $annot{$locus_tag}->{'AAseq'} = $AAsequence; # You can add any other information to 'desc'. # Uncomment the following five lines if you want print seq to STDOUT and # specific -seq to be ether $NTsequence or $AAsequence. # my $seqObj = Bio::Seq->new(-display_id => $locus_tag, # -desc => $product, # -seq => $NTsequence # ); # $seqOut->write_seq($seqObj); } } } } $argv0 = basename($ARGV[0]); print STDERR "[STDERR] $argv0 has $LOCUS_num sequence(s)\n"; print STDERR "[STDERR] $argv0 has $num gene features\n"; print STDERR "[STDERR] $argv0 has $num2 active CDS features (not marked as 'pseudo')\n"; print STDERR "[STDERR] $argv0 has ", scalar @pseudo, " pseudo genes\n"; # Now print the entire information! # sort required! @final = (); open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n"; print STDERR "[STDERR] Feature information is being written to [ $infoFile ] (pre-existing file was overwritten!)\n"; print INFO join "\t", '#source', 'seqID', 'feature', 'locus_tag', 'old_locus_tag', 'isPseudo?', 'start', 'end', 'strand', 'gene', 'product', 'protein_id', 'EC_number', 'function', 'NT_sequence' , 'AA_sequence' . "\n"; for my $locus_tag (sort keys %annot) { my $temp = join "\t", $argv0, $annot{$locus_tag}->{'locus'}, $annot{$locus_tag}->{'type'}, $locus_tag, $annot{$locus_tag}->{'old_locus_tag'}, $annot{$locus_tag}->{'isPseudo'}, $annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'}, $annot{$locus_tag}->{'gene'}, $annot{$locus_tag}->{'product'}, $annot{$locus_tag}->{'protein_id'}, $annot{$locus_tag}->{'EC_number'}, $annot{$locus_tag}->{'function'}; if ($ARGV[1] eq '-seq') { $temp .= "\t" . $annot{$locus_tag}->{'NTseq'} . "\t" . $annot{$locus_tag}->{'AAseq'}; } $temp .= "\n"; push @final, $temp; } @sorted_final = map { $_->[0] } sort { $a->[1] cmp $b->[1] # First sort by locus || $a->[2] <=> $b->[2] # Second sort by start position (numeric) } map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start print INFO @sorted_final; if ($debug) { # process using file # data.txt example (tab-delimited files; three columns) # seqID locus_tag SNP_position # NC_009698 CLC_0737 758733 # NC_009698 CLC_0847 867070 # Sorting data.txt using the third column as a key before running this script is desirable open DATA, "data.txt" or die "Can't open data.txt file for reading!\n"; print join "\t", '#SeqID', 'locus_tag', 'strand', 'feature_start', 'SNP_position', 'NT_position (gene)', 'AA_position (gene)', 'position_in_codon', 'codon', 'AA', 'nucleotide (genome)' . "\n"; while () { next if /^#/; chomp; my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_; if ($LOCUS ne $annot{$locus_tag}->{'locus'}) { print STDERR "[FATAL ERROR] Something went wrong! Aborting....\n"; exit; } ($start, $end, $strand) = ($annot{$locus_tag}->{'start'}, $annot{$locus_tag}->{'end'}, $annot{$locus_tag}->{'strand'}); if ($strand eq '+') { $NA_position_in_feature = $SNP - $annot{$locus_tag}->{'start'}; # zero-base position, i.e., offset } else { $NA_position_in_feature = $annot{$locus_tag}->{'end'} - $SNP; } if (exists $annot{$locus_tag}->{'isPseudo'} || $locus_tag eq 'NA') { print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, 'NA', $SNP, 'NA', 'NA', 'NA', 'NA', 'NA', $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n"; } else { $AA_position_in_feature = int($NA_position_in_feature / 3) + 1; $position_in_codon = ($NA_position_in_feature + 1) % 3; $position_in_codon = 3 if $position_in_codon == 0; $codon = $seqref{$locus_tag}->trunc($AA_position_in_feature * 3 - 2, $AA_position_in_feature * 3); $AA = $codon->translate(); print join "\t", $LOCUS, $locus_tag, $annot{$locus_tag}->{'strand'}, $start, $SNP, $NA_position_in_feature + 1, $AA_position_in_feature, $position_in_codon, $codon->seq(), $AA->seq(), $whole_seq{$LOCUS}->subseq($SNP, $SNP) . "\n"; } } }