custom_scripts
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| custom_scripts [2023/06/22 17:12] – removed - external edit (Unknown date) 127.0.0.1 | custom_scripts [2023/06/29 12:13] (current) – [gbkInfo.pl] hyjeong | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== my_rename1.pl ====== | ||
| + | # | ||
| + | # | ||
| + | open ALL, $ARGV[0] or die " | ||
| + | while (< | ||
| + | chomp; | ||
| + | next unless / | ||
| + | my @data = split /\t/, $_; | ||
| + | $data[8] =~ s/ | ||
| + | my $strain = $data[8]; | ||
| + | $data[7] =~ s/ $strain$//; # remove redundant strain name | ||
| + | my $name = $data[7] . ' ' . $data[8]; | ||
| + | $name .= $data[15] if $name =~ /^\S+ \S+ $/; # add asm_name if required | ||
| + | $name =~ s/:/-/g; | ||
| + | $name =~ s/\s+/_/g; | ||
| + | $name =~ s/; | ||
| + | $name =~ s/\//_/; | ||
| + | $name =~ s/_$//; | ||
| + | my @temp = split /\//, $data[19]; | ||
| + | $key = pop @temp; | ||
| + | $key2name{$key} = $name; | ||
| + | print $key, " | ||
| + | } | ||
| + | ====== my_rename2.pl ====== | ||
| + | |||
| + | # | ||
| + | # | ||
| + | # $ARGV[0] : id2name file | ||
| + | # GCF_000015065.1_ASM1506v1 | ||
| + | # GCF_000092165.1_ASM9216v1 | ||
| + | # ... | ||
| + | # $ARGV[1] : fna file | ||
| + | # GCF_000015065.1_ASM1506v1_genomic.fna | ||
| + | # | ||
| + | | ||
| + | open ALL, $ARGV[0] or die " | ||
| + | while (< | ||
| + | chomp; | ||
| + | my @data = split /\t/, $_; | ||
| + | $key2name{$data[0]} = $data[1]; | ||
| + | } | ||
| + | | ||
| + | my @temp = split /_/, $ARGV[1]; | ||
| + | my $suffix = pop @temp; | ||
| + | my $accession = $temp[0] . ' | ||
| + | $suffix =~ / | ||
| + | $end = $1; | ||
| + | $key = join ' | ||
| + | | ||
| + | #print $key, " ", $key2name{$key}, | ||
| + | $file = $key2name{$key} . ' | ||
| + | print " | ||
| + | system(" | ||
| + | |||
| + | ====== safe_download.sh ====== | ||
| + | # | ||
| + | | ||
| + | # argument (FtpPath_RefSeq) example | ||
| + | # ftp:// | ||
| + | # | ||
| + | # How to run: | ||
| + | # while read path; do bash ./ | ||
| + | | ||
| + | TARGET_DIR=download | ||
| + | | ||
| + | # Modify ' | ||
| + | DOWNLOAD_PATH=$(echo $1 | sed -r ' | ||
| + | FILE=${DOWNLOAD_PATH## | ||
| + | MD5SUM_FILE=${1}/ | ||
| + | | ||
| + | wget ${MD5SUM_FILE} | ||
| + | grep ${FILE} md5checksums.txt > $$checksum.txt | ||
| + | wget ${DOWNLOAD_PATH} | ||
| + | | ||
| + | md5sum -c $$checksum.txt | ||
| + | if [ $? -eq 0 ]; then | ||
| + | echo ${FILE} download OK! Moving to ${TARGET_DIR}... | ||
| + | mv ${FILE} $TARGET_DIR | ||
| + | else | ||
| + | echo ${FILE} download FAIL! | ||
| + | echo ${DOWNLOAD_PATH} >> failed_download.txt | ||
| + | fi | ||
| + | | ||
| + | rm md5checksums.txt* $$checksum.txt | ||
| + | |||
| + | ====== gbkInfo.pl ====== | ||
| + | |||
| + | # | ||
| + | # | ||
| + | # Written by Haeyoung Jeong | ||
| + | # | ||
| + | | ||
| + | use Bio::SeqIO; | ||
| + | use File:: | ||
| + | | ||
| + | # latest update: 2022-05-06 | ||
| + | | ||
| + | $scriptname = basename($0); | ||
| + | $modtime = (stat($scriptname))[9]; | ||
| + | $timestamp = localtime($modtime); | ||
| + | | ||
| + | if ($#ARGV == -1) { | ||
| + | print STDERR "Usage : $scriptname <GenBank file> [-seq]\n"; | ||
| + | print STDERR " | ||
| + | exit; | ||
| + | } | ||
| + | | ||
| + | # debug mode is for additional analysis using the ' | ||
| + | $debug = 0; | ||
| + | if ($debug) { | ||
| + | print STDERR " | ||
| + | } | ||
| + | | ||
| + | $seqIn = Bio:: | ||
| + | $infoFile = basename($ARGV[0]) . ' | ||
| + | if ($ARGV[1] eq ' | ||
| + | print STDERR " | ||
| + | } | ||
| + | $seqOut = Bio:: | ||
| + | | ||
| + | $LOCUS_num = $num = $num2 = 0; | ||
| + | | ||
| + | # Can handle multiple sequence objects in a single GenBank file | ||
| + | while (my $seqObj = $seqIn-> | ||
| + | my $LOCUS = $seqObj-> | ||
| + | my $VERSION = $seqObj-> | ||
| + | print STDERR " | ||
| + | $LOCUS_num++; | ||
| + | $whole_seq{$LOCUS} = $seqObj; | ||
| + | @features = $seqObj-> | ||
| + | foreach my $feat ( @features ) { | ||
| + | # types of primary_tag: | ||
| + | if ($feat-> | ||
| + | print STDERR " | ||
| + | } | ||
| + | | ||
| + | # For each gene feature, check if it is pseudo or not. | ||
| + | # Extract function info if possible. | ||
| + | # Values will be stored in %annot hash. | ||
| + | | ||
| + | if ($feat-> | ||
| + | $num++; | ||
| + | # $annot{$locus_tag}-> | ||
| + | $locus_tag = eval {($feat-> | ||
| + | $geneFeaturePresence{$locus_tag} = ""; | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $seqref{$locus_tag} = $feat-> | ||
| + | if ($feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | push @pseudo, $locus_tag; | ||
| + | } | ||
| + | # $feat-> | ||
| + | if ($feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } else { | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | | ||
| + | # For each CDS feature, product/ | ||
| + | # There can be multiple EC numbers in one CDS feature. | ||
| + | # | ||
| + | | ||
| + | # if ($feat-> | ||
| + | if ($feat-> | ||
| + | $num2++ unless $feat-> | ||
| + | my $locus_tag = eval {($feat-> | ||
| + | | ||
| + | # This is for gene feature-less GenBank file produced by Prokka. | ||
| + | # If possible, use subroutine. | ||
| + | # | ||
| + | unless (exists $geneFeaturePresence{$locus_tag}) { | ||
| + | $annot{$locus_tag}-> | ||
| + | if ($feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } else { | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | | ||
| + | # If there are multiple " | ||
| + | # process the first one only. Remaining CDS feature(s) has the same information. | ||
| + | # | ||
| + | next if exists $seen{$locus_tag}; | ||
| + | $seen{$locus_tag} = ''; | ||
| + | | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | | ||
| + | # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18) | ||
| + | # | ||
| + | if ($feat-> | ||
| + | my @tag = $feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | | ||
| + | # Extracting gene sequences if not pseudo. | ||
| + | # If you want to process specified genes only, then use $locus_tag variable. | ||
| + | # Sequence features will not be printed if $debug is nonzero. | ||
| + | # | ||
| + | if ($feat-> | ||
| + | my $NTsequence = $feat-> | ||
| + | my $AAsequence = eval {($feat-> | ||
| + | my $product = $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | # You can add any other information to ' | ||
| + | # Uncomment the following five lines if you want print seq to STDOUT and | ||
| + | # specific -seq to be ether $NTsequence or $AAsequence. | ||
| + | # my $seqObj = Bio:: | ||
| + | # | ||
| + | # | ||
| + | # ); | ||
| + | # $seqOut-> | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | | ||
| + | $argv0 = basename($ARGV[0]); | ||
| + | print STDERR " | ||
| + | print STDERR " | ||
| + | print STDERR " | ||
| + | print STDERR " | ||
| + | | ||
| + | # Now print the entire information! | ||
| + | # sort required! | ||
| + | | ||
| + | @final = (); | ||
| + | open INFO, "> | ||
| + | print STDERR " | ||
| + | | ||
| + | print INFO join " | ||
| + | ' | ||
| + | for my $locus_tag (sort keys %annot) { | ||
| + | my $temp = join " | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | if ($ARGV[1] eq ' | ||
| + | $temp .= " | ||
| + | } | ||
| + | $temp .= " | ||
| + | push @final, $temp; | ||
| + | } | ||
| + | | ||
| + | @sorted_final = map { $_->[0] } | ||
| + | sort { | ||
| + | | ||
| + | || | ||
| + | | ||
| + | } | ||
| + | map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start | ||
| + | | ||
| + | print INFO @sorted_final; | ||
| + | |||
| + | if ($debug) { | ||
| + | # process using < | ||
| + | # data.txt example (tab-delimited files; three columns) | ||
| + | # seqID locus_tag SNP_position | ||
| + | # NC_009698 CLC_0737 758733 | ||
| + | # NC_009698 CLC_0847 867070 | ||
| + | # Sorting data.txt using the third column as a key before running this script is desirable | ||
| + | open DATA, " | ||
| + | print join " | ||
| + | ' | ||
| + | ' | ||
| + | while (< | ||
| + | next if /^#/; | ||
| + | chomp; | ||
| + | | ||
| + | my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_; | ||
| + | if ($LOCUS ne $annot{$locus_tag}-> | ||
| + | print STDERR " | ||
| + | exit; | ||
| + | } | ||
| + | | ||
| + | ($start, $end, $strand) = ($annot{$locus_tag}-> | ||
| + | if ($strand eq ' | ||
| + | | ||
| + | } else { | ||
| + | | ||
| + | } | ||
| + | if (exists $annot{$locus_tag}-> | ||
| + | print join " | ||
| + | | ||
| + | } else { | ||
| + | $AA_position_in_feature = int($NA_position_in_feature / 3) + 1; | ||
| + | $position_in_codon = ($NA_position_in_feature + 1) % 3; | ||
| + | $position_in_codon = 3 if $position_in_codon == 0; | ||
| + | $codon = $seqref{$locus_tag}-> | ||
| + | $AA = $codon-> | ||
| + | print join " | ||
| + | | ||
| + | | ||
| + | } | ||
| + | } | ||
| + | } | ||
