custom_scripts
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
custom_scripts [2023/06/22 17:12] – removed - external edit (Unknown date) 127.0.0.1 | custom_scripts [2023/06/29 12:13] (current) – [gbkInfo.pl] hyjeong | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== my_rename1.pl ====== | ||
+ | # | ||
+ | # | ||
+ | open ALL, $ARGV[0] or die " | ||
+ | while (< | ||
+ | chomp; | ||
+ | next unless / | ||
+ | my @data = split /\t/, $_; | ||
+ | $data[8] =~ s/ | ||
+ | my $strain = $data[8]; | ||
+ | $data[7] =~ s/ $strain$//; # remove redundant strain name | ||
+ | my $name = $data[7] . ' ' . $data[8]; | ||
+ | $name .= $data[15] if $name =~ /^\S+ \S+ $/; # add asm_name if required | ||
+ | $name =~ s/:/-/g; | ||
+ | $name =~ s/\s+/_/g; | ||
+ | $name =~ s/; | ||
+ | $name =~ s/\//_/; | ||
+ | $name =~ s/_$//; | ||
+ | my @temp = split /\//, $data[19]; | ||
+ | $key = pop @temp; | ||
+ | $key2name{$key} = $name; | ||
+ | print $key, " | ||
+ | } | ||
+ | ====== my_rename2.pl ====== | ||
+ | |||
+ | # | ||
+ | # | ||
+ | # $ARGV[0] : id2name file | ||
+ | # GCF_000015065.1_ASM1506v1 | ||
+ | # GCF_000092165.1_ASM9216v1 | ||
+ | # ... | ||
+ | # $ARGV[1] : fna file | ||
+ | # GCF_000015065.1_ASM1506v1_genomic.fna | ||
+ | # | ||
+ | | ||
+ | open ALL, $ARGV[0] or die " | ||
+ | while (< | ||
+ | chomp; | ||
+ | my @data = split /\t/, $_; | ||
+ | $key2name{$data[0]} = $data[1]; | ||
+ | } | ||
+ | | ||
+ | my @temp = split /_/, $ARGV[1]; | ||
+ | my $suffix = pop @temp; | ||
+ | my $accession = $temp[0] . ' | ||
+ | $suffix =~ / | ||
+ | $end = $1; | ||
+ | $key = join ' | ||
+ | | ||
+ | #print $key, " ", $key2name{$key}, | ||
+ | $file = $key2name{$key} . ' | ||
+ | print " | ||
+ | system(" | ||
+ | |||
+ | ====== safe_download.sh ====== | ||
+ | # | ||
+ | | ||
+ | # argument (FtpPath_RefSeq) example | ||
+ | # ftp:// | ||
+ | # | ||
+ | # How to run: | ||
+ | # while read path; do bash ./ | ||
+ | | ||
+ | TARGET_DIR=download | ||
+ | | ||
+ | # Modify ' | ||
+ | DOWNLOAD_PATH=$(echo $1 | sed -r ' | ||
+ | FILE=${DOWNLOAD_PATH## | ||
+ | MD5SUM_FILE=${1}/ | ||
+ | | ||
+ | wget ${MD5SUM_FILE} | ||
+ | grep ${FILE} md5checksums.txt > $$checksum.txt | ||
+ | wget ${DOWNLOAD_PATH} | ||
+ | | ||
+ | md5sum -c $$checksum.txt | ||
+ | if [ $? -eq 0 ]; then | ||
+ | echo ${FILE} download OK! Moving to ${TARGET_DIR}... | ||
+ | mv ${FILE} $TARGET_DIR | ||
+ | else | ||
+ | echo ${FILE} download FAIL! | ||
+ | echo ${DOWNLOAD_PATH} >> failed_download.txt | ||
+ | fi | ||
+ | | ||
+ | rm md5checksums.txt* $$checksum.txt | ||
+ | |||
+ | ====== gbkInfo.pl ====== | ||
+ | |||
+ | # | ||
+ | # | ||
+ | # Written by Haeyoung Jeong | ||
+ | # | ||
+ | | ||
+ | use Bio::SeqIO; | ||
+ | use File:: | ||
+ | | ||
+ | # latest update: 2022-05-06 | ||
+ | | ||
+ | $scriptname = basename($0); | ||
+ | $modtime = (stat($scriptname))[9]; | ||
+ | $timestamp = localtime($modtime); | ||
+ | | ||
+ | if ($#ARGV == -1) { | ||
+ | print STDERR "Usage : $scriptname <GenBank file> [-seq]\n"; | ||
+ | print STDERR " | ||
+ | exit; | ||
+ | } | ||
+ | | ||
+ | # debug mode is for additional analysis using the ' | ||
+ | $debug = 0; | ||
+ | if ($debug) { | ||
+ | print STDERR " | ||
+ | } | ||
+ | | ||
+ | $seqIn = Bio:: | ||
+ | $infoFile = basename($ARGV[0]) . ' | ||
+ | if ($ARGV[1] eq ' | ||
+ | print STDERR " | ||
+ | } | ||
+ | $seqOut = Bio:: | ||
+ | | ||
+ | $LOCUS_num = $num = $num2 = 0; | ||
+ | | ||
+ | # Can handle multiple sequence objects in a single GenBank file | ||
+ | while (my $seqObj = $seqIn-> | ||
+ | my $LOCUS = $seqObj-> | ||
+ | my $VERSION = $seqObj-> | ||
+ | print STDERR " | ||
+ | $LOCUS_num++; | ||
+ | $whole_seq{$LOCUS} = $seqObj; | ||
+ | @features = $seqObj-> | ||
+ | foreach my $feat ( @features ) { | ||
+ | # types of primary_tag: | ||
+ | if ($feat-> | ||
+ | print STDERR " | ||
+ | } | ||
+ | | ||
+ | # For each gene feature, check if it is pseudo or not. | ||
+ | # Extract function info if possible. | ||
+ | # Values will be stored in %annot hash. | ||
+ | | ||
+ | if ($feat-> | ||
+ | $num++; | ||
+ | # $annot{$locus_tag}-> | ||
+ | $locus_tag = eval {($feat-> | ||
+ | $geneFeaturePresence{$locus_tag} = ""; | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $seqref{$locus_tag} = $feat-> | ||
+ | if ($feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | push @pseudo, $locus_tag; | ||
+ | } | ||
+ | # $feat-> | ||
+ | if ($feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } else { | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | | ||
+ | # For each CDS feature, product/ | ||
+ | # There can be multiple EC numbers in one CDS feature. | ||
+ | # | ||
+ | | ||
+ | # if ($feat-> | ||
+ | if ($feat-> | ||
+ | $num2++ unless $feat-> | ||
+ | my $locus_tag = eval {($feat-> | ||
+ | | ||
+ | # This is for gene feature-less GenBank file produced by Prokka. | ||
+ | # If possible, use subroutine. | ||
+ | # | ||
+ | unless (exists $geneFeaturePresence{$locus_tag}) { | ||
+ | $annot{$locus_tag}-> | ||
+ | if ($feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } else { | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | | ||
+ | # If there are multiple " | ||
+ | # process the first one only. Remaining CDS feature(s) has the same information. | ||
+ | # | ||
+ | next if exists $seen{$locus_tag}; | ||
+ | $seen{$locus_tag} = ''; | ||
+ | | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | | ||
+ | # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18) | ||
+ | # | ||
+ | if ($feat-> | ||
+ | my @tag = $feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | | ||
+ | # Extracting gene sequences if not pseudo. | ||
+ | # If you want to process specified genes only, then use $locus_tag variable. | ||
+ | # Sequence features will not be printed if $debug is nonzero. | ||
+ | # | ||
+ | if ($feat-> | ||
+ | my $NTsequence = $feat-> | ||
+ | my $AAsequence = eval {($feat-> | ||
+ | my $product = $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | # You can add any other information to ' | ||
+ | # Uncomment the following five lines if you want print seq to STDOUT and | ||
+ | # specific -seq to be ether $NTsequence or $AAsequence. | ||
+ | # my $seqObj = Bio:: | ||
+ | # | ||
+ | # | ||
+ | # ); | ||
+ | # $seqOut-> | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | | ||
+ | $argv0 = basename($ARGV[0]); | ||
+ | print STDERR " | ||
+ | print STDERR " | ||
+ | print STDERR " | ||
+ | print STDERR " | ||
+ | | ||
+ | # Now print the entire information! | ||
+ | # sort required! | ||
+ | | ||
+ | @final = (); | ||
+ | open INFO, "> | ||
+ | print STDERR " | ||
+ | | ||
+ | print INFO join " | ||
+ | ' | ||
+ | for my $locus_tag (sort keys %annot) { | ||
+ | my $temp = join " | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | if ($ARGV[1] eq ' | ||
+ | $temp .= " | ||
+ | } | ||
+ | $temp .= " | ||
+ | push @final, $temp; | ||
+ | } | ||
+ | | ||
+ | @sorted_final = map { $_->[0] } | ||
+ | sort { | ||
+ | | ||
+ | || | ||
+ | | ||
+ | } | ||
+ | map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start | ||
+ | | ||
+ | print INFO @sorted_final; | ||
+ | |||
+ | if ($debug) { | ||
+ | # process using < | ||
+ | # data.txt example (tab-delimited files; three columns) | ||
+ | # seqID locus_tag SNP_position | ||
+ | # NC_009698 CLC_0737 758733 | ||
+ | # NC_009698 CLC_0847 867070 | ||
+ | # Sorting data.txt using the third column as a key before running this script is desirable | ||
+ | open DATA, " | ||
+ | print join " | ||
+ | ' | ||
+ | ' | ||
+ | while (< | ||
+ | next if /^#/; | ||
+ | chomp; | ||
+ | | ||
+ | my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_; | ||
+ | if ($LOCUS ne $annot{$locus_tag}-> | ||
+ | print STDERR " | ||
+ | exit; | ||
+ | } | ||
+ | | ||
+ | ($start, $end, $strand) = ($annot{$locus_tag}-> | ||
+ | if ($strand eq ' | ||
+ | | ||
+ | } else { | ||
+ | | ||
+ | } | ||
+ | if (exists $annot{$locus_tag}-> | ||
+ | print join " | ||
+ | | ||
+ | } else { | ||
+ | $AA_position_in_feature = int($NA_position_in_feature / 3) + 1; | ||
+ | $position_in_codon = ($NA_position_in_feature + 1) % 3; | ||
+ | $position_in_codon = 3 if $position_in_codon == 0; | ||
+ | $codon = $seqref{$locus_tag}-> | ||
+ | $AA = $codon-> | ||
+ | print join " | ||
+ | | ||
+ | | ||
+ | } | ||
+ | } | ||
+ | } |