custom_scripts
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| custom_scripts [2023/06/29 10:00] – [safe_download.sh] hyjeong | custom_scripts [2023/06/29 12:13] (current) – [gbkInfo.pl] hyjeong | ||
|---|---|---|---|
| Line 87: | Line 87: | ||
| ====== gbkInfo.pl ====== | ====== gbkInfo.pl ====== | ||
| + | # | ||
| + | # | ||
| + | # Written by Haeyoung Jeong | ||
| + | # | ||
| + | | ||
| + | use Bio::SeqIO; | ||
| + | use File:: | ||
| + | | ||
| + | # latest update: 2022-05-06 | ||
| + | | ||
| + | $scriptname = basename($0); | ||
| + | $modtime = (stat($scriptname))[9]; | ||
| + | $timestamp = localtime($modtime); | ||
| + | | ||
| + | if ($#ARGV == -1) { | ||
| + | print STDERR "Usage : $scriptname <GenBank file> [-seq]\n"; | ||
| + | print STDERR " | ||
| + | exit; | ||
| + | } | ||
| + | | ||
| + | # debug mode is for additional analysis using the ' | ||
| + | $debug = 0; | ||
| + | if ($debug) { | ||
| + | print STDERR " | ||
| + | } | ||
| + | | ||
| + | $seqIn = Bio:: | ||
| + | $infoFile = basename($ARGV[0]) . ' | ||
| + | if ($ARGV[1] eq ' | ||
| + | print STDERR " | ||
| + | } | ||
| + | $seqOut = Bio:: | ||
| + | | ||
| + | $LOCUS_num = $num = $num2 = 0; | ||
| + | | ||
| + | # Can handle multiple sequence objects in a single GenBank file | ||
| + | while (my $seqObj = $seqIn-> | ||
| + | my $LOCUS = $seqObj-> | ||
| + | my $VERSION = $seqObj-> | ||
| + | print STDERR " | ||
| + | $LOCUS_num++; | ||
| + | $whole_seq{$LOCUS} = $seqObj; | ||
| + | @features = $seqObj-> | ||
| + | foreach my $feat ( @features ) { | ||
| + | # types of primary_tag: | ||
| + | if ($feat-> | ||
| + | print STDERR " | ||
| + | } | ||
| + | | ||
| + | # For each gene feature, check if it is pseudo or not. | ||
| + | # Extract function info if possible. | ||
| + | # Values will be stored in %annot hash. | ||
| + | | ||
| + | if ($feat-> | ||
| + | $num++; | ||
| + | # $annot{$locus_tag}-> | ||
| + | $locus_tag = eval {($feat-> | ||
| + | $geneFeaturePresence{$locus_tag} = ""; | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $seqref{$locus_tag} = $feat-> | ||
| + | if ($feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | push @pseudo, $locus_tag; | ||
| + | } | ||
| + | # $feat-> | ||
| + | if ($feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } else { | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | | ||
| + | # For each CDS feature, product/ | ||
| + | # There can be multiple EC numbers in one CDS feature. | ||
| + | # | ||
| + | | ||
| + | # if ($feat-> | ||
| + | if ($feat-> | ||
| + | $num2++ unless $feat-> | ||
| + | my $locus_tag = eval {($feat-> | ||
| + | | ||
| + | # This is for gene feature-less GenBank file produced by Prokka. | ||
| + | # If possible, use subroutine. | ||
| + | # | ||
| + | unless (exists $geneFeaturePresence{$locus_tag}) { | ||
| + | $annot{$locus_tag}-> | ||
| + | if ($feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } else { | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | | ||
| + | # If there are multiple " | ||
| + | # process the first one only. Remaining CDS feature(s) has the same information. | ||
| + | # | ||
| + | next if exists $seen{$locus_tag}; | ||
| + | $seen{$locus_tag} = ''; | ||
| + | | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | | ||
| + | # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18) | ||
| + | # | ||
| + | if ($feat-> | ||
| + | my @tag = $feat-> | ||
| + | $annot{$locus_tag}-> | ||
| + | } | ||
| + | | ||
| + | # Extracting gene sequences if not pseudo. | ||
| + | # If you want to process specified genes only, then use $locus_tag variable. | ||
| + | # Sequence features will not be printed if $debug is nonzero. | ||
| + | # | ||
| + | if ($feat-> | ||
| + | my $NTsequence = $feat-> | ||
| + | my $AAsequence = eval {($feat-> | ||
| + | my $product = $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | $annot{$locus_tag}-> | ||
| + | # You can add any other information to ' | ||
| + | # Uncomment the following five lines if you want print seq to STDOUT and | ||
| + | # specific -seq to be ether $NTsequence or $AAsequence. | ||
| + | # my $seqObj = Bio:: | ||
| + | # | ||
| + | # | ||
| + | # ); | ||
| + | # $seqOut-> | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | | ||
| + | $argv0 = basename($ARGV[0]); | ||
| + | print STDERR " | ||
| + | print STDERR " | ||
| + | print STDERR " | ||
| + | print STDERR " | ||
| + | | ||
| + | # Now print the entire information! | ||
| + | # sort required! | ||
| + | | ||
| + | @final = (); | ||
| + | open INFO, "> | ||
| + | print STDERR " | ||
| + | | ||
| + | print INFO join " | ||
| + | ' | ||
| + | for my $locus_tag (sort keys %annot) { | ||
| + | my $temp = join " | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | | ||
| + | if ($ARGV[1] eq ' | ||
| + | $temp .= " | ||
| + | } | ||
| + | $temp .= " | ||
| + | push @final, $temp; | ||
| + | } | ||
| + | | ||
| + | @sorted_final = map { $_->[0] } | ||
| + | sort { | ||
| + | | ||
| + | || | ||
| + | | ||
| + | } | ||
| + | map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start | ||
| + | | ||
| + | print INFO @sorted_final; | ||
| + | |||
| + | if ($debug) { | ||
| + | # process using < | ||
| + | # data.txt example (tab-delimited files; three columns) | ||
| + | # seqID locus_tag SNP_position | ||
| + | # NC_009698 CLC_0737 758733 | ||
| + | # NC_009698 CLC_0847 867070 | ||
| + | # Sorting data.txt using the third column as a key before running this script is desirable | ||
| + | open DATA, " | ||
| + | print join " | ||
| + | ' | ||
| + | ' | ||
| + | while (< | ||
| + | next if /^#/; | ||
| + | chomp; | ||
| + | | ||
| + | my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_; | ||
| + | if ($LOCUS ne $annot{$locus_tag}-> | ||
| + | print STDERR " | ||
| + | exit; | ||
| + | } | ||
| + | | ||
| + | ($start, $end, $strand) = ($annot{$locus_tag}-> | ||
| + | if ($strand eq ' | ||
| + | | ||
| + | } else { | ||
| + | | ||
| + | } | ||
| + | if (exists $annot{$locus_tag}-> | ||
| + | print join " | ||
| + | | ||
| + | } else { | ||
| + | $AA_position_in_feature = int($NA_position_in_feature / 3) + 1; | ||
| + | $position_in_codon = ($NA_position_in_feature + 1) % 3; | ||
| + | $position_in_codon = 3 if $position_in_codon == 0; | ||
| + | $codon = $seqref{$locus_tag}-> | ||
| + | $AA = $codon-> | ||
| + | print join " | ||
| + | | ||
| + | | ||
| + | } | ||
| + | } | ||
| + | } | ||
custom_scripts.1688000438.txt.gz · Last modified: by hyjeong
