custom_scripts
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
custom_scripts [2023/06/22 17:12] – ↷ Page name changed from custom_perl_scripts to custom_scripts hyjeong | custom_scripts [2023/06/29 12:13] (current) – [gbkInfo.pl] hyjeong | ||
---|---|---|---|
Line 84: | Line 84: | ||
| | ||
rm md5checksums.txt* $$checksum.txt | rm md5checksums.txt* $$checksum.txt | ||
+ | |||
+ | ====== gbkInfo.pl ====== | ||
+ | |||
+ | # | ||
+ | # | ||
+ | # Written by Haeyoung Jeong | ||
+ | # | ||
| | ||
+ | use Bio::SeqIO; | ||
+ | use File:: | ||
+ | | ||
+ | # latest update: 2022-05-06 | ||
+ | | ||
+ | $scriptname = basename($0); | ||
+ | $modtime = (stat($scriptname))[9]; | ||
+ | $timestamp = localtime($modtime); | ||
+ | | ||
+ | if ($#ARGV == -1) { | ||
+ | print STDERR "Usage : $scriptname <GenBank file> [-seq]\n"; | ||
+ | print STDERR " | ||
+ | exit; | ||
+ | } | ||
+ | | ||
+ | # debug mode is for additional analysis using the ' | ||
+ | $debug = 0; | ||
+ | if ($debug) { | ||
+ | print STDERR " | ||
+ | } | ||
+ | | ||
+ | $seqIn = Bio:: | ||
+ | $infoFile = basename($ARGV[0]) . ' | ||
+ | if ($ARGV[1] eq ' | ||
+ | print STDERR " | ||
+ | } | ||
+ | $seqOut = Bio:: | ||
+ | | ||
+ | $LOCUS_num = $num = $num2 = 0; | ||
+ | | ||
+ | # Can handle multiple sequence objects in a single GenBank file | ||
+ | while (my $seqObj = $seqIn-> | ||
+ | my $LOCUS = $seqObj-> | ||
+ | my $VERSION = $seqObj-> | ||
+ | print STDERR " | ||
+ | $LOCUS_num++; | ||
+ | $whole_seq{$LOCUS} = $seqObj; | ||
+ | @features = $seqObj-> | ||
+ | foreach my $feat ( @features ) { | ||
+ | # types of primary_tag: | ||
+ | if ($feat-> | ||
+ | print STDERR " | ||
+ | } | ||
+ | | ||
+ | # For each gene feature, check if it is pseudo or not. | ||
+ | # Extract function info if possible. | ||
+ | # Values will be stored in %annot hash. | ||
+ | | ||
+ | if ($feat-> | ||
+ | $num++; | ||
+ | # $annot{$locus_tag}-> | ||
+ | $locus_tag = eval {($feat-> | ||
+ | $geneFeaturePresence{$locus_tag} = ""; | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $seqref{$locus_tag} = $feat-> | ||
+ | if ($feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | push @pseudo, $locus_tag; | ||
+ | } | ||
+ | # $feat-> | ||
+ | if ($feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } else { | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | | ||
+ | # For each CDS feature, product/ | ||
+ | # There can be multiple EC numbers in one CDS feature. | ||
+ | # | ||
+ | | ||
+ | # if ($feat-> | ||
+ | if ($feat-> | ||
+ | $num2++ unless $feat-> | ||
+ | my $locus_tag = eval {($feat-> | ||
+ | | ||
+ | # This is for gene feature-less GenBank file produced by Prokka. | ||
+ | # If possible, use subroutine. | ||
+ | # | ||
+ | unless (exists $geneFeaturePresence{$locus_tag}) { | ||
+ | $annot{$locus_tag}-> | ||
+ | if ($feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } else { | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | | ||
+ | # If there are multiple " | ||
+ | # process the first one only. Remaining CDS feature(s) has the same information. | ||
+ | # | ||
+ | next if exists $seen{$locus_tag}; | ||
+ | $seen{$locus_tag} = ''; | ||
+ | | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | | ||
+ | # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18) | ||
+ | # | ||
+ | if ($feat-> | ||
+ | my @tag = $feat-> | ||
+ | $annot{$locus_tag}-> | ||
+ | } | ||
+ | | ||
+ | # Extracting gene sequences if not pseudo. | ||
+ | # If you want to process specified genes only, then use $locus_tag variable. | ||
+ | # Sequence features will not be printed if $debug is nonzero. | ||
+ | # | ||
+ | if ($feat-> | ||
+ | my $NTsequence = $feat-> | ||
+ | my $AAsequence = eval {($feat-> | ||
+ | my $product = $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | $annot{$locus_tag}-> | ||
+ | # You can add any other information to ' | ||
+ | # Uncomment the following five lines if you want print seq to STDOUT and | ||
+ | # specific -seq to be ether $NTsequence or $AAsequence. | ||
+ | # my $seqObj = Bio:: | ||
+ | # | ||
+ | # | ||
+ | # ); | ||
+ | # $seqOut-> | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | | ||
+ | $argv0 = basename($ARGV[0]); | ||
+ | print STDERR " | ||
+ | print STDERR " | ||
+ | print STDERR " | ||
+ | print STDERR " | ||
+ | | ||
+ | # Now print the entire information! | ||
+ | # sort required! | ||
+ | | ||
+ | @final = (); | ||
+ | open INFO, "> | ||
+ | print STDERR " | ||
+ | | ||
+ | print INFO join " | ||
+ | ' | ||
+ | for my $locus_tag (sort keys %annot) { | ||
+ | my $temp = join " | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | if ($ARGV[1] eq ' | ||
+ | $temp .= " | ||
+ | } | ||
+ | $temp .= " | ||
+ | push @final, $temp; | ||
+ | } | ||
+ | | ||
+ | @sorted_final = map { $_->[0] } | ||
+ | sort { | ||
+ | | ||
+ | || | ||
+ | | ||
+ | } | ||
+ | map { [ $_, (split /\t/)[1,6] ] } @final; # [1] locus, [6] start | ||
+ | | ||
+ | print INFO @sorted_final; | ||
+ | |||
+ | if ($debug) { | ||
+ | # process using < | ||
+ | # data.txt example (tab-delimited files; three columns) | ||
+ | # seqID locus_tag SNP_position | ||
+ | # NC_009698 CLC_0737 758733 | ||
+ | # NC_009698 CLC_0847 867070 | ||
+ | # Sorting data.txt using the third column as a key before running this script is desirable | ||
+ | open DATA, " | ||
+ | print join " | ||
+ | ' | ||
+ | ' | ||
+ | while (< | ||
+ | next if /^#/; | ||
+ | chomp; | ||
+ | | ||
+ | my ($LOCUS, $locus_tag, $SNP) = split /\t/, $_; | ||
+ | if ($LOCUS ne $annot{$locus_tag}-> | ||
+ | print STDERR " | ||
+ | exit; | ||
+ | } | ||
+ | | ||
+ | ($start, $end, $strand) = ($annot{$locus_tag}-> | ||
+ | if ($strand eq ' | ||
+ | | ||
+ | } else { | ||
+ | | ||
+ | } | ||
+ | if (exists $annot{$locus_tag}-> | ||
+ | print join " | ||
+ | | ||
+ | } else { | ||
+ | $AA_position_in_feature = int($NA_position_in_feature / 3) + 1; | ||
+ | $position_in_codon = ($NA_position_in_feature + 1) % 3; | ||
+ | $position_in_codon = 3 if $position_in_codon == 0; | ||
+ | $codon = $seqref{$locus_tag}-> | ||
+ | $AA = $codon-> | ||
+ | print join " | ||
+ | | ||
+ | | ||
+ | } | ||
+ | } | ||
+ | } |
custom_scripts.1687421549.txt.gz · Last modified: 2023/06/22 17:12 by hyjeong