bioinfo:gbkinfo_script
gbkInfo script
GenBank flat file을 열어서 locus tag, product 등 기본 정보를 tab-delimited text file로 출력하고, CDS 정보는 표준출력으로 내보내는 스크립트이다. tr 옵션을 주면 아미노산 서열로 번역이 되어 출력된다. 용도에 따라 여러 가지 형태로 변형하여 사용했었지만, 아래에 소개한 gbkInfo_working.pl은 가장 포괄적인 기능을 갖도록 작성하였다.
#!/usr/bin/perl # # Usage - gbkInfo_working.pl GenBank_file [tr] # use Bio::SeqIO; $seqIn = Bio::SeqIO->new(-file => $ARGV[0], -format=>'genbank'); $infoFile = $ARGV[0] . '.txt'; $seqOut = Bio::SeqIO->new(-fh=> \*STDOUT, -format=>'fasta'); # Can handle multiple sequence objects in a single GenBank file while (my $seqObj = $seqIn->next_seq()) { @features = $seqObj->get_SeqFeatures(); foreach my $feat ( @features ) { # types of primary_tag: source, gene, CDS, rRNA, tRNA if ($feat->primary_tag eq 'source') { print STDERR "[STDERR] Organism: ", $feat->get_tag_values(organism), "\n" if $feat->has_tag(organism); } # For each gene feature, check if it is pseudo or not. # Extract function info if possible. # Values will be stored in %annot hash. if ($feat->primary_tag eq 'gene') { $num++; $locus = eval {($feat->get_tag_values(locus_tag))[0]}; $annot{$locus}->{'function'} = eval {($feat->get_tag_values(function))[0]} if $feat->has_tag(function); $annot{$locus}->{'old_locus_tag'} = eval {($feat->get_tag_values(old_locus_tag))[0]} if $feat->has_tag(old_locus_tag); if ($feat->has_tag(pseudo)) { $annot{$locus}->{'isPseudo'} = 'pseudo'; push @pseudo, $locus; } if ($feat->strand == '1') { $annot{$locus}->{'strand'} = '+'; $annot{$locus}->{'stop'} = $feat->end; } else { $annot{$locus}->{'strand'} = '-'; $annot{$locus}->{'stop'} = $feat->start; } # start < end $annot{$locus}->{'start'} = $feat->start; $annot{$locus}->{'end'} = $feat->end; } # For each CDS feature, product/gene/EC_number will be extracted. # There can be multiple EC numbers in one CDS feature. # if ($feat->primary_tag eq 'CDS') { $num2++ unless $feat->has_tag(pseudo); my $locus = eval {($feat->get_tag_values(locus_tag))[0]}; # If there are multiple "pseudo" CDS features (fragmentary) in one given locus, # process the first one only. Remaining CDS feature(s) has the same information. # next if exists $seen{$locus}; $seen{$locus} = ''; $annot{$locus}->{'product'} = eval {($feat->get_tag_values(product))[0]} if $feat->has_tag(product); $annot{$locus}->{'gene'} = eval {($feat->get_tag_values(gene))[0]} if $feat->has_tag(gene); $annot{$locus}->{'protein_id'} = eval {($feat->get_tag_values(protein_id))[0]} if $feat->has_tag(protein_id); # If a CDS has several EC_numbers, join them into one (ex: 3.2.2.23; 4.2.99.18) # if ($feat->has_tag(EC_number)) { my @tag = $feat->get_tag_values(EC_number); $annot{$locus}->{EC_number} = join "; ", @tag; } # Extracting gene sequences (not pseudo). # If you want to process specified genes only, then use $locus variable. # if ($feat->has_tag(translation)) { my $NTsequence = $feat->seq->seq(); my $AAsequence = eval {($feat->get_tag_values(translation))[0]}; my $sequence = $NTsequence; $sequence = $AAsequence if $ARGV[1] eq 'tr'; my $product = $annot{$locus}->{'product'}; # You can add any other information to 'desc'. my $seqObj = Bio::Seq->new(-display_id => $locus, -desc => $product, -seq => $sequence ); $seqOut->write_seq($seqObj); } } } } print STDERR "[STDERR] $ARGV[0] is $num gene features\n"; print STDERR "[STDERR] $ARGV[0] is $num2 active CDS features (not marked as pseudo)\n"; # Now print the entire information! # open INFO, ">$infoFile" or die "Can't open $infoFile for writing!\n"; print STDERR "[STDERR] Feature information is being written to $infoFile (pre-existing file will be overwritten!)\n"; print INFO join "\t", '#source', 'feature', 'locus tag', 'old locus tag', 'isPseudo?', 'start', 'end', 'straind', 'stop', 'gene', 'product', 'protein id', 'EC number', 'function' . "\n"; for my $locus (keys %annot) { print INFO join "\t", $ARGV[0], 'protein-coding gene', $locus, $annot{$locus}->{'old_locus_tag'}, $annot{$locus}->{'isPseudo'}, $annot{$locus}->{'start'}, $annot{$locus}->{'end'}, $annot{$locus}->{'strand'}, $annot{$locus}->{'stop'}, $annot{$locus}->{'gene'}, $annot{$locus}->{'product'}, $annot{$locus}->{'protein_id'}, $annot{$locus}->{'EC_number'}, $annot{$locus}->{'function'} . "\n"; }
bioinfo/gbkinfo_script.txt · Last modified: 2021/03/17 13:09 by 127.0.0.1