#!/usr/local/bin/perl # # Dr. Xiaodong Bai # It may be freely distributed under GNU General Public License. # This script will parse a NCBI blastx output file and output the top N hits of each blast search result. # For each hit, the following results are reported: # accesion number, length, description, E value, bit score, query frame, query start, query end, hit start, hit end, positives, and identical # The results are tab-deliminated and ready for import into a spreadsheet program for browsing and further analysis. # use strict; use warnings; use Bio::SearchIO; local $| = 1; # Usage information die "Usage: $0 \n", if (@ARGV != 2); my ($infile,$outfile) = @ARGV; print "Parsing the BLAST result ..."; my $in = Bio::SearchIO->new(-format => 'blast', -file => $infile); open (OUT,">$outfile") or die "Cannot open $outfile: $!"; # print the header info for tab-deliminated columns print OUT "query_name\tquery_length\taccession_number\tlength\tdescription\tE value\tbit score\tframe\tquery_start\t"; print OUT "query_end\thit_start\thit_end\tpositives\tidentical\n"; # extraction of information for each result recursively while ( my $result = $in->next_result ) { # if there is no hit, go to the next result next if ( $result->num_hits == 0 ); # get the top hit my $hit = $result->next_hit; # get the E value of the hit my $e = $hit->significance; # E VALUE CRITERION GOES HERE!!! # the length of the query sequence my $qlength = $result->query_length; # get the top HSP my $hsp = $hit->next_hsp; # get the start and the end of the query sequence in the alignment my $qstart = $hsp->start('query'); my $qend = $hsp->end('query'); # FRACTION CRITERION GOES HERE!!! # get the start and the end of the hit sequence in the alignment my $hstart = $hsp->start('hit'); my $hend = $hsp->end('hit'); # get the similarity value my $conserved = sprintf("%.1f",($hsp->frac_conserved * 100)); # get the identity value my $identical = sprintf("%.1f",($hsp->frac_identical * 100)); # SIMILARITY CRITERION GOES HERE!!! # the name of the query sequence my $qname = $result->query_name; # get the accession numbers of the hits my $acc = $hit->accession; # get the lengths of the hit sequences my $hlength = $hit->length; # get the description of the hit sequences my $hdesc = $hit->description; #get the bit score of the hit my $bitscore = $hit->bits; # get the frame of the query sequence my $frame = $hsp->query->frame . "\t"; print OUT join("\t",$qname,$qlength,$acc,$hlength,$hdesc,$e,$bitscore,$frame,$qstart,$qend,$hstart,$hend,$conserved,$identical),"\n"; } close OUT; print " DONE!!!\n";