#!/usr/bin/perl -w # PPath@Cornell # Surya Saha # http://citrusgreening.org/pred_cutoff use strict; use warnings; use Getopt::Long; use POSIX; eval {require Bio::SearchIO::hmmer}; if ($@){print STDERR "Cannot find Bio::SearchIO::hmmer\n"; exit 1;} use Bio::SearchIO; if(!($^O=~ /linux/)){ print STDERR "Script has not been tested on non-linux operating systems. Exiting.."; exit 1;} =head1 NAME Pred_cutoff.pl - Add error estimates to HMMER/patser report (GFF) and prints out a new GFF file. =head1 SYNOPSIS % Pred_cutoff.pl --file seq.fna --copies copies --engine HMMER/patser --report report.gff =head1 DESCRIPTION This script reads in a HMMER (for HMM search) or patser (for PWM search) report file in GFF format. The appropriate search engine (HMMER or patser) is then run against artificial genomes created using Create_art_genomes.pl script. Error estimates are computed for each prediction and the predictions are reported in GFF format. =head2 NOTES Please make sure this script is run in the directory containing the genomes directory created by Create_art_genomes.pl. This script requires patser (http://ural.wustl.edu/patser-v3e.1.tar.gz) and HMMER 2.3.2 (ftp://selab.janelia.org/pub/software/hmmer/2.3.2/hmmer-2.3.2.tar.gz) to be installed and accessible. We do not recommend using HMMER 3.0 since it is not optimized for DNA/DNA comparisons. You need to have the BioPerl library installed and accessible (http://www.bioperl.org/wiki/Installing_BioPerl). This script has been tested on Linux. =head1 COMMAND-LINE OPTIONS Command-line options can be abbreviated to single-letter options, e.g. -f instead of --file. Some options are mandatory (see below). --file Fasta file whole genome DNA sequence (required) --copies Number of artificial genomes to create (required) --engine Tool used to create the report (HMMER or patser) (required) --model Model created from motifs. Hidden Markov Model (.hmm) for HMMER or Position Weight Matrix(.pwm) for patser --report File name of GFF file generated by Search_pred.pl on whole genome DNA sequence (required) --domE Domain E-value cutoff for hmmsearch on artificial genomes. Default is 0.3. All predictions with lower E-values will be recorded. --score Score cutoff for patser on artificial genomes. Default is 1. All predictions with higher scores will be recorded. --verbose Print progress messages =head1 AUTHOR Surya Saha, ss2489@cornell.edu =cut my ($i,$file,$copies,$engine,$report,$model,$verbose,$domE,$pscore); GetOptions ( 'file=s' => \$file, 'copies=s' => \$copies, 'engine=s' => \$engine, 'report=s' => \$report, 'model:s' => \$model, 'domE:f' => \$domE, 'score:i' => \$pscore, verbose => \$verbose) or (system('pod2text',$0), exit 1); # defaults and checks defined($file) or (system('pod2text',$0), exit 1); if (!(-e $file)){print STDERR "$file not found: $!\n"; exit 1;} defined($copies) or (system('pod2text',$0), exit 1); defined($engine) or (system('pod2text',$0), exit 1); if(($engine ne 'HMMER') && ($engine ne 'patser')){print STDERR "Incorrect engine name supplied. $engine should be HMMER or patser.\n"; exit 1;} if($verbose){print STDERR "Setting engine type to $engine\n";} defined($report) or (system('pod2text',$0), exit 1); if (!(-e $report)){print STDERR "$report not found: $!\n"; exit 1;} if($engine eq 'HMMER'){ $model ||= "$file.hmm";} elsif($engine eq 'patser'){ $model ||= "$file.pwm";} if (!(-e $model)){print STDERR "$model not found: $!\nPlease supply Hidden Markov Model (.hmm) for HMMER or Position Weight Matrix(.pwm) for patser on the command line\n"; exit 1;} if($verbose){print STDERR "Setting model name to $model\n";} if (!(-e 'genomes')){print STDERR "genomes directory containing artificial genomes not found: $!\n"; exit 1;} if($engine eq 'HMMER'){ defined($domE) or ($domE=0.3); if($verbose){print STDERR "Setting hmmsearch domain E value cutoff to $domE\n";}} elsif($engine eq 'patser'){ defined($pscore) or ($pscore=1); if($verbose){print STDERR "Setting patser score cutoff to $pscore\n";}} for $i (1..$copies){ if (!(-e 'genomes/'.$i.'.'.$file)){print STDERR "genomes/$i.$file not found: $!\n"; exit 1;} } # Supporting functions # get the complement sub comp{ my $DNA; $DNA=$_[0]; $DNA=~ s/\s*//g;# clean it $DNA=~ tr/ACGTacgt/TGCAtgca/; return $DNA; } # Prep work my ($rec,%counts,$ctr,$j,$k,@temp,$seq,$mlen,$highctr,$lowctr); $i=localtime(); if (-e 'out'){rename 'out',"out_Pred_cutoff_$i"; unlink glob "out/* out/.*";rmdir ("out");} mkdir ('out', 0755) or warn "Cannot make out directory: $!\n"; unless(open(IN,$file)){print "not able to open $file\n\n";exit 1;} $seq=''; while ($rec=){ if ($rec=~ /^>/){ next;} else{ chomp $rec; $seq=$seq.$rec;} } close(IN); $highctr=$lowctr=0; #run engine on artificial genomes if($engine eq 'patser'){ #create alphabet file if($verbose){print STDERR "Creating alphabet file for patser..\n";} $i= length $seq; $counts{'A'}=($seq =~ tr/A//); if($counts{'A'}==0){ $counts{'A'}=($seq =~ tr/a//);} $counts{'T'}=($seq =~ tr/T/X/); if($counts{'T'}==0){ $counts{'T'}=($seq =~ tr/t//);} $counts{'G'}=($seq =~ tr/G/Y/); if($counts{'G'}==0){ $counts{'G'}=($seq =~ tr/g//);} $counts{'C'}=($seq =~ tr/C/Z/); if($counts{'C'}==0){ $counts{'C'}=($seq =~ tr/c//);} if (-e "$file.alphabet"){ unlink "$file.alphabet" or warn "cannot delete old alphabet file: $!\n";} unless(open(OUT,">$file.alphabet")){print "not able to open $file.alphabet\n";exit 1;} $j=$counts{'A'}+$counts{'T'}; $j=sprintf("%.1f", $j/$i); print OUT "a:t $j\n"; $j=$counts{'G'}+$counts{'C'}; $j=sprintf("%.1f", $j/$i); print OUT "g:c $j\n"; close (OUT); %counts=(); #create patser compatible artificial genome files, run patser on them and get counts from reports for $i (1..$copies){ unless(open(IN,"genomes/$i.$file")){print "not able to open $i.$file\n\n";exit 1;} unless(open(OUT,">genomes/$i.$file.temp")){print "not able to open $i.$file.temp\n\n";exit 1;} while($rec=){ if($rec =~ /^>/){ print OUT "temp \\\n";} else {print OUT $rec;} } print OUT "\\"; close (OUT); close (IN); if($verbose){print STDERR "Running patser on artificial genome $i ..";} $k=system "patser-v3e -m $model -a $file.alphabet -f genomes/$i.$file.temp -c -ls $pscore > out/$i.$file.patser.out"; if($k!=0){ print STDERR "patser execution failed: $?\n"; exit 1;} unlink "genomes/$i.$file.temp"; if($verbose){print STDERR " getting results..";} unless(open(IN,"out/$i.$file.patser.out")){print "not able to open out/$i.$file.patser.out\n\n";exit 1;} while($rec=){ if($rec =~ /^ temp/){ @temp = split(' ',$rec); if (exists $counts{sprintf("%.2f", $temp[4])}){ $counts{sprintf("%.2f", $temp[4])}++;} else{ $counts{sprintf("%.2f", $temp[4])}=1;} @temp=(); #NOTE we will have some 0 counts } } close (IN); if($verbose){ print STDERR ".\n";} } for($j=0.00;$j<=30.00;$j+=0.01){#fix 0 counts $j=sprintf("%.2f", $j); if(!(exists $counts{$j})){ $counts{$j}=0;} } unlink glob 'out/*.out'; rmdir ('out'); unlink "$file.alphabet"; #make new gff file with predictions and error estimates if($verbose){print STDERR "\nCreating GFF file with predictions..\n\n";} unless(open(OUT,">err-est.$report")){print "not able to open err-est.$report\n";exit 1;} unless(open(PSGFF, $report)){print "not able to open $report\n";exit 1;} while($rec=){ if($rec =~ /^#/){ print OUT $rec; next;} else{ chomp $rec; @temp = split("\t",$rec); print OUT $rec; print OUT " Err_rate ",sprintf("%.2f",($counts{$temp[5]}/$copies)),";\n"; if(($counts{$temp[5]}/$copies) > 1){$highctr++;} else{$lowctr++;} @temp=(); } } close (PSGFF); close (OUT); } elsif($engine eq 'HMMER'){ #run hmmsearch on artificial genomes and get counts from reports my($result,$hit,$hsp); for $i (1..$copies){ if($verbose){print STDERR "Running hmmsearch on artificial genome $i ..";} #pos strand $k=system "hmmsearch --domE $domE $model genomes/$i.$file > out/$i.$file.hmmsearch.pos.out"; if($k!=0){ print STDERR "hmmsearch execution failed: $?\n"; exit 1;} #comp strand unless(open(OUT,">genomes/comp.$i.$file")){print "not able to open genomes/comp.$i.$file\n";exit 1;} print OUT ">comp_seq\n"; print OUT &comp($seq); close(OUT); $k=system "hmmsearch --domE $domE $model genomes/comp.$i.$file > out/$i.$file.hmmsearch.comp.out"; if($k!=0){ print STDERR "hmmsearch execution failed: $?\n"; exit 1;} unlink "genomes/comp.$i.$file"; if($verbose){print STDERR " getting results..";} #read in hmmsearch report for artificial genomes and increment %counts (pos and comp) $j=Bio::SearchIO->new(-format => 'hmmer',-file => "out/$i.$file.hmmsearch.pos.out"); while($result=$j->next_result()){# class of $result: Bio::Search::Result::HMMERResult # print STDERR "class of \$result: ",ref($result),"\n"; #print STDERR $result->query_name, " for HMM ", $result->hmm_name, "\n"; #RirA_Rodionov2006.fas for HMM RirA_NC_012985.hmm [RirA_Rodionov2006.fas] while ($hit = $result->next_hit()){# class of $hit: Bio::Search::Hit::HMMERHit while ($hsp = $hit->next_hsp()){# class of $hsp: Bio::Search::HSP::HMMERHSP if (exists $counts{$hsp->score()}){ $counts{$hsp->score()}++;} else{ $counts{$hsp->score()}=1;} } } } if($verbose){print STDERR '.';} $j=Bio::SearchIO->new(-format => 'hmmer',-file => "out/$i.$file.hmmsearch.comp.out"); while($result=$j->next_result()){# class of $result: Bio::Search::Result::HMMERResult while ($hit = $result->next_hit()){# class of $hit: Bio::Search::Hit::HMMERHit while ($hsp = $hit->next_hsp()){# class of $hsp: Bio::Search::HSP::HMMERHSP if (exists $counts{$hsp->score()}){ $counts{$hsp->score()}++;} else{ $counts{$hsp->score()}=1;} } } } if($verbose){print STDERR ".\n";} } for($j=0.0;$j<=30.0;$j+=0.1){#fix 0 counts $j=sprintf("%.1f", $j); if(!(exists $counts{$j})){ $counts{$j}=0;} } unlink glob 'out/*.out'; rmdir ('out'); #read in hmmsearch report GFF for actual genome, make new gff file with error estimates if($verbose){print STDERR "\nCreating GFF file with predictions..\n\n";} unless(open(OUT,">err-est.$report")){print "not able to open err-est.$report\n";exit 1;} unless(open(HMMGFF, $report)){print "not able to open $report\n";exit 1;} while($rec=){ if($rec =~ /^#/){ print OUT $rec; next;} else{ chomp $rec; @temp = split("\t",$rec); print OUT $rec; $temp[5]=~ s/00000$//; print OUT " Err_rate ",sprintf("%.2f",($counts{$temp[5]}/$copies)),";\n"; if(($counts{$temp[5]}/$copies) > 1){$highctr++;} else{$lowctr++;} @temp=(); } } close(HMMGFF); close (OUT); } if($verbose){ print STDERR "Predictions with error rate < 1.0 : $lowctr\n"; print STDERR "Predictions with error rate > 1.0 : $highctr\n"; my($user_t,$system_t,$cuser_t,$csystem_t); ($user_t,$system_t,$cuser_t,$csystem_t) = times; print STDERR "\n\nSystem time for process: $system_t\n"; print STDERR "User time for process: $user_t\n"; }