package CPDPredict;
=head1 NAME
CPDPredict
=head1 SYNOPSIS
use CPDPredict;
CPDPredict::loadPosteriorValues("CPDPosterior.txt");
my ($minDistance, $candidateCompensations) =
CPDPredict::findCPD(
\@alignedSequences,
$pos, $aa1, $aa2);
die "$aa1$pos$aa2 isn't found in the alignment!" unless
$CPDprobability;
my $CPDprobability =
CPDPredict::getPosteriorProbability($minDistance);
print "Probability $aa1$pos$aa2 is a CPD: $CPDprobability\n";
print "\n";
print "Candidate compensations for $aa1$pos$aa2:\n";
print "pos\taa1\aa2\tnumSpecies\n";
foreach my $candidate (@$candidateCompensations) {
print join "\t", @$candidate;
}
=head1 DESCRIPTION
Module containing functions to predict the properties of CPDs.
=head1 AUTHOR
d m jordanus scripsit MMXIV
=head1 METHODS
=cut
use Set::Scalar;
use Scalar::Util qw(looks_like_number);
use Carp;
use version; our $VERSION = version->declare("v1.0.0");
our %POSTERIOR_TABLE;
sub findCPD {
=head2 C<findCPD(\@alignedSequences, $pos, $aa1, $aa2)>
Takes four arguments:
=over 12
=item An arrayref containing a list of aligned sequences as strings of
one-letter amino acid codes.
=item A one-based position
=item The reference amino acid, as a one-letter code (used for sanity
checking, will fail if this doesn't agree with the first sequence in
the alignment)
=item The variant amino acid, also as a one-letter code
If the variant is not found in the alignment, returns nothing. If the
variant is found in the alignment, returns two values:
=over 12
=item The minimum distance to the CPD in the alignment
=item An arrayref containing list of candidate compensations. Each
candidate is formatted as an arrayref containing four values: pos,
aa1, aa2, and numSpecies. Compensations are sorted by number of
compensations in descending order, so that the best candidates are at
the top of the list. The aa2 value may be a string containing multiple
amino acids.
=back
When called in scalar context, just returns the first value.
=cut
my $alignedSequences = shift;
my $pos = shift;
my $aa1 = shift;
my $aa2 = shift;
# validate input
my $humanSequence = $alignedSequences->[0];
croak "No sequences in alignment?" unless $humanSequence;
for (my $i; $i < @$alignedSequences; $i++) {
croak "Length of aligned sequence $i (" .
length($alignedSequences->[$i])
. ") does not match length of reference
sequence ("
. length($humanSequence) . ")"
unless length($alignedSequences->[$i]) ==
length($humanSequence);
}
croak "Invalid pos $pos given (must be between 0 and "
. length($humanSequence) . ")"
unless $pos > 0
and $pos <= length($humanSequence);
for my $aa ($aa1, $aa2) {
croak "Invalid amino acid $aa" unless $aa =~ /
^[ACDEFGHIKLMNPQRSTVWY]$/;
}
my $human_aa = substr($humanSequence, $pos - 1, 1);
croak "AA1 ($aa1) does not correspond to reference allele
($human_aa)"
unless $aa1 eq $human_aa;
# get sequences containing variant aa
my @variantSeqs = grep {
substr($_, $pos - 1, 1) eq $aa2;
} @$alignedSequences;
return unless @variantSeqs;
# collect sites with differences
my $minDist = 1.0;
my %candidateSitesCounts;
my %candidateSitesAAs;
SEQUENCE: foreach my $seq (@variantSeqs) {
my $differences = 0;
my $alignedPositions = 0;
LETTER: for (my $i = 1; $i <= length($seq); $i++) {
$refLetter = substr($humanSequence, $i - 1,
1);
$thisLetter = substr($seq, $i - 1, 1);
next LETTER unless $thisLetter =~ /
^[ACDEFGHIKLMNPQRSTVWY]$/
and $refLetter =~ /
^[ACDEFGHIKLMNPQRSTVWY]$/;
# only get here if position is aligned
$alignedPositions += 1;
next LETTER if $thisLetter eq $refLetter or
$i == $pos;
# only get here if position is aligned and
mismatched
$differences += 1;
$candidateSitesCounts{$i} += 1;
if ($candidateSitesCounts{$i} == 1) {
$candidateSitesAAs{$i} =
Set::Scalar->new($thisLetter);
} else {
$candidateSitesAAs{$i}-
>insert($thisLetter);
}
}
my $dist = $differences / $alignedPositions;
$minDist = $dist if $dist < $minDist;
}
return $minDist unless wantarray;
# don't need to collect candidates if caller only expects
minDist
my @orderedCandidateSites = sort {
$candidateSitesCounts{$b} <=>
$candidateSitesCounts{$a} ||
$a <=> $b;
} keys(%candidateSitesCounts);
my @returnCandidates;
foreach my $site (@orderedCandidateSites) {
push(@returnCandidates, [$site,
substr($humanSequence, $site - 1, 1),
join('',$candidateSitesAAs{$site}->elements),
$candidateSitesCounts{$site}]);
}
return $minDist, \@returnCandidates;
}
sub loadPosteriorValues {
=head2 C<loadPosteriorValues($filename)>
Loads the table of posterior values in $filename into the
%CPDPredict::POSTERIOR_TABLE variable. Expects a tab-delimited file in
two columns, with the first column being sequence distance and the
second being the posterior value. Returns the loaded %POSTERIOR_TABLE
hash in list context, and the number of loaded values in scalar
context. If posteriors are already loaded, warns and overwrites
existing values.
=cut
carp "posteriors already loaded! overwriting" if
%POSTERIOR_TABLE;
%POSTERIOR_TABLE = ();
my $filename = shift;
open (my $fh, "<", $filename) or croak "Can't open file
$filename: $!";
while (<$fh>) {
next if (/^#/);
my ($x, $y) = split '\t';
if (looks_like_number($x) and looks_like_number($y))
{
$POSTERIOR_TABLE{$x} = $y;
} else {
carp "perl doesn't think ($x, $y) is a pair
of numbers, skipping";
}
}
wantarray ? %POSTERIOR_TABLE : scalar keys %POSTERIOR_TABLE;
}
sub getPosteriorProbability {
=head2 C<getPosteriorProbability($minDistance)>
Returns the posterior probability of a variant being a CPD based on
the minimum sequence distance found in the alignment. Specifically,
looks up the passed value in %CPDPredict::POSTERIOR_TABLE, and uses
linear interpolation between values in the hash if it's not there.
Must call C<loadPosteriorValues()> to load values first.
=cut
# validate input
croak "called getPosteriorProbability() without first loading
posteriors"
unless %POSTERIOR_TABLE;
$minDistance = shift;
carp "negative sequence distance makes no sense" if
$minDistance < 0;
carp "sequence distance greater than 1 makes no sense" if
$minDistance > 1;
# return the exact value if we can
return $POSTERIOR_TABLE{$minDistance}
if defined $POSTERIOR_TABLE{$minDistance};
# otherwise, find the closest two values with a linear search
# I may recode to use binary search if this is too slow
my @x_values = sort keys %POSTERIOR_TABLE;
# deal with special cases
return 0.0 if @x_values[0] > $minDistance;
return 0.1 if @x_values[-1] < $minDistance;
# shift values off the front of @x_values
# until the first two values are the values we're
interpolating between
while ($x_values[1] < $minDistance) {
shift @x_values;
}
# extract values to interpolate between
my $x0 = $x_values[0];
my $x1 = $x_values[1];
my $y0 = $POSTERIOR_TABLE{$x0};
my $y1 = $POSTERIOR_TABLE{$x1};
# do interpolation
my $interpolation_slope = ($y1 - $y0) / ($x1 - $x0);
my $dx = $minDistance - $x0;
return $y0 + $interpolation_slope * dx;
}
1;

created 2 years ago

Perl Online Compiler

Write, Run & Share Perl code online using OneCompiler's Perl online compiler for free. It's one of the robust, feature-rich online compilers for Perl language, running on the latest version 5.22.1. Getting started with the OneCompiler's Perl compiler is simple and pretty fast. The editor shows sample boilerplate code when you choose language as Perl and start coding.

Taking inputs (stdin)

OneCompiler's Perl online editor supports stdin and users can give inputs to programs using the STDIN textbox under the I/O tab. Following is a sample Perl program which takes name as input and prints hello message with your name.

my $name = <STDIN>;             
print "Hello $name.\n";

About Perl

Perl(Practical Extraction and Report Language) is especially desined for text processing by Larry Wall.

Key features

Cross-platform
Efficient for mission critical applications.
Open-source
Supports both procedural and object-oriented programming.
Perl interpreter is embeddable with other systems.
Loosely typed language

Syntax help

Data types

There is no need to specify the type of the data in Perl as it is loosely typed language.

Type	Description	Usage
Scalar	Scalar is either a number or a string or an address of a variable(reference)	$var
Arrays	Array is an ordered list of scalars, you can access arrays with indexes which starts from 0	@arr = (1,2,3)
Hash	Hash is an unordered set of key/value pairs	%ul = (1,'foo', 2, 'bar)

Variables

In Perl, there is no need to explicitly declare variables to reserve memory space. When you assign a value to a variable, declaration happens automatically.

$var-name =value; #scalar-variable
@arr-name = (values); #Array-variables
%hashes = (key-value pairs); # Hash-variables

Loops

1. If family:

If, If-else, Nested-Ifs are used when you want to perform a certain set of operations based on conditional expressions.

If

if(conditional-expression){    
//code    
}

If-else

if(conditional-expression){  
//code if condition is true  
}else{  
//code if condition is false  
}

Nested-If-else

if(condition-expression1){  
//code if above condition is true  
}else if(condition-expression2){  
//code if above condition is true  
}  
else if(condition-expression3){  
//code if above condition is true  
}  
...  
else{  
//code if all the conditions are false  
}

2. Switch:

There is no case or switch in perl, instead we use given and when to check the code for multiple conditions.

given(expr){    
when (value1)  
{//code if above value is matched;}    
when (value2)  
{//code if above value is matched;}   
when (value3)  
{//code if above value is matched;}  
default  
{//code if all the above cases are not matched.}     
}

3. For:

For loop is used to iterate a set of statements based on a condition.

for(Initialization; Condition; Increment/decrement){  
  // code  
}

4. While:

While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.

while(condition) {  
 // code 
}

5. Do-While:

Do-while is also used to iterate a set of statements based on a condition. It is mostly used when you need to execute the statements atleast once.

do {
  // code 
} while (condition);

Sub-routines

Sub-routines are similar to functions which contains set of statements. Usually sub-routines are written when multiple calls are required to same set of statements which increases re-usuability and modularity.

How to define a sub-routine

sub subroutine_name 
{
	# set of Statements
}

How to call a sub-routine

subroutine_name();
subroutine_name(arguments-list); // if arguments are present