package TrypticDigestor; use 5.008; use strict; use warnings; use SBEAMS::Proteomics::AvgMolWgt::AvgMolWgt; require Exporter; our @ISA = qw(Exporter); # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. # This allows declaration use TrypticDigestor ':all'; # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK # will save memory. our %EXPORT_TAGS = ( 'all' => [ qw( ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw( ); our $VERSION = '0.01'; # Preloaded methods go here. ####################################################################### # digestSequence - tryptically digest a sequence # # @param sequence is protein sequence to digest # @param n_allowed_missed_cleavages is number of allowed missed cleavages # @param min_avg_mol_wgt is lower limit of peptide avg mol wgt to keep # @param max_avg_mol_wgt is upper limit of peptide avg mol wgt to keep # @return hash with key = peptide sequence # value = avg mol wgt ####################################################################### sub digestSequence { my %args = @_; my $sequence= $args{'sequence'} || "die need digestSequence needs sequence"; my $n_allowed_missed_cleavages = $args{'n_allowed_missed_cleavages'} || "0"; my $min_avg_mol_wgt = $args{'min_avg_mol_wgt'} || "0"; my $max_avg_mol_wgt = $args{'max_avg_mol_wgt'} || "40000"; ## chomp sequence, and remove trailing "*" if present: chomp($sequence); #$sequence =~ s/^(.*)\*$/$1/g; ## changing to remove all occurences of "*" as SGD sequences have them embedded $sequence =~ s/\*//g; ## length of sequence: my $len = length($sequence); ## hash to fill: my %peptide_mass_hash; ## array to hold positions of cleavages in sequence: my @tryptic_positions; ######## get array of K and R positions, excluding when before a proline: ############## for (my $i=0; $i <= $len - 1; $i++) { my $amino_acid = substr($sequence, $i, 1); if ( ($amino_acid eq "K") || ($amino_acid eq "R") ) { my $next_amino_acid; ## handle over-running sequence: if ( $i == $len - 1) { $next_amino_acid = ""; } else { $next_amino_acid = substr($sequence, $i+1, 1); } unless ( ($next_amino_acid eq "P") || ($next_amino_acid eq "") ) { push(@tryptic_positions, $i); } } } ########### In chops below, handling: #################### ## (1) sub-sequence from first letter to first cleavage allowing $n_allowed_missed_cleavages ## (2) sub-sequence from second cleavage to last cleavage allowing $n_allowed_missed_cleavages ## (3) sub-sequence from last cleavage to last letter my $first_amino_acid_position; my $peptide; my $avgMolWgt; for (my $i=0; $i <= $#tryptic_positions; $i++) { my $nmc = $n_allowed_missed_cleavages; ## params for case (1) if ($i == 0) { ## position of first amino-acid letter in protein sequence: $first_amino_acid_position = 0; } else { ## params for case (2) ## position of (value of last index) + 1 $first_amino_acid_position = ($tryptic_positions[$i-1]) + 1; } ## params for case (3) ## if on last tryptic position, don't look further for missed cleavages: if ( $i == $#tryptic_positions) { $nmc = 0; } ## parse out peptides given params of cases, and get their avg mol wgts: for (my $j = 0; $j <= $nmc; $j++) { if ( $tryptic_positions[$i + $j] < $len) { my $seq_length = $tryptic_positions[$i + $j] - $first_amino_acid_position + 1; if ( $seq_length > 0 ) { $peptide = substr($sequence, $first_amino_acid_position, $seq_length); ## now get avg mass of peptide sequence: $avgMolWgt = AvgMolWgt::calcAvgMolWgt( sequence => $peptide ); ## if between lower and upper mass limits, store it in hash: if ( ($avgMolWgt >= $min_avg_mol_wgt) && ($avgMolWgt <= $max_avg_mol_wgt) ) { $peptide_mass_hash{$peptide} = $avgMolWgt; } ## print "$peptide:$first_amino_acid_position:$tryptic_positions[$i + $j]\n"; } } } } ## handling the last case, trailing peptide sequence: ## if tryptic_positions[$#tryptic_positions] is last character, done, else get trailing ## peptide if ( $#tryptic_positions > 0) { my $seq_length = $len - $tryptic_positions[$#tryptic_positions] + 1; if ( $seq_length > 0 ) { $first_amino_acid_position = ($tryptic_positions[$#tryptic_positions]) + 1; $peptide = substr($sequence, $first_amino_acid_position, $seq_length); $avgMolWgt = AvgMolWgt::calcAvgMolWgt( sequence => $peptide ); ## if between lower and upper mass limits, store it in hash: if ( ($avgMolWgt >= $min_avg_mol_wgt) && ($avgMolWgt <= $max_avg_mol_wgt) ) { $peptide_mass_hash{$peptide} = $avgMolWgt; ## print "$peptide\n"; } } } return %peptide_mass_hash; } 1; __END__ # Below is stub documentation for your module. You'd better edit it! =head1 NAME TrypticDigestor - Perl extension for insilico tryptic digestion of a sequence, allowing n missed cleavages =head1 SYNOPSIS use TrypticDigestor; my %peptide_hash = TrypticDigestor::digestSequence( sequence => $sequence, n_allowed_missed_cleavages => '1', min_avg_mol_wgt = 200, max_avg_mol_wgt = 5000 ); returns a hash with with key = tryptically digested peptide, value = avg mol wgt =head1 ABSTRACT Module to parse a protein sequence into peptides following tryptic digestion rules and filtered by supplied parameters. =head1 DESCRIPTION Module to parse a protein sequence into peptides following tryptic digestion rules and filtered by supplied parameters. Tryptic digest rules: -- cleaves on K's and R's, cleaving on C-terminus -- exception: no cleavage with a trailing N-terminus P (proline) so not cleaving on KP nor RP Additional parameters accepted by the digestSequence method are: n_allowed_missed_cleavages, min_avg_mol_wgt, and max_avg_mol_wgt. The parameters default to the following if not specified: n_allowed_missed_cleavages = 0 min_avg_mol_wgt = 0 max_avg_mol_wgt = 40,000 [for QStar...] =head2 EXPORT None by default. =head1 AUTHOR Nichole King =head1 SEE ALSO uses AvgMolWgt L. =cut