{package SBEAMS::Glycopeptide::Glyco_peptide_load; # Module handles loading peptide/protein information into the unipep # database (AKA glycopeptide). Due to a change in input file format # there are currently some unused subroutines, will cull when possible. use strict; use File::Basename; use File::Find; use File::stat; use Data::Dumper; use Carp; use FindBin; use POSIX qw(strftime); use Benchmark; use SBEAMS::Connection qw($log); use SBEAMS::Connection::Tables; use SBEAMS::Connection::Settings; use SBEAMS::Glycopeptide::Tables; use SBEAMS::Glycopeptide; my $module = SBEAMS::Glycopeptide->new(); ####################################################### # Constructor ####################################################### sub new { my $class = shift; my %args = @_; my $sbeams = $args{sbeams} || SBEAMS::Connection->new(); my $verbose = $args{verbose} || 0; my $debug = $args{debug}; my $testonly = $args{testonly}; my $file = $args{file}; my $release = $args{release} || $args{file}; my $ambiguous = $args{ambiguous} || ''; my $self = { _file => $file, _release => $release, _ambiguous => $ambiguous, }; bless $self, $class; $self->setSBEAMS($sbeams); $self->verbose($verbose); $self->debug($debug); $self->testonly($testonly); return $self; } ############################################################################### # Cache the main SBEAMS object ############################################################################### sub setSBEAMS { my $self = shift; my $sbeams = shift; $self->{_SBEAMS} = $sbeams; } ############################################################################### # Provide the main SBEAMS object ############################################################################### sub getSBEAMS { my $self = shift; return $self->{_SBEAMS}; } ############################################################################### # Get/Set the VERBOSE status # # ############################################################################### sub verbose { my $self = shift; my $verbose = shift; if (defined $verbose){ #it's a setter $self->{_VERBOSE} = $verbose; } return $self->{_VERBOSE}; } ############################################################################### # Get/Set the DEBUG status # # ############################################################################### sub debug { my $self = shift; if (@_){ #it's a setter $self->{_DEBUG} = $_[0]; }else{ #it's a getter $self->{_DEBUG}; } } ############################################################################### # Get/Set the TESTONLY status # # ############################################################################### sub testonly { my $self = shift; if (@_){ #it's a setter $self->{_TESTONLY} = $_[0]; }else{ #it's a getter return $self->{_TESTONLY} || 0; } } ############################################################################## # Get file to parse # # ############################################################################### sub getfile { my $method = 'getfile'; my $self = shift; return $self->{_file}; } sub insert_ipi_db { my $self = shift; my %args = @_; my @all_data = (); my $file = $self->getfile(); open DATA, $file || die "Unable to open file $file $!\n"; my %heads; my $count = 0; my $insert_count = 1; my $t0 = new Benchmark; my %stats; my $sbeams = $self->getSBEAMS(); my %time; # Hash of info for IPI records my %file_data; my %seq_to_accession; $self->{_seq_to_id} = {}; print "Processing file\n"; while() { chomp; my @tokens = split( /\t/, $_, -1); $count ++; # populate global hash of col_name => col_index. if ($count == 1){ # Make them lower case for consistancy @tokens = map {lc($_)} @tokens; # Build header index hash @heads{@tokens} = 0..$#tokens; # See if col headers have changed $self->checkHeaders(\%heads); # Everything has checked out, start inserts. # Does this version already exist? $self->check_version(%args); next; } # print progress 'bar' unless ( $count % 100 ){ print '*'; # print "\n"; last; # Short circuit! } unless ( $count % 5000 ){ my $t1 = new Benchmark; my $time = timestr(timediff($t1, $t0)); $time =~ s/^[^\d]*(\d+) wallclock secs.*/$1/; my $verb = ( $args{testonly} ) ? 'Found' : 'Processed'; print "$verb $count records, $time seconds\n"; } my $ipi = $tokens[$heads{'ipi'}]; if ( $file_data{$ipi} ) { $ipi = $ipi . '__' . $sbeams->getRandomString(num_chars=>12); # print "duplicate id changed to $ipi\n"; } if ( $stats{$ipi} || $self->check_ipi($ipi) ) { $stats{$ipi}++; } else { $stats{$ipi}++; $stats{total}++; my $motif = 'N[^P][ST]'; my $sites = $module->get_site_positions( seq => $tokens[$heads{'protein sequences'}], pattern => $motif, index_base => 1); # Get theoretical digestion of protein my $peptides = $module->do_tryptic_digestion( aa_seq => $tokens[$heads{'protein sequences'}], flanking => 1 ); foreach my $pep ( @$peptides ) { my $match = $module->clean_pepseq( $pep ); $match = substr( $match, 0, 900 ) if length($match) > 900; # hash peptide sequence to ipi_data_id info $self->{_seq_to_id}->{uc($match)} ||= {}; $self->{_seq_to_id}->{uc($match)}->{$ipi}++; } $file_data{$ipi} = { rowdata => \@tokens, peptides => $peptides, sites => $sites }; } # last if $stats{total} > 100; } # End file reading loop print "Read $count records\n"; # return if $args{testonly}; # Rubber, meet road... print "Inserting data\n"; $sbeams->initiate_transaction(); my $commit_interval = 20; # Insert version record $self->add_ipi_version( %args ) unless $args{testonly}; $count = 0; for my $acc ( sort(keys( %file_data) ) ) { $count++; # print progress 'bar' unless ( $count % 100 ){ print '*'; } unless ( $count % 5000 ){ my $t1 = new Benchmark; my $time = timestr(timediff($t1, $t0)); $time =~ s/^[^\d]*(\d+) wallclock secs.*/$1/; my $verb = ( $args{testonly} ) ? 'loaded' : 'processed'; print "$verb $count records, $time seconds\n"; } my $tokens = $file_data{$acc}->{rowdata}; eval { my $ipi_id = $self->add_ipi_record( $tokens ); my $site_idx = $self->add_glycosites( glyco_score => $tokens->[$heads{'nxt/s score'}], ipi_data_id => $ipi_id, protein_sequence => $tokens->[$heads{'protein sequences'}], site_idx => $file_data{$acc}->{sites} ); my $original_acc = $acc; $original_acc =~ s/__.*$//; if ( $acc ne $original_acc ) { # print "Changed $acc back to $original_acc\n"; } $self->add_predicted_peptides( sequence => $tokens->[$heads{'protein sequences'}], ipi_data_id => $ipi_id, site_idx => $file_data{$acc}->{sites}, ipi => $original_acc, peptides => $file_data{$acc}->{peptides} ); }; if ( $@ ) { $sbeams->rollback_transaction(); die( "$@" ); } else { # Space commits to once per n sequences unless ( $count % $commit_interval ) { $sbeams->commit_transaction(); $sbeams->initiate_transaction(); } } } $sbeams->commit_transaction(); # $stats{max} = ( $stats{$ipi} > $stats{max} ) ? $stats{$ipi} : $stats{max}; # for my $k ( keys(%time) ) { print "$k => $time{$k}\n"; } # $count -= 1; # print "\n\n$stats{total} unique ipi entries in $count total rows (max count was $stats{max})\n"; # my $t2 = new Benchmark; # print "\nFinished in " . timestr(timediff($t2, $t0)) . "\n"; } ############################################################################### # process_data_file # ############################################################################### sub process_data_file { my $self = shift; my %args = @_; $args{load_peptides} = 1 if !defined $args{load_peptides}; my @all_data = (); my $file = $self->getfile(); open DATA, $file || die "Unable to open file $file $!\n"; my %heads; my $count = 0; my $insert_count = 1; my $t0 = new Benchmark; while() { chomp; my @tokens = split( /\t/, $_, -1); # populate global hash of col_name => col_index. if ($count == 0){ # Make them lower case for consistancy @tokens = map {lc($_)} @tokens; # Build header index hash @heads{@tokens} = 0..$#tokens; $count ++; # See if col headers have changed $self->checkHeaders(\%heads); } $count ++; # print progress 'bar' unless ( $count % 100 ){ print '*'; } unless ( $count % 5000 ){ my $t1 = new Benchmark; my $time = timestr(timediff($t1, $t0)); $time =~ s/^[^\d]*(\d+) wallclock secs.*/$1/; my $verb = ( $args{testonly} ) ? 'Found' : 'Processed'; print "$verb $count records, elapsed time $time seconds\n"; } $self->add_ipi_record( \@tokens ) unless $self->check_ipi($tokens[$heads{'ipi'}]); my $glyco_pk = $self->add_glyco_site( \@tokens ); if ( $tokens[$heads{'predicted tryptic nxt/s peptide sequence'}] ) { $self->add_erpdicted_peptide( glyco_pk => $glyco_pk, line_parts => \@tokens); } # add identifed peptide iff there is one. if ( $args{load_peptides} && $tokens[$heads{'identified sequences'}] ) { $self->add_identified_peptides( glyco_pk => $glyco_pk, line_parts => \@tokens); } } if ( $args{testonly} ) { print "\nIPI file had correct headers, $count total rows\n"; exit; } my $t2 = new Benchmark; my $pcnt = $self->{_id_peps}; print "\nTotal peptides was " . scalar(keys(%$pcnt) ) ."\n"; print "\n\nLoaded $count total records in " . timestr(timediff($t2, $t0)) . "\n"; } sub checkHeaders { my $self = shift; my $heads = shift; my %heads = %$heads; my @known_columns = ( 'IPI', 'Protein Name', 'Protein Sequences', 'Protein Symbol', 'Swiss-Prot', 'Summary', 'Synonyms', 'Protein Location', 'signalP', 'TM', 'TM location', 'Peptide IPI', 'NXT/S Location', 'NXT/S Score', 'Predicted Tryptic NXT/S Peptide Sequence', 'Predicted Peptide Mass', 'Database Hits', 'Database Hit IPIs', 'Min Similarity Score', 'Detection Probability', 'Identified Sequences', 'Tryptic Ends', 'Peptide ProPhet', 'Identified Peptide Mass', 'Identified Tissues', 'Number Observations', ); @known_columns = map { lc($_) } @known_columns; my @current_cols = keys(%heads); for my $curr_col ( @current_cols ) { print "Checking $curr_col\n"; unless( grep /$curr_col/, @known_columns ) { die "Column $curr_col is not known by the parser\n"; } } for my $parser_col ( @known_columns ) { unless( grep /$parser_col/, @current_cols ) { die "Column $parser_col is missing in this file\n"; } } # We got past the checks, cache the header values. $self->{_heads} = \%heads; return 1; } =head1 example columns 0 IPI => IPI00015102 1 Protein Name => CD166 antigen precursor 2 Protein Sequences => MESKGASSCRLLFCLLISATVFRPGLGWYTVNSAYGDTIIIPCRLDVPQNLMFGKWKYEKPDGSPVFIAFRSSTKKSVQYDDVPEYKDRLNLSENYTLSISNARISDEKRFVCMLVTEDNVFEAPTIVKVFKQPSKPEIVSKALFLETEQLKKLGDCISEDSYPDGNITWYRNGKVLHPLEGAVVIIFKKEMDPVTQLYTMTSTLEYKTTKADIQMPFTCSVTYYGPSGQKTIHSEQAVFDIYYPTEQVTIQVLPPKNAIKEGDNITLKCLGNGNPPPEEFLFYLPGQPEGIRSSNTYTLMDVRRNATGDYKCSLIDKKSMIASTAITVHYLDLSLNPSGEVTRQIGDALPVSCTISASRNATVVWMKDNIRLRSSPSFSSLHYQDAGNYVCETALQEVEGLKKRESLTLIVEGKPQIKMTKKTDPSGLSKTIICHVEGFPKPAIQWTITGSGSVINQTEESPYINGRYYSKIIISPEENVTLTCTAENQLERTVNSLNVSAISIPEHDEADEISDENREKVNDQAKLIVGIVVGLLLAALVAGVVYWLYMKKSKTASKHVNKDLGNMEENKKLEENNHKTEA 3 Protein Symbol => ALCAM 4 Swiss-Prot => Q13740 5 Summary => 6 Synonyms => CD166 antigen precursor (Activated leukocyte-cell adhesion molecule) (ALCAM). 7 Protein Location => S 8 signalP => 28 Y 0.988 Y 9 TM => 1 10 TM location => o528-550i 11 Num Nxts Sites => 10 12 Nxts Sites => 91,95,167,265,306,337,361,457,480,499 13 Peptide IPI => IPI00015102 14 NXT/S Location => 265 15 NXT/S Score => 0.6163598427746358 16 Predicted Tryptic NXT/S Peptide Sequenc.e => K.EGDN#ITLK.C 17 Predicted Peptide Mass => 888.444684 18 Database Hits => 1 19 Database Hit IPIs => IPI00015102 20 Min Similarity Score => 1.0 21 Detection Probability => 22 Identified Sequences => K.NAIKEGDN#ITLK.C 23 Tryptic Ends => 2 24 Peptide ProPhet => 0.8636 25 Identified Peptide Mass => 1315.7 =cut sub insert_peptides { my $self = shift; my %args = @_; my $err; for my $opt ( qw( build peptide_file format ipi_version_id ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; $self->testonly( $args{testonly} ); $self->{_format} = $args{format}; my $observed; # check which format was specified, read file if ( $args{format} eq 'interact-tsv' ) { $observed = $self->read_tsv( %args ); } elsif ( $args{format} eq 'phosphogigolo' ) { $observed = $self->read_tsv( %args ); } else { die "Unsupported file format\n"; } # insert peptide search record my $psid = $self->insert_peptide_search( %args ); # insert observed peptide $self->insert_observed_peptides( %args, peptides => $observed, pep_search_id => $psid ); # insert observed_to_ipi record(s) - indicate original # What the heck is this? my $match = $module->clean_pepseq( $args{aa_seq} ); if ( length($match) > 900 ) { print STDERR "Truncating long value for matching sequence\n"; $match = substr( $match, 0, 900 ); } print "what the heck?\n"; } sub insert_peptide_search { my $self = shift; my %args = @_; my $err; for my $opt ( qw( peptide_file sample_id ipi_version_id build_id ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; my $fname = basename( $args{peptide_file} ); my $sbeams = $self->getSBEAMS(); # Make sure this file hasn't already been loaded my ($dup) = $sbeams->selectrow_array( <<" END" ); SELECT COUNT(*) FROM $TBGP_PEPTIDE_SEARCH WHERE search_file = '$fname' AND sample_id = $args{sample_id} AND ref_db_id = $args{ipi_version_id} END # die "Duplicate input file detected: $fname\n" if $dup; my $rowdata = { search_file => $fname, sample_id => $args{sample_id}, ref_db_id => $args{ipi_version_id}, comment => "Loaded " . $sbeams->get_datetime(), }; # Insert row my $sid = $sbeams->updateOrInsertRow( table_name => $TBGP_PEPTIDE_SEARCH, rowdata_ref => $rowdata, return_PK => 1, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'peptide_search_id', ); my $btsid = $sbeams->updateOrInsertRow( table_name => $TBGP_BUILD_TO_SEARCH, rowdata_ref => {build_id => $args{build_id}, search_id => $sid }, return_PK => 1, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'peptide_search_id', ); return $sid; } sub insert_observed_peptides { my $self = shift; my %args = @_; my $err; for my $opt ( qw( ipi_version_id peptides pep_search_id ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; # this call will populate 3 hashes, seq->data_id, data_id => sequence, and ipi_id -> data_id $self->get_ipi_seqs( ipi_version_id => $args{ipi_version_id}, build_id => $args{build_id} ); my $seqs = $self->{_seq_to_id}; my $accs = $self->{_acc_to_id}; my $ids = $self->{_id_to_seq}; my $pepseqs = $self->{_pepseqs}; my $idx_file = $PHYSICAL_BASE_DIR . "/usr/Glycopeptide/" . $module->getSpectraSTLib( build_id => $args{build_id} ) . ".pepidx"; print STDERR "Using library file: $idx_file\n"; # my $sp_pepidx = $module->readSpectrastPepidxFile( $idx_file ); my $sp_pepidx = {}; my @keys = keys( %{$seqs} ); my $heads = $self->{_heads} || die("Doh"); my $sbeams = $self->getSBEAMS(); my %no_maps = ( potential => [], exact => [] ); my $cnt; my $insert_cnt = 0; my $t0 = time(); my $pcnt = scalar( @{$args{peptides}} ); print "Inserting peptides ($pcnt candidates)\n"; # Loop over peptides, insert if they pass muster for my $obs ( @{$args{peptides}} ) { $cnt++; # for my $k ( keys( %$heads ) ) { print "$k => $heads->{$k} => $obs->[$heads->{$k}]\n"; } # print "peptide from pos $heads->{Peptide} is $obs->[$heads->{Peptide}]\n"; # print "prot from pos $heads->{Protein} is $obs->[$heads->{Protein}]\n"; my $clean_pep = $module->clean_pepseq( $obs->[$heads->{Peptide}] ); my $clean_prot = $self->trim_space( $obs->[$heads->{Protein}] ); my $mapteins = []; my $seq = uc( $clean_pep ); my @map = grep( /$seq/, @keys ); for my $k ( @map ) { foreach my $ipi ( @{$seqs->{$k}} ) { push @$mapteins, $ipi; } } # Test for error conditions. if ( !scalar(@$mapteins) ) { # Can't map peptide to reference db print STDERR "Unable to map sequence: $clean_pep\t$clean_prot\n"; next; } elsif ( $obs->[$heads->{prob}] < 0.5 ) { # detection probability too low print STDERR "Low probability: $obs->[$heads->{prob}]\n"; next; } elsif ( grep /^$obs->[$heads->{Peptide}]$/, @$pepseqs ) { # Duplicato # FIXME - should we update here? # FIXME - Case sensitive? # FIXME - this picks up dbl charge states # print STDERR "Duplicate detected: $obs->[$heads->{Peptide}]\n"; # next; } $insert_cnt++; # Fight dups in same file push @$pepseqs, $obs->[$heads->{Peptide}]; my $calc_mass = $module->calculatePeptideMass( sequence => $clean_pep ); my ( $delta, $exp_mass, $scan, $charge, $mass2ch, $mh_plus, $motif_score ); # print "Choans dCn is $args{dCn_cutoff}\n"; if ( $self->{_format} eq 'interact-tsv' ) { # for my $h (keys( %$heads ) ) { print "head is $h, idx is $heads->{$h}, val is $obs->[$heads->{$h}]\n"; } # exit; $exp_mass = $module->mh_plus_to_mass($obs->[$heads->{'MH+'}]); my $out = basename( $obs->[$heads->{file}] ); my @name = split( /\./, $out ); unless ( scalar(@name) == 4 && $name[1] == $name[2] ) { # warn "Mismatch in out $out: " . join( ", ", @name ) . "\n"; } $delta = $self->extract_delta( $obs->[$heads->{'MH error'}] ); $mass2ch = ( $name[3] ) ? $exp_mass/$name[3] : undef; $scan = $name[2]; $charge = $name[3] ; $mh_plus = $obs->[$heads->{'MH+'}]; } elsif ( $self->{_format} eq 'phosphogigolo' ) { # print "Pogig in da house\n"; my $mh = $obs->[$heads->{'MH+'}]; $charge = $obs->[$heads->{'chg_state'}]; $exp_mass = ($mh * $charge) - ( $charge - 1 ); $mass2ch = $obs->[$heads->{'MH+'}]; $mh_plus = $obs->[$heads->{'MH+'}] + 1.0078; if ( $args{gygi_ascore} ) { # print "Getting gygi with it!\n"; $motif_score = $args{gygi_ascore}->{$obs->[$heads->{Peptide}]}; if ( !$motif_score || $motif_score < 10 ) { $obs->[$heads->{Peptide}] =~ s/\*/\&/g; } # print STDERR "Ascore is $motif_score for $obs->[$heads->{Peptide}]\n"; } elsif ( $args{dCn_cutoff} > 0 ) { # print "chizzle de dizzle\n"; if ( $obs->[$heads->{dCn}] < $args{dCn_cutoff} ) { my $opep = $obs->[$heads->{Peptide}]; $obs->[$heads->{Peptide}] =~ s/\*/\&/g; # print "dCn of $obs->[$heads->{dCn}] caused $opep to become $obs->[$heads->{Peptide}]"; } else { # print "dCn of $obs->[$heads->{dCn}] caused $obs->[$heads->{Peptide}] to stand pat"; } } # print STDERR "we be nexting\n"; # next; # scan and delta are not defined } # Set ipi data id to search match if available, else first db match my $ipi = $accs->{$obs->[$heads->{Protein}]} || $mapteins->[0]; my $rowdata = { observed_peptide_sequence => $obs->[$heads->{Peptide}], # sample_id => $args{sample_id}, peptide_search_id => $args{pep_search_id}, ipi_data_id => $ipi, peptide_prophet_score => $obs->[$heads->{prob}], experimental_mass => $exp_mass, spectrum_path => $obs->[$heads->{file}], mass_to_charge => $mass2ch, mh_plus => $mh_plus, mh_delta => $delta, matching_sequence => $clean_pep, scan_number => $scan, n_obs => $obs->[$heads->{n_obs}], charge_state => $charge, peptide_mass => $calc_mass, motif_score => $motif_score, delta_cn => $obs->[$heads->{dCn}], }; my $obspep; my $newpep = uc( $clean_pep ); my $has_match = 0; if ( $sp_pepidx->{$newpep} ) { my @matches = @{$sp_pepidx->{$newpep}}; for my $m ( @matches ) { $newpep = uc( $clean_pep ); my $obspep = $obs->[$heads->{Peptide}]; $obspep =~ s/\&/\*/g; my @mods = split( "/", $m->[1], -1 ); my $pcnt = 1; # start loop at 1 to skip first (charge/nmods) for ( my $i = 1; $i <= $#mods; $i++ ) { $mods[$i] =~ /(\d+),([STY]),(Phospho)/; next unless $2; my $posn = $1 + $pcnt++; substr( $newpep, $posn, 0, '*' ); } if ( $newpep eq $obspep ) { # print "Found a match for $newpep\n"; $has_match++; last; # } else { # print "Still looking for $newpep\n"; } } # print 'Found ' . scalar( @matches ) . ' potential matches for ' . $clean_pep . ' ( ' . $obs->[$heads->{Peptide}] . " )\n"; } else { print STDERR 'No potential matches for ' . $clean_pep . ' ( ' . $obs->[$heads->{Peptide}] . " )\n"; push @{$no_maps{potential}}, "$obs->[$heads->{Peptide}] / $clean_pep"; # next; $has_match++; } if ( !$has_match ) { # If we haven't been seen in the consensus lib, skip print STDERR "No exact match for $obs->[$heads->{Peptide}] ($obspep)\n"; push @{$no_maps{exact}}, "$obs->[$heads->{Peptide}] / $clean_pep"; next; } # Insert row my $obs_id = $sbeams->updateOrInsertRow( table_name => $TBGP_OBSERVED_PEPTIDE, rowdata_ref => $rowdata, return_PK => 1, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'peptide_search_id', ); unless ( $cnt % 25 ){ print '*'; } unless ( $cnt % 500 ){ print "\n"; } my %seen; for my $id ( @$mapteins ) { # For each IPI that this peptide maps to next if $seen{$id}; $seen{$id}++; # Insert rows into observed_to_ipi table $sbeams->updateOrInsertRow( table_name => $TBGP_OBSERVED_TO_IPI, rowdata_ref => { ipi_data_id => $id, observed_peptide_id => $obs_id }, return_PK => 0, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'observed_to_ipi_id', ); # Insert row(s) into observed_to_glycosite table my $coords = $module->map_peptide_to_protein( protseq => $ids->{$id}, multiple_mappings => 1, pepseq => uc($clean_pep) ); # print "Does $clean_pep map to $ids->{$id}\n"; # for my $cd ( @$coords ) { print "$cd->[0], $cd->[1]\n"; } # print "Done coords\n"; my $gsites = $module->getIPIGlycosites( ipi_data_id => $id ); # for my $gs ( @$gsites ) { print "$gs->[0] => $gs->[1]\n"; } my $mapped = 0; for my $coord_pair ( @$coords ) { for my $ipi_sites ( @$gsites ) { if ( $coord_pair->[0] <= $ipi_sites->[1] && $coord_pair->[1] >= $ipi_sites->[1] ) { $mapped++; # Insert rows into observed_to_glycosite table $sbeams->updateOrInsertRow( table_name => $TBGP_OBSERVED_TO_GLYCOSITE, rowdata_ref => { observed_peptide_id => $obs_id, glycosite_id => $ipi_sites->[0], site_start => $ipi_sites->[1], site_stop => $ipi_sites->[1] + length($clean_pep), }, return_PK => 0, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'observed_to_glycosite_id', ); } } # If a peptide spans multiple sites, we'll map them all, but we won't try # to map a peptide to a protein in multiple places. last if $mapped; } } # End of foreach mapped protein # last if $cnt >= 100; } # End of foreach peptide my $tdiff = time() - $t0; print "\n\n"; print "Inserted $insert_cnt rows of $cnt in $tdiff seconds\n"; print "No potential matches\n" if $no_maps{potential}; for my $pep ( @{$no_maps{potential}} ) { print "$pep\n"; } print "No exact matches\n" if $no_maps{exact}; for my $pep ( @{$no_maps{exact}} ) { print "$pep\n"; } } sub extract_delta { my $self = shift; my $delta = shift; # has the form of (+2.4) or (-0.4) $delta = s/\(//g; $delta = s/\)//g; $delta = s/\+//g; return $delta; } sub map_proteins { my $self = shift; my %args = @_; my $err; for my $opt ( qw( peptide ipi_version_id ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; if ( !$self->{_db_seqs} ) { $self->{_db_seqs} = $self->get_ipi_seqs( ipi_version_id => $args{ipi_version_id} ); $self->{_db_keys} = [ keys( %{$self->{_db_seqs}} ) ]; } my $seq = uc( $args{peptide} ); my @mapteins; # array of ipi's to which this peptide maps my @map = grep( /$seq/, @{$self->{_db_keys}} ); for my $k ( @map ) { foreach my $ipi ( @{$self->{_db_seqs}->{$k}} ) { push @mapteins, $ipi; } } return \@mapteins; # DB lookup was too dang slow my $peptide = '%' . $args{peptide} . '%'; my $sql =<<" END"; SELECT ipi_data_id, ipi_accession_number FROM $TBGP_IPI_DATA WHERE ipi_version_id = $args{ipi_version_id} AND protein_sequence like '$peptide' END my $sbeams = $self->getSBEAMS(); my @ipi_data; while( my $row = $sbeams->selectSeveralColumnsRow( sql => $sql ) ) { if ( $row->[1] eq $args{protein} ) { unshift @ipi_data, $row->[0]; } else { push @ipi_data, $row->[0]; } } return \@ipi_data; } sub get_ipi_seqs { my $self = shift; my %args = @_; my $err; for my $opt ( qw( ipi_version_id build_id ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; my $peptide = '%' . $args{peptide} . '%'; my $sql =<<" END"; SELECT ipi_data_id, protein_sequence, ipi_accession_number FROM $TBGP_IPI_DATA WHERE ipi_version_id = $args{ipi_version_id} END my %seq2id; my %acc2id; my %id2seq; my $sbeams = $self->getSBEAMS(); while( my $row = $sbeams->selectSeveralColumnsRow( sql => $sql ) ) { # $seq2ipi{$row->[1]} ||= []; push @{$seq2id{$row->[1]}}, $row->[0]; $acc2id{$row->[2]} = $row->[0]; $id2seq{$row->[0]} = $row->[1]; } $self->{_seq_to_id} = \%seq2id; $self->{_acc_to_id} = \%acc2id; $self->{_id_to_seq} = \%id2seq; $sql =<<" END"; SELECT matching_sequence FROM $TBGP_OBSERVED_PEPTIDE OP JOIN $TBGP_BUILD_TO_SEARCH BTS ON OP.peptide_search_id = BTS.search_id WHERE build_id = $args{build_id} END my @pepseqs; while( my $row = $sbeams->selectSeveralColumnsRow( sql => $sql ) ) { push @pepseqs, $row->[0]; } $self->{_pepseqs} = \@pepseqs; return 1; } sub get_interact_tsv_headers { my $self = shift; my %heads = ( prob => 0, ignore => 1, file => 2, 'MH+' => 3, 'MH error' => 4, XCorr => 5, dCn => 6, Sp => 7, SpRank => 8, IonsMatch => 9, IonsTot => 10, Protein => 11, '#DupProt' => 12, Peptide => 13, num_obs => 99, chg_state => 99 ); %heads = ( 'index' => 0, prob => 1, file => 2, 'MH+' => 99, 'MH error' => 99, XCorr => 3, dCn => 4, Sp => 99, SpRank => 5, IonsMatch => 9, IonsTot => 6, Protein => 8, '#DupProt' => 8, Peptide => 7, num_obs => 99, chg_state => 99 ); for my $k (keys( %heads ) ) { $heads{lc($k)} = $heads{$k}; } return \%heads; } sub get_phosgigolo_headers { my $self = shift; # 0 Found in DB na - # 1 DB-Name AA na - # 2 AA na - # 3 Pos. na - # 4 Search-AC Protein 11 # 5 SQ Peptide 13 # 6 Mr MH+ 3 # 7 Charge state na - # 8 Pep.prob prob 0 # 9 Xcorr Xcorr 5 # 10 DeltaCN dCn 6 # 11 # phosporylations na - # 12 # in LC/MS detected n_obs - # 13 LC/MS ID na - # my %heads = ( prob => 8, ignore => 99, file => 99, 'MH+' => 6, 'MH error' => 99, XCorr => 9, dCn => 10, Sp => 99, SpRank => 99, IonsMatch => 99, IonsTot => 99, Protein => 4, '#DupProt' => 99, Peptide => 5, n_obs => 12, chg_state => 7 ); for my $k (keys( %heads ) ) { $heads{lc($k)} = $heads{$k}; } return \%heads; } #### sub read_tsv { my $self = shift; my %args = @_; my $sbeams = $self->getSBEAMS(); my $file = $args{peptide_file}; open DATA, $file || die "Unable to open file $file $!\n"; $self->{_heads} ||= ( $args{format} eq 'interact-tsv' ) ? get_interact_tsv_headers() : get_phosgigolo_headers(); my $count = 0; my @peptides; print "Processing peptides\n"; while() { chomp; next if $_ =~ /^\s*$/; my @tokens = split( /\t/, $_, -1); for my $t ( @tokens ) { $t =~ s/^\s*\"*//; $t =~ s/\"*\s*$//; } if ( !scalar( @peptides ) && $self->is_header( \@tokens ) ) { # this column has a header row print "$tokens[0] is a header for format type $self->{_format}\n"; next; # } elsif( !scalar( @peptides ) ) { # print "$tokens[4] and $tokens[9] aren't SQ and DeltaCN?\n"; } $count ++; push @peptides, \@tokens; # print progress 'bar' unless ( $count % 25 ){ print '*'; } unless ( $count % 500 ){ print "\n"; } } print "\n\n"; return \@peptides; } sub is_header { my $self = shift; my $tokens = shift; if ( $self->{_format} eq 'interact-tsv' ) { if ( $tokens->[0] eq 'prob' && $tokens->[6] eq 'dCn' ) { return 1; } } elsif ( $self->{_format} eq 'phosphogigolo' ) { if ( $tokens->[5] eq 'SQ' && $tokens->[10] eq 'DeltaCN' ) { return 1; } } return 0; } ############################################################################## #Add the identifed_peptide for a row ############################################################################### sub add_identified_peptides{ # deprecated my $self = shift; my $method = 'add_identified_peptides'; my %args = @_; my $row = $args{line_parts}; my $glyco_pk = $args{glyco_pk}; my %heads = %{$self->{_heads}}; my $sbeams = $self->getSBEAMS; my $id_seq = $row->[$heads{'identified sequences'}]; # make sure we have an identifed peptide otherwise do nothing return unless ( $id_seq ); # Special case, some peptides have the sequence 'N.AME.?', has some meaning # Can't cope with this now, log it and return FIXME if ( $id_seq eq 'N.AME.?' || $id_seq eq '...' ) { # print STDERR "Got an oddball, $row->[$heads{'ipi'}]: $id_seq ($row->[$heads{'predicted tryptic nxt/s peptide sequence'}])\n"; return; } my $ipi_acc = $row->[$heads{'ipi'}]; my $clean_seq = $self->clean_seq($id_seq); my ($start, $stop) = $self->map_peptide_to_protein(peptide=> $clean_seq, protein_seq => $row->[$heads{'protein sequences'}]); my $iden_pep_id; # Insert new id'd peptide or use cached version if ( !$self->{_id_peps}->{$id_seq} ) { my $matching_sequence = $module->clean_pepseq( $id_seq ); # First, add row to the identified peptide table my %id_pep_row = ( identified_peptide_sequence => $id_seq, tryptic_end => $row->[$heads{'tryptic ends'}], peptide_prophet_score => $row->[$heads{'peptide prophet'}], peptide_mass => $row->[$heads{'identified peptide mass'}], glyco_site_id => $glyco_pk, matching_sequence => $matching_sequence, n_obs => $row->[$heads{'number observations'}] ); my $sbeams = $self->getSBEAMS(); # returns identified_peptide_id for new row $iden_pep_id = $sbeams->updateOrInsertRow( table_name => $TBGP_IDENTIFIED_PEPTIDE, rowdata_ref => \%id_pep_row, return_PK => 1, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'identified_peptide_id', ); # Cache value for later use! $self->{_id_peps}->{$id_seq} = $iden_pep_id; } else { $iden_pep_id = $self->{_id_peps}->{$id_seq}; # print "Using cached, $row->[$heads{'identified sequences'}] => $iden_pep_id\n"; } # Now, add row to identified_to_ipi lookup(join) table my %iden_to_ipi_row = ( ipi_data_id => $self->get_ipi_data_id($ipi_acc), glyco_site_id => $glyco_pk, identified_peptide_id => $iden_pep_id, identified_start => $start, identified_stop => $stop, ); # Insert row $sbeams->updateOrInsertRow( table_name => $TBGP_IDENTIFIED_TO_IPI, rowdata_ref => \%iden_to_ipi_row, return_PK => 0, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'identified_to_ipi_id', ); if ($self->verbose()>0){ print (__PACKAGE__."::$method Added IDENTIFIED PEPTIDE pk '$iden_pep_id'\n"); } $self->peptide_to_tissue($iden_pep_id, $row); return $iden_pep_id; } ############################################################################## #Add peptide_to_tissue information ############################################################################### sub peptide_to_tissue { my $self = shift; my $identified_peptide_id = shift; my $row = shift; my %heads = %{$self->{_heads}}; my $method = 'peptide_to_tissue'; my $samples = $row->[$heads{'identified tissues'}]; my @samples = split( ",", $samples, -1 ); my $sbeams = $self->getSBEAMS(); if ( !$self->{_sample_tissues} ) { my $sql = "SELECT sample_name, sample_id FROM $TBGP_UNIPEP_SAMPLE"; $self->{_sample_tissues} = $sbeams->selectTwoColumnHashref( $sql ); # foreach my $k ( keys ( %{$self->{_sample_tissues}} ) ) { print "$k\n"; } } foreach my $sample ( @samples ) { # Trim leading and trailing space $sample =~ s/^\s*//g; $sample =~ s/\s*$//g; # bernd means lymphocyte for the time being $sample =~ s/bernd/lymphocytes/g; # We didn't find tissue type in lookup, try to autocreate unless ( $self->{_sample_tissues}->{$sample} ) { $self->{_sample_tissues}->{$sample} = $self->newGlycoSample( $sample ) || die "Unable to create new sample type: $sample"; } my %rowdata = ( identified_peptide_id => $identified_peptide_id, sample_id => $self->{_sample_tissues}->{$sample} ); $sbeams->updateOrInsertRow( return_PK => 0, table_name => $TBGP_PEPTIDE_TO_SAMPLE, rowdata_ref => \%rowdata, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'peptide_to_tissue_id', ); } } sub newGlycoSample { my $self = shift; my $sample = shift; my $sbeams = $self->getSBEAMS(); my $tissue_sql = $sbeams->evalSQL ( <<" END" ); SELECT tissue_type_id FROM $TBGP_TISSUE_TYPE WHERE tissue_type_name = 'unknown' END my $dbh = $sbeams->getDBHandle(); my ( $tissue_id ) = $dbh->selectrow_array( $tissue_sql ) || die "Unable to find 'unknown' tissue type, cannot insert new samples"; my %rowdata = ( tissue_type_id => $tissue_id, sample_name => $sample ); my $sample_id = $sbeams->updateOrInsertRow( return_PK => 1, table_name => $TBGP_UNIPEP_SAMPLE, rowdata_ref => \%rowdata, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'sample_id' ); if ( $sample_id ) { print STDERR "Created new sample entry for $sample: $sample_id\n"; return $sample_id; } else { return undef; } } ############################################################################## #Add the predicted peptide for a row ############################################################################### sub add_erpdicted_peptide { my $method = 'add_erpdicted_peptide'; my $self = shift; my %args = @_; my $row = $args{line_parts}; my $glyco_pk = $args{glyco_pk}; my %heads = %{$self->{_heads}}; my $ipi_acc = $row->[$heads{'ipi'}]; my $peptide_sequence = $row->[$heads{'predicted tryptic nxt/s peptide sequence'}]; if ( length($peptide_sequence) > 900 ) { $peptide_sequence = substr( $peptide_sequence, 0, 900 ); } my $clean_seq = $self->clean_seq($peptide_sequence); my $sbeams = $self->getSBEAMS(); # We may now be getting proteins only, skip this bloc if no predicted peptide if ( $row->[$heads{'predicted tryptic nxt/s peptide sequence'}] && $ipi_acc && $clean_seq ) { #my $fixed_predicted_seq = $self->fix_predicted_peptide_seq($row->[16]); my ($start, $stop) = $self->map_peptide_to_protein(peptide=> $clean_seq, protein_seq => $row->[$heads{'protein sequences'}]); my $det_prob = 0 ; #$row->[$heads{'detection probablility'}] || 0; my $matching_sequence = $module->clean_pepseq( $peptide_sequence ); #TODO WARNING DETECTION PROBABLITY IS FAKE> DATA IS NOT COMPLETE my %rowdata_h = ( ipi_data_id => $self->get_ipi_data_id($ipi_acc), predicted_peptide_sequence => $row->[$heads{'predicted tryptic nxt/s peptide sequence'}], predicted_peptide_mass => $row->[$heads{'predicted peptide mass'}], detection_probability => $det_prob, # number_proteins_match_peptide => $row->[$heads{'database hits'}], matching_protein_ids => $row->[$heads{'database hit ipis'}], protein_similarity_score => $row->[$heads{'min similarity score'}], predicted_start => $start, predicted_stop => $stop, glyco_site_id => $glyco_pk, matching_sequence => $matching_sequence ); #TODO REMOVE SIZE LIMIT OF DATA #my %rowdata_h = $self->truncate_data(record_href => \%rowdata_h); #some of the data will need to truncated to make it easy to put all data in varchar 255 or less my $rowdata_ref = \%rowdata_h; my $predicted_peptide_id = $sbeams->updateOrInsertRow( table_name=>$TBGP_PREDICTED_PEPTIDE, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'predicted_peptide_id', ); if ($self->verbose()>0){ print (__PACKAGE__."::$method Added PREDICTED PEPTIDE pk '$predicted_peptide_id'\n"); } return $predicted_peptide_id; } } ############################################################################### # fix_predicted_peptide_seq # predicted peptide sequences did not have a trailing . denoting the peptide/protein #cleavage site. Need to add it back in ############################################################################### sub fix_predicted_peptide_seq { my $method = 'fix_predicted_peptide_seq'; my $self = shift; my $pep_seq = shift; if ($pep_seq =~ s/(.)$/.$1/){ if($self->verbose){ print (__PACKAGE__."::$method ADDED TRAILING CUT SITE '$pep_seq'\n"); } return $pep_seq; }else{ confess(__PACKAGE__."$method COULD NOT REPLACE THE TRAILING CUT SITE in '$pep_seq'\n"); } } sub trim_space { my $self = shift; my $seq = shift; $seq =~ s/^\s*//g; $seq =~ s/\s*$//g; return $seq; } ############################################################################### # clean_seq remove the start and finish protein aa. Remove any non aa from # from a peptide sequence ############################################################################### sub clean_seq { my $method = 'clean_seq'; my $self = shift; my $pep_seq = shift; unless($pep_seq){ confess(__PACKAGE__."$method MUST PROVIDE A PEPTIDE SEQUENCE YOU GAVE '$pep_seq'\n"); } $pep_seq =~ s/^.//; #remove first aa unless($pep_seq){ # return ''; confess(__PACKAGE__."$method PEP SEQ IS GONE'$pep_seq'\n"); } $pep_seq =~ s/.$//; #remove last aa $pep_seq =~ s/\W//g; #remove any '*' '.' '#' signs unless($pep_seq){ # return ''; confess(__PACKAGE__."$method PEP SEQ IS GONE'$pep_seq'\n"); } if($self->verbose){ print (__PACKAGE__."::$method CLEAN SEQ '$pep_seq'\n"); } if ($pep_seq =~ /\W/){ confess(__PACKAGE__."$method PEPTIDE SEQUENCE IS NOT CLEAN '$pep_seq'\n"); } return $pep_seq; } ############################################################################### #map_peptide_to_protein ############################################################################### sub map_peptide_to_protein { my $method = 'map_peptide_to_protein'; my $self = shift; my %args = @_; my $pep_seq = $args{peptide}; my $protein_seq = $args{protein_seq}; # return ( 0, 0 ) unless $args{peptide}; if ( $protein_seq =~ /$pep_seq/ ) { #add one for the starting position since we want the start of the peptide location my $start_pos = length($`) +1; # subtract 1 since we want the true end my $stop_pos = length($pep_seq) + $start_pos - 1 ; if($self->verbose){ print (__PACKAGE__."::$method $pep_seq START '$start_pos' STOP '$stop_pos'\n"); } if ($start_pos >= $stop_pos){ confess(__PACKAGE__. "::$method STOP LESS THAN START START '$start_pos' STOP '$stop_pos'\n"); } return ($start_pos, $stop_pos); }else{ print STDERR "No mapping possible: PEPTIDE '$pep_seq' DOES NOT MATCH '$protein_seq'\n"; return( 0, 0 ); confess(__PACKAGE__. "::$method PEPTIDE '$pep_seq' DOES NOT MATCH '$protein_seq'\n"); } } # # Add predicted tryptic peptide # sub insert_predicted { my $self = shift; my %args = @_; my $err; for my $opt ( qw( ipi ipi_data_id aa_seq start_idx stop_idx ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; my $match = $module->clean_pepseq( $args{aa_seq} ); if ( length($match) > 900 ) { print STDERR "Truncating long value for matching sequence\n"; $match = substr( $match, 0, 900 ); } $self->{_loaded_peptide} ||= {}; if ( $self->{_loaded_peptide}->{$args{aa_seq}} ) { # This code kept multiple instances of the same aa sequence from being loaded. # return $self->{_loaded_peptide}->{$args{aa_seq}}; } # $self->{_seq_to_id}->{$match}->{$args{ipi}}++; my $n_match = 0; my $match_str = ''; for my $k ( keys( %{$self->{_seq_to_id}->{$match}} ) ) { next unless $k; $n_match++; $match_str = ( $match_str ) ? "$match_str, $k" : $k; } my $mass = $module->calculatePeptideMass( sequence => $match); # Fixme use methyl Cys & N => D? my $detection_prob = -1; my $prot_sim_score = -1; my $rowdata = { ipi_data_id => $args{ipi_data_id}, predicted_peptide_sequence => $args{aa_seq}, predicted_peptide_mass => $mass, detection_probability => $detection_prob, n_proteins_match_peptide => $n_match, matching_protein_ids => $match_str, protein_similarity_score => $prot_sim_score, predicted_start => $args{start_idx}, predicted_stop => $args{stop_idx}, matching_sequence => $match}; my $sbeams = $self->getSBEAMS(); my $predicted_id = $sbeams->updateOrInsertRow( return_PK => 1, table_name => $TBGP_PREDICTED_PEPTIDE, rowdata_ref => $rowdata, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'predicted_peptide_id', ); $self->{_loaded_peptide}->{$args{aa_seq}} = $predicted_id; return $predicted_id; } sub whatsub { my $self = shift; my $package = shift || 0; my @call = caller(1); $call[3] =~ s/.*:+([^\:]+)$/$1/ unless $package; return $call[3]; } sub add_predicted2glycosite { my $self = shift; my %args = @_; my $err; for my $opt ( qw( ipi_data_id site predicted_id ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; my $key = $args{ipi_data_id} . $args{site}; my $glycosite_id = $self->{_ipi2gs}->{$key} || die "Unknown site"; my $rowdata = { glycosite_id => $glycosite_id, site_start => $args{site}, site_stop => $args{site} + 3, predicted_peptide_id => $args{predicted_id} }; my $sb = $self->getSBEAMS(); my $id = $sb->updateOrInsertRow( return_PK => 1, table_name => $TBGP_PREDICTED_TO_GLYCOSITE, rowdata_ref => $rowdata, insert => 1, PK_name => 'predicted_to_glycosite_id', ); return $id; } sub update_peptide_sequence { my $self = shift; my %args = @_; my $err; for my $opt ( qw( aa_seq predicted_id ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; my $sbeams = $self->getSBEAMS(); my $result = $sbeams->updateOrInsertRow( table_name => $TBGP_PREDICTED_PEPTIDE, rowdata_ref => { predicted_peptide_sequence => $args{aa_seq} }, update => 1, PK_name => 'predicted_peptide_id', PK_value => $args{predicted_id}, ); return $result; } # # Add predicted tryptic peptides, plus glycosite record if appropriate. # sub add_predicted_peptides { my $self = shift; my %args = @_; my $err; for my $opt ( qw( ipi ipi_data_id site_idx sequence peptides ) ) { $err = ( $err ) ? $err . ', ' . $opt : $opt if !defined $args{$opt}; } die ( "Missing required parameter(s): $err in " . $self->whatsub() ) if $err; # Index into protein my $pidx = 0; # Array of glycosite locations my $site = shift( @{$args{site_idx}} ); for my $peptide ( @{$args{peptides}} ) { # index of glycosite in peptide my $sidx = $pidx; # Increment index $pidx += (length( $peptide ) - 4); my $num_motifs = 0; my $predicted_id; # If peptide is 5 aa's or more -or- is a glycosite # my $do_insert = ( length($peptide) > 8 || ( $site && $pidx >= $site + 1 ) ) ? 1 : 0; # # If peptide is a glycosite my $do_insert = ( $site && $pidx >= $site + 1 ) ? 1 : 0; $predicted_id = $self->insert_predicted( %args, # pass through ipi_data_id, ipi accession aa_seq => $peptide, start_idx => $sidx, stop_idx => $pidx ) or die "Insert failed $!" if $do_insert; # If we are in a glyco while ( defined $site && $pidx >= $site ) { my $pepidx = ( $site - $sidx ) + 1 + $num_motifs; my $annot_pep = $peptide; substr( $annot_pep, $pepidx, 1 ) = 'N#'; # print "$peptide => $annot_pep\n"; my $aa = substr( $peptide, $pepidx , 3 ); # print "\tDataID: $args{ipi_data_id}\t$pidx\t$pepidx\t$aa\t$site\n"; $self->add_predicted2glycosite( %args, site => $site, predicted_id => $predicted_id ) if $predicted_id; # print "Added 2 for $predicted_id\n" if $num_motifs > 1; $site = shift( @{$args{site_idx}} ); # print "\t\t$pidx\t$pepidx\t$aa\t$site\n"; $peptide = $annot_pep; $num_motifs++; last if !defined $site; } if ( $predicted_id && $peptide =~ /#/ ) { $self->update_peptide_sequence( predicted_id => $predicted_id, aa_seq => $peptide ); } } } # # Add the glycosite(s) for current protein # sub add_glycosites { my $self = shift; my %args = @_; my $missing; for my $opt ( qw( protein_sequence ipi_data_id site_idx ) ) { $missing = ( $missing ) ? $missing . ', ' . $opt : $opt unless defined $args{$opt}; } die ( "Missing required parameter(s): $missing in " . $self->whatsub() ) if $missing; my $sbeams = $self->getSBEAMS(); $self->{_ipi2gs} ||= {}; for my $site ( @{$args{site_idx}} ) { my $glyco_score = -1; my $motif_context = $self->motif_context( seq => $args{protein_sequence}, site => $site ); my $rowdata = { protein_glycosite_position => $site, site_context => $motif_context, glyco_score => $glyco_score, # Fixme ipi_data_id => $args{ipi_data_id} }; my $pk = $sbeams->updateOrInsertRow( table_name => $TBGP_GLYCOSITE, rowdata_ref => $rowdata, return_PK => 1, verbose => $self->verbose(), testonly => $self->testonly(), insert => 1, PK => 'glycosite_id' ) || die "putresence"; $self->{_ipi2gs}->{$args{ipi_data_id} . $site} = $pk; } print return 1; } sub motif_context { my $self = shift; my %args = @_; my $missing; for my $opt ( qw( seq site ) ) { $missing = ( $missing ) ? $missing . ', ' . $opt : $opt unless defined $args{$opt}; } die ( "Missing required parameter(s): $missing in " . $self->whatsub() ) if $missing; my $lpad = 0; my $rpad = 0; my $beg = $args{site} - 5; my $end = 13; my $len = length( $args{seq} ); if ( $args{site} <= 5 ) { $lpad = 5 - $args{site}; $beg = 0; $end -= $lpad; } if ( $args{site} + 8 > $len ) { $rpad = ($args{site} + 8) - $len; $end -= $rpad; } my $context = substr( $args{seq}, $beg, $end ); $context = '-' x $lpad . $context . '-' x $rpad; my $slen = length($context); return $context; } ############################################################################### #Add the glycosite for this row ############################################################################### sub add_glyco_site_old { my $method = 'add_glyco_site'; my $self = shift; my $row = shift; my %heads = %{$self->{_heads}}; my $ipi_id = $row->[$heads{'ipi'}]; my %rowdata_h = ( protein_glycosite_position => $row->[$heads{'nxt/s location'}], glyco_score =>$row->[$heads{'nxt/s score'}], ipi_data_id => $self->get_ipi_data_id( $ipi_id ), ); my $rowdata_ref = \%rowdata_h; my $sbeams = $self->getSBEAMS(); my $glyco_site_id = $sbeams->updateOrInsertRow( table_name=>$TBGP_GLYCOSITE, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'glyco_site_id', ); if ($self->verbose()>0){ print (__PACKAGE__."::$method Added GLYCOSITE pk '$glyco_site_id'\n"); } return $glyco_site_id; } ############################################################################### #Add the main info for an ipi record ############################################################################### sub add_ipi_record { my $self = shift; my $row = shift; die 'Did not pass data ref' unless ( $row && ref($row) =~/ARRAY/ ); my $sbeams = $self->getSBEAMS(); my %heads = %{$self->{_heads}}; my $cellular_location_id = $self->find_cellular_location_id($row->[$heads{'protein location'}]); my $ipi_id = $row->[$heads{'ipi'}]; my $ipi_version_id = $self->ipi_version_id(); my %rowdata_h = ( ipi_version_id => $ipi_version_id, ipi_accession_number =>$ipi_id, protein_name =>$row->[$heads{'protein name'}], protein_symbol =>$row->[$heads{'protein symbol'}], swiss_prot_acc =>$row->[$heads{'swiss-prot'}], cellular_location_id =>$cellular_location_id, transmembrane_info =>$row->[$heads{'tm location'}], signal_sequence_info =>$row->[$heads{'signalp'}], synonyms => $row->[$heads{'synonyms'}], ); %rowdata_h = $self->truncate_data(record_href => \%rowdata_h); #some of the data will need to truncated to make it easy to put all data in varchar 255 or less ##Add in the big columns that should not be truncated $rowdata_h{protein_sequence} = $row->[$heads{'protein sequences'}]; $rowdata_h{protein_summary} = $row->[$heads{'summary'}]; my $rowdata_ref = \%rowdata_h; my $ipi_data_id = $sbeams->updateOrInsertRow( table_name=>$TBGP_IPI_DATA, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'ipi_data_id', ); $self->{All_records}{$ipi_id} = {ipi_data_id => $ipi_data_id}; return $ipi_data_id; } ############################################################################### #get ipi_data_id #given a ipi_accession_number #return id if present #die otherwise ############################################################################### sub get_ipi_data_id{ my $method = 'get_ipi_data-id'; my $self = shift; my $ipi_acc = shift; if (exists $self->{All_records}{$ipi_acc}){ return $self->{All_records}{$ipi_acc}{ipi_data_id}; }else{ confess(__PACKAGE__. "::$method COULD NOT FIND ID '$ipi_acc'\n"); } } ############################################################################### #Given the name of a tissue look return a tissue id ############################################################################### sub find_tissue_id { my $self = shift; my $tissue_name = shift; my $code = ''; if ($self->tissue_code_id($tissue_name)){ #print "I SEE THE CODE **\n"; return $self->tissue_code_id($tissue_name); }else{ return $self->find_tissue_code($tissue_name); } } ############################################################################### #Get/Set the tissue code_id ############################################################################### sub tissue_code_id { my $self = shift; if (@_){ #it's a setter $self->{_TISSUE_NAMES}{$_[0]} = $_[1]; }else{ #it's a getter $self->{_CELLULAR_NAMES}{$_[0]}; } } ############################################################################## #Query the database for the tissue code ############################################################################### sub find_tissue_code { my $method = 'find_tissue_code'; my $self = shift; my $tissue = shift; my $sbeams = $self->getSBEAMS(); my $tissue_name = ( $tissue =~ /serum/i ) ? 'serum' : ( $tissue =~ /prostate/i ) ? 'prostate' : ( $tissue =~ /ovary/i ) ? 'ovary' : ( $tissue =~ /breast/i ) ? 'breast' : 'unknown'; my $sql = qq~ SELECT tissue_id FROM $TBGP_TISSUE_TYPE WHERE tissue_name = '$tissue_name' ~; my ($id) = $sbeams->selectOneColumn($sql); if ($self->verbose){ print __PACKAGE__. "::$method FOUND TISSUE ID '$id' FOR TISSUE '$tissue_name'\n"; } unless ($id) { confess(__PACKAGE__ ."::$method CANNOT FIND ID FOR FOR TISSUE '$tissue_name'\n"); } $self->tissue_code_id($tissue_name, $id); return $id; } ############################################################################### #If the ipi_protein has been seen return 0 otherwise retrun 1 ############################################################################### sub find_cellular_location_id{ my $self = shift; my $cellular_code = shift; my $code = ''; if ($self->cellular_code_id($cellular_code)){ return $self->cellular_code_id($cellular_code); }else{ return $self->find_cellular_code($cellular_code); } } ############################################################################### #Convert the cellular code to and a name and find it in the database ############################################################################### sub find_cellular_code { my $method = 'find_cellular_code'; my $self = shift; my $code = shift; my $sbeams = $self->getSBEAMS(); # Lets not look it up every single time, eh? my $cached = $self->cellular_code_id( $code ); return $cached if $cached; my $full_name = ''; if ($code eq 'S'){ $full_name = 'Secreted'; }elsif($code eq 'TM'){ $full_name = 'Transmembrane'; }elsif($code eq 'A'){ $full_name = 'Anchor'; }elsif($code eq '0'){ $full_name = 'Intracellular'; }elsif($code eq 'A_low' ){ $full_name = 'Anchor'; }elsif( !$code ){ # FIXME $full_name = 'Cytoplasmic'; } else { die "Unknown cellular code $code\n"; } my $sql =<<" END"; SELECT cellular_location_id FROM $TBGP_CELLULAR_LOCATION WHERE cellular_location_name = '$full_name' END print $sbeams->evalSQL( $sql ); my ($id) = $sbeams->selectOneColumn($sql); if ( $full_name ) { unless ($id) { die "DB lookup failed for cellular code $code ($full_name)\n"; } } $self->cellular_code_id($code, $id); return $id; } ############################################################################### #Get/Set the cellular code_id cellular_code ############################################################################### sub cellular_code_id { my $self = shift; my ( $code, $id ) = @_; if ( defined $id ){ #it's a setter print "Code $code is getting set to $id\n" if $self->verbose(); $self->{_CELLULAR_CODES}{$code} = $id; } return $self->{_CELLULAR_CODES}{$code}; } ############################################################################### #If the ipi_protein has been seen return 0 otherwise retrun 1 ############################################################################### sub check_ipi { my $method = 'check_ipi'; my $self = shift; my $ipi_id = shift; # confess(__PACKAGE__ . "::$method Need to provide IPI id '$ipi_id' is not good \n")unless $ipi_id =~ /^IPI/; if (exists $self->{All_records}{$ipi_id} ){ return 1; }else{ return 0; } } ############################################################################### #check_version ############################################################################### sub check_version { my $method = 'check_version'; my $self = shift; my %args = @_; my $sbeams = $self->getSBEAMS(); my $sql = qq~ SELECT ipi_version_id FROM $TBGP_IPI_VERSION WHERE ipi_version_name = '$args{release}' ~; my ($id) = $sbeams->selectOneColumn($sql); # print STDERR "$sql Found matching version $id\n"; if ($id){ die "Version $args{version} already exists, quitting\n"; } return 1; } ############################################################################### #add_new_ipi_version/set ipi_version_id ############################################################################### sub add_ipi_version{ my $self = shift; my %args = @_; my $file = $self->getfile(); my $file_name = basename($file); my $sbeams = $self->getSBEAMS(); my $st = stat($file); my $mod_time_string = strftime "%F %H:%M:%S.00", localtime($st->mtime); my $release = $self->{_release} || $file; my $orgID = $sbeams->get_organism_id( organism => $args{organism} ); die "Unable to find organism $args{organism} in the database" unless $orgID; my $is_default = ( $args{default} ) ? 1 : 0; # FIXME Add to schema # ipi_file_name => $file, my %rowdata_h = ( ipi_version_name => $args{release}, ipi_version_date => $mod_time_string, ipi_version_file => $file_name, organism_id => $orgID, is_default => $is_default, comment => $args{comment}, ); my $ipi_version_id = $sbeams->updateOrInsertRow( table_name=>$TBGP_IPI_VERSION, rowdata_ref=> \%rowdata_h, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'ipi_version_id', add_audit_parameters => 1, ); return $self->ipi_version_id($ipi_version_id); } ############################################################################### #get/set ipi_version_id #return the ipi_version_id in either case ############################################################################### sub ipi_version_id { my $self = shift; if (@_){ #it's a setter $self->{_IPI_VERSION_ID} = $_[0]; return $_[0]; }else{ #it's a getter return $self->{_IPI_VERSION_ID}; } } ############################################################################### #Column headers ############################################################################### sub column_headers { my $self = shift; my $line_aref = shift; my %headers = (); my $count = 0; foreach my $name (@{$line_aref}){ $headers{$count} = $name; $count ++; } return %headers; } ############################################################################### #truncate_data #used to truncate any long fields. Will truncate everything in a hash or a single value to 254 char. Also will #write out to the error log if any extra fields are truncated ############################################################################### sub truncate_data { my $method = 'truncate_data'; my $self = shift; my %args = @_; my $record_href = $args{record_href}; my $data_aref = $args{data_aref}; confess(__PACKAGE__ . "::$method Need to provide key value pair 'record_href' OR 'data_aref'\n") unless ( ref($record_href) eq 'HASH' || ref($data_aref) eq 'ARRAY' ); my %record_h = (); my @data = (); if ($record_href){ %record_h = %{$record_href}; foreach my $key ( keys %record_h){ if (length $record_h{$key} > 255){ my $big_val = $record_h{$key}; $record_h{$key} = substr($record_h{$key}, 0, 255); $self->anno_error(error => 'trunc'); } } return %record_h; }elsif($data_aref){ @data = @$data_aref; for(my $i=0; $i<=$#data; $i++){ if (length $data[$i] > 255){ my $big_val = $data[$i]; $data[$i] = substr($data[$i], 0, 255); $self->anno_error(error => 'trunc'); } } return @data; }else{ die "Unknown DATA TYPE FOR $method\n"; } } ############################################################################## # anno_error ############################################################################### sub anno_error { my $self = shift; my %args = @_; $self->{_anno_error} ||= { trunc => 0, }; if ( $args{error} ) { $self->{_anno_error}->{$args{error}}++; return; } my $errstr = 'No errors reported'; if ( $self->{_anno_error}->{trunc} ) { $errstr .= "Warning: $self->{_anno_error}->{trunc} values were truncated"; } return $errstr; } }#closing bracket for the package 1;