#!/usr/local/bin/perl ############################################################################### # $Id: peptideSearch.cgi 4670 2006-04-22 01:54:05Z dcampbel $ # # SBEAMS is Copyright (C) 2000-2005 Institute for Systems Biology # This program is governed by the terms of the GNU General Public License (GPL) # version 2 as published by the Free Software Foundation. It is provided # WITHOUT ANY WARRANTY. See the full description of GPL terms in the # LICENSE file distributed with this software. ############################################################################### ############################################################################### # Get the script set up with everything it will need ############################################################################### use strict; use vars qw ($q $sbeams $sbeamsMOD $PROG_NAME $current_contact_id $current_username $glyco_query_o); use lib qw (../../lib/perl); use CGI::Carp qw(fatalsToBrowser croak); use Data::Dumper; use SBEAMS::Connection qw($q $log); use SBEAMS::Connection::Settings; use SBEAMS::Connection::Tables; use SBEAMS::Connection::DataTable; use SBEAMS::BioLink::Tables; use SBEAMS::BioLink::KeggMaps; use SBEAMS::BioLink::MSF; use SBEAMS::Glycopeptide; use SBEAMS::Glycopeptide::Settings; use SBEAMS::Glycopeptide::Tables; use SBEAMS::Glycopeptide::Get_glyco_seqs; use SBEAMS::Glycopeptide::Glyco_query; ############################################################################### # Global Variables ############################################################################### # $sbeams = new SBEAMS::Connection; $sbeamsMOD = new SBEAMS::Glycopeptide; $sbeamsMOD->setSBEAMS($sbeams); my $kegg = new SBEAMS::BioLink::KeggMaps; $glyco_query_o = new SBEAMS::Glycopeptide::Glyco_query; $glyco_query_o->setSBEAMS($sbeams); $sbeams->setSBEAMS_SUBDIR($SBEAMS_SUBDIR); my $base_url = "$CGI_BASE_DIR/$SBEAMS_SUBDIR/massSearch"; { # Main # Authenticate or exit exit unless ($current_username = $sbeams->Authenticate( # permitted_work_groups_ref=>['Glycopeptide_user','Glycopeptide_admin', 'Glycopeptide_readonly'], # connect_read_only=>1, allow_anonymous_access=>1, )); #### Read in the default input parameters my %params; $sbeams->parse_input_parameters( q=>$q, parameters_ref=>\%params ); $sbeams->processStandardParameters(parameters_ref=>\%params); if ( $params{unipep_build_id} ) { my $build_id = $sbeamsMOD->get_current_build( build_id => $params{unipep_build_id} ); if ( $build_id != $params{unipep_build_id} ) { $sbeams->set_page_message( type => 'Error', msg => 'You must log in to access specified build' ); } } ## get project_id to send to HTMLPrinter display my $project_id = $sbeams->getCurrent_project_id(); my $page = $sbeams->getGifSpacer( 800 ) . "
\n"; if ( $params{ipi_data_id} ) { $page .= get_ortholog_display( %params ); } elsif ( $params{list_groups} ) { $page .= get_ortholog_list( %params ); } else { $page .= $sbeams->makeErrorText( "Missing required parameter ipi_data_id" ); } # Display page $sbeamsMOD->display_page_header(project_id => $project_id); print $sbeamsMOD->get_phospho_css(); print $page; $sbeamsMOD->display_page_footer(); } # end main sub get_ortholog_display { my %args = @_; my $content; for my $arg ( qw( ipi_data_id ) ) { unless ( $args{$arg} ) { $content .= $sbeams->makeErrorText("Missing required parameter: $arg
"); return $content; } } # Set up display table my $table = SBEAMS::Connection::DataTable->new( BORDER => 0 ); $table->addRow( ['Ortholog group', 'Accession','Protein Name', 'Organism name', '# Phospho-peptides', '# Peptides' ] ); $table->setRowAttr( ROWS => [1], BGCOLOR => '#C0D0C0' ); $table->setHeaderAttr( BOLD => 1 ); my $all_builds = $sbeamsMOD->getPhosphopepBuilds( as_string => 1 ); # SQL to fetch orthologs my $sql =<<" END_SQL"; SELECT ortholog_group, ipi_accession_number, protein_name, organism_name, I.ipi_data_id, observed_peptide_sequence, ORG.organism_id, protein_sequence FROM $TBBL_ORTHOLOG O JOIN $TBGP_ORTHOLOG_TO_IPI OTI ON O.ortholog_id = OTI.ortholog_id JOIN $TBGP_IPI_DATA I ON I.ipi_data_id = OTI.ipi_data_id LEFT JOIN $TBGP_OBSERVED_TO_IPI OTIPI ON OTIPI.ipi_data_id = OTI.ipi_data_id LEFT JOIN $TBGP_OBSERVED_PEPTIDE OP ON OTIPI.observed_peptide_id = OP.observed_peptide_id LEFT JOIN $TBGP_BUILD_TO_SEARCH BTS ON OP.peptide_search_id = BTS.search_id JOIN $TB_ORGANISM ORG ON O.ortholog_organism_id = ORG.organism_id WHERE ortholog_group IN ( SELECT DISTINCT ortholog_group FROM $TBBL_ORTHOLOG O JOIN $TBGP_ORTHOLOG_TO_IPI OTI ON O.ortholog_id = OTI.ortholog_id WHERE ipi_data_id = $args{ipi_data_id} ) AND ( build_id IN ( $all_builds ) OR build_id IS NULL ) -- GROUP BY ortholog_group, ipi_accession_number, protein_name, organism_name, I.ipi_data_id, ORG.organism_id, CAST( protein_sequence AS VARCHAR(4000) ) ORDER BY organism_name, ipi_accession_number END_SQL my $org_hash = get_org_hash(); for my $o ( keys( %{$org_hash} ) ) { $log->debug( "org $o has id $org_hash->{$o}" ); } my $current; my %subject; my $bgcolor = '#F1F1F1'; my $sth = $sbeams->get_statement_handle( $sql ); my $fasta = ''; my %prot_data; my @prot_order; while ( my @row = $sth->fetchrow_array() ) { if ( !$prot_data{$row[1]} ) { push @prot_order, $row[1]; } $log->debug( "org id is $row[6], name is $row[3]" ); $prot_data{$row[1]} ||= {}; $prot_data{$row[1]}->{peps} ||= []; push @{$prot_data{$row[1]}->{peps}}, $row[5]; $log->debug( "prot_data for $row[1] getting pushed, org_id is $row[6]" ); $prot_data{$row[1]}->{row} = \@row; next unless $row[5]; $prot_data{$row[1]}->{pcnt}++; $prot_data{$row[1]}->{ppcnt}++ if $row[5] =~ /\*|\&/; } # Will populate the $prot_data->{accession}->{sites} hashref, site => type # and the $prot_data->{allsites} hash... get_phospho_sites( \%prot_data ); my %bgcolor; for my $prot ( @prot_order ) { # my $curr_prot = $prot_data{$prot}; my @row = @{$prot_data{$prot}->{row}}; $fasta .= ">$row[1]\n$row[7]\n"; $prot_data{$prot}->{ppcnt} ||= 0; $prot_data{$prot}->{pcnt} ||= 0; # Fixing bug, why on earth were 5 and 6 overwritten! my $org_id = $row[6]; $row[5] = $prot_data{$prot}->{pcnt}; $row[6] = $prot_data{$prot}->{ppcnt}; my $original_name = $row[1]; if ( $prot_data{$prot}->{pcnt} ) { $row[1] = " $row[1] "; } $table->addRow( [@row[0..3,5,6]] ); $current ||= $row[3]; if ( $current ne $row[3] ) { # New organism $bgcolor = ( $bgcolor eq '#E0E0E0' ) ? '#F1F1F1' : '#E0E0E0'; $current = $row[3]; } if ( $row[4] == $args{ipi_data_id} ) { $subject{name} = $row[2]; $subject{acc} = $row[1]; $table->setRowAttr( ROWS => [$table->getRowNum()], BGCOLOR => '#FFFACD' ); $bgcolor{$original_name} = '#FFFACD'; } else { $table->setRowAttr( ROWS => [$table->getRowNum()], BGCOLOR => $bgcolor ); $bgcolor{$original_name} = $bgcolor; } } if ( $table->getRowNum() == 1 ) { return $sbeams->makeInfoText( "Unable to find an OrthoMCL group for this protein" ); } $table->setColAttr( COLS => [5,6], ROWS => [1..$table->getRowNum()], ALIGN => 'RIGHT' ); # $table->setColAttr( COLS => [2], ROWS => [1..3], ALIGN => 'LEFT' ); my $MSF = SBEAMS::BioLink::MSF->new(); my $clustal = $MSF->runClustalW( sequences => $fasta ); my $clustal_display = get_clustal_display( alignments => $clustal, data => \%prot_data, bgcolor => \%bgcolor ); my $alignment_text = get_alignment_text(); my $spc = ' ' x 50; return( <<" END" );

The table below shows ortholog/homolog information about the target protein, $subject{name} ($subject{acc}), which is highlighted in yellow. The ortholog groups were computed using OrthoMCL version 2, based on BLAST homology.

$table

Ortholog Sequence Alignment

$spc Legend: X: Confident phosphorylation site assignment X: Ambiguous phosphorylation site assignment

$clustal_display

$alignment_text
END } sub get_alignment_text { my $content =<<" END";

Alignment was created by the clustalw program, which is maintained at the Conway Institute UCD Dublin.  
The alignment considers physical properties of the amino acids in the protein sequence, and the 
program was invoked using the following command-line:

clustalw -tree -align -outorder=input -infile=clustal_file`;

The consensus 'sequence' uses symbols to represent the level of conservation of amino acids at any given 
position.  The text below, adapted from the clustal documentation, describes the various symbols used.

    CONSENSUS SYMBOLS:

       An alignment will display by default the following symbols denoting the degree of conservation observed in each column:

      "*" means that the residues or nucleotides in that column are identical in all sequences in the alignment.

      ":" means that conserved substitutions have been observed

      "." means that semi-conserved substitutions are observed. 

      " " means that substitutions are not conservative.

END my @toggle = $sbeams->make_toggle_section( textlink => 1, hidetext => "Less Info", showtext => "More Info", content => $content ); return "Alignment created using ClustalW ( $toggle[1] )
$toggle[0] "; } sub get_phospho_sites { my $prot_data = shift; # $prot_data{$row[1]} ||= {}; # $prot_data{$row[1]}->{peps} ||= []; # push @{$prot_data{$row[1]}->{peps}}, $row[5]; # $prot_data{$row[1]}->{row} = \@row; # next unless $row[5]; # $prot_data{$row[1]}->{pcnt}++; # $prot_data{$row[1]}->{ppcnt}++ if $row[5] =~ /\*|\&/; # } for my $acc ( keys( %$prot_data ) ) { $prot_data->{$acc}->{sites} ||= {}; $prot_data->{allsites} ||= {}; # $prot_data->{$acc}->{conf} ||= []; my $prot_seq = $prot_data->{$acc}->{row}->[7]; for my $pep ( @{$prot_data->{$acc}->{peps}} ) { # $log->debug( "pep is $pep, prot is $prot_seq" ); # $pep = 'AAAS&FFFT*CCY*F'; # $prot_seq = 'XXXXXAAASFFFTCCYFXXXXXXXXXXXXXXXXXAAASFFFTCCYFXXXXXXXXXXXXXXXX'; # $log->debug( "pep is $pep, prot is $prot_seq" ); next unless $pep =~ /\*|\&/; # posns of ambi phos in protein my $ambi = $sbeamsMOD->get_site_positions( seq => $pep, pattern => '\&' ); # posns of conf phos in protein my $conf = $sbeamsMOD->get_site_positions( seq => $pep, pattern => '\*' ); my $clean_pep = $pep; $clean_pep =~ s/\*//g; $clean_pep =~ s/\&//g; my $prot_posns = $sbeamsMOD->get_site_positions( seq => $prot_seq, pattern => $clean_pep ); # Set up so that conf overrides ambi my %conf; for my $c ( @$conf ) { $conf{$c}++; } my $decr = 1; # first, loop over phosphorylation sites for my $p ( sort { $a <=> $b } @$conf, @$ambi ) { my $adj = $p - $decr; my $phosAA = substr( $clean_pep, $adj, 1 ); my $type = ( $conf{$p} ) ? 'phosnobo' : 'darkambinobo'; # $log->debug( "posn is $p, adj posn is $adj, phosphoAA is $phosAA, type is $type for seq $pep, cleaned to $clean_pep" ); # next, loop over peptide sites in protein for my $prot_posn( sort {$a <=> $b} @$prot_posns ) { my $adj_prot = $adj + $prot_posn; my $prot_phosAA = substr( $prot_seq, $adj_prot, 1 ); # push @{$prot_data->{$acc}->{ambi}}; $prot_data->{$acc}->{sites}->{$adj_prot} = $type; $prot_data->{allsites}->{$adj_prot}++; # $log->debug( "Added site $adj_prot for protein $acc, type is $type" ); } $decr++; } } } } sub get_org_hash { my $name2org = $kegg->getKeggNameToOrganism(); my %org2name; @org2name{values(%{$name2org})} = keys( %{$name2org} ); return \%org2name; } sub get_clustal_display { my %args = @_; my $display = qq~

~; for my $seq ( @{$args{alignments}} ) { # $log->debug( "After passing we have $seq->[0] and $seq->[1]!" ); my $sequence = $seq->[1]; if ( $seq->[0] eq 'consensus' ) { # $log->debug( "Consensus!@!!!!!" ); $sequence =~ s/ / /g } else { # $log->debug( "$seq->[0] isn't consensus, commence to highlighting $sequence" ); $sequence = highlight_sites( seq => $sequence, sites => $args{data}->{$seq->[0]}->{sites}, allsites => $args{data}->{allsites}); # my $clustal_display = get_clustal_display( alignments => $clustal, data => \%prot_data ); } $display .= "\n"; } $display .= "

$seq->[0]:

{$seq->[0]}>$sequence

\n"; return $display; } sub highlight_sites { my %args = @_; my @aa = split( '', $args{seq} ); my $cnt = 0; my $return_seq; # for my $s ( sort { $a <=> $b } keys( %{$args{sites}} )) { $log->debug( "Got a site at $s!" ); } for my $aa ( @aa ) { # $log->debug( "Consider $aa" ); if ( $aa =~ /\w/ ) { # $log->debug( " $aa is a letter" ); if ( $args{sites}->{$cnt} ) { # $log->debug( " $aa has a site at $cnt!" ); $return_seq .= "{$cnt}>$aa"; # $return_seq .= "$aa"; # } elsif ( $args{allsites}->{$cnt} ) { # $return_seq .= "$aa"; } else { $return_seq .= $aa; } # Increment counter on aa, not on - # $log->debug( "count is $cnt, is there no site here?" ); $cnt++; } else { $return_seq .= $aa; } } return $return_seq; }