#!/usr/local/bin/perl
###############################################################################
# $Id: peptideSearch.cgi 4670 2006-04-22 01:54:05Z dcampbel $
#
# SBEAMS is Copyright (C) 2000-2005 Institute for Systems Biology
# This program is governed by the terms of the GNU General Public License (GPL)
# version 2 as published by the Free Software Foundation. It is provided
# WITHOUT ANY WARRANTY. See the full description of GPL terms in the
# LICENSE file distributed with this software.
###############################################################################
###############################################################################
# Get the script set up with everything it will need
###############################################################################
use strict;
use vars qw ($q $sbeams $sbeamsMOD $PROG_NAME
$current_contact_id $current_username $glyco_query_o);
use lib qw (../../lib/perl);
use CGI::Carp qw(fatalsToBrowser croak);
use Data::Dumper;
use SBEAMS::Connection qw($q $log);
use SBEAMS::Connection::Settings;
use SBEAMS::Connection::Tables;
use SBEAMS::Connection::DataTable;
use SBEAMS::BioLink::Tables;
use SBEAMS::BioLink::KeggMaps;
use SBEAMS::BioLink::MSF;
use SBEAMS::Glycopeptide;
use SBEAMS::Glycopeptide::Settings;
use SBEAMS::Glycopeptide::Tables;
use SBEAMS::Glycopeptide::Get_glyco_seqs;
use SBEAMS::Glycopeptide::Glyco_query;
###############################################################################
# Global Variables
###############################################################################
#
$sbeams = new SBEAMS::Connection;
$sbeamsMOD = new SBEAMS::Glycopeptide;
$sbeamsMOD->setSBEAMS($sbeams);
my $kegg = new SBEAMS::BioLink::KeggMaps;
$glyco_query_o = new SBEAMS::Glycopeptide::Glyco_query;
$glyco_query_o->setSBEAMS($sbeams);
$sbeams->setSBEAMS_SUBDIR($SBEAMS_SUBDIR);
my $base_url = "$CGI_BASE_DIR/$SBEAMS_SUBDIR/massSearch";
{ # Main
# Authenticate or exit
exit unless ($current_username = $sbeams->Authenticate(
# permitted_work_groups_ref=>['Glycopeptide_user','Glycopeptide_admin', 'Glycopeptide_readonly'],
# connect_read_only=>1,
allow_anonymous_access=>1,
));
#### Read in the default input parameters
my %params;
$sbeams->parse_input_parameters( q=>$q, parameters_ref=>\%params );
$sbeams->processStandardParameters(parameters_ref=>\%params);
if ( $params{unipep_build_id} ) {
my $build_id = $sbeamsMOD->get_current_build( build_id => $params{unipep_build_id} );
if ( $build_id != $params{unipep_build_id} ) {
$sbeams->set_page_message( type => 'Error', msg => 'You must log in to access specified build' );
}
}
## get project_id to send to HTMLPrinter display
my $project_id = $sbeams->getCurrent_project_id();
my $page = $sbeams->getGifSpacer( 800 ) . "
\n";
if ( $params{ipi_data_id} ) {
$page .= get_ortholog_display( %params );
} elsif ( $params{list_groups} ) {
$page .= get_ortholog_list( %params );
} else {
$page .= $sbeams->makeErrorText( "Missing required parameter ipi_data_id" );
}
# Display page
$sbeamsMOD->display_page_header(project_id => $project_id);
print $sbeamsMOD->get_phospho_css();
print $page;
$sbeamsMOD->display_page_footer();
} # end main
sub get_ortholog_display {
my %args = @_;
my $content;
for my $arg ( qw( ipi_data_id ) ) {
unless ( $args{$arg} ) {
$content .= $sbeams->makeErrorText("Missing required parameter: $arg
");
return $content;
}
}
# Set up display table
my $table = SBEAMS::Connection::DataTable->new( BORDER => 0 );
$table->addRow( ['Ortholog group', 'Accession','Protein Name', 'Organism name', '# Phospho-peptides', '# Peptides' ] );
$table->setRowAttr( ROWS => [1], BGCOLOR => '#C0D0C0' );
$table->setHeaderAttr( BOLD => 1 );
my $all_builds = $sbeamsMOD->getPhosphopepBuilds( as_string => 1 );
# SQL to fetch orthologs
my $sql =<<" END_SQL";
SELECT ortholog_group, ipi_accession_number, protein_name,
organism_name, I.ipi_data_id, observed_peptide_sequence, ORG.organism_id,
protein_sequence
FROM $TBBL_ORTHOLOG O
JOIN $TBGP_ORTHOLOG_TO_IPI OTI ON O.ortholog_id = OTI.ortholog_id
JOIN $TBGP_IPI_DATA I ON I.ipi_data_id = OTI.ipi_data_id
LEFT JOIN $TBGP_OBSERVED_TO_IPI OTIPI ON OTIPI.ipi_data_id = OTI.ipi_data_id
LEFT JOIN $TBGP_OBSERVED_PEPTIDE OP ON OTIPI.observed_peptide_id = OP.observed_peptide_id
LEFT JOIN $TBGP_BUILD_TO_SEARCH BTS ON OP.peptide_search_id = BTS.search_id
JOIN $TB_ORGANISM ORG ON O.ortholog_organism_id = ORG.organism_id
WHERE ortholog_group IN (
SELECT DISTINCT ortholog_group
FROM $TBBL_ORTHOLOG O
JOIN $TBGP_ORTHOLOG_TO_IPI OTI ON O.ortholog_id = OTI.ortholog_id
WHERE ipi_data_id = $args{ipi_data_id} )
AND ( build_id IN ( $all_builds ) OR build_id IS NULL )
-- GROUP BY ortholog_group, ipi_accession_number, protein_name, organism_name, I.ipi_data_id, ORG.organism_id, CAST( protein_sequence AS VARCHAR(4000) )
ORDER BY organism_name, ipi_accession_number
END_SQL
my $org_hash = get_org_hash();
for my $o ( keys( %{$org_hash} ) ) {
$log->debug( "org $o has id $org_hash->{$o}" );
}
my $current;
my %subject;
my $bgcolor = '#F1F1F1';
my $sth = $sbeams->get_statement_handle( $sql );
my $fasta = '';
my %prot_data;
my @prot_order;
while ( my @row = $sth->fetchrow_array() ) {
if ( !$prot_data{$row[1]} ) {
push @prot_order, $row[1];
}
$log->debug( "org id is $row[6], name is $row[3]" );
$prot_data{$row[1]} ||= {};
$prot_data{$row[1]}->{peps} ||= [];
push @{$prot_data{$row[1]}->{peps}}, $row[5];
$log->debug( "prot_data for $row[1] getting pushed, org_id is $row[6]" );
$prot_data{$row[1]}->{row} = \@row;
next unless $row[5];
$prot_data{$row[1]}->{pcnt}++;
$prot_data{$row[1]}->{ppcnt}++ if $row[5] =~ /\*|\&/;
}
# Will populate the $prot_data->{accession}->{sites} hashref, site => type
# and the $prot_data->{allsites} hash...
get_phospho_sites( \%prot_data );
my %bgcolor;
for my $prot ( @prot_order ) {
# my $curr_prot = $prot_data{$prot};
my @row = @{$prot_data{$prot}->{row}};
$fasta .= ">$row[1]\n$row[7]\n";
$prot_data{$prot}->{ppcnt} ||= 0;
$prot_data{$prot}->{pcnt} ||= 0;
# Fixing bug, why on earth were 5 and 6 overwritten!
my $org_id = $row[6];
$row[5] = $prot_data{$prot}->{pcnt};
$row[6] = $prot_data{$prot}->{ppcnt};
my $original_name = $row[1];
if ( $prot_data{$prot}->{pcnt} ) {
$row[1] = " $row[1] ";
}
$table->addRow( [@row[0..3,5,6]] );
$current ||= $row[3];
if ( $current ne $row[3] ) { # New organism
$bgcolor = ( $bgcolor eq '#E0E0E0' ) ? '#F1F1F1' : '#E0E0E0';
$current = $row[3];
}
if ( $row[4] == $args{ipi_data_id} ) {
$subject{name} = $row[2];
$subject{acc} = $row[1];
$table->setRowAttr( ROWS => [$table->getRowNum()], BGCOLOR => '#FFFACD' );
$bgcolor{$original_name} = '#FFFACD';
} else {
$table->setRowAttr( ROWS => [$table->getRowNum()], BGCOLOR => $bgcolor );
$bgcolor{$original_name} = $bgcolor;
}
}
if ( $table->getRowNum() == 1 ) {
return $sbeams->makeInfoText( "Unable to find an OrthoMCL group for this protein" );
}
$table->setColAttr( COLS => [5,6], ROWS => [1..$table->getRowNum()], ALIGN => 'RIGHT' );
# $table->setColAttr( COLS => [2], ROWS => [1..3], ALIGN => 'LEFT' );
my $MSF = SBEAMS::BioLink::MSF->new();
my $clustal = $MSF->runClustalW( sequences => $fasta );
my $clustal_display = get_clustal_display( alignments => $clustal, data => \%prot_data, bgcolor => \%bgcolor );
my $alignment_text = get_alignment_text();
my $spc = ' ' x 50;
return( <<" END" );
The table below shows ortholog/homolog information about the target protein, $subject{name} ($subject{acc}), which is highlighted in yellow. The ortholog groups were computed using OrthoMCL version 2, based on BLAST homology.
$table| $spc Legend: X: Confident phosphorylation site assignment X: Ambiguous phosphorylation site assignment |
Alignment was created by the clustalw program, which is maintained at the Conway Institute UCD Dublin.
The alignment considers physical properties of the amino acids in the protein sequence, and the
program was invoked using the following command-line:
clustalw -tree -align -outorder=input -infile=clustal_file`;
The consensus 'sequence' uses symbols to represent the level of conservation of amino acids at any given
position. The text below, adapted from the clustal documentation, describes the various symbols used.
CONSENSUS SYMBOLS:
An alignment will display by default the following symbols denoting the degree of conservation observed in each column:
"*" means that the residues or nucleotides in that column are identical in all sequences in the alignment.
":" means that conserved substitutions have been observed
"." means that semi-conserved substitutions are observed.
" " means that substitutions are not conservative.
| $seq->[0]: | {$seq->[0]}>$sequence |