###############################################################################
# $Id: KeggMaps.pm $
#
# Description : Module to support fetch and display gene expression on Kegg pathway maps.
#
# SBEAMS is Copyright (C) 2000-2006 Institute for Systems Biology
# This program is governed by the terms of the GNU General Public License (GPL)
# version 2 as published by the Free Software Foundation. It is provided
# WITHOUT ANY WARRANTY. See the full description of GPL terms in the
# LICENSE file distributed with this software.
#
###############################################################################
# package provides interface to various functionality for KEGG pathway maps
package SBEAMS::BioLink::KeggMaps;
use strict;
#require Exporter;
#our @ISA = qw( Exporter );
#our @EXPORT_OK = qw();
use LWP::UserAgent;
use SOAP::Lite;
use lib "../..";
use SBEAMS::Connection qw( $log );
use SBEAMS::Connection::Settings;
use SBEAMS::Connection::Tables;
use SBEAMS::BioLink;
use SBEAMS::BioLink::KGMLParser;
#use SBEAMS::BioLink::Settings;
#use SBEAMS::BioLink::TableInfo;
use SBEAMS::BioLink::Tables;
use lib qw (../../lib/perl);
### Globals ###
my $sbeams = new SBEAMS::Connection;
$sbeams->setRaiseError(1);
my $sbeamsMOD = new SBEAMS::BioLink;
$sbeamsMOD->setSBEAMS($sbeams);
### Class Methods ###
#+
# Constructor
#
#-
sub new {
my $class = shift;
my $this = { _wsdl => 'http://soap.genome.jp/KEGG.wsdl',
@_
};
# Objectification.
bless $this, $class;
return $this;
}
## General gettor/settor methods
#
#+
# Set URL for WSDL
#-
sub setWSDL {
my $this = shift;
$this->{_wsdl} = shift;
}
#+
# Get URL for WSDL
#-
sub getWSDL {
my $this = shift;
return $this->{_wsdl};
}
#+
# Get SOAP service object
#-
sub get_service {
my $this = shift;
my $wsdl = $this->getWSDL() || die "Missing required WSDL";
eval {
$this->{_service} ||= SOAP::Lite->service($wsdl);
};
if ( $@ ) {
$log->error( $@ );
print( $sbeams->makeErrorText("Temporarily unable to connect to KEGG SOAP service, please try again later") );
exit;
}
return $this->{_service};
}
#+
# reset SOAP service object
#-
sub reset_service {
my $this = shift;
my $wsdl = shift || $this->getWSDL() || die "Missing required WSDL";
$this->{_service} ||= SOAP::Lite->service($wsdl);
return $this->{_service};
}
#+
# Set current pathway
#-
sub setPathway {
my $this = shift;
my %args = @_;
$this->{_pathway} = $args{pathway} if $args{pathway};
}
#+
# Get current pathway
#-
sub getPathway {
my $this = shift;
return $this->{_pathway};
}
#+
# Set current organism (kegg org name, not db organism_id, e.g. hsa, sce)
#-
sub setOrganism {
my $this = shift;
my %args = @_;
$this->{_organism} = $args{organism} if $args{organism};
}
#+
# Get current organism (kegg org name, not db organism_id, e.g. hsa, sce)
#-
sub getOrganism {
my $this = shift;
return $this->{_organism};
}
#+
# Return hash of kegg name to sbeams organism_id
#-
sub getKeggNameToOrganism {
my $self = shift;
my %name2org = $sbeams->selectTwoColumnHash( <<" END" );
SELECT kegg_organism_name, organism_id
FROM $TBBL_KEGG_ORGANISM
END
return \%name2org;
}
#+
# Get current organism (kegg org name, not db organism_id, e.g. hsa, sce)
#-
sub getOrganismID {
my $this = shift;
my $org = $this->getOrganism();
my ($org_id) = $sbeams->selectrow_array( <<" END" );
SELECT kegg_organism_id FROM $TBBL_KEGG_ORGANISM where
kegg_organism_name = '$org'
END
return $org_id;
}
## KGML processing section
#+
# Process parsed KGML into useful data objects for building image map
#-
sub processEntries {
my $self = shift;
my %args = @_;
my @entries; # Array of entry IDs
my @coordinates; # Array of coordinates synced with entries
my %gene2entries; # Lookup of gene_id => entries
my %entry2genes; # Lookup of entry_id => genes
my %allgenes; # Non-redundant list of gene IDs
for my $en ( @{$args{entries}} ) {
# For now we'll skip all but genes
next unless $en->{type} eq 'gene';
# cache entry id
push @entries, $en->{id};
# cache coordinates
my $bl_x = ($en->{gr_x} - int($en->{gr_width}/2)); # bottom left X
my $bl_y = ($en->{gr_y} - int($en->{gr_height}/2)); # bottom left Y
my $tr_x = $bl_x + $en->{gr_width}; # top right X
my $tr_y = $bl_y + $en->{gr_height}; # top right Y
push @coordinates, "$bl_x, $bl_y, $tr_x, $tr_y";
my $genelist = $en->{name};
$genelist =~ s/\s+/ /g;
my @genes = split( " ", $genelist );
for my $gene ( @genes ) {
$allgenes{$gene}++;
$gene2entries{$gene} ||= [];
push @{$gene2entries{$gene}}, $en->{id};
$entry2genes{$en->{id}} ||= [];
push @{$entry2genes{$en->{id}}}, $gene;
}
}
my %processed = ( entries => \@entries,
coords => \@coordinates,
allgenes => [keys(%allgenes)],
gene2entries => \%gene2entries,
entry2genes => \%entry2genes );
return \%processed;
}
#+
# Returns image map HTML
#-
sub get_image_map {
my $self = shift;
my %args = @_;
return undef unless defined $args{coords};
my $name = $args{name} || 'sbeams_map';
my $src = $args{img_src} || 'src.gif';
my $colors = "";
my $link = "
";
my $map = "\n";
# return ( wantarray ) ? ($link, $map) : $link . $map;
return "$colors\n$link\n$map";
}
sub kegg_tables_exist {
my $self = shift;
# return 0;
unless ( defined $self->{_kegg_tables_exist} ) {
eval {
$sbeams->selectrow_array( "SELECT TOP 1 * FROM $TBBL_KEGG_GENE" );
};
if ( $@ ) {
$self->{_kegg_tables_exist} = 0;
} else {
$self->{_kegg_tables_exist} = 1;
}
}
return $self->{_kegg_tables_exist};
}
#+
# Method to fetch pathway info from KEGG
#-
sub getKeggPathways {
my $self = shift;
my %args = @_;
$self->{_organism} = $args{organism} if $args{organism};
if ( !$self->{_organism} ) {
$log->error( "Missing required parameter organism" );
exit;
} elsif ( !$self->validateOrganism() ) {
$log->error( "Invalid organism" );
exit;
}
# Check args and set defaults
$args{source} ||= 'db';
if ( $args{source} eq 'db' ) {
unless ( $self->kegg_tables_exist() ) {
$log->warn( "Kegg tables not installed, proceeding manually" );
$args{source} = 'kegg';
}
}
my $result;
# kegg api-based method, to be replaced with caching version
if ( $args{source} =~ /kegg/i ) {
my $service = $self->get_service();
$result = $service->list_pathways( $args{organism} );
} elsif ( $args{source} =~ /db/i ) {
# Fetch info from the database
$result = $self->list_db_pathways( %args );
} else {
}
# ref to array of hashrefs
return $result;
}
#+
# Method to fetch gene info from KEGG
#-
sub getGeneInfo {
my $self = shift;
my %args = @_;
# Check args and set defaults
unless ( $args{genes} && ref($args{genes}) eq 'ARRAY' ) {
$log->error( "missing required parameter genes" );
exit;
}
my $org = $self->getOrganism();
my @genes;
my $cnt = 0;
while ( @{$args{genes}} ) {
$cnt++;
my $gene_string;
for ( my $i = 0; $i < 50; $i++ ) {
$gene_string .= ' ' . "$org:" . shift( @{$args{genes}} );
last unless scalar @{$args{genes}};
}
my $service = $self->get_service();
my $gene_info = $service->btit( $gene_string );
$log->debug( "processing batch # $cnt" );
foreach my $line ( split /\n/, $gene_info ) {
my ( $gene_id, $symbol, $defn ) = ($line =~ /$org:(\S+)\s+([^;]+);\s+(.*)/);
# Multiple gene symbols
if ( $symbol =~ /,/ ) {
my @syms = split( /,/, $symbol );
$symbol = $syms[0];
$defn = "(AKA " . join /,/, @syms[1..$#syms] . ") $defn";
}
push @genes, { gene_id => $gene_id, symbol => $symbol,
defn => $defn, annotline => $line };
}
}
$log->debug( "got info for " . scalar( @genes ) . " total genes" );
return \@genes;
}
#+
# Check specified organism to see if it is supported.
#-
sub validateOrganism {
my $self = shift;
my $organisms = $self->getSupportedOrganisms();
unless ( grep( /$self->{_organism}/, @{$organisms} ) ) {
$log->error( "Organism $self->{_organism} not supported" );
return 0;
}
return 1;
}
sub fetch_image {
my $self = shift;
my %args = @_;
# Sort out the arguments
my $path = $args{pathway} || $self->{_pathway};
$path =~ s/path://g;
my $org = $args{organism} || $self->{_organism};
for my $req ( $path, $org ) {
die "Missing required parameter" unless $req;
}
my $base = $CONFIG_SETTING{KEGG_IMAGE_URL} ||
"ftp://ftp.genome.ad.jp/pub/kegg/pathways/__KEGG_ORG__/BASE.gif";
$base =~ s/BASE/$path/g;
$base =~ s/__KEGG_ORG__/$org/g;
my $image_path = "/net/dblocal/data/sbeams/KEGG_MAPS/$org/$path.gif";
return $image_path if -e $image_path; #short circuit if image is already there.
# Fetch response
my $ua = LWP::UserAgent->new();
my $response = $ua->get( $base );
my $image = $response->content;
open( IMAGE, ">$image_path" );
print IMAGE $image;
close IMAGE;
return $image_path;
}
sub fetchPathwayXML {
my $self = shift;
my %args = @_;
# Sort out the arguments
my $path = $args{pathway} || $self->{_pathway};
$path =~ s/path://g;
my $org = $args{organism} || $self->{_organism};
for my $req ( $path, $org ) {
die "Missing required parameter" unless $req;
}
my $base = $CONFIG_SETTING{KGML_URL} ||
"ftp://ftp.genome.jp/pub/kegg/xml/KGML_v0.6.1/$org/BASE.xml";
# $base = "ftp://ftp.genome.jp/pub/kegg/xml/organisms/$org/BASE.xml";
# $log->debug( "kegg earl is $base" );
$base =~ s/BASE/$path/g;
$base =~ s/__KEGG_ORG__/$org/g;
# Fetch response
my $ua = LWP::UserAgent->new();
my $response = $ua->get( $base );
$self->{xml} = $response->content;
my %results = ( xml => $response->content(),
url => $base );
return \%results;
# return $response->content();
}
#+
# Fetch and parse KGML file for this pathway
#-
sub parsePathwayXML {
my $self = shift;
my %args = @_;
# Sort out the arguments
my $path = $args{pathway} || $self->{_pathway};
$path =~ s/path://g;
my $org = $args{organism} || $self->{_organism};
for my $req ( $path, $org ) {
die "Missing required parameter" unless $req;
}
my $parser = SBEAMS::BioLink::KGMLParser->new();
$args{source} ||= 'db';
if ( $args{source} eq 'db' && $self->kegg_tables_exist() ) {
my $sql =<<" END";
SELECT kgml FROM $TBBL_KEGG_PATHWAY
WHERE kegg_pathway_name = 'path:$path'
END
my ( $kgml ) = $sbeams->selectrow_array( $sql );
if ( !$kgml ) {
$log->warn( "Failed to fetch KGML db:\n $sql" );
$log->warn( "Falling back to direct fetch from KEGG" );
my $from_kegg = $self->fetchPathwayXML( %args );
$kgml = $from_kegg->{xml};
}
if ( $kgml ) {
$parser->set_string( xml => $kgml );
} else {
$log->error( "Unable to retrieve KGML from db or KEGG" );
return;
}
} else {
my $base = $CONFIG_SETTING{KGML_URL} ||
"ftp://ftp.genome.jp/pub/kegg/xml/KGML_v0.6.1/$org/BASE.xml";
$base =~ s/BASE/$path/g;
$base =~ s/__KEGG_ORG__/$org/g;
$parser->set_url( url => $base );
if ( !$parser->fetch_url() ) {
$log->error( "Failed to fetch KGML from KEGG" );
return undef;
}
}
$parser->parse() || $log->debug("xml failed to parse, eh");
return $self->processEntries( entries => $parser->{_entries} );
}
#+
# Get list of genes in pathway, currently via KEGG API.
#-
sub getPathwayGenes {
my $self = shift;
my %args = @_;
$args{source} ||= 'db';
my $pathway = $args{pathway} || $self->{_pathway};
if ( !$pathway ) {
$log->error( "Pathway not set" );
exit;
} elsif ( !$self->{_organism} ) {
$pathway =~ /path:(\w{3})(\d*)/;
$self->{_organism} = $1;
$self->validateOrganism( $self->{_organism} );
}
my $result;
if ( $args{source} eq 'db' && $self->kegg_tables_exist() ) {
$result = $self->get_db_pathway_genes( pathway => $pathway,
organism => $self->{_organism} );
} else {
my $service = $self->get_service();
$result = $service->get_genes_by_pathway($pathway);
}
my @gene_list;
for my $gene ( @$result ) {
$gene =~ s/$self->{_organism}://g;
if ( $self->{_organism} eq 'hsa' || $args{not_keys} ) {
push @gene_list, $gene;
} else {
push @gene_list, "'" . $gene . "'";
}
}
return \@gene_list;
}
#+
# Routine to get relationships between genes in pathway
#-
sub getPathwayRelationships {
my $self = shift;
my %args = @_;
my $pathway = $args{pathway} || $self->{_pathway};
if ( !$pathway ) {
$log->error( "Pathway not set" );
exit;
} elsif ( !$self->{_organism} ) {
$pathway =~ /path:(\w{3})(\d*)/;
$self->{_organism} = $1;
$self->validateOrganism( $self->{_organism} );
}
my $service = $self->get_service();
# Fetch elements and relationships from KEGG
my $elements_by_path = $service->get_elements_by_pathway($pathway);
my $relations_by_path = $service->get_element_relations_by_pathway($pathway);
my %elements; # hash of elements and associated genes/connections
my @groups; # Temp storage for 'groups' of genes
my %genes; # List of gene entities, we return only these
# Loop over the elements, store genes and save groups for subsequent analysis.
for my $result ( @{$elements_by_path} ) {
my $type = $result->{type};
my $element_id = $result->{element_id};
if ( $type eq 'group' ) {
push @groups, $result;
}
if ( $type !~ /gene/ ) {
next;
}
my $names = $result->{names};
$genes{$element_id}++;
$elements{$element_id} ||= {};
for my $gene ( @$names ) {
$elements{$element_id}->{genes} ||= [];
push @{$elements{$element_id}->{genes}}, $gene;
}
}
# Now that we know which elements are genes, we can process the groups
for my $group ( @groups ) {
my $element_id = $group->{element_id};
$elements{$element_id} ||= {};
$elements{$element_id}->{genes} ||= [];
for my $component ( @{$group->{components}} ) {
if( $genes{$component} ) { # only process if this group component is a gene
my $c_genes = $elements{$component}->{genes} || [];
for my $gene ( @$c_genes ) {
push @{$elements{$element_id}->{genes}}, $gene;
}
}
}
}
# Loop over reported relationships
for my $result ( @{$relations_by_path} ) {
my $type = $result->{type};
# Skip out unless it is a protein-protein relationship
next unless $type && $type eq 'PPrel';
# Each relationship follows a e1 description e2 pattern
my $element1 = $result->{element_id1};
next unless $elements{$element1}; # Exit unless we've stored info about this
my $element2 = $result->{element_id2};
next unless $elements{$element2}; # Exit unless we've stored info about this
# Record connection, relation, and type ( e.g. 25 inhibition 22 --| ) -
$elements{$element1} ||= {};
$elements{$element1}->{connex} ||= [];
$elements{$element1}->{relation} ||= [];
$elements{$element1}->{relation_type} ||= [];
my $rel = '';
my $rel_type = '';
my $subtypes = $result->{subtypes}->[0];
if ( $subtypes ) {
$rel = $subtypes->{relation};
$rel_type = $subtypes->{type};
}
# Store relationship info in 3 parallel arrayrefs
push @{$elements{$element1}->{connex}}, $element2;
push @{$elements{$element1}->{relation}}, $rel;
push @{$elements{$element1}->{relation_type}}, $rel_type;
}
# Array of derived relationships
my @relationships;
# Loop through keys of elements array, cacheing the ones with needed info.
for my $e1 ( sort { $a <=> $b } ( keys( %elements ) ) ) {
# Skip those without connection info
next unless $elements{$e1}->{connex};
my @connex = @{$elements{$e1}->{connex}};
for ( my $i = 0; $i <= $#connex; $i++ ) { # For each connection record
my $e2 = $connex[$i];
my $rel = $elements{$e1}->{relation}->[$i];
my $rel_type = $elements{$e1}->{relation_type}->[$i];
my $e1_genes = $elements{$e1}->{genes};
my $e2_genes = $elements{$e2}->{genes};
for my $g1 ( @$e1_genes ) { # For each gene defined by element_id 1
for my $g2 ( @$e2_genes ) { # For each gene defined by element_id 2
push @relationships, [ $g1, $g2, $rel_type, $rel ];
}
}
}
}
# Return reference to array of relationships
return( \@relationships );
}
#+
# Routine to get Kegg to color pathway with expressed genes
#-
sub getColoredPathway {
my $self = shift;
my %args = @_;
$log->debug( 'here' );
for my $key ( qw( genes bg fg ) ) {
if ( !$args{$key} ) {
$log->error( "Missing required parameter $key" );
exit;
} elsif ( ref $args{$key} ne 'ARRAY' ) {
}
}
$log->debug( 'there' );
my $pathway = $args{pathway} || $self->{_pathway};
if ( !$pathway ) {
$log->error( "Pathway not set" );
exit;
}
if ( $pathway !~ /^path:/ ) {
$pathway = 'path:' . $pathway;
}
my $cnt = 0;
my $all = '';
for my $g ( @{$args{genes}} ) {
$cnt++;
$all .= " $g";
}
$cnt = 0;
$all = '';
for my $g ( @{$args{fg}} ) {
$cnt++;
$all .= " $g";
}
$cnt = 0;
$all = '';
for my $g ( @{$args{bg}} ) {
$cnt++;
$all .= " $g";
}
$log->debug( 'everywhere' );
my $genes = SOAP::Data->type( array => $args{genes} );
my $fg = SOAP::Data->type( array => $args{fg} );
my $bg = SOAP::Data->type( array => $args{bg} );
my $service = $self->get_service();
my $url = $service->color_pathway_by_objects("$pathway", $genes, $fg, $bg ) ;
return $url if $url;
$log->info( "Initial fetch for $pathway failed!: $!" );
$url = $service->color_pathway_by_objects("$pathway", $genes, $fg, $bg ) ;
return $url if $url;
print "Unable to fetch pathway from KEGG, please try again later.