############################################################################### # Program : SBEAMS::Microarray::Affy_Analysis # Author : Pat Moss # $Id$ # # Description : Module which implements methods for parsing Affymetrix # annotation files. # # SBEAMS is Copyright (C) 2000-2005 Institute for Systems Biology # ############################################################################### {package SBEAMS::Microarray::Affy_Annotation; our $VERSION = '1.00'; ############################################################## use strict; use vars qw($sbeams $self); #HACK within the read_dir method had to set self to global: read below for more info use File::Basename; use File::Find; use Data::Dumper; use Carp; use FindBin; use SBEAMS::Connection qw($log); use SBEAMS::Connection::Tables; use SBEAMS::Microarray::Tables; use base qw(SBEAMS::Microarray::Affy); #declare superclass ####################################################### # Constructor ####################################################### sub new { my $class = shift; my @all_records =(); my $self = {ALL_RECORDS =>\@all_records}; bless $self, $class; } ############################################################################### # Receive the main SBEAMS object ############################################################################### sub setSBEAMS { my $self = shift; $sbeams = shift; return($sbeams); } ############################################################################### # Provide the main SBEAMS object ############################################################################### sub getSBEAMS { my $self = shift; return($sbeams); } ############################################################################### # Get/Set the VERBOSE status # # ############################################################################### sub verbose { my $self = shift; if (@_){ #it's a setter $self->{_VERBOSE} = $_[0]; }else{ #it's a getter $self->{_VERBOSE}; } } ############################################################################### # Get/Set the DEBUG status # # ############################################################################### sub debug { my $self = shift; if (@_){ #it's a setter $self->{_DEBUG} = $_[0]; }else{ #it's a getter $self->{_DEBUG}; } } ############################################################################### # Get/Set the TESTONLY status # # ############################################################################### sub testonly { my $self = shift; if (@_){ #it's a setter $self->{_TESTONLY} = $_[0]; }else{ #it's a getter $self->{_TESTONLY}; } } ############################################################################### # Get/Set the DATABASE status # # ############################################################################### sub database { my $self = shift; if (@_){ #it's a setter $self->{_DATABASE} = $_[0]; }else{ #it's a getter $self->{_DATABASE}; } } ############################################################################### # Get/Set the run_mode status # # ############################################################################### sub run_mode { my $self = shift; if (@_){ #it's a setter $self->{_RUN_MODE} = $_[0]; }else{ #it's a getter $self->{_RUN_MODE}; } } ############################################################################### #Set Record # ############################################################################### sub set_record { my $method = 'set_record'; my $self = shift; my $record = shift; confess(__PACKAGE__ . "::$method Need to provide a record ") unless ($record); push @ { $self->{ALL_RECORDS}} , $record; } ############################################################################### #Get Record # ############################################################################### sub get_record { my $method = 'get_record'; my $self = shift; return pop @{ $self->{ALL_RECORDS}} } ############################################################################### #set_annotation_set_id #Set the annotation set id. Will first go grab the slide type id then check to see # if this annotation set has been loaded. If so the user will have to choose to delete the previous #version. ############################################################################### sub set_annotation_set_id { my $method = 'set_annotation_set_id'; my $self = shift; my %args = @_; my $annotation_date = $args{anno_date}; my $genome_version = $args{genome_version}; my $file_name = $args{file_name}; confess(__PACKAGE__ . "::$method Need to provide key value pair 'anno_date', 'genome_version', 'file_name'") unless (%args); my $slide_type_id = $self->find_slide_type_id(slide_name => $file_name); my $id = $self->check_previous_annotation_set( slide_type_id => $slide_type_id, genome_version => $genome_version, annotation_date => $annotation_date, ); $self->{ANNO_SET_ID} = $id #set the ANNOTATION SET ID into the annotation object } ############################################################################### #get_annotation_set_id #Return the annotation_set_id ############################################################################### sub get_annotation_set_id { my $method = 'get_annotation_set_id'; my $self = shift; return $self->{ANNO_SET_ID} } ############################################################################### #get_record_count #Return the number of records ############################################################################### sub get_record_count { my $method = 'get_record_count'; my $self = shift; return scalar @ { $self->{ALL_RECORDS}}; } ############################################################################### # parse_data_file # # Simple parse of the data file foreach row of the data file make a record hash keys will be the data column names and the value will be the piece of data from the particular row column intersection #sets all the data into little objects, basically an array of hashes ############################################################################### sub parse_data_file { my $self = shift; my $file_name = shift; open DATA, $file_name or die "CANNOT OPEN ANNOTATION FILE '$!' \n"; my @col_names = (); my $count = 0; while(){ chomp; s/^"//; #remove the first quote from the line s/"$//; my @line_parts = split /","/, $_; #split on the quotes and comma. Commas do exists in some of the data fields if ($count == 0){ #grab the column names @col_names = @line_parts; if ($self->verbose() > 0){ print "@col_names\n"; } $count ++; next; } if($count == 1){ # find Annotation Date and Genome Version columns my $annotation_date_column; my $genome_version_column; for(my $i=0;$i<$#col_names;$i++) { if($col_names[$i] =~ /Annotation\ Date/i) { $annotation_date_column = $i; } elsif($col_names[$i] =~ /Genome\ Version/i) { $genome_version_column = $i; } } my ($annotation_date, $genome_version) = @line_parts[$annotation_date_column, $genome_version_column]; my $file_base_name = basename($file_name); #will use the file name to find the slide_type_id unless ($file_base_name =~ s/_anno.*//){ #all files should contain _anno.csv die "THE FILE NAME '$file_base_name' DOES NOT LOOK GOOD\n", "All file names must have '_anno.csv' AND the name must be in entered into MA_slide_type.name column\n"; } if ( $self->verbose() > 0 ){ print "ANNO FILE ROOT NAME '$file_base_name'\n"; } $self->set_annotation_set_id( anno_date => $annotation_date, genome_version => $genome_version, file_name => $file_base_name, ); } my %record_h = (); for (my $i = 0; $i <= $#line_parts; $i ++){ my $name = (); if ($name = $col_names[$i]){ }else{ $self->anno_error(error => "LINE $count FIELD '$i' DOES NOT HAVE A COLUMN NAME"); } if ($self->verbose() > 3 ) { print "_${i}__ $name => $line_parts[$i]\n"; } $record_h{$name} = $line_parts[$i]; } $self->set_record( \%record_h); $count ++; } } ############################################################################### #add_record_to_annotation_table #Given a record hash_ref take the columns needed and upload into Affy annotation table #Return the PK ############################################################################### sub add_record_to_annotation_table { my $method = 'add_record_to_annotation_table'; my $self = shift; my %args = @_; my $record_href = $args{record}; my %record_h= %{$record_href}; #setup the rowdata_ref need to insert the record #table_column_name => value : The keys for the record_h are taken from the Affy annotation file confess(__PACKAGE__ . "::$method Need to provide reference to a record hash ") unless (ref($record_href)); my %rowdata_h = ( affy_annotation_set_id => $self->get_annotation_set_id(), probe_set_id => $record_h{'Probe Set ID'}, sequence_type => $record_h{'Sequence Type'}, sequence_source => $record_h{'Sequence Source'}, transcript_id => $record_h{'Transcript ID'}, target_description_feature => $record_h{'Target Description'} =~ m!FEA=(.+?)/! ? $1: undef , target_description => $record_h{'Target Description'} =~ m!DEF=(.+?)/! ? $1: undef , target_description_note => $record_h{'Target Description'} =~ m!NOTE=(.+?)/!? $1: undef , representative_public_id => $record_h{'Representative Public ID'}, archival_unigene_cluster => $record_h{'Archival UniGene Cluster'}, gene_title => $record_h{'Gene Title'}, gene_symbol => $record_h{'Gene Symbol'}, chromosomal_location => $record_h{'Chromosomal Location'}, pathway => $record_h{'Pathway'}, qtl => $record_h{'QTL'}, annotation_description => $record_h{'Annotation Description'}, transcript_assignment => $record_h{'Transcript Assignments'}, annotation_transcript_cluster => $record_h{'Annotation Transcript Cluster'}, annotation_notes => $record_h{'Annotation Notes'}, ); %rowdata_h = $self->truncate_data(record_href => \%rowdata_h); #some of the data will need to truncated to make it easy to put all data in varchar 255 or less my $rowdata_ref = \%rowdata_h; my $affy_annotation_set_id = $sbeams->updateOrInsertRow( table_name=>$TBMA_AFFY_ANNOTATION, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'affy_annotation_id', ); return $affy_annotation_set_id } ############################################################################### #parse_alignment #Given the data from the aligment field #FORMAT:chromosome // start-end // strand // percentage identity" #Example: "chr9:109422168-109422413 (-) // 100" #Add the data to the current record ############################################################################### sub parse_alignment_OLD { my $method = 'parse_alignment'; my $self = shift; my %record_h = @_; my $alignemt = $record_h{'Alignments'}; confess(__PACKAGE__ . "::$method Need to provide a record hash ") unless (%record_h); if ($alignemt =~ m!:(\d+)-(\d+)\s #find the genomic bp Start Stop Example chr6:109422168-109422413 \((\+|-)\)\s+ #pull out the strand orientaion Example (+) Escape the first parenthesis //\s(.+?)\s #pull out the percent Identity !x){ $record_h{gene_start} = $1; $record_h{gene_stop} = $2; $record_h{gene_orientation} = $3; $record_h{percent_identity} = $4; if ($self->verbose() > 0) { print "PARSED ALIGNMENT '$alignemt'\n"; print "\t'$record_h{gene_start}' '$record_h{gene_stop}' '$record_h{gene_orientation}' '$record_h{percent_identity}'\n"; } return %record_h; }else{ if ( $alignemt && $alignemt ne '---'){ $self->anno_error(error => "ERROR: ALIGNMENT NOT PARSED '$alignemt'"); } } return %record_h; #return the same hash } ############################################################################### #add_record_to_affy_db_links #Given a record hash_ref take the columns needed and upload into Affy annotation table #Return the PK ############################################################################### sub add_record_to_affy_db_links { my $method = 'add_record_to_affy_db_links'; my $self = shift; my %args = @_; my $record_href = $args{record}; my $affy_anno_pk = $args{affy_annotation_pk}; confess(__PACKAGE__ . "::$method Need to provide 'record' and 'affy_annotation_pk' ") unless ($record_href && $affy_anno_pk); my %record_h= %{$record_href}; #dbxref.dbxref_tag #value from Affy annotation file my %external_links = ( LocusLink => $record_h{'LocusLink'}, SwissProt => $record_h{'SwissProt'}, FlyBase => $record_h{'FlyBase'}, WormProt => $record_h{'WormBase'}, EC => $record_h{'EC'}, NCBIProt => $record_h{'RefSeq Protein ID'}, ENSEMBL => $record_h{'Ensembl'}, OMIM => $record_h{'OMIM'}, MGI => $record_h{'MGI Name'}, RGD => $record_h{'RGD Name'}, ); ###NEED TO ADD TO DBXREF TABLE #UNKNOWN => $record_h{'AGI'} foreach my $db_ref_tag (keys %external_links){ #loop through all the fields to upload my $db_id = $external_links{$db_ref_tag}; #actual accession number for external database my $dbxref_id = $self->get_xref_id($db_ref_tag); if ( !defined $db_id || $db_id =~ /---/ ) { #skip uploading blank data next; } my @parts = split "///", $db_id; #each field could hold multiple ids foreach my $val (@parts){ $val =~ s/\s//g; #remove any white space $val =~ s/^EC://; #remove the EC tag from the EC accession number it is not needed if ( $self->debug ){ print "EXTERNAL DBID '$val'\n"; } my $rowdata_ref = { affy_annotation_id => $affy_anno_pk, db_id => $val, dbxref_id => $dbxref_id, }; $sbeams->updateOrInsertRow( table_name=>$TBMA_AFFY_DB_LINKS, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'affy_links_id', ); } } } ############################################################################### #add_data_child_tables #Take data from a record_h parse and add to child tables of the affy_annotation table. All the sub tables will hold a FK to the affy_annotaion_table #Will load data into a bunch of tables ############################################################################### sub add_data_child_tables { my $method = 'add_data_child_tables'; my $self = shift; my %args = @_; my $record_href = $args{record}; my $affy_anno_pk = $args{affy_annotation_pk}; my %record_h = %{$record_href}; confess(__PACKAGE__ . "::$method Need to provide 'record' and 'affy_annotation_pk' ") unless (%record_h && $affy_anno_pk); my %ontology_column_id = $self->get_gene_ontolgy_types(); #maps the affy_column heads to a db id #################### #A column from the affy annotation .csv file will be parsed and added to a child table. The hash below will be used to #pull the data from the record hash, parse the data and supply all the data data need to insert the record into the child tables ###Example hash #Data_column name => { REG_EXP used to parse affy annotation line. regular expression will be used with the 'x' flag so white space will be ignored # MULTI_RECORDS < NO> if the piece of data has a /// it means there are multiple records that do not need to be split apart, the default is to split records into individual chunks # TABLE_NAME <$TABLE_VAR> Name of the table to insert data into # PK Name of the primary key column of the table to insert into # COLUMN_NAMES Foreach piece of data to capture have a column name as the value for each numeric ID. The number ID's is the order of the data being captured by the REG_EXP # RUN_METHOD Some inserts will need to gather data from various other tables this can be done by using a method. #} my $ontlogy_reg_exp = qr!(\d+)\s+// #grab the ID \s+(\w.+?)\s// #grab the gene_ontology_description \s+(\w.*) #grab the gene_ontology_evidence !x; my %child_tables = ( ### Ontology Suff ### "Gene Ontology Biological Process"=>{ REG_EXP => $ontlogy_reg_exp, COLUMN_NAMES => { 1 => 'affy_gene_ontology_id', 2 => 'gene_ontology_description', 3 => 'gene_ontology_evidence', }, TABLE_NAME => $TBMA_GENE_ONTOLOGY, PK => 'gene_ontology_type_id', RUN_METHOD => 'add_affy_db_links', }, "Gene Ontology Cellular Component"=>{ REG_EXP => $ontlogy_reg_exp, COLUMN_NAMES => { 1 => 'affy_gene_ontology_id', 2 => 'gene_ontology_description', 3 => 'gene_ontology_evidence', }, TABLE_NAME => $TBMA_GENE_ONTOLOGY, PK => 'gene_ontology_type_id', RUN_METHOD => 'add_affy_db_links', }, "Gene Ontology Molecular Function"=>{ REG_EXP => $ontlogy_reg_exp, COLUMN_NAMES => { 1 => 'affy_gene_ontology_id', 2 => 'gene_ontology_description', 3 => 'gene_ontology_evidence', }, TABLE_NAME => $TBMA_GENE_ONTOLOGY, PK => 'gene_ontology_type_id', RUN_METHOD => 'add_affy_db_links', }, ### End of Ontology ## Alignments =>{ REG_EXP => qr!\s?(.+?): #pull the Chromsome Information Example 'chr16:' (\d+)-(\d+)\s #find the genomic bp Start Stop Example chr6:109422168-109422413 \((\+|-)\)\s+ #pull out the strand orientaion Example (+) Escape the first parenthesis //\s(.+?)\s!x, #pull out the percent Identity COLUMN_NAMES => { 1 => 'match_chromosome', 2 => 'gene_start', 3 => 'gene_stop', 4 => 'gene_orientation', 5 => 'percent_identity', }, TABLE_NAME => $TBMA_ALIGNMENT, PK => 'alignment_id', }, "Trans Membrane_NOT_USED" =>{### THIS IS WOULD PARSE WHAT THE AFFY PROBE SET DATA IN TABULAR FORMAT DESCRIBES, BUT THE FILE IT SELF HAS A MUCH DIFFERNT FORMAT REG_EXP => qr!numtm:(\d+); #grab the number of transmemebrane domains Nin-prob:(\w+); #grab the the probability the the N term is interior Type:(\w+); #grab the type values :SIGNAL, ANCHOR, empty "" !x, COLUMN_NAMES => { 1 => 'number_of_domains', 2 => 'nin_prob', 3 => 'type', }, TABLE_NAME => $TBMA_TRANS_MEMBRANE, PK => 'trans_membrane_id', #sub table info ST_REG_EXP => qr!TM from (\d+) to (\d+): #grab the start and stop of the TM domain Eval:(\d\.\d+) #grab the e-val for the TM domain prediction !x, ST_COLUMN_NAMES => { 1 => 'tm_protein_start', 2 => 'tm_protein_end', 3 => 'tm_e_value', }, ST_TABLE_NAME =>$TBMA_TRANS_MEMBRANE_DOMAIN, ST_PK =>'trans_membrane_domain_id', ST_FK =>'trans_membrane_id', }, "Trans Membrane" =>{ #Example 'NP_054700.1 // span:417-439 // numtm:1' REG_EXP => qr!(\w.+?)\s.*numtm:(\d+)!x, #grab the number of predicted domains. This format just repeats the same information over and over for each predicted TM domain COLUMN_NAMES => { 1 => 'protein_accession_numb', 2 => 'number_of_domains', }, TABLE_NAME => $TBMA_TRANS_MEMBRANE, PK => 'trans_membrane_id', #MULTIPULE_RECORDS => 'NO', }, "Overlapping Transcripts" =>{ #Example 'NM_173599 // hypothetical protein FLJ40126 // chr12:38306286-38588369 (+)' REG_EXP => qr!(\w+)\s// #grab the accession number \s(.+?)\s// #grab the title \s(.+?): #pull the Chromsome Information Example 'chr16:' (\d+)-(\d+)\s #find the genomic bp Start Stop Example chr6:109422168-109422413 \((\+|-)\) #pull out the strand orientaion Example (+) Escape the first parenthesis !x, COLUMN_NAMES => { 1 => 'accession_number', 2 => 'title', 3 => 'overlapping_choromosome', 4 => 'overlapping_gene_start', 5 => 'overlapping_gene_end', 6 => 'gene_orientation', }, TABLE_NAME => $TBMA_OVERLAPPING_TRANSCRIPT, PK => 'overlapping_transcript_id', }, "Protein Families" =>{ #Example 'ec // ACRO_HUMAN // ACRO_HUMAN EC:3.4.21.10:ACROSIN PRECURSOR (EC 3.4.21.10). // ---' REG_EXP => qr!(EC):(\d.+?): #grab the database_type AND grab the EC number (\w.+?)\s// #grab the description !ix, #Example 'Hanks // ECK // KECK_HUMAN (ECK) KINASES:5.2.1 | PTK Group B membrane spanning protein tyrosine kinases.PTK XI Eph/Elk/Eck orphan receptor family .ECK // ---' REG_EXP_2 => qr!(Hanks)\s// #grab the database type \s(\w.+?)\s// #grab the HANK accession number \s(\w.+?)// #grab the description !x, #Example 'P450 // Q62397 // CYP2B-10 Cyt P450: Animalia.CYP2B-10.mou // ---' REG_EXP_3 => qr!(P450)\s// #grab the database type \s(\w.+?)\s// #grab the P450 accession number \s(\w.+?)// #grab the description !x, COLUMN_NAMES => { 1 => 'database_type', 2 => 'accession_number', 3 => 'description', #4 => 'e_value', #turned off None of the inital annotation files had field }, TABLE_NAME => $TBMA_PROTEIN_FAMILIES, PK => 'protein_families_id', RUN_METHOD => '$self->add_affy_db_links(accession_number =>$accession_number, affy_anno_id =>$affy_anno_pk, dbxref_tag =>$database_type, )', }, InterPro =>{ #Example 'IPR003309 // Transcriptional regulator SCAN /// IPR007087 // Zn-finger, C2H2 type' REG_EXP => qr!(IPR\d+)\s// #grab the Accession number \s(\w.*)(?:\s+//|$) #grab the description !x, COLUMN_NAMES => { 1 => 'accession_number', 2 => 'interpro_description', }, TABLE_NAME => $TBMA_INTERPRO, PK => 'interpro_id', RUN_METHOD => '$self->add_affy_db_links(accession_number =>$accession_number, affy_anno_id =>$affy_anno_pk, dbxref_tag => "InterPro" )', }, "Protein Domains" =>{ #OLD Example 'scop // d1bupa1 // d1bupa1 SCOP:c.55.1.1:| Heat shock protein 70kDa, ATPase fragment // --- /// scop // d1dkza_ // d1dkza_ SCOP:e.20.1.1:' # REG_EXP => qr!(scop)\s// #grab the Database # \s(\w.+?)\s// #grab the Accession number # .+?\s(\w.*)\s// #grab the description # !x, #example ' pfam // IL4 // Interleukin 4 // 6.5E-97' REG_EXP_2 => qr!(pfam)\s// #grab and Accession number \s(\w.+?)\s// #grab the Database type .+?\s(\w.*)\s//\s #grab the description (.*) #grab the e-value !x, # New scop example 'scop // a.4.1.Paired domain // All alpha proteins; DNA/RNA-binding 3-helical bundle; Homeodomain-like; Paired domain' REG_EXP => qr!(scop)\s// #grab the Database \s(\w(?:\.\d)+).+?\s// #grab the Accession number .+?\s(\w.*)\s// #grab the description !x, COLUMN_NAMES => { 1 => 'database_type', 2 => 'accession_number', 3 => 'protein_domain_description', 4 => 'protein_domain_e_value', }, TABLE_NAME => $TBMA_PROTEIN_DOMAIN, PK => 'protein_domanin_id', RUN_METHOD => '$self->add_affy_db_links(accession_number =>$accession_number, affy_anno_id =>$affy_anno_pk, dbxref_tag =>$database_type, )', }, ); foreach my $record_key (keys %child_tables){ #start looping through child_tables keys #next unless $record_key =~ 'Gene Ontology '; #DEBUG ONLY my $full_record_value = $record_h{$record_key}; #pull the piece of data from the record hash if ($self->verbose() > 0){ print "RECORD_KEY '$record_key' VALUE '$full_record_value'\n"; } my @all_reg_exp = grep {/^REG_EXP/} keys % {$child_tables{$record_key} }; #pull all the reg exp keys that might be needed most will only have one... my $rowdata_ref = {}; my @multi_records = (); # Set a default $child_tables{$record_key}{MULTIPULE_RECORDS} ||= 'NO'; if ( $child_tables{$record_key}{MULTIPULE_RECORDS} eq 'NO'){ push @multi_records, $full_record_value; }else{ @multi_records = split /\/\/\//,$full_record_value; #split on the /// to make individual records } @multi_records = $self->truncate_data(data_aref => \@multi_records); RECORD_VAL:foreach my $record_value (@multi_records) { next if $record_value && $record_value eq '---'; #skip the blank fields if ($self->verbose() > 0){ print "RECORD PART '$record_value'\n"; } for( my $i = 0; $i<=$#all_reg_exp; $i++) { #loop through the regular expression my $reg_exp_key = $all_reg_exp[$i]; my $reg_exp = $child_tables{$record_key}{$reg_exp_key}; if ($record_value && $record_value =~ /$reg_exp/){ #run the regular expression for (my $i = 1; $i <= $#+; $i ++){ # @+ = LAST_MATCH_END perl internal variable, array holds the offsets of the ends of the last successful sub matches, which can be use to find out how many matches where made my $column_name = $child_tables{$record_key}{COLUMN_NAMES}{$i}; no strict 'refs'; $$rowdata_ref{$column_name} = ${$i}; #add the data found to the hash-ref. ${$i} = to $1.... from the data the reg_exp found if ($self->verbose() > 0){ print "REG EXP FOUND POSITION $i '${$i}' INSERT INTO '$column_name'\n"; } } last; }else{ if ($i == $#all_reg_exp){ $record_value = '' if !defined $record_value; $self->anno_error(error => "ERROR: Cannot MATCH VAL '$record_value'\n\tREG_EXP '$reg_exp'"); next RECORD_VAL ; #if no more regular expression are in the @all_reg_exp array move on to the next record since there is not data to insert for this particular chunk } } } #################################################################### ### Specific for Gene Ontology columns only ..... if ($record_key =~ /Gene Ontology/){ #Need to convert the affy GO accession to a real GO accession number then add a record to the DBREF table my $affy_go_accession_numb = $$rowdata_ref{affy_gene_ontology_id}; my $pad = '0' x (7 - length($affy_go_accession_numb)); my $go_accession_numb = "GO:" . $pad . $affy_go_accession_numb;#GO accession numbers are 7 digit long, need to pad affy accession numbers to the correct format #example Affy 1234 -> GO:0001234 my $affy_db_linds_id = $self->add_affy_db_links(accession_number =>$go_accession_numb, affy_anno_id =>$affy_anno_pk, dbxref_tag => 'GO', ); $$rowdata_ref{affy_db_links_id} = $affy_db_linds_id; my $ontology_type_id = $ontology_column_id{$record_key}; #pull the ontology_type_id $$rowdata_ref{gene_ontology_type_id} = $ontology_type_id; if ($self->verbose() >0 ){ print "ADDING ONTOLOGY INFO RECORD_TYPE '$record_key' AFFY_DB_LINK_ID '$affy_db_linds_id' GO ACCESSION '$go_accession_numb'\n ONTOLOGY_TYPE_ID '$ontology_type_id'\n"; } } ################################################################### ### Add in link to Affy_Db_links table if needed if ( defined $record_key && defined $child_tables{$record_key}{RUN_METHOD} && $child_tables{$record_key}{RUN_METHOD} =~ /add_affy_db_links/ && $record_key !~ /Gene Ontology/ ) { my $accession_number = $$rowdata_ref{accession_number}; delete($$rowdata_ref{accession_number}); #need to delete the key since the data will be kept in the affy_db_links table my $database_type = $$rowdata_ref{database_type}; #need to delete this key too delete($$rowdata_ref{database_type}); # only the "Protein Familiy" record keys should have this field my $method = $child_tables{$record_key}{RUN_METHOD}; if ($accession_number && $accession_number !~ /NULL/i ){ #some of the data does not have a valid accession number in the file so Affymetrix puts in NULL my $affy_db_linds_id = eval $method; die "EVAL ERROR '$@'\n"if $@; $$rowdata_ref{affy_db_links_id} = $affy_db_linds_id; if ($self->verbose() >0 ){ print "ADDING AFFY_DB_ID INTO '$record_key' AFFY_DB_LINK_ID '$affy_db_linds_id'\n"; } } } ################################################################### ### Add the data to the database my $table_name = $child_tables{$record_key}{TABLE_NAME}; my $PK = $child_tables{$record_key}{PK}; $$rowdata_ref{affy_annotation_id} = $affy_anno_pk; #add the affy_annotation_id FK column my $inserted_PK = $sbeams->updateOrInsertRow( # insert the data in to the first child table, most record_values will only be parse into one table but some will have two tables that will need to be dealt with table_name=>$table_name, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>$PK, ); if ($self->verbose() > 0) { print "INSERTED DATA IN TO PRIMARAY CHILD TABLE '$table_name' PK '$inserted_PK'\n\n"; } #################################################################### ### Add the sub table data if present #basically built to upload the TM information if (my $sub_table = $child_tables{$record_key}{ST_TABLE_NAME}){ if ($self->verbose() >0) { print "ADDING SUB TABLE INFO FOR '$sub_table'\n"; } my $PK = $child_tables{$record_key}{ST_PK}; my $FK_column = $child_tables{$record_key}{ST_FK}; #subtable foregin key my $st_reg_exp = $child_tables{$record_key}{ST_REG_EXP}; my %column_names_h = %{$child_tables{$record_key}{ST_COLUMN_NAMES} }; #key is the order which the reg exp will find the data, value is the column to put the data into my @column_name_keys = sort keys %column_names_h; my $rowdata_ref = {}; $$rowdata_ref{$FK_column} = $inserted_PK; if (my @all_hits = ($record_value =~ /$st_reg_exp/)) { #gather all the hits while (@all_hits){ foreach my $key (@column_name_keys){ #loop the array of column keys, which are just numbers my $column_name = $column_names_h{$key}; #convert to a column name my $value = shift @all_hits; #get a value from things that where found $$rowdata_ref{$column_name} = $value; if ($self->verbose() > 0){ print "SUB TABLE HIT VALUE '$value' INSERT INTO COL '$column_name'\n"; } } my $st_inserted_PK = $sbeams->updateOrInsertRow( # insert the data table_name=>$sub_table, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>$PK, ); } }else{ $self->anno_error(error => "ERROR:SUB TABLE REGEXP '$st_reg_exp' DID NOT MATCH\n\tRECORD_VAL'$record_value'\n"); } } } } } ############################################################################### #get_gene_ontolgy_types # #retrun hash of with keys of the differnt gene ontology types and the value equal to the PK for the row in the db ############################################################################### sub get_gene_ontolgy_types { my $self = shift; my $sql = qq~ SELECT gene_ontology_name_type, gene_ontology_type_id FROM $TBMA_GENE_ONTOLOGY_TYPE qq~; return $sbeams->selectTwoColumnHash($sql) } ############################################################################### #add_affy_db_links #Given a dbref_tag turn it into a PK #retrun the PK from $TBMA_AFFY_DB_LINKS ############################################################################### sub add_affy_db_links { my $method = 'add_affy_db_links'; my $self = shift; my %args = @_; my $db_id = $args{accession_number}; my $affy_anno_id = $args{affy_anno_id}; my $dbxref_tag = $args{dbxref_tag}; my $dbxref_id = $self->get_xref_id($dbxref_tag); confess(__PACKAGE__ . "::$method Need to provide 'accession_number' & 'affy_anno_id' & 'dbxref_tag") unless ($db_id =~ /^\w/ && $affy_anno_id && $dbxref_tag); my $rowdata_ref = {affy_annotation_id => $affy_anno_id, dbxref_id => $dbxref_id, db_id => $db_id, }; my $inserted_pk = $sbeams->updateOrInsertRow( # insert the data table_name=>$TBMA_AFFY_DB_LINKS, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'affy_db_links_id', ); if ($self->verbose() > 0){ print "INSERTED AFFY_DB_LINK FOR '$dbxref_tag' DB_ID '$db_id' PK '$inserted_pk'\n"; } return $inserted_pk; } ############################################################################### #get_xref_id #Given a dbref_tag turn it into a PK #retrun the PK from $TB_DBXREF ############################################################################### sub get_xref_id { my $method = 'get_xref_id'; my $self = shift; my $dbxref_tag = shift; if (exists $self->{$dbxref_tag}){ #if we have already seen this dbxref tag before send back the PK return $self->{$dbxref_tag}; } confess(__PACKAGE__ . "::$method Need to provide 'dbxref_tag' => YOU GAVE '$dbxref_tag'") unless ($dbxref_tag =~ /^\w/); my $sql = qq~ SELECT dbxref_id FROM $TB_DBXREF WHERE dbxref_tag = '$dbxref_tag' ~; my @rows = $sbeams->selectOneColumn($sql); die "*** CANNOT FIND DBXREF_ID FOR '$dbxref_tag' '$sql' ***\n" unless ($rows[0] =~ /^\d/); $self->{$dbxref_tag} = $rows[0]; #add the data into a hash for future lookups return $rows[0]; } ############################################################################### #truncate_data #used to truncate any long fields. Will truncate everything in a hash or a single value to 254 char. Also will #write out to the error log if any extra fields are truncated ############################################################################### sub truncate_data { my $self = shift; my %args = @_; my $method = 'truncate_data'; unless ( $args{record_href} || $args{data_aref} ) { $log->error( "$method requires either record or data ref" ); return undef; } unless ( ref($args{record_href}) eq 'HASH' || ref($args{data_aref}) eq 'ARRAY' ) { $log->error( "Invalid reference type in $method" ); return undef; } if ($args{record_href}){ # Cleaned up 6/06 DSC my %record_h = %{$args{record_href}}; my %fieldsize = ( gene_symbol => 50, chromosomal_location => 50, archival_unigene_cluster => 50, qtl => 100 ); # Trim values to match data type foreach my $key ( keys %record_h){ next unless defined $key; $fieldsize{$key} ||= 255; my $orig_len = length( $record_h{$key} ); if ( $record_h{$key} && $fieldsize{$key} && $orig_len > $fieldsize{$key} ){ my $trunc = substr($record_h{$key}, 0, $fieldsize{$key} ); my $trunc_len = length( $trunc ); $self->anno_error( error => "Hash value truncated from $orig_len to $trunc_len ($record_h{$key})"); $record_h{$key} = $trunc; } } return %record_h; } elsif($args{data_aref}){ # left alone for now my @data = @{$args{data_aref}}; for(my $i=0; $i<=$#data; $i++){ if ( $data[$i] && length $data[$i] > 255){ my $big_val = $data[$i]; my $truncated_val = substr($data[$i], 0, 254); $self->anno_error(error => "Warning DATA Val truncated\n,ORIGINAL VAL SIZE:". length($big_val). "'$big_val'\nNEW VAL SIZE:" . length($truncated_val) . "'$truncated_val'"); #print "VAL '$record_h{$key}'\n" $data[$i] = $truncated_val; } } return @data; } } ############################################################################### #check_previous_annotation_set #Check to see if annotation set already exists in the db #if not upload a record into affy_annotation_set and return the PK #if yes delete old data if user wants too REQUIRES USER INPUT ############################################################################### sub check_previous_annotation_set { my $method = 'check_previous_annotation_set'; my $self = shift; my %args = @_; my $slide_type_id = $args{slide_type_id}; my $genome_version = $args{genome_version}; my $annotation_date = $args{annotation_date}; my $sql = qq~ SELECT affy_annotation_set_id FROM $TBMA_AFFY_ANNOTATION_SET WHERE annotation_date = '$annotation_date' AND slide_type_id = $slide_type_id ~; my @rows = $sbeams->selectOneColumn($sql); if ( scalar(@rows) && $rows[0] =~ /^\d/ ) { #Record exists so ask the user if previous data should be deleted # if they say they wanna delete, accept the default since they can still bail. my $answer = ( $self->run_mode() =~ /delete/i ) ? 'Y' : ''; QUESTION:{ unless ( $answer ) { print "\n\n\n********* WARNING THIS AFFYMETRIX ANNOTATION FILE HAS ALREADY BEEN UPLOADED*****\n" . "re-uploading it will delete the previous version, are you sure you want to delete the annotation for\n" . "'$genome_version - $annotation_date', annotation_set_id '$rows[0]'\n" . "Enter Y or [N]:"; $answer = ; chomp $answer; # Did they just hit enter (default)? $answer = 'N' if $answer eq ''; } if ($answer =~ /^[nN]/){ print "OK I WILL NOT DELETE ANYTHING\n"; die "Data already exists in the database and you do not want to updated it so there is nothing to do\n"; }elsif($answer =~ /^[Yy]/) { print "OK I WILL DELETE ALL THE ANNOTATION IN 5 secs... LAST CHANCE push crtl-c TO ABORT....\n"; sleep 5; if ($self->verbose() > 0){ print "AFFY ANNOTATION ID '$rows[0]'\n"; } $self->delete_affy_annotation_data(affy_annotation_set_id => $rows[0]); }else{ print "Sorry I do not understand your answer, Type Y or N\n"; $answer = ''; QUESTION:redo; } } if ($self->run_mode() =~ /delete/i){ #if we are in delete mode return the annotation_set_id print "RETRUN ANNOTATION SET ID FROM '$method' ID = '$rows[0]'\n" if ( $self->verbose() ); return $rows[0]; } $self->check_previous_annotation_set( slide_type_id => $slide_type_id, #Now that the old date is gone redo the method which will insert a new record genome_version => $genome_version, annotation_date => $annotation_date, ); }else{ #record does not exists need to insert new annotation_set record my $rowdata_ref = { annotation_date => $annotation_date, slide_type_id => $slide_type_id, genome_version => $genome_version, }; my $affy_annotation_set_id = $sbeams->updateOrInsertRow( table_name=>$TBMA_AFFY_ANNOTATION_SET, rowdata_ref=>$rowdata_ref, return_PK=>1, verbose=>$self->verbose(), testonly=>$self->testonly(), insert=>1, PK=>'affy_annotation_set_id', add_audit_parameters=>1, ); ##DEBUG print "ADDING ANNOTATION RECORD '$affy_annotation_set_id'\n"; return $affy_annotation_set_id } } ############################################################################### #delete_affy_annotation_data #give annotation set id #delete all the annotation from all the annotation tables ############################################################################### sub delete_affy_annotation_data { my $method = 'delete_affy_annotation_data'; my $self = shift; my %args = @_; my $affy_annotation_set_id = $args{affy_annotation_set_id}; print "ID '$affy_annotation_set_id'\n"; confess(__PACKAGE__ . "::$method Need to provide key value pair 'affy_annotation_set_id' => YOU GAVE '$affy_annotation_set_id'") unless ($affy_annotation_set_id =~ /^\d/); my %table_child_relationship = (); my $table_name = ''; %table_child_relationship = ( #table relationship needed by deleteRecordsAndChildren affy_annotation_set => 'affy_annotation(C)', #Below it will utilize this hash to delete the affy_array rows then loop back and take out the sample rows #add more PKLC tables here affy_annotation => "protein_domain(PKLC),interpro(PKLC),protein_families(PKLC),gene_ontology(PKLC),alignment(PKLC),trans_membrane(PKLC),overlapping_transcript(PKLC),affy_db_links(PKLC)", trans_membrane => ("trans_membrane_domain(C)"), ); #DELETE THE annotation_set record and delete all the childern in affy_annotation and it's PKLC childeren if ($self->verbose() > 0){ print "Affy annotation set ID TO DELETE'$affy_annotation_set_id'\n"; } print "Starting to delete Old data\n"; my $result = $sbeams->deleteRecordsAndChildren( table_name => "affy_annotation_set", table_child_relationship => \%table_child_relationship, delete_PKs => [ $affy_annotation_set_id ], delete_batch => 1000, database => $self->database(), verbose => $self->verbose(), testonly => $self->testonly(), ); print "Done deleting old records\n"; } ############################################################################### #find_annotation_ids #Given a FK to affy_annotation_set collect all the annotation_id #Return array of PK from affy_annotation ############################################################################### sub find_annotation_ids { my $method = 'find_annotation_ids'; my $self = shift; my %args = @_; my $FK = $args{affy_annotation_set_id}; confess(__PACKAGE__ . "::$method Need to provide key value pair 'error'") unless ($FK); my $sql = qq~ SELECT affy_annotation_id FROM $TBMA_AFFY_ANNOTATION WHERE affy_annotation_set_id = $FK ~; my @rows = $sbeams->selectOneColumn($sql); if ($self->verbose() > 0){ print "ANNOTATION KEYS '@rows'\n"; } return @rows; } ############################################################################### #get_annotation_sql #Given an affy_probe_set_id and an annotation_set_id get a sql statment to pull out all the information we know #about this particular probe #Return a sql string ############################################################################### sub get_annotation_sql { my $self = shift; my %args = @_; my $sql = qq~ SELECT anno_set.genome_version AS "Genome_Version", anno_set.annotation_date AS "Annotation_Date", st.name AS "Affy_Chip", anno.affy_annotation_id, anno.probe_set_id AS "probe_set_id", anno.gene_title AS "gene_title", anno.gene_symbol AS "gene_symbol", anno.sequence_type, anno.sequence_source, anno.transcript_id, anno.target_description_feature, anno.target_description, anno.target_description_note, anno.representative_public_id, anno.archival_unigene_cluster, anno.chromosomal_location, anno.pathway, anno.qtl, anno.annotation_description, anno.transcript_assignment, anno.annotation_transcript_cluster, anno.annotation_notes FROM $TBMA_AFFY_ANNOTATION anno JOIN $TBMA_AFFY_ANNOTATION_SET anno_set ON (anno.affy_annotation_set_id = anno_set.affy_annotation_set_id) JOIN $TBMA_SLIDE_TYPE st ON (st.slide_type_id = anno_set.slide_type_id) WHERE 1=1 ~; return $sql; } ############################################################################### #get_dbxref_accessor_urls #Give nothing #return results as a hash example 16 => LocusLink__http://www.ncbi.nlm.nih.gov/LocusLink/ #dbxref_PK dbref_tag 2 underscores __ accessor url ############################################################################### sub get_dbxref_accessor_urls { my $self = shift; return $sbeams->selectTwoColumnHash( <<" END" ); SELECT dbxref_id , dbxref_tag || '__' || accessor FROM $TB_DBXREF END } ############################################################################### #get_protein_family_info #Give affy_annotation_id #return results as an array of hrefs ############################################################################### sub get_protein_family_info { my $self = shift; my $affy_annotation_id = shift; my $sql = qq~ SELECT pf.description, pf.e_value, db.dbxref_id, db.db_id FROM $TBMA_PROTEIN_FAMILIES pf JOIN $TBMA_AFFY_DB_LINKS db ON (pf.affy_db_links_id = db.affy_db_links_id) WHERE pf.affy_annotation_id = $affy_annotation_id ~; #$sbeams->display_sql(sql=>$sql); return $sbeams->selectHashArray($sql) } ############################################################################### #get_protein_domain_info #Give affy_annotation_id #return results as an array of hrefs ############################################################################### sub get_protein_domain_info { my $self = shift; my $affy_annotation_id = shift; my $sql = qq~ SELECT pd.protein_domain_description, db.dbxref_id, db.db_id FROM $TBMA_PROTEIN_DOMAIN pd JOIN $TBMA_AFFY_DB_LINKS db ON (pd.affy_db_links_id = db.affy_db_links_id) WHERE pd.affy_annotation_id = $affy_annotation_id ~; #$sbeams->display_sql(sql=>$sql); return $sbeams->selectHashArray($sql) } ############################################################################### #get_interpro_info #Give affy_annotation_id #return results as an array of hrefs ############################################################################### sub get_interpro_info { my $self = shift; my $affy_annotation_id = shift; my $sql = qq~ SELECT ip.interpro_description, db.dbxref_id, db.db_id FROM $TBMA_INTERPRO ip JOIN $TBMA_AFFY_DB_LINKS db ON (ip.affy_db_links_id = db.affy_db_links_id) WHERE ip.affy_annotation_id = $affy_annotation_id ~; #$sbeams->display_sql(sql=>$sql); return $sbeams->selectHashArray($sql) } ############################################################################### #get_transmembrane_info # #Give a affy_annotaion_id #Return number of transmembrane domains ############################################################################### sub get_transmembrane_info { my $self = shift; my $anno_id = shift; my $sql = qq~SELECT number_of_domains, protein_accession_numb FROM $TBMA_TRANS_MEMBRANE WHERE affy_annotation_id = $anno_id ~; return $sbeams->selectHashArray($sql) } ############################################################################### #get_alignment_info # #Give a affy_annotaion_id #Return all the alignment info as an array of hrefs ############################################################################### sub get_alignment_info { my $self = shift; my $anno_id = shift; my $sql = qq~ SELECT a.gene_start, a.gene_stop, a.percent_identity, a.gene_orientation,a.match_chromosome FROM $TBMA_ALIGNMENT a WHERE a.affy_annotation_id = $anno_id ~; return $sbeams->selectHashArray($sql); } ############################################################################### #get_go_info # #Give a affy_annotaion_id #Return all the go info as an array of hrefs ############################################################################### sub get_go_info { my $self = shift; my $anno_id = shift; my $sql = qq~SELECT gt.gene_ontology_name_type, go.gene_ontology_description, go.gene_ontology_evidence, db.dbxref_id, db.db_id FROM $TBMA_GENE_ONTOLOGY go JOIN $TBMA_GENE_ONTOLOGY_TYPE gt ON (go.gene_ontology_type_id = gt.gene_ontology_type_id) JOIN $TBMA_AFFY_DB_LINKS db ON(go.affy_db_links_id = db.affy_db_links_id) WHERE go.affy_annotation_id = $anno_id ORDER BY gt.gene_ontology_type_id ~; #print STDERR ($sql); return $sbeams->selectHashArray($sql); } ############################################################################### #get_db_acc_numbers # #Give a affy_annotaion_id #retrun all ids as a hash of db_id => dbxref_id Example 12345 => 16 ############################################################################### sub get_db_acc_numbers { my $self = shift; my $anno_id = shift; return $sbeams->selectTwoColumnHash(" SELECT db_id, dbxref_id FROM $TBMA_AFFY_DB_LINKS WHERE affy_annotation_id = $anno_id" ); } ############################################################################### # anno_error ############################################################################### sub anno_error { my $method = 'anno_error'; my $self = shift; my %args = @_; if (exists $args{error} ){ if ($self->verbose() > 0){ print "$args{error}\n"; } return $self-> {ERROR} .= "\n$args{error}"; #might be more then one error so append on new errors }else{ $self->{ERROR}; } } #+ # Affy annotations can have multiple id's in the same field seperated with /// # Truncate string to first value and return. #- sub clean_id{ my $self = shift; my $val = shift; $val =~ s!///.*!!; return $val; } #+ # Logic for turning affy annotation values into gene_expression canonical_name #- sub getCanonicalName { my $self = shift; my %args = @_; my $canonical = 'undefined'; # Refseq Prot Id should start with N/X . P_ddddd if ( $args{refseq} && $args{refseq} =~ /^\s*[NX]P_\d+/ ) { $canonical = $args{refseq}; $canonical =~ s/^\s*(.*)\.\d+\s*$/$1/; } elsif ( $args{gene_id} && $args{gene_id} =~ /^\s*\d+\s*$/ ) { $canonical = $args{gene_id}; } elsif ( $args{public} ) { $canonical = $args{public}; } return $canonical; } }#closing bracket for the package 1;