package SBEAMS::Biomarker::ParseSampleWorkbook; ############################################################################### # # Description : Library code for parsing sample workbook files # Copywrite 2005 ############################################################################### use Spreadsheet::ParseExcel::Simple; use Spreadsheet::ParseExcel; use SBEAMS::Connection; use strict; #### Set up new variables use vars qw(@ISA @EXPORT); require Exporter; @ISA = qw (Exporter); @EXPORT = qw ( ); sub new { my $class = shift; my $this = { verbose => 0, @_ }; bless $this, $class; return $this; } #+ # #- sub parseFile { my $this = shift; my %args = @_; # Check for required params for ( qw( type wbook ) ) { die "Missing required parameter $_" unless $args{$_}; } # Make sure workbook exists and is readable unless( -e $args{wbook} && -r $args{wbook} ) { die "Workbook file, $args{wbook}, unreadable or does not exist"; } # Call appropriate parsing method return ( $args{type} eq 'tsv' ) ? $this->_parseTabFile( %args ) : $this->_parseExcelFile( %args ); } # End parseFile #+ # # - sub process_data { my $this = shift; my ( @sou_attr, @sam_attr, @dis ); my $idx = $this->getHeadingsIndex(); # Array to hold list of separated objects my @items; # Get headers for various sub-groups my @tags = qw( biosample biosample_attr prep_params ms_params analysis_params biosource biosource_attr biosource_disease biosource_disease_stage tissue_type ); # Get headers for all columns my %head; for my $tag ( @tags ) { $head{$tag} = $this->_getDb2ParserMap($tag); } # accumulator for processed lines my @all_items; # print "Saw a total of " . scalar @{$this->{_data}} . " lines\n"; # Each row represents one sample foreach my $row ( @{$this->{_data}} ) { # hash representing item (line in file) my %item; # For each of the categories (tables) for my $tag ( @tags ) { my %subitem; # Confused yet? # The item hash is keyed by tags, i.e. biosource, biosample, etc. # Each of these points at an anonymous hashref. # The referenced hash holds the actual spreadsheet values, keyed by db # column name (keys of %{$head{$tag}} rather than by the spreadsheet # column header, which are the values of %{$head{$tag}} # @{$item{$tag}}{keys(%{$head{$tag}})} = @$row[@$idx{values(%{$head{$tag}})}]; # Had to rewrite as a loop to avoid undefined cols getting $row[0]. foreach my $key ( keys(%{$head{$tag}}) ) { $subitem{$key} = ( !defined( $idx->{${$head{$tag}}{$key}} ) ? undef : $row->[$idx->{${$head{$tag}}{$key}}] ); # print "looping items: $key, $head{$tag}->{$key}\n"; } $item{$tag} = \%subitem; } push @all_items, \%item; } # We have them all, cache and return. $this->{processed_items} = \@all_items; } # End process_data #+ # Returns reference to hash of column index values keyed by column heading # - sub getHeadingsIndex { my $this = shift; return $this->{_headidx} || {}; } #+ # Returns reference to array of arrayrefs which each represent worksheet row # - sub getDataRef { my $this = shift; return $this->{_data} || []; } sub getProcessedItems { my $this = shift; return $this->{processed_items} || undef; } #+ # cache sbeams object #- sub set_sbeams { my $this = shift; my $sbeams = shift || die "Must pass sbeams object"; $this->{_sbeams} = $sbeams; } #+ # Fetch cached sbeams object #- sub get_sbeams { my $this = shift; return $this->{_sbeams}; } #+ # Set the 'verbose' level for parser # named argument verbose (integer) #- sub setVerbosity { my $this = shift; my %args = @_; $this->{verbose} = $args{verbose} || 0; } # # # ####################### # # # # # ## Private methods ## # # # ################### #+ # Fetch the 'verbose' level for parser #- sub _verbose { my $this = shift; return $this->{verbose}; } #+ # Rudimentry method to avoid dangerous SQL stmts #- sub _safeSQL { my $this = shift; my $value = shift; return ( $value =~ /DELETE|DROP|UPDATE|INSERT|ALTER/gi ); } #+ # Based on file format as of 2005-08-01 #- sub _parseTabFile { my $this = shift; my %args = @_; # Array of column headings my @headings; # column heading indicies my %idx; my %seen; open( WBOOK, "<$args{wbook}" ) || die "unable to open $args{wbook}"; $this->{_wbook} = $args{wbook}; die; while( my $line = ){ chomp $line; next if $line =~ /^\s*$/; my @line = split( "\t", $line, -1 ); unless( @headings ) { # Meaningful headings fingerprint? next unless ( $line[0] =~ /Sample Setup Order/ && $line[1] =~ /MS Sample Run Number/ ); for my $h ( @line ) { $h =~ s/\s+$//g; push @headings, $h; } my $i = 0; foreach (@headings) { $idx{$_} = $i++; } $this->{_headidx} = \%idx; next; } push @{$this->{_data}}, \@line; next; } return; } # #+ # Based on file format(s) as of 2005-08-25 #- sub _parseExcelFile { my $this = shift; my %args = @_; # Messages to return to caller about the parse my @msg; my $xls = Spreadsheet::ParseExcel::Simple->read( $args{wbook} ) || die 'Unable to parse workbook'; $this->{_wbook} = $args{wbook}; # Array of column headings my @headings; # column heading indicies my %idx; my %seen; my $active_sheet; my $cnt = 1; foreach my $sheet ( $xls->sheets() ) { $active_sheet ||= $sheet; $cnt++; } push @msg, 'workbook has multiple sheets, using only the first' if $cnt > 1; while ( $active_sheet->has_data() ) { my @line = $active_sheet->next_row(); # Sweet, the excel parser loves blank lines. Doh! if ( !$line[0] && !$line[1] && !$line[2] && !$line[3] && !$line[4] && !$line[5] && !$line[6] && !$line[7] && !$line[8] && !$line[9] ) { next; } unless( @headings ) { # Meaningful headings fingerprint? next unless ( $line[0] =~ /Sample Setup Order/ && $line[1] =~ /MS Sample Run Number/ ); for my $h ( @line ) { # $h =~ s/\s+$//g; # $h =~ s/^\s+//g; push @headings, $h; } my $i = 0; foreach (@headings) { $idx{$_} = $i++; } $this->{_headidx} = \%idx; next; } push @{$this->{_data}}, \@line; next; } push @msg, 'Unable to find heading signature' unless @headings; push @msg, 'No data found in first sheet' unless @{$this->{_data}}; return \@msg; } #+ # returns reversed hash; dangerous as always! #- sub _getDb2ParserMap { my $this = shift; my $mode = shift || 'all'; my $map = $this->_getParser2DbMap( $mode ); my %reverse = reverse( %$map ); for( keys( %reverse ) ) { #print "$_ => $reverse{$_}\n"; } return \%reverse; } sub _getParser2DbMap { my $this = shift; my $mode = shift || 'all'; # Biosource my %biosource = ( 'ISB sample ID' => 'biosource_name', # to be used as biosample_name also? 'Patient_id' => 'patient_id', 'External Sample ID' => 'external_id', 'Name of Institute' => 'organization_id', # Do ID lookup 'Name of Investigators' => 'investigators', # need to add 'Sample type ' => 'tissue_type_id', 'species' => 'organism_id', # Do ID lookup 'age' => 'age', # will have to split 'gender' => 'gender', # need to add ); return \%biosource if $mode eq 'biosource'; # Biosource Attributes my %biosource_attr = ( 'PARAM:time of sample collection' => 'collection_time', # type => PARAM, name => collection_time, value => value 'PARAM:meal' => 'meal', # 'PARAM:alcohole' => 'alcohol', 'PARAM:alcohol' => 'alcohol', # 'PARAM:smoke' => 'smoke', 'PARAM:Date of Sample Collection' => 'collection_date', 'PARAM:Others' => 'other', 'PARAM:Strain' => 'strain', 'Cat_who_Dx' => 'Diagnosis category', 'others' => 'other', # type => general, name => others, value => value # 'Plate Layout' => 'plate_layout', 'Study Histology' => 'study_histology', 'Disease Info: Group' => 'group_disease', ); return \%biosource_attr if $mode eq 'biosource_attr'; # biosource_disease.disease_stage my %biosource_disease_stage = ( 'Disease Stage' => 'disease_stage', # temp for diabetes data # 'Cat_who_Dx' => 'disease_stage', ); return \%biosource_disease_stage if $mode eq 'biosource_disease_stage'; # Diseases my %biosource_disease = ( 'Disease:Breast cancer' => 'breast_cancer', # disease_type => cancer, disease_name => breast_cancer, value => value 'Disease:Ovarian cancer' => 'ovarian_cancer', # source disease 'Disease:Prostate cancer' => 'prostate_cancer', # source disease 'Disease:Blader Cancer' => 'bladder_cancer', # source disease 'Disease:Skin cancer' => 'skin_cancer', # source disease 'Disease:Lung cancer' => 'lung_cancer', # source disease "Disease: Huntington's Disease" => 'huntingtons_disease', # source disease 'diabetic' => 'diabetic', 'Disease:other cancers' => 'other_cancers', ); return \%biosource_disease if $mode eq 'biosource_disease'; # Biosample my %biosample = ( # 'ISB sample ID' => 'biosample_name', # to be used as biosample_name also? 'Location of orginal sample' => 'storage_location_id', 'amount of sample received' => 'original_volume', 'amount of sample received' => 'original_volume', 'Plate Layout' => 'well_id' ); return \%biosample if $mode eq 'biosample'; # Biosample Attributes my %biosample_attr = ( 'Sample Setup Order' => 'sample_setup',# biosample_attribute, type is sample_order name is sample_setup_order, value is value 'MS Sample Run Number' => 'ms_run_number', ); return \%biosample_attr if $mode eq 'biosample_attr'; # Tissue types my %tissue_type = ( 'heart' => 'heart', 'blood' => 'blood', 'liver' => 'liver', 'neuron' => 'neuron', 'lung' => 'lung', 'bone' => 'bone' ); return \%tissue_type if $mode eq 'tissue_type'; # Prep stuff, samples -> treatment my %prep_params = ( 'Prep Replicate id' => 'xxxx', # biosample_attribute, type is sample_order name is prep_replicate_id, value is value 'Sample Prep Name' => 'xxxx', # treatement_name =>> deprecated, still a possibility??? # biogroup.bio_group_name, group type is sample_prep 'status of sample prep' => 'xxxx', # treatement.status 'date of finishing prep' => 'xxxx', # treatment.date_completed 'amount of sample used in prep' => 'xxxx', # input_volume 'Sample prep method' => 'xxxx', # protocol 'person prepared the samples' => 'xxxx', # treatment.processed_by 'Volume of re-suspended sample' => 'xxxx', # New sample.original_volume 'location of finished sample prep' => 'xxxx', # New sample.storage_location ); return \%prep_params if $mode eq 'prep_params'; # MS stuff, all downstream! my %ms_params = ( 'MS Replicate Number' => 'xxxx', 'MS Run Name' => 'xxxx', 'status of MS' => 'xxxx', 'date finishing MS' => 'xxxx', 'Random Sample Run order' => 'xxxx', 'order of samples ran per day' => 'xxxx', 'MS run protocol' => 'xxxx', 'Volume Injected' => 'xxxx', ); return \%ms_params if $mode eq 'ms_params'; # analysis stuff, all downstream! my %analysis_params = ( 'location of data' => 'xxxx', 'status of Conversion' => 'xxxx', 'Date finishing conversion' => 'xxxx', 'name of raw files' => 'xxxx', 'location of raw files' => 'xxxx', 'name of mzXML' => 'xxxx', 'location of mzXML' => 'xxxx', 'person for MS analysis' => 'xxxx', 'date finishing alignment' => 'xxxx', 'location of alignment files' => 'xxxx', 'person for data analysis' => 'xxxx', 'peplist peptide peaks file location' => 'xxxx' ); return \%analysis_params if $mode eq 'analysis_params'; my %map = ( %biosource, %biosource_attr, %biosource_disease, %biosource_disease_stage, %biosample, %biosample_attr, %tissue_type, %prep_params, %ms_params, %analysis_params ); return \%map if $mode eq 'all'; die "Unknown mode $mode, try again"; } 1; __DATA__ Biosource Biosample Attribute Disease Orphan Treatment Storage_location Analysis_file 'Sample Setup Order' => xxxx 'MS Sample Run Number' => xxxx 'Plate Layout' => xxxx 'ISB sample ID' => xxxx 'Patient_id' => xxxx 'External Sample ID' => xxxx 'Name of Institute' => xxxx 'Name of Investigators' => xxxx 'Sample type' => xxxx 'species' => xxxx 'age' => xxxx 'gender' => xxxx 'amount of sample received' => xxxx 'Location of orginal sample' => xxxx 'PARAM:time of sample collection' => xxxx 'PARAM:meal' => xxxx 'PARAM:alcohole' => xxxx 'PARAM:smoke' => xxxx 'PARAM:Date of Sample Collection' => xxxx 'PARAM:Others' => xxxx 'Disease:Breast cancer' => xxxx 'Disease:Ovarian cancer' => xxxx 'Disease:Prostate cancer' => xxxx 'Disease:Blader Cancer' => xxxx 'Disease:Skin cancer' => xxxx 'Disease:Lung cancer' => xxxx 'Disease: Huntington's Disease' => xxxx 'Disease:other cancers' => xxxx 'heart' => xxxx 'blood' => xxxx 'liver' => xxxx 'diabetic' => xxxx 'neuron' => xxxx 'lung' => xxxx 'bone' => xxxx 'Disease Stage' => xxxx 'others' => xxxx 'Study Histology' => xxxx 'Disease Info: Group' => xxxx 'Prep Replicate id' => xxxx 'Sample Prep Name' => xxxx 'status of sample prep' => xxxx 'date of finishing prep' => xxxx 'amount of sample used in prep' => xxxx 'Sample prep method' => xxxx 'person prepared the samples' => xxxx 'Volume of re-suspended sample' => xxxx 'location of finished sample prep' => xxxx 'MS Replicate Number' => xxxx 'MS Run Name' => xxxx 'status of MS' => xxxx 'date finishing MS' => xxxx 'Random Sample Run order' => xxxx 'order of samples ran per day' => xxxx 'MS run protocol' => xxxx 'Volume Injected' => xxxx 'location of data' => xxxx 'status of Conversion' => xxxx 'Date finishing conversion' => xxxx 'name of raw files' => xxxx 'location of raw files' => xxxx 'name of mzXML' => xxxx 'location of mzXML' => xxxx 'person for MS analysis' => xxxx 'date finishing alignment' => xxxx 'location of alignment files' => xxxx 'person for data analysis' => xxxx 'peplist peptide peaks file location' => xxxx # Attributes 'Sample Setup Order' # 'MS Sample Run Number' # same 'Name of Investigators' # same 'PARAM:time of sample collection ' # param 'PARAM:meal' # param 'PARAM:alcohole' # param 'PARAM:smoke' # param 'PARAM:Date of Sample Collection' # param 'Study Histology' # anonymous # Bioource 'ISB sample ID' 'Patient_id' 'External Sample ID' 'Name of Institute' 'species' 'age' 'gender' 'Sample type' # Biosample 'amount of sample received' 'Location of orginal sample' # Disease 'Disease:Breast cancer' 'Disease:Ovarian cancer' 'Disease:Prostate cancer' 'Disease:Blader Cancer' 'Disease:Skin cancer' 'Disease:Lung cancer' 'Disease: Huntington\'s Disease' 'diabetic' # tissue_type 'heart' 'blood' 'liver' 'neuron' 'lung' 'bone' biosource_disease 'Disease Stage' #orphan 'Disease Info: Group' 'Prep Replicate id' 'Sample Prep Name' 'status of sample prep' 'date of finishing prep' 'amount of sample used in prep' 'Sample prep method' 'person prepared the samples' 'Volume of re-suspended sample' 'location of finished sample prep' 'MS Replicate Number' 'MS Run Name' 'status of MS' 'date finishing MS' 'Random Sample Run order' 'order of samples ran per day' 'MS run protocol' 'Volume Injected' 'location of data' 'status of Conversion' 'Date finishing conversion' 'name of raw files' 'location of raw files' 'name of mzXML' 'location of mzXML' 'person for MS analysis' 'date finishing alignment' 'location of alignment files' 'person for data analysis' 'peplist peptide peaks file location' # Odd bits, tries to unify column indices between worksheets if ( 0 ) { if ( $_ eq 'ISB sample ID' ){ $i = 4; } elsif ( $_ eq 'PARAM:Others' ) { $i = 20; } elsif ( $_ eq 'Disease:Breast cancer' ) { $i = 21; } elsif ( $_ eq 'Disease:other cancers' ) { $i = 28; } elsif ( $_ eq 'heart' ) { $i = 29; } elsif ( $_ eq 'others' ) { $i = 37; } elsif ( $_ eq 'Study Histology' ) { $i = 38; } elsif ( $_ eq 'Prep Replicate id' ) { $i = 40; } elsif ( $_ eq 'date of finishing prep' ) { $i = 43; } elsif ( $_ eq 'amount of sample used in prep' ) { $i = 44; } }