#!/usr/local/bin/perl -w

###############################################################################
# Program     : load_atlas_build.pl
# Author      : Eric Deutsch <edeutsch@systemsbiology.org>
# $Id$
#
# Description : This script loads a build of the PeptideAtlas into the
#               database from the build process data products files
#
###############################################################################


###############################################################################
   # Generic SBEAMS setup for all the needed modules and objects
###############################################################################
use strict;
use Getopt::Long;
use FindBin;
use lib "$FindBin::Bin/../../perl/SBEAMS/PeptideAtlas";
use PAxmlContentHandler;
use SpectraDescriptionSetParametersParser;
use SearchResultsParametersParser;

use XML::Parser;

use lib "$FindBin::Bin/../../perl";
use vars qw ($sbeams $sbeamsMOD $q $current_username 
             $ATLAS_BUILD_ID %spectra
             $PROG_NAME $USAGE %OPTIONS $QUIET $VERBOSE $DEBUG $TESTONLY
             $TESTVARS $CHECKTABLES
             $sbeamsPROT $SSRCalculator $GlycoPeptideModule
	     $massCalculator
            );


#### Set up SBEAMS core module
use SBEAMS::Connection qw($q);
use SBEAMS::Connection::Settings;
use SBEAMS::Connection::Tables;

use SBEAMS::PeptideAtlas;
use SBEAMS::PeptideAtlas::Settings;
use SBEAMS::PeptideAtlas::Tables;

use SBEAMS::Proteomics;
use SBEAMS::Proteomics::Settings;
use SBEAMS::Proteomics::Tables;

$sbeams = new SBEAMS::Connection;
$sbeamsMOD = new SBEAMS::PeptideAtlas;
$sbeamsMOD->setSBEAMS($sbeams);
$sbeams->setSBEAMS_SUBDIR($SBEAMS_SUBDIR);

$sbeamsPROT = new SBEAMS::Proteomics;
$sbeamsPROT->setSBEAMS($sbeams);

use SBEAMS::Glycopeptide;
$GlycoPeptideModule = new SBEAMS::Glycopeptide();

#### Create and initialize SSRCalc object with 3.0
use lib '/net/db/src/SSRCalc/ssrcalc';
$ENV{SSRCalc} = '/net/db/src/SSRCalc/ssrcalc';
use SSRCalculator;
my $SSRCalculator = new SSRCalculator;
$SSRCalculator->initializeGlobals3();
$SSRCalculator->ReadParmFile3();

use SBEAMS::Proteomics::PeptideMassCalculator;
$massCalculator = new SBEAMS::Proteomics::PeptideMassCalculator;


###############################################################################
# Set program name and usage banner for command like use
###############################################################################
$PROG_NAME = $FindBin::Script;
$USAGE = <<EOU;
Usage: $PROG_NAME [OPTIONS]
Options:
  --verbose n                 Set verbosity level.  default is 0
  --quiet                     Set flag to print nothing at all except errors
  --debug n                   Set debug flag
  --testonly                  If set, rows in the database are not changed or added
  --testvars                  If set, makes sure all vars are filled
  --check_tables              will not insert or update.  does a check of peptide_instance_sample

  --delete                    Delete an atlas build (does not build an atlas).
  --purge                     Delete child records in atlas build (retains parent atlas record).
  --load                      Build an atlas (can be used in conjunction with --purge).

  --organism_abbrev           Abbreviation of organism like Hs

  --atlas_build_name          Name of the atlas build (already entered by hand in
                              the atlas_build table) into which to load the data
  --default_sample_project_id default project_id  needed for auto-creation of tables (best to set
                              default to dev/private access and open access later)

 e.g.:  ./load_atlas_build.pl --atlas_build_name \'Human_P0.9_Ens26_NCBI35\' --organism_abbrev \'Hs\' --purge --load --default_sample_project_id 476

 e.g.: ./load_atlas_build.pl --atlas_build_name \'TestAtlas\' --delete
EOU

#### Process options
unless (GetOptions(\%OPTIONS,"verbose:s","quiet","debug:s","testonly",
        "testvars","delete", "purge", "load", "check_tables",
        "atlas_build_name:s", "organism_abbrev:s", "default_sample_project_id:s"
    )) {

    die "\n$USAGE";

}


$VERBOSE = $OPTIONS{"verbose"} || 0;

$QUIET = $OPTIONS{"quiet"} || 0;

$DEBUG = $OPTIONS{"debug"} || 0;

$TESTONLY = $OPTIONS{"testonly"} || 0;

$TESTVARS = $OPTIONS{"testvars"} || 0;

$CHECKTABLES = $OPTIONS{"check_tables"} || 0;

$TESTONLY = 1 if ($CHECKTABLES);

if ($DEBUG) {

    print "Options settings:\n";

    print "  VERBOSE = $VERBOSE\n";

    print "  QUIET = $QUIET\n";

    print "  DEBUG = $DEBUG\n";

    print "  TESTONLY = $TESTONLY\n";

    print "  TESTVARS = $TESTVARS\n";

    print "  CHECKTABLES = $CHECKTABLES\n";

}
   
   
###############################################################################
# Set Global Variables and execute main()
###############################################################################

main();

exit(0);

###############################################################################
# Main Program:
#
# Call $sbeams->Authenticate() and exit if it fails or continue if it works.
###############################################################################
sub main {

  #### Do the SBEAMS authentication and exit if a username is not returned
  exit unless (
      $current_username = $sbeams->Authenticate(work_group=>'PeptideAtlas_admin')
  );


  $sbeams->printPageHeader() unless ($QUIET);

  handleRequest();

  $sbeams->printPageFooter() unless ($QUIET);

} # end main


###############################################################################
# handleRequest
###############################################################################
sub handleRequest {

  my %args = @_;

  ##### PROCESS COMMAND LINE OPTIONS AND RESTRICTIONS #####

  #### Set the command-line options
  my $del = $OPTIONS{"delete"} || '';

  my $purge = $OPTIONS{"purge"} || '';

  my $load = $OPTIONS{"load"} || '';

  my $organism_abbrev = $OPTIONS{"organism_abbrev"} || '';

  my $atlas_build_name = $OPTIONS{"atlas_build_name"} || '';

  my $default_sample_project_id = $OPTIONS{"default_sample_project_id"} || '';


  #### Verify required parameters
  unless ($atlas_build_name) {
    print "\nERROR: You must specify an --atlas_build_name\n\n";
    die "\n$USAGE";
  }


  ## --delete with --load will not work
  if ($del && $load) {
      print "ERROR: --delete --load will not work.\n";
      print "  use: --purge --load instead\n\n";
      die "\n$USAGE";
      exit;
  }


  ## --delete with --purge will not work
  if ($del && $purge) {
      print "ERROR: select --delete or --purge, but not both\n";
      print "$USAGE";
      exit;
  }


  #### If there are any unresolved parameters, exit
  if ($ARGV[0]){
    print "ERROR: Unresolved command line parameter '$ARGV[0]'.\n";
    print "$USAGE";
    exit;
  }


  ## get ATLAS_BUILD_ID:
  $ATLAS_BUILD_ID = get_atlas_build_id(atlas_build_name=>$atlas_build_name);


  ## handle --purge:
  if ($purge) {

       print "Removing child records in $atlas_build_name ($ATLAS_BUILD_ID): \n";

       removeAtlas(atlas_build_id => $ATLAS_BUILD_ID,
           keep_parent_record => 1);

  }#end --purge


  ## handle --load:
  if ($load) {

      loadAtlas( atlas_build_id=>$ATLAS_BUILD_ID,
          organism_abbrev => $organism_abbrev,
          default_sample_project_id => $default_sample_project_id,
      );

      populateSampleRecordsWithSampleAccession();

  } ## end --load


  #### Print out the header
  unless ($QUIET) {
    $sbeams->printUserContext();
    print "\n";
  }


  ## handle --delete:
  if ($del) {

     print "Removing atlas $atlas_build_name ($ATLAS_BUILD_ID) \n";

     removeAtlas(atlas_build_id => $ATLAS_BUILD_ID);

  }#end --delete


  if ($TESTONLY || $purge) {
      print "\a done\n";
  }


} # end handleRequest


###############################################################################
# get_atlas_build_id  --  get atlas build id
# @param atlas_build_name
# @return atlas_build_id
###############################################################################
sub get_atlas_build_id {

    my %args = @_;

    my $name = $args{atlas_build_name} or die "need atlas build name($!)";

    my $id;

    my $sql = qq~
        SELECT atlas_build_id
        FROM $TBAT_ATLAS_BUILD
        WHERE atlas_build_name = '$name'
        AND record_status != 'D'
    ~;

    ($id) = $sbeams->selectOneColumn($sql) or
        die "\nERROR: Unable to find the atlas_build_name ". 
        $name." with $sql\n\n";

    return $id;

}


###############################################################################
# get_atlas_build_directory  --  get atlas build directory
# @param atlas_build_id
# @return atlas_build:data_path
###############################################################################
sub get_atlas_build_directory
{
    my %args = @_;

    my $atlas_build_id = $args{atlas_build_id} or die "need atlas build id($!)";

    my $path;

    my $sql = qq~
        SELECT data_path
        FROM $TBAT_ATLAS_BUILD
        WHERE atlas_build_id = '$atlas_build_id'
        AND record_status != 'D'
    ~;

    ($path) = $sbeams->selectOneColumn($sql) or
        die "\nERROR: Unable to find the data_path in atlas_build record". 
        " with $sql\n\n";

    ## get the global variable PeptideAtlas_PIPELINE_DIRECTORY
    my $pipeline_dir = $CONFIG_SETTING{PeptideAtlas_PIPELINE_DIRECTORY};

    $path = "$pipeline_dir/$path";

    ## check that path exists
    unless ( -e $path) 
    {
        die "\n Can't find path $path in file system.  Please check ".
        " the record for atlas_build with atlas_build_id=$atlas_build_id";

    }

    return $path;

}

###############################################################################
# get_biosequence_set_id  --  get biosequence_set_id
# @param atlas_build_id
# @return atlas_build:biosequence_set_id
###############################################################################
sub get_biosequence_set_id
{

    my %args = @_;

    my $atlas_build_id = $args{atlas_build_id} or die "need atlas build id($!)";

    my $b_id;

    my $sql = qq~
        SELECT biosequence_set_id
        FROM $TBAT_ATLAS_BUILD
        WHERE atlas_build_id = '$atlas_build_id'
        AND record_status != 'D'
    ~;

    ($b_id) = $sbeams->selectOneColumn($sql) or
        die "\nERROR: Unable to find the biosequence_set_id in atlas_build record". 
        " with $sql\n\n";

    return $b_id;

}


###############################################################################
# removeAtlas -- removes atlas build records
#
#   has option --keep_parent to keep parent record (purge uses this feature)
###############################################################################
sub removeAtlas {
   my %args = @_;
   my $atlas_build_id = $args{'atlas_build_id'};
   my $keep_parent_record = $args{'keep_parent_record'} || 0;

   my $database_name = $DBPREFIX{PeptideAtlas};
   my $table_name = "atlas_build";
   my $full_table_name = "$database_name$table_name";

   my %table_child_relationship = (
      atlas_build => 'peptide_instance(C),atlas_build_sample(C),atlas_build_search_batch(C),spectra_description_set(C)',
      peptide_instance => 'peptide_mapping(C),peptide_instance_sample(C),peptide_instance_search_batch(C),modified_peptide_instance(C)',
      modified_peptide_instance => 'modified_peptide_instance_sample(C),modified_peptide_instance_search_batch(C)',
   );

   #my $TESTONLY = "0";
   my $VERBOSE = "1" unless ($VERBOSE);

   if ($keep_parent_record) {
      my $result = $sbeams->deleteRecordsAndChildren(
         table_name => 'atlas_build',
         table_child_relationship => \%table_child_relationship,
         delete_PKs => [ $atlas_build_id ],
         delete_batch => 1000,
         database => $database_name,
         verbose => $VERBOSE,
         testonly => $TESTONLY,
         keep_parent_record => 1,
      );
   } else {
      my $result = $sbeams->deleteRecordsAndChildren(
         table_name => 'atlas_build',
         table_child_relationship => \%table_child_relationship,
         delete_PKs => [ $atlas_build_id ],
         delete_batch => 1000,
         database => $database_name,
         verbose => $VERBOSE,
         testonly => $TESTONLY,
      );
   }
} # end removeAtlas


###############################################################################
# loadAtlas -- load an atlas
# @param atlas_build_id
# @param organism_abbrev
# @param default_sample_project_id  project_id to be used when creating new samples
###############################################################################
sub loadAtlas {

    my %args = @_;

    my $atlas_build_id = $args{'atlas_build_id'} or die
        " need atlas_build_id";

    my $organism_abbrev = $args{'organism_abbrev'} or die
        " need organism_abbrev";

    my $default_sample_project_id = $args{'default_sample_project_id'} or
       die "need default_sample_project_id";


    {
        ## check if atlas has peptide_instance entries (checking for 1 entry):
        my $sql =qq~
            SELECT *
            FROM $TBAT_PEPTIDE_INSTANCE
            WHERE atlas_build_id = '$atlas_build_id'
            ~;
    
        print "Checking whether peptide_instance records exist for atlas...\n"
            if ($TESTONLY);
    
        my @peptide_instance_array = $sbeams->selectOneColumn($sql);
    
        ## if has entries, tell user...atlas_build_name might be a user error
        if (@peptide_instance_array && $TESTONLY == 0) { 
            print "ERROR: Records already exist in atlas $\atlas_build name\n";
            print "To purge existing records and load new records\n";
            print "  use: --purge --load \n";
            print "$USAGE";
            return;
        }

    }


    my $builds_directory = get_atlas_build_directory (atlas_build_id =>
        $atlas_build_id);

    $builds_directory = "$builds_directory/";

    my $biosequence_set_id = get_biosequence_set_id (atlas_build_id =>
        $atlas_build_id);


    ## build atlas:
    print "Building atlas $atlas_build_id: \n";
  
    buildAtlas(atlas_build_id => $atlas_build_id,
               biosequence_set_id => $biosequence_set_id,
               source_dir => $builds_directory,
               default_sample_project_id => $default_sample_project_id,
               organism_abbrev => $organism_abbrev,
    );


    print "\nFinished loading atlas \n";

}


###############################################################################
# buildAtlas -- populates PeptideAtlas records in requested atlas_build
# @param atlas_build_id 
# @param default_sample_project_id
# @param biosequence_set_id
# @param source_dir
# @param organism_abbrev
###############################################################################
sub buildAtlas {

    print "building atlas...\n" if ($TESTONLY);

    my %args = @_;

    my $atlas_build_id = $args{'atlas_build_id'} or die "need atlas_build_id ($!)";

    my $default_sample_project_id = $args{'default_sample_project_id'} or die
        "need default_sample_project_id ($!)";

    my $biosequence_set_id = $args{'biosequence_set_id'} or 
        die "need biosequence_set_id ($!)";

    my $source_dir = $args{'source_dir'} or 
        die "need source directory containing coordinate_mapping.txt etc. ($!)";

    my $organism_abbrev = $args{'organism_abbrev'} or 
        die "need organism_abbrev ($!)";
 

    ## hash with key = search_batch_id, value = $search_dir
    my %loading_sbid_searchdir_hash = getInfoFromExperimentsList(
        infile => "$source_dir../Experiments.list" );

    ## the content handler for loadFromPAxmlFile is using search_batch_id's 
    ## from proteomics so we need to send it hash to look up atlas_search_batch_ids
    ## and sample_ids when needed.  
    ## note, this method creates sample and *search_batch records when necessary
    my %proteomicsSBID_hash =  get_search_batch_and_sample_id_hash(
        atlas_build_id => $atlas_build_id,
        default_sample_project_id => $default_sample_project_id,
        loading_sbid_searchdir_hash_ref => \%loading_sbid_searchdir_hash,
        source_dir => $source_dir,
    );

        
    #### Load from .PAxml file
    my $PAxmlfile = $source_dir . "APD_" . $organism_abbrev . "_all.PAxml";

    if (-e $PAxmlfile) 
    {
        loadFromPAxmlFile(
            infile => $PAxmlfile,
            sbid_asbid_sid_hash_ref => \%proteomicsSBID_hash,
            atlas_build_id => $ATLAS_BUILD_ID,
        );

    } else 
    {
        die("ERROR: Unable to find '$PAxmlfile' to load data from.");
    }


    ## set infile to coordinate mapping file
    my $mapping_file = "$source_dir/coordinate_mapping.txt";

    print "\n Begin calc coordinates \n";

    #### Update the build data already loaded with genomic coordinates
    readCoords_updateRecords_calcAttributes(
        infile => $mapping_file,
        atlas_build_id => $ATLAS_BUILD_ID,
        biosequence_set_id => $biosequence_set_id,
        organism_abbrev => $organism_abbrev
    );


}# end buildAtlas


###############################################################################
# get_search_batch_and_sample_id_hash --  get complex hash, and create all
# sample and *search_batch records in the process
#
# @param atlas_build_id
# @param default_sample_project_id default project_id used in creating new samples
# @param search_batch_hash_ref reference to hash with key = search_batch_id
# @return complex hash accessed as:
#     $hash{$proteomics_search_batch_id}->{atlas_search_batch_id}
#     $hash{$proteomics_search_batch_id}->{sample_id}
###############################################################################
sub get_search_batch_and_sample_id_hash
{
    my $METHOD='get_search_batch_and_sample_id_hash';

    my %loaded_sbid_asbid_sid_hash;

    my %args = @_;

    my $atlas_build_id = $args{atlas_build_id} or die("need atlas_build_id");

    my $default_sample_project_id = $args{'default_sample_project_id'} or
        die "need default_sample_project_id ($!)";

    my $source_dir = $args{source_dir} or die("need source_dir ($!)");

    my $loading_sbid_searchdir_hash_ref = $args{loading_sbid_searchdir_hash_ref} 
       or die "need loading_sbid_searchdir_hash_ref ($)";

    ## hash with key = search_batch_id, value = $search_dir
    my %loading_sbid_searchdir_hash = %{$loading_sbid_searchdir_hash_ref};

    ## loading_sbid_searchdir_hash is used to get the sbid's to access
    ## the Proteomic records

    my $loading_sbid_list = get_string_list_of_keys(
        hash_ref => \%loading_sbid_searchdir_hash);

    foreach my $loading_sb_id (keys %loading_sbid_searchdir_hash )
    {
        my ($atlas_search_batch_id, $sample_id);

        my $asb_exists = "false";

        my $sample_exists = "false";

        ##########  handle sample records ##################
        ## since there are so few sample_records, will query one at a 
        ## time instead of large select

        #### see if an atlas_search_batch record exists ####
        my $sql = qq~
            SELECT ASB.sample_id, ASB.atlas_search_batch_id
            FROM $TBAT_ATLAS_SEARCH_BATCH ASB
            WHERE ASB.proteomics_search_batch_id = '$loading_sb_id'
            AND ASB.record_status != 'D'
        ~;

        my @rows = $sbeams->selectSeveralColumns($sql);

        foreach my $row (@rows)
        {
            my ($s_id, $asb_id) = @{$row};
            $sample_id = $s_id;
            $atlas_search_batch_id = $asb_id;
            $asb_exists = "true";
        }


        if ( $asb_exists eq "true" )
        { ## If $TBAT_ATLAS_SEARCH_BATCH exists, then so does a sample, and just got it in last query
            $sample_exists = "true";
        } else
        {   ## [cases: either this is a new sample, or the new table/records for any atlas_search_batch
            ## of this sample haven't been written yet (adjusting to new schema)

            ## For both cases, need to look for a sample record.  Will have to make assumption
            ## that PeptideAtlas sample_tag equals Proteomics exp_tag, until enough atlases
            ## have been built to have populated atlas_search_batch table, then we
            ## can skip this sample_exists check section
            $sql = qq~
                SELECT S.sample_id
                FROM $TBAT_SAMPLE S
                JOIN $TBPR_PROTEOMICS_EXPERIMENT PE ON (S.sample_tag = PE.experiment_tag)
                JOIN $TBPR_SEARCH_BATCH PSB ON (PE.experiment_id = PSB.experiment_id)
                WHERE PSB.search_batch_id = '$loading_sb_id'
                AND PE.record_status != 'D'
                AND S.record_status != 'D'
            ~;

            @rows = $sbeams->selectSeveralColumns($sql);
            
            foreach my $row (@rows)
            {
                my ($s_id) = @{$row};
                $sample_id = $s_id;
                $sample_exists = "true";
            }
        }

        ## Lastly, if no sample_id, create one from protomics record.  
        ## if this is true, then also will be missing [atlas_search_batch] and 
        ## [atlas_search_batch_parameter] and [atlas_search_batch_parameter_set]
        if ( ($asb_exists eq "false") || ($sample_exists eq "false") )
        {
            $sql = qq~
                SELECT distinct SB.search_batch_id, PE.experiment_name, PE.experiment_tag,
                SB.data_location
                FROM $TBPR_PROTEOMICS_EXPERIMENT PE
                JOIN $TBPR_SEARCH_BATCH SB
                ON ( PE.experiment_id = SB.experiment_id)
                WHERE SB.search_batch_id = '$loading_sb_id'
                AND PE.record_status != 'D'
            ~;

            @rows = $sbeams->selectSeveralColumns($sql) or die
                "Could not find proteomics experiment record for ".
                " search batch id $loading_sb_id \n$sql\n ($!)";

            foreach my $row (@rows)
            {
                my ($sb_id, $exp_name, $exp_tag, $d_l) = @{$row};

                ## create [sample] record if it doesn't exist:
                if ($sample_exists eq "false")
                {
                    my %rowdata = (
                        search_batch_id => $sb_id,
                        sample_tag => $exp_tag,
                        original_experiment_tag => $exp_tag,
                        sample_title => $exp_name,
                        sample_description => $exp_name,
                        is_public => 'N',
                        project_id => $default_sample_project_id,
                    );

                    $sample_id = insert_sample( rowdata_ref => \%rowdata );
                }


                ## create [atlas_search_batch]
                my $search_batch_path = $loading_sbid_searchdir_hash{$loading_sb_id};
        
                $atlas_search_batch_id = create_atlas_search_batch(
                    proteomics_search_batch_id => $loading_sb_id,
                    sample_id => $sample_id,
                    search_batch_path => $search_batch_path
                );

                ## create [atlas_search_batch_parameter]s and [atlas_search_batch_parameter_set]
                my $successful = create_atlas_search_batch_parameter_recs(
                    atlas_search_batch_id => $atlas_search_batch_id,
                    search_batch_path => $search_batch_path
                );

            }

        } ## end of the "no sample_id" loop

        ### okay, all [sample] records exist now, and [atlas_search_batch]s

        ## load 'em into hash
        $loaded_sbid_asbid_sid_hash{$loading_sb_id}->{sample_id} = $sample_id;

        $loaded_sbid_asbid_sid_hash{$loading_sb_id}->{atlas_search_batch_id} = 
            $atlas_search_batch_id;


        ## create [atlas_build_search_batch] record
        my $atlas_build_search_batch_id = 
            create_atlas_build_search_batch(
                atlas_build_id => $atlas_build_id,
                sample_id => $sample_id,
                atlas_search_batch_id => $atlas_search_batch_id,
            );

        ## create a [spectra_description_set] record
        insert_spectra_description_set( 
            atlas_build_id => $atlas_build_id,
            sample_id => $sample_id,
            atlas_search_batch_id => $atlas_search_batch_id,
            search_batch_dir_path =>$loading_sbid_searchdir_hash{$loading_sb_id}
        );
         

        ## create [atlas_build_sample] record
        my $atlas_build_sample_id = createAtlasBuildSampleLink(
            sample_id => $sample_id,
            atlas_build_id => $atlas_build_id,
        );


#       my $path = "$dl/$sub_dir";
#
#       ## if doesn't exist, use global var for archive area
#       unless (-e $path)
#       {
#           $path = $RAW_DATA_DIR{Proteomics} . "/path";
#       }
#
#       unless (-e $path)
#       {
#           print "[WARN] cannot locate $path for " .
#           "atlas_build_search_batch w/ id=$absbid ";
#       }

    } ## end iterate over load 

    return %loaded_sbid_asbid_sid_hash;

}


###############################################################################
#  createAtlasBuildSampleLink -- create atlas build sample record
# @param sample_id
# @param atlas_build_id
# @return atlas_build_sample_id
###############################################################################
sub createAtlasBuildSampleLink {

    my $METHOD = "createAtlasBuildSampleLink";

    my %args = @_;

    my $sample_id = $args{sample_id} or die "need sample_id ($!)";

    my $atlas_build_id = $args{atlas_build_id} or 
        die "need atlas_build_id ($!)";


    ## Populate atlas_build_sample table
    my %rowdata = (   ##   atlas_build_sample    table attributes
        atlas_build_id => $atlas_build_id,
        sample_id => $sample_id,
    );


    my $atlas_build_sample_id = $sbeams->updateOrInsertRow(
        insert=>1,
        table_name=>$TBAT_ATLAS_BUILD_SAMPLE,
        rowdata_ref=>\%rowdata,
        PK => 'atlas_build_sample_id',
        return_PK => 1,
        add_audit_parameters => 1,
        verbose=>$VERBOSE,
        testonly=>$TESTONLY,
    );

    print "INFO[$METHOD]: created atlas_build_sample_id=".
        "$atlas_build_sample_id\n";

    return $atlas_build_sample_id;

} ## end createAtlasBuildSampleLink


#######################################################################
# getProteomicsExpTag -- get proteomics experiment tag, given search
#     batch id
#
# @param search_batch_id
# @return exp_tag
#######################################################################
sub getProteomicsExpTag
{

    my %args = @_;

    my $search_batch_id = $args{search_batch_id} or die
        "need search_batch_id ($!)";

    my $exp_tag;
    
    my $sql = qq~
        SELECT PE.experiment_tag, SB.search_batch_id
        FROM $TBPR_SEARCH_BATCH SB 
        JOIN $TBPR_PROTEOMICS_EXPERIMENT PE
        ON (PE.experiment_id = SB.experiment_id)
        WHERE SB.search_batch_id = '$search_batch_id'
        AND PE.record_status != 'D'
    ~;

    my @rows = $sbeams->selectSeveralColumns($sql) or 
        print "[WARN] could not find proteomics experiment tag for "
        . "search_batch_id = $search_batch_id\n";

    foreach my $row (@rows)
    {
        my ($et, $sbid) = @{$row};

        $exp_tag = $et;

#       print "$et, $sbid\n";
    }

    return $exp_tag;

}


#######################################################################
# get_string_list_of_keys -- get string list of keys
# @param hash_ref 
# @return string of a list of the hash keys, separated by commas
#######################################################################
sub get_string_list_of_keys
{

    my %args=@_;

    my $hash_ref = $args{hash_ref} or die "need hash_ref";

    my %hash = %{$hash_ref};

    my $str_list;

    foreach my $key ( keys %hash)
    {
        if ($str_list eq "")
        {
            $str_list = "$key";
        } else
        {
             $str_list = "$str_list,$key";
        }
    }

    return $str_list;

}

#######################################################################
# create_atlas_search_batch -- create atlas_search_batch record
# @param proteomics_search_batch_id  search batch id to look-up Proteomics info
# @param sample_id
# @param search_batch_path absolute path to search_batch information
# @return atlas_search_batch_id
#######################################################################
sub create_atlas_search_batch
{
    my %args = @_;

    my $sbid = $args{proteomics_search_batch_id} or die "need search_batch_id ($!)";

    my $sid = $args{sample_id} or die "need sample_id ($!)";

    my $proteomics_search_batch_path = $args{search_batch_path}
        or die "need search_batch_path ($!)";

    my $atlas_search_batch_id;


#   my $nspec = getNSpecFromProteomics( search_batch_id => $sbid );

    my $nspec = getNSpecFromFlatFiles( search_batch_path =>
        $proteomics_search_batch_path );

    my $search_batch_subdir =  $proteomics_search_batch_path;

    ##trim off all except last directory
    $search_batch_subdir =~ s/^(.+)\/(.+)/$2/gi;

    my $experiment_path = $proteomics_search_batch_path;

    $experiment_path =~ s/^(.+)\/(.+)\/(.+)\/(.+)\/(.+)/$2\/$3\/$4/gi;

    my $TPP_version = getTPPVersion( directory => $proteomics_search_batch_path);

    ## attributes for atlas_search_batch_record
    my %rowdata = (             ##  atlas_search_batch
        proteomics_search_batch_id => $sbid,
        sample_id => $sid,
        n_searched_spectra => $nspec,
        data_location => $experiment_path,
        search_batch_subdir => $search_batch_subdir,
        TPP_version => $TPP_version,
    );

    my $atlas_search_batch_id = $sbeams->updateOrInsertRow(
        insert=>1,
        table_name=>$TBAT_ATLAS_SEARCH_BATCH,
        rowdata_ref=>\%rowdata,
        PK => 'atlas_search_batch_id',
        return_PK => 1,
        verbose=>$VERBOSE,
        testonly=>$TESTONLY,
    );

    print "[INFO] Created  atlas_search_batch record $atlas_search_batch_id\n";

    return $atlas_search_batch_id;

}


#######################################################################
# create_atlas_build_search_batch -- create atlas_build_search_batch record
# @param sample_id  sample_id
# @param atlas_build_id
# @param atlas_search_batch_id  atlas_search_batch_id
# @return atlas_build_search_batch_id
#######################################################################
sub create_atlas_build_search_batch
{

    my %args = @_;

    my $sid = $args{sample_id} or die "need sample_id ($!)";

    my $atlas_build_id = $args{atlas_build_id} or die "need atlas_build_id ($!)";

    my $atlas_search_batch_id = $args{atlas_search_batch_id}
        or die "need atlas_search_batch_id ($!)";


    ## attributes for atlas_build_search_batch_record
    my %rowdata = (             ##  atlas_search_batch
        sample_id => $sid,
        atlas_search_batch_id => $atlas_search_batch_id,
        atlas_build_id => $atlas_build_id,
    );

    my $atlas_build_search_batch_id = $sbeams->updateOrInsertRow(
        insert => 1,
        table_name => $TBAT_ATLAS_BUILD_SEARCH_BATCH,
        rowdata_ref => \%rowdata,
        PK => 'atlas_build_search_batch_id',
        return_PK => 1,
        verbose => $VERBOSE,
        testonly => $TESTONLY,
    );

    return $atlas_build_search_batch_id;

}


#######################################################################
# create_atlas_search_batch_parameter_recs - create records for
# atlas_search_batch_parameter_set and atlas_search_batch_parameter
#
# @param $atlas_search_batch_id
# @param $search_batch_path absolute path to search_batch directory
# @return successful returns 1 for "true" 
#######################################################################
sub create_atlas_search_batch_parameter_recs
{
    my %args = @_;

    my $atlas_search_batch_id  = $args{atlas_search_batch_id}
        or die "need atlas_search_batch_id ($!)";

    my $search_batch_path = $args{search_batch_path}
        or die "need search_batch_path ($!)";


    my ($peptide_mass_tolerance, $peptide_ion_tolerance, $peptide_ion_tolerance,
        $enzyme, $num_enzyme_termini);


    #### Assume the location of the search parameters file
    my $infile = "$search_batch_path/sequest.params";

    #### Complain and return if the file does not exist
    if ( ! -e "$infile" ) 
    {
        #### Also try the parent directory
        $infile = "$search_batch_path/../sequest.params";

        if ( ! -e "$infile" ) 
        {
            print "ERROR: Unable to find sequest parameter file: '$infile'\n";
            return;
        }

    }

    ## if can't find readParamsFile, instantiate Protoeomics module 
    ## $sbeamsPROT = SBEAMS::Proteomics->new();
    ## $sbeamsPROT->setSBEAMS($sbeams);

    #### Read in the search parameters file
    my $result = $sbeamsPROT->readParamsFile(inputfile => "$infile");

    unless ($result) 
    {
        print "ERROR: Unable to read sequest parameter file: '$infile'\n";

        return;
    }

    #### Loop over each returned row
    my ($key,$value,$tmp);

    my $counter = 0;

    ## store returned results from sequest.params as key value pair records in
    ## ATLAS_SEARCH_BATCH_PARAMETER table
    foreach $key (@{${$result}{keys_in_order}}) 
    {

        #### Define the data for this row
        my %rowdata;
        $rowdata{atlas_search_batch_id} = $atlas_search_batch_id;
        $rowdata{key_order} = $counter;
        $rowdata{parameter_key} = $key;
        $rowdata{parameter_value} = ${$result}{parameters}->{$key};

        #### INSERT it
        $sbeams->insert_update_row(
            insert => 1,
            table_name => $TBAT_ATLAS_SEARCH_BATCH_PARAMETER,
            rowdata_ref => \%rowdata,
            verbose => $VERBOSE,
            testonly => $TESTONLY,
        );

        $counter++;

        ## pick up params needed for ATLAS_SEARCH_BATCH_PARAMETER_SET
        if ($key eq "peptide_mass_tolerance")
        {
            $peptide_mass_tolerance = $rowdata{parameter_value};
        }

        if ($key eq "fragment_ion_tolerance")
        {
            $peptide_ion_tolerance = $rowdata{parameter_value};
        }

        if ($key eq "enzyme_number")
        {
            $enzyme = $rowdata{parameter_value};
        }

        if ($key eq "NumEnzymeTermini")
        {
            $num_enzyme_termini = $rowdata{parameter_value};
        }

    }

    #### If there was no NumEnzymeTermini specified, the default is 0
    $num_enzyme_termini = 0 unless ($num_enzyme_termini);


    my %rowdata = ( ##   ATLAS_SEARCH_BATCH_PARAMETER_SET attributes
        peptide_mass_tolerance => $peptide_mass_tolerance,
        peptide_ion_tolerance => $peptide_ion_tolerance,
        enzyme => $enzyme,
        num_enzyme_termini => $num_enzyme_termini,
    );


    #### INSERT it
    $sbeams->insert_update_row(
        insert => 1,
        table_name => $TBAT_ATLAS_SEARCH_BATCH_PARAMETER_SET,
        rowdata_ref => \%rowdata,
        verbose => $VERBOSE,
        testonly => $TESTONLY,
    );

    return 1;

}


###############################################################################
#  insert_sample -- get or create sample record
#     with minimum info and some default settings 
# @param $rowdata_ref  reference to hash holding table attributes
###############################################################################
sub insert_sample
{
    my $METHOD='insert_sample';

    my %args = @_;

    my $rowdata_ref = $args{'rowdata_ref'} or die("need rowdata_ref");

    my %rowdata = %{$rowdata_ref};


    ## create a sample record:
    my $sample_id = $sbeams->updateOrInsertRow(
        table_name=>$TBAT_SAMPLE,
        insert=>1,
        rowdata_ref=>\%rowdata,
        PK => 'sample_id',
        return_PK=>1,
        add_audit_parameters => 1,
        verbose=>$VERBOSE,
        testonly=>$TESTONLY,
    );

    print "INFO[$METHOD]: created sample record $sample_id\n";

    return $sample_id;

} ## end insert_sample


#######################################################################
#  get_peptide_accession_id_hash -- get hash 
#      key = peptide accession
#      value = peptide_id
#######################################################################
sub get_peptide_accession_id_hash {

    #### Get the current list of peptides in the peptide table
    my $sql = qq~
        SELECT peptide_accession,peptide_id
        FROM $TBAT_PEPTIDE
        ~;

    my %peptide_ids_hash = $sbeams->selectTwoColumnHash($sql);

    return %peptide_ids_hash;

}

#######################################################################
# getInfoFromExperimentsList - get hash of experiment list contents
# @param infile full path to Experiments.list file
# @return hash with key = search_batch_id,
#                 value = full path to search dir
#######################################################################
sub getInfoFromExperimentsList
{

    my $METHOD = "getInfoFromExperimentsList";

    my %args = @_;

    my $infile = $args{infile} or die
        "need path to Experiments.list file($!)";

    my %hash;

    open(INFILE, "<$infile") or die "cannot open $infile for reading ($!)";

    while (my $line = <INFILE>) 
    {

        chomp($line);

        if ($line =~ /^(\d+)(\s+)(.+)/) 
        {
            $hash{$1} = $3;

        }

    }

    close(INFILE) or die "Cannot close $infile";

    return %hash;

}


#######################################################################
# get_peptide_accession_instance_id_hash -- get hash
#     key = peptide accession
#     value = peptide_instance_id
#######################################################################
sub get_peptide_accession_instance_id_hash {

    my %args = @_;

    my $atlas_build_id = $args{atlas_build_id} or die
        "need atlas build id ($!)";

    my %hash;

    my $sql = qq~
        SELECT P.peptide_accession, PEPI.peptide_instance_id
        FROM $TBAT_PEPTIDE_INSTANCE PEPI
        JOIN $TBAT_PEPTIDE P ON (P.peptide_id = PEPI.peptide_id)
        WHERE PEPI.atlas_build_id = $atlas_build_id
        ~;

    %hash = $sbeams->selectTwoColumnHash($sql) or die
        "unable to execute statement:\n$sql\n($!)";

    return %hash;

}


#######################################################################
# get_peptide_accession_sequence_hash - get hash with key = peptide
#    accession, value = peptide sequence
#######################################################################
sub get_peptide_accession_sequence_hash
{

    my %args = @_;

    my $atlas_build_id = $args{atlas_build_id} or die
        "need atlas build id ($!)";

    my %hash;

    my $sql = qq~
        SELECT P.peptide_accession, P.peptide_sequence
        FROM $TBAT_PEPTIDE_INSTANCE PEPI
        JOIN $TBAT_PEPTIDE P ON (P.peptide_id = PEPI.peptide_id)
        WHERE PEPI.atlas_build_id = $atlas_build_id
        ~;

    %hash = $sbeams->selectTwoColumnHash($sql);

    return %hash;

}


#######################################################################
#  get_biosequence_name_id_hash -- get hash
#      key = biosequence_name
#      value = biosequence_id
# @param biosequence_set_id
#######################################################################
sub get_biosequence_name_id_hash {

    my %args = @_;

    my $biosequence_set_id = $args{'biosequence_set_id'};

    my %hash;


    #### Get the current biosequences in this set
    my $sql = qq~
        SELECT biosequence_name,biosequence_id
        FROM $TBAT_BIOSEQUENCE
        WHERE biosequence_set_id = '$biosequence_set_id'
        AND record_status != 'D'
    ~;


    %hash = $sbeams->selectTwoColumnHash($sql) or
        die "unable to get biosequence name and id from biosequence".
        " set $biosequence_set_id ($!)";


    ## creates hash with key=biosequence_name, value=biosequence_id
    return %hash;

}

#######################################################################
# getTPPVersion - get TPP version used in PeptideProphet scoring
# @param directory full path to search_batch directory
# @return string holding TPP version
#######################################################################
sub getTPPVersion
{
    my %args = @_;

    my $directory = $args{directory} or die
        "need path to search_batch directory file($!)";

    my $results_parser = new SearchResultsParametersParser();

    $results_parser->setSearch_batch_directory($directory);

    $results_parser->parse();

    my $TPP_version = $results_parser->getTPP_version();

    return $TPP_version;

}

#######################################################################
# insert_spectra_description_set - insert or update a record
#    for [spectra_description_set] table.  
# @param atlas_build_id
# @param sample_id 
# @param atlas_search_batch_id
#######################################################################
sub insert_spectra_description_set
{

    my $METHOD = "insert_spectra_description_set";

    my %args = @_;

    my $atlas_build_id = $args{atlas_build_id} or die
        "need atlas_build_id ($!)";

    my $sample_id = $args{sample_id} or die
        "need sample_id ($!)";

    my $atlas_search_batch_id = $args{atlas_search_batch_id} or die
        "need atlas_search_batch_id ($!)";

    my $search_batch_dir_path = $args{search_batch_dir_path} or die
        "need search_batch_dir_path ($!)";

    ## There could be [spectra_description_set] records for this
    ## sample_id and atlas_search_batch
    my $sql = qq~
        SELECT distinct sample_id, atlas_search_batch_id, 
        instrument_model_id, instrument_model_name, conversion_software_name, 
        conversion_software_version, mzXML_schema, n_spectra
        FROM $TBAT_SPECTRA_DESCRIPTION_SET
        WHERE atlas_search_batch_id = '$atlas_search_batch_id'
        AND sample_id = '$sample_id'
        AND record_status != 'D'
    ~;

    my @rows = $sbeams->selectSeveralColumns($sql);

    my ($instrument_model_id, $conversion_software_name, 
    $instrument_model_id, $instrument_model_name, $conversion_software_name,
    $conversion_software_version, $mzXML_schema, $n_spectra);


    ## If record exists, use attributes to create new record afterwards
    if ($#rows > -1)
    {
        foreach my $row (@rows)
        {
            my ($sid, $asbid, $imid, $imn, $csn, $csv, $ms, $ns) = @{$row}; 

            $instrument_model_id = $imid;

            $instrument_model_name = $imn;

            $conversion_software_name = $csn;

            $conversion_software_version = $csv;

            $mzXML_schema = $ms;

            $n_spectra = $ns;

        }

    } else
    { 

        ## read experiment's pepXML file to get an mzXML file name
        my @mzXMLFileNames = getMzXMLFileNames( search_batch_dir_path => $search_batch_dir_path);


        #### read an mzXML file to get needed attributes... could make a sax content handler for this, 
        #### but only need the first dozen or so lines of file, and the mzXML files are huge...
        my $infile = $mzXMLFileNames[0];

        my $spectrum_parser = new SpectraDescriptionSetParametersParser();

        $spectrum_parser->setMzXML_file($infile);

        $spectrum_parser->parse();

        $mzXML_schema = $spectrum_parser->getMzXML_schema();

        $conversion_software_name = $spectrum_parser->getConversion_software_name();

        $conversion_software_version = $spectrum_parser->getConversion_software_version();


        ## count the number of MS/MS in the mzXML files: ##
        my $sum = 0;

        foreach my $mzXMLFile (@mzXMLFileNames)
        {
            my $nspec = `cat $mzXMLFile | grep 'msLevel="2"' | wc -l`;

            $sum = $sum + $nspec;

        }

        $n_spectra = $sum;

    }


    ## insert [spectra_description_set] record
    my %rowdata = (             
        atlas_build_id => $atlas_build_id,
        sample_id => $sample_id,
        atlas_search_batch_id => $atlas_search_batch_id,
        instrument_model_id  => $instrument_model_id,
        instrument_model_name  => $instrument_model_name,
        conversion_software_name => $conversion_software_name,
        conversion_software_version => $conversion_software_version,
        mzXML_schema => $mzXML_schema,
        n_spectra => $n_spectra,
    );

    my $spectra_description_set_id = $sbeams->updateOrInsertRow(
        insert=>1,
        table_name=>$TBAT_SPECTRA_DESCRIPTION_SET,
        rowdata_ref=>\%rowdata,
        PK => 'spectra_description_id',
        return_PK => 1,
        verbose=>$VERBOSE,
        testonly=>$TESTONLY,
    );

}


#######################################################################
# getNSpecFromProteomics - get the number of spectra with P>=0 loaded 
#    into Proteomics for this search batch
# @param search_batch_id
# @return number of spectra with P>=0 loaded into Proteomics for search_batch
#######################################################################
sub getNSpecFromProteomics
{

    my %args = @_;

    my $search_batch_id = $args{search_batch_id} or die 
        "need search_batch_id ($!)";

    ## get nspec for P>=0.0
    my $sql = qq~
        SELECT count(*)
        FROM $TBPR_PROTEOMICS_EXPERIMENT PE, $TBPR_SEARCH_BATCH SB,
        $TBPR_FRACTION F, $TBPR_MSMS_SPECTRUM MSS
        WHERE SB.search_batch_id = $search_batch_id
        AND PE.experiment_id = SB.experiment_id
        AND PE.experiment_id = F.experiment_id
        AND F.fraction_id = MSS.fraction_id
        AND PE.record_status != 'D'
    ~;

    my @rows = $sbeams->selectOneColumn($sql) or die
        "Could not complete query (No spectra loaded?): $sql ($!)";

    my $n0 = $rows[0];

    return $n0;

}

#######################################################################
# getNSpecFromFlatFiles
# @param search_batch_path
# @return number of spectra with P>=0 for search_batch
#######################################################################
sub getNSpecFromFlatFiles
{
    my %args = @_;

    my $search_batch_path = $args{search_batch_path} or die
        "need search_batch_path ($!)";

    my $pepXMLfile = "$search_batch_path/interact-prob.xml";

    my $n0;

#   print "    file: $pepXMLfile\n";

##  Need to make this a global var, as perl doesn't have nested
##  subroutines (i.e., can't include sub local_start_handler
##  in this subroutine and have %spectra be accessible to it)

    %spectra = ();

    if (-e $pepXMLfile)
    {
        my $parser = new XML::Parser( );

        $parser->setHandlers(Start => \&local_start_handler);

        $parser->parsefile($pepXMLfile);
    }

    $n0 = keys %spectra;

    print "Num spectra searched: $n0\n" if ($TESTONLY);

    return $n0;
}


###################################################################
# local_start_handler -- local content handler for parsing of a
# pepxml to get number of spectra in interact-prob.xml file
###################################################################
sub local_start_handler
{
    my ($expat, $element, %attrs) = @_;

    ## need to get attribute spectrum from spectrum_query element,
    ## drop the last .\d from the string to get the spectrum name,
    ## then count the number of unique spectrum names in the file
    if ($element eq 'spectrum_query')
    {
        my $spectrum = $attrs{spectrum};

        ## drop the last . followed by number
        $spectrum =~ s/(.*)(\.)(\d)/$1/;

        $spectra{$spectrum} = $spectrum;
    }
}


#######################################################################
# readCoords_updateRecords_calcAttributes -- read coordinate mapping
# file, calculate mapping attributes, update peptide_instance
# record, and create peptide_mapping records 
# @infile
# @atlas_build_id 
# @biosequence_set_id 
# @organism_abbrev 
#######################################################################
sub readCoords_updateRecords_calcAttributes {

    my %args = @_;

    my $infile = $args{'infile'} or die "need infile ($!)";

#   my $proteomicsSBID_hash_ref = $args{'proteomicsSBID_hash_ref'} or die
#       " need proteomicsSBID_hash_ref ($!)";
#
#   my %proteomicsSBID_hash = %{$proteomicsSBID_hash_ref};

    my $atlas_build_id = $args{atlas_build_id} or die "need atlas_build_id ($!)";

    my $biosequence_set_id = $args{biosequence_set_id} or die 
        "need biosequence_set_id ($!)";

    my $organism_abbrev = $args{organism_abbrev} or die
        "need organism_abbrev ($!)";

    ## get hash with key=peptide_accession, value=peptide_id
    my %peptides = get_peptide_accession_id_hash();
        

    ## get hash with key=biosequence_name, value=biosequence_id
    my %biosequence_ids = get_biosequence_name_id_hash(
        biosequence_set_id => $biosequence_set_id);


    my (@peptide_accession, @biosequence_name);

    my (@chromosome, @strand, @start_in_chromosome, @end_in_chromosome);

    my (@n_protein_mappings, @n_genome_locations, @is_exon_spanning);

    my (@start_in_biosequence, @end_in_biosequence);


    ## hash with key = $peptide_accession[$ind], 
    ##           value = string of array indices holding given peptide_accession
    my %index_hash; 
                   

    open(INFILE, $infile) or die "ERROR: Unable to open for reading $infile ($!)";
 
    #### READ  coordinate_mapping.txt  ##############
    print "\nReading $infile\n";

    my $line;


    ## hash with key = peptide_accession, value = peptide_sequence
    my %peptideAccession_peptideSequence = get_peptide_accession_sequence_hash( 
        atlas_build_id => $atlas_build_id );

    #### Load information from the coordinate mapping file:
    my $ind=0;

    while ($line = <INFILE>) {

        chomp($line);

        my @columns = split(/\t/,$line);
  
        my $pep_acc = $columns[0];

        push(@peptide_accession, $columns[0]);

        push(@biosequence_name, $columns[2]);

        push(@start_in_biosequence, $columns[5]);

        push(@end_in_biosequence, $columns[6]);

        my $tmp_chromosome = $columns[8];

        ## parsing for chromosome:   this is set for Ens 21 and 22 notation...
        if ($tmp_chromosome =~ /^(chromosome:)(NCBI.+:)(.+)(:.+:.+:.+)/ ) {
            $tmp_chromosome = $3;
        }
        if ($tmp_chromosome =~ /^(chromosome:)(DROM.+:)(.+)(:.+:.+:.+)/ ) {
            $tmp_chromosome = $3;
        }
        ## and for yeast:
        if ($tmp_chromosome =~ /^S/) {
            $tmp_chromosome = $columns[12];
        }

        ### For Ens 32 and on, we are storing chromsome in column 8, so no need for parsing
        #if ( ($tmp_chromosome =~ /^(\d+)$/) || ($tmp_chromosome =~ /^((X)|(Y))$/) ) {
        #    $tmp_chromosome = $tmp_chromosome;
        #}
        ### Additional match for the novel chromosomes such as 17_NT_079568 X_NT_078116
        #if ( ($tmp =~ /^(\d+)_NT_(\d+)$/) || ($tmp_chromosome =~ /^((X)|(Y))_NT_(\d+)$/) ) {
        #    $tmp_chromosome = $tmp_chromosome;
        #}


        push(@chromosome, $tmp_chromosome);

        push(@strand,$columns[9]);

        push(@start_in_chromosome,$columns[10]);
 
        push(@end_in_chromosome,$columns[11]);

 
        ## hash with
        ## keys =  pep_accession
        ## values = the array indices (of the arrays above, holding pep_accession)
        if ( exists $index_hash{$pep_acc} ) {
 
            $index_hash{$pep_acc} = 
                join " ", $index_hash{$pep_acc}, $ind;

        } else {

           $index_hash{$pep_acc} = $ind;

        }


        push(@n_protein_mappings, 1);  ##unless replaced in next section

        push(@n_genome_locations, 1);  ##unless replaced in next section

        push(@is_exon_spanning, 'N');  ##unless replaced in next section

        $ind++;

    }   ## END reading coordinate mapping file

    close(INFILE) or die "Cannot close $infile ($!)";

    print "\nFinished reading .../coordinate_mapping.txt\n";


    if ($TESTONLY) { 

        foreach my $tmp_ind_str (values ( %index_hash ) ) {

            my @tmp_ind_array = split(" ", $tmp_ind_str);
 
            my %unique_protein_hash;

            my %unique_coord_hash;
 
            for (my $ii = 0; $ii <= $#tmp_ind_array; $ii++) {
 
               my $i_ind=$tmp_ind_array[$ii];
 
                reset %unique_protein_hash; ## necessary?
                reset %unique_coord_hash;   ## necessary?
 
                ##what is key of index_hash where value = $tmp_ind_str  ?
                ##is that key = $pep_accession[$i_ind]  ?
                my %inverse_index_hash = reverse %index_hash;
 
                die "check accession numbers in index_hash" 
                    if ($inverse_index_hash{$tmp_ind_str} != $peptide_accession[$i_ind]); 

            }
        }
    } ## end TEST
 

    ## n_protein_mappings -- Number of distinct proteins that a peptide maps to
    ## n_genome_locations -- Number of mappings to Genome where protein is not the same
    ##                       (in other words, counts span over exons only once)
    ## is_exon_spanning   -- Whether a peptide has been mapped to a protein more than
    ##                       once (with different chromosomal coordinates)...each coord
    ##                       set should be shorter in length than the entire sequence...
    ##                       This avoids counting repeat occurrences of an entire
    ##                       peptide in a protein or ORF.

    ## For each peptide:
    ## protein_mappings_hash:  key = protein, value="$chrom:$start:$end"
    ## --> n_protein_mappings = keys ( %protein_mappings_hash )
    ##
    ## --> n_genome_locations =  keys of reverse protein_mappings_hash
    ##
    ## is_exon_spanning_hash:  key = protein,
    ##                         value = 'Y' if (  (diff_coords + 1) != (seq_length*3);

    print "\nCalculating n_protein_mappings, n_genome_locations, and is_exon_spanning\n";

    ## looping through a peptide's array indices to calculate these:
    foreach my $tmp_ind_str (values ( %index_hash ) ) {  #key = peptide_accession

        my @tmp_ind_array = split(" ", $tmp_ind_str);
 
        my (%protein_mappings_hash, %is_exon_spanning_hash);

        ## will skip 1st key = peptide in hashes below, and instead, 
        ## reset hash for each peptide
        reset %protein_mappings_hash; ## necessary?  above declaration should have cleared these?
        reset %is_exon_spanning_hash;

        my $peptide = $peptide_accession[$tmp_ind_array[0]];
        my $protein;

        ## initialize to 'N'
        $is_exon_spanning_hash{$protein} = 'N';


        ## for each index:
        for (my $ii = 0; $ii <= $#tmp_ind_array; $ii++) {

            my $i_ind=$tmp_ind_array[$ii];

            $protein = $biosequence_name[$i_ind];

            my $chrom = $chromosome[$i_ind];

            my $start = $start_in_chromosome[$i_ind];

            my $end = $end_in_chromosome[$i_ind];

            my $coord_str = "$chrom:$start:$end";

          
            $protein_mappings_hash{$protein} = $coord_str;


            my $diff_coords = abs($start - $end );
 
            my $seq_length = 
                length($peptideAccession_peptideSequence{$peptide});

            ## If entire sequence fits between coordinates, the protein has
            ## redundant sequences.  If the sequence doesn't fit between
            ## coordinates, it's exon spanning:
            if (  ($diff_coords + 1) != ($seq_length * 3) ) {

                $is_exon_spanning_hash{$protein} = 'Y';

                ## update larger scope variable too:
                $is_exon_spanning[$i_ind] = 'Y';

            }

        } 

       ## Another iteration through indices to count n_genome_locations
       ## want the number of unique coord strings, corrected to count is_exon_spanning peptides as only 1 addition.
       my %inverse_protein_hash = reverse %protein_mappings_hash;

       my $pep_n_genome_locations = keys( %inverse_protein_hash);


       my $pep_n_protein_mappings = keys( %protein_mappings_hash );


       ## need to assign values to array members now:
       foreach my $tmpind (@tmp_ind_array) {

           $protein = $biosequence_name[$tmpind];


           $n_protein_mappings[$tmpind] = $pep_n_protein_mappings;

           $n_genome_locations[$tmpind] = $pep_n_genome_locations;

       }
     
   } ## end calculate n_genome_locations, n_protein_mappings and is_exon_spanning loop
 
   ### ABOVE modeled following rules:
   ##
   ## PAp00011291  9       ENSP00000295561 ... 24323279        24323305
   ## PAp00011291  9       ENSP00000336741 ... 24323279        24323305
   ## ---> n_genome_locations = 1 for both
   ## ---> n_protein_mappings = 2 for both
   ## ---> is_exon_spanning   = n for both
   ##
   ## PAp00004221  13      ENSP00000317473 ... 75675871        75675882
   ## PAp00004221  13      ENSP00000317473 ... 75677437        75677463
   ## ---> n_genome_locations = 1 for both
   ## ---> n_protein_mappings = 1 for both
   ## ---> is_exon_spanning   = y for both
   ## 
   ## PAp00004290  16      ENSP00000306222 ...   1151627         1151633
   ## PAp00004290  16      ENSP00000306222 ...   1151067         1151107
   ## PAp00004290  16      ENSP00000281456 ... 186762937       186762943
   ## PAp00004290  16      ENSP00000281456 ... 186763858       186763898
   ## ---> n_genome_locations = 2 for all
   ## ---> n_protein_mappings = 2 for both
   ## ---> is_exon_spanning   = y for all

   ## testing match to above rules:
   ## the above cases have P=1.0, and tested for Ens build 22 - 26
   if ($TESTONLY && ($organism_abbrev eq 'Hs') ) {  
       for (my $ii = 0; $ii <= $#peptide_accession; $ii++) {
           my @test_pep = ("PAp00011291", "PAp00004221", "PAp00004290", "PAp00005006");
           my @test_n_protein_mappings = ("2", "1", "2", "5");
           my @test_n_genome_locations = ("1", "1", "2", "5");
           my @test_is_exon_spanning = ('N', 'Y', 'Y', 'N');

           for (my $jj = 0; $jj <= $#test_pep; $jj++) {
               if ($peptide_accession[$ii] eq $test_pep[$jj]) {
                   if ($n_genome_locations[$ii] != $test_n_genome_locations[$jj]) {
                       print "!! $test_pep[$jj] : " .
                       "expected n_genome_locations=$test_n_genome_locations[$jj]" .
                       " but calculated $n_genome_locations[$ii]\n";
                   }
                   if ($n_protein_mappings[$ii] != $test_n_protein_mappings[$jj]) {
                       print "!! $test_pep[$jj] : " .
                       " expected n_protein_mappings=$test_n_protein_mappings[$jj]",
                       " but calculated $n_protein_mappings[$ii]\n";
                   }
                   if ($is_exon_spanning[$ii] != $test_is_exon_spanning[$jj]) {
                       print "!! $test_pep[$jj] : " .
                       " expected is_exon_spanning=$test_is_exon_spanning[$jj]",
                       " but calculated $is_exon_spanning[$ii]\n";
                   }
               }
           }
       }
   }


    ## Calculate is_subpeptide_of

    ## hash with key = peptide_accession, value = sub-peptide string list
    my %peptideAccession_subPeptide;

    foreach my $sub_pep_acc (keys %peptideAccession_peptideSequence)
    {        

        for my $super_pep_acc (keys %peptideAccession_peptideSequence)
        {        

            if ( ( index($peptideAccession_peptideSequence{$super_pep_acc}, 
                $peptideAccession_peptideSequence{$sub_pep_acc}) >= 0) 
                && ($super_pep_acc ne $sub_pep_acc) ) {
   
                if ( exists $peptideAccession_subPeptide{$sub_pep_acc} )
                {

                    $peptideAccession_subPeptide{$sub_pep_acc} =
                        join ",", $peptideAccession_subPeptide{$sub_pep_acc},
                        $peptides{$super_pep_acc};

                } else { 

                    $peptideAccession_subPeptide{$sub_pep_acc} = 
                    $peptides{$super_pep_acc};

                }

            }

        }

    }


   if ($TESTVARS) {

       my $n = ($#peptide_accession) + 1;

       print "\nChecking storage of values after calcs.  $n entries\n";

       for(my $i =0; $i <= $#peptide_accession; $i++){

           my $tmp_pep_acc = $peptide_accession[$i];

           my $tmp_strand = $strand[$i] ;
           my $tmp_peptide_id = $peptides{$tmp_pep_acc};
           my $tmp_n_genome_locations = $n_genome_locations[$i];
           my $tmp_is_exon_spanning = $is_exon_spanning[$i];
           my $tmp_n_protein_mappings = $n_protein_mappings[$i];
           my $tmp_start_in_biosequence = $start_in_biosequence[$i];
           my $tmp_end_in_biosequence = $end_in_biosequence[$i];
           my $tmp_chromosome = $chromosome[$i];
           my $tmp_start_in_chromosome = $start_in_chromosome[$i];
           my $tmp_end_in_chromosome = $end_in_chromosome[$i];
           my $tmp_strand = $strand[$i];


           if (!$tmp_pep_acc) {
         
               print "PROBLEM:  missing \$tmp_pep_acc for index $i\n";

           } elsif (!$tmp_strand) {

               print "PROBLEM:  missing \$tmp_strand for $tmp_pep_acc \n";

           } elsif (!$tmp_n_genome_locations) {

               print "PROBLEM:  missing \$tmp_n_genome_locations for $tmp_pep_acc \n";

           } elsif (!$tmp_is_exon_spanning) {

               print "PROBLEM:  missing \$tmp_is_exon_spanning  for $tmp_pep_acc \n";

           } elsif (!$tmp_n_protein_mappings){

               print "PROBLEM:  missing \$tmp_n_protein_mappings for $tmp_pep_acc \n";

           } elsif (!$tmp_start_in_biosequence) {

               print "PROBLEM:  missing \$tmp_start_in_biosequence for $tmp_pep_acc \n";

           } elsif (!$tmp_end_in_biosequence ) {

               print "PROBLEM:  missing \$tmp_end_in_biosequence for $tmp_pep_acc \n";

           } elsif (!$tmp_chromosome ){

               print "PROBLEM:  missing \$tmp_chromosome for $tmp_pep_acc \n";

           } elsif (!$tmp_start_in_chromosome ) {

               print "PROBLEM:  missing \$tmp_start_in_chromosome for $tmp_pep_acc \n";

           } elsif (!$tmp_end_in_chromosome ) {

               print "PROBLEM:  missing \$tmp_end_in_chromosome  for $tmp_pep_acc \n";

           } elsif (!$tmp_strand) {

               print "PROBLEM:  missing \$tmp_strand for $tmp_pep_acc \n";

           }
       }

       print "-->end third stage test of vars\n";
    }


    ####----------------------------------------------------------------------------
    ## Creating peptide_mapping records, and updating peptide_instance records
    ####----------------------------------------------------------------------------
    print "\nCreating peptide_mapping records, and updating peptide_instance records\n";


    ## hash with key = peptide_accesion, value = peptide_instance_id
    my %peptideAccession_peptideInstanceID = 
        get_peptide_accession_instance_id_hash( 
        atlas_build_id => $atlas_build_id );


    my %strand_xlate = ( '-' => '-',   '+' => '+',  '-1' => '-',
                        '+1' => '+',   '1' => '+'
    );


    for (my $i =0; $i <= $#peptide_accession; $i++){

        my $tmp = $strand_xlate{$strand[$i]}
            or die("ERROR: Unable to translate strand $strand[$i]");

        $strand[$i] = $tmp;
 
        #### Make sure we can resolve the biosequence_id
        my $biosequence_id = $biosequence_ids{$biosequence_name[$i]}
            || die("ERROR: BLAST matched biosequence_name $biosequence_name[$i] ".
            "does not appear to be in the biosequence table!!");

        my $tmp_pep_acc = $peptide_accession[$i];
 

        my $peptide_id = $peptides{$tmp_pep_acc} ||
            die("ERROR: Wanted to insert data for peptide $peptide_accession[$i] ".
            "which is in the BLAST output summary, but not in the input ".
            "peptide file??");

        my $peptide_instance_id = $peptideAccession_peptideInstanceID{$tmp_pep_acc};


        ## UPDATE peptide_instance record
        my %rowdata = (   ##   peptide_instance    table attributes
            n_genome_locations => $n_genome_locations[$i],
            is_exon_spanning => $is_exon_spanning[$i],
            n_protein_mappings => $n_protein_mappings[$i],
            is_subpeptide_of => $peptideAccession_subPeptide{$tmp_pep_acc},
        );
 

        my $success = $sbeams->updateOrInsertRow(
            update=>1,
            table_name=>$TBAT_PEPTIDE_INSTANCE,
            rowdata_ref=>\%rowdata,
            PK => 'peptide_instance_id',
            PK_value => $peptide_instance_id,
            verbose=>$VERBOSE,
            testonly=>$TESTONLY,
        );


        ## CREATE peptide_mapping record
        my %rowdata = (   ##   peptide_mapping      table attributes
            peptide_instance_id => $peptide_instance_id,
            matched_biosequence_id => $biosequence_id,
            start_in_biosequence => $start_in_biosequence[$i],
            end_in_biosequence => $end_in_biosequence[$i],
            chromosome => $chromosome[$i],
            start_in_chromosome => $start_in_chromosome[$i],
            end_in_chromosome => $end_in_chromosome[$i],
            strand => $strand[$i],
        );
        
        $sbeams->updateOrInsertRow(
            insert=>1,
            table_name=>$TBAT_PEPTIDE_MAPPING,
            rowdata_ref=>\%rowdata,
            PK => 'peptide_mapping_id',
            verbose=>$VERBOSE,
            testonly=>$TESTONLY,
        );


        print "$i...";
 
    }  ## end  create peptide_mapping records and update peptide_instance records


    print "\n";
 

   ## LAST TEST to assert that all peptides of an atlas in
   ## peptide_instance  are associated with a peptide_instance_sample record
   ## ALSO checks that atlas_build_sample is filled...uses this as start point
   if ($CHECKTABLES) {

       print "\nChecking peptide_instance_sample records\n";

       my $sql = qq~
           SELECT peptide_instance_id, sample_ids
           FROM $TBAT_PEPTIDE_INSTANCE
           WHERE atlas_build_id ='$ATLAS_BUILD_ID'
       ~;

       ## make hash with key = peptide_instance_id, value = sample_ids
       my %peptide_instance_hash = $sbeams->selectTwoColumnHash($sql)
           or die "In last test, unable to complete query:\n$sql\n($!)";

       my $n = keys ( %peptide_instance_hash ) ;

       print "\n$n peptide_instance_ids in atlas build $ATLAS_BUILD_ID\n";

       
       ## get an array of sample_id for a given atlas_id
       $sql = qq~
           SELECT sample_id
           FROM $TBAT_ATLAS_BUILD_SAMPLE
           WHERE atlas_build_id ='$ATLAS_BUILD_ID'
           AND record_status != 'D'
        ~;

       my @sample_id_array =  $sbeams->selectOneColumn($sql) 
           or die "could not find sample id's for atlas_build_id= ".
           "$ATLAS_BUILD_ID ($!)";

       my $sample_id_string = join("\',\'",@sample_id_array);
       ## add a ' to beginning of string and to end
       $sample_id_string =~ s/^(\w+)/\'$1/;
       $sample_id_string =~ s/(\w+)$/$1\'/;


       ## grab all records in peptide_instance_sample that contain
       ## sample_id = one of "sample_id_string"
       $sql = qq~
           SELECT peptide_instance_id, sample_id
           FROM $TBAT_PEPTIDE_INSTANCE_SAMPLE
           WHERE sample_id IN ( $sample_id_string )
           AND record_status != 'D'
       ~;
       ## returns an array of references to arrays (each of which is a returned row?)
       my @peptide_instance_id_ref_array = $sbeams->selectSeveralColumns($sql)
           or die "In last test, unable to complete query:\n$sql\n($!)";


       foreach my $peptide_instance_id ( keys %peptide_instance_hash) {

           my $tmp_sample_ids = $peptide_instance_hash{$peptide_instance_id};
           
           ## split sample_ids into sample_id:
           my @sample_id = split(",", $tmp_sample_ids );

            foreach (my $i; $i <= $#sample_id; $i++) {
 
                ## check that there's an entry in @peptide_instance_id_ref_array
                ## matching $peptide_instance_id and $sample_id[$i]
                my $match_flag = 0;

                foreach my $row (@peptide_instance_id_ref_array) {
             
                    my ($tmp_peptide_instance_id, $tmp_sample_id) = @{$row};

                    if ( ($peptide_instance_id == $tmp_peptide_instance_id)
                    && ($tmp_sample_id == $sample_id[$i]) ) {

                        $match_flag = 1;

                        last;
                    }
                    
                }
 
                unless ($match_flag == 1) {

                    print "couldn't find peptide_instance_sample record for ".
                        "peptide_instance_id $peptide_instance_id in sample $sample_id[$i]\n";

                }
 
            }

       }

   }


}


#######################################################################
# loadFromPAxmlFile - uses SAX Content handler to parse APD_*all.PAxml 
# and call methods within this code to insert [peptide], 
# [peptide_instance*], and [modified_peptide_instance*] records
#
# @param atlas_build_id
# @param sbid_asbid_sid_hash_ref reference to complex hash holding
#    atlas_search_batch_id and sample_id using key = protomics
#    search_batch_id
# @param $infile the APD_<organism>_all.PAxml file
#######################################################################
sub loadFromPAxmlFile {
  my %args = @_;

  my $infile = $args{'infile'} or die "need infile ($!)";

  my $sbid_asbid_sid_hash_ref = $args{'sbid_asbid_sid_hash_ref'} or die
    "need sample_id_hash reference ($!)";

  my $atlas_build_id = $args{'atlas_build_id'} or die
    "need atlas_build_id ($!)";

  ## hash with key = atlas_search_batch_id, value = sample_id
  my %sbid_asbid_sid_hash = %{ $sbid_asbid_sid_hash_ref };

  ## get hash with key=peptide_accession, value=peptide_id
  my %peptide_acc_id_hash = get_peptide_accession_id_hash();

  #### Process parser options
  my $validate = $OPTIONS{validate} || 'auto';
  my $namespace = $OPTIONS{namespaces} || 0;
  my $schema = $OPTIONS{schemas} || 0;

  if (uc($validate) eq 'ALWAYS') {
    $validate = $XML::Xerces::SAX2XMLReader::Val_Always;
  } elsif (uc($validate) eq 'NEVER') {
    $validate = $XML::Xerces::SAX2XMLReader::Val_Never;
  } elsif (uc($validate) eq 'AUTO') {
    $validate = $XML::Xerces::SAX2XMLReader::Val_Auto;
  } else {
    die("Unknown value for -v: $validate\n$USAGE");
  }

  #### Set up the Xerces parser
  my $parser = XML::Xerces::XMLReaderFactory::createXMLReader();
  $parser->setFeature("http://xml.org/sax/features/namespaces", $namespace);

  if ($validate eq $XML::Xerces::SAX2XMLReader::Val_Auto) {
    $parser->setFeature("http://xml.org/sax/features/validation", 1);
    $parser->setFeature("http://apache.org/xml/features/validation/dynamic",1);

  } elsif ($validate eq $XML::Xerces::SAX2XMLReader::Val_Never) {
    $parser->setFeature("http://xml.org/sax/features/validation", 0);

  } elsif ($validate eq $XML::Xerces::SAX2XMLReader::Val_Always) {
    $parser->setFeature("http://xml.org/sax/features/validation", 1);
    $parser->setFeature("http://apache.org/xml/features/validation/dynamic",0);
  }

  $parser->setFeature("http://apache.org/xml/features/validation/schema",
    $schema);


  #### Create the error handler and content handler
  my $error_handler = XML::Xerces::PerlErrorHandler->new();
  $parser->setErrorHandler($error_handler);

  my $CONTENT_HANDLER = PAxmlContentHandler->new();
  $parser->setContentHandler($CONTENT_HANDLER);

  $CONTENT_HANDLER->setVerbosity($VERBOSE);
  $CONTENT_HANDLER->{counter} = 0;
  $CONTENT_HANDLER->{atlas_build_id} = $atlas_build_id;

  $CONTENT_HANDLER->{sbid_asbid_sid_hash} = 
    \%sbid_asbid_sid_hash;

  $CONTENT_HANDLER->{peptide_acc_id_hash} = \%peptide_acc_id_hash;


  $parser->parse(XML::Xerces::LocalFileInputSource->new($infile));

  return(1);

}


#######################################################################
#  getMzXMLFileNames -- get mzXML File Names used in the interact pepXML
# @param search_batch_dir_path absolute path to search_batch_dir
# @return mzXMLFileNames
#######################################################################
sub getMzXMLFileNames
{
    my %args = @_;

    my $search_batch_dir_path = $args{search_batch_dir_path} or die
        " need search_batch_dir_path ($!)";

    my $infile = "$search_batch_dir_path/interact-prob.xml";

    #### Sometimes search_batch_dir_path is actually a file??
    if ($search_batch_dir_path =~ /\.xml/) {
      $infile = $search_batch_dir_path;
    }

    ## to handle both older and newer formats:
    my ($msRunPepXMLFileName, $mzXMLFileName);
    my (@msRunPepXMLFileNames, @mzXMLFileNames);
    my $guessed_experiment_dir;

    unless(-e $infile)
    {
        print "[WARN] could not find $infile\n";

        $infile = "$search_batch_dir_path/interact.xml";
    }
    unless(-e $infile)
    {
        die "could not find $infile either. Abort.\n";
    }

    open(INFILE, "<$infile") or die "cannot open $infile for reading ($!)";

    while (my $line = <INFILE>)
    {
        chomp($line);

        if ($line =~ /^(\<inputfile name=\")(.+)(\/)(.+)(\/)(.+)(\")(\/\>)/)
        {
            my $exp_dir = $2;

            my $tmp = $6;

            if ($tmp =~ /(.+)\.xml/)
            {
                $tmp = $1;
            }

            $mzXMLFileName = "$exp_dir/$tmp" . ".mzXML";

            push (@mzXMLFileNames, $mzXMLFileName);

        }

        #### Workaround for problem Qstar experiments, specifically
        #### /sbeams/archive/rossola/HUPO-ISB/b1-CIT_glyco_qstar
	if ($line =~ m~directory="(.+)/.+">~) {
	  $guessed_experiment_dir = $1;
	}
        if ($line =~ m~<inputfile name="(.+)\.xml"/>~) {
	  $mzXMLFileName = "$guessed_experiment_dir/$1.mzXML";
	  push (@mzXMLFileNames, $mzXMLFileName);
	}


	#### Finish parsing if we get to <roc> element and we've found something
        if ($line =~ /(\<roc)(.+)/ && $mzXMLFileName)
        {
            last;
        }

	#### Finish parsing that <spectrum_query> for sure
        last if ($line =~ /\<spectrum_query/);

    }

    close(INFILE) or die "Cannot close $infile";

    return @mzXMLFileNames;

}


###############################################################################
# insert_peptide
###############################################################################
sub insert_peptide {
  my %args = @_;

  my $rowdata_ref = $args{'rowdata_ref'} or die("need rowdata_ref");

  my $sequence = $rowdata_ref->{peptide_sequence};
  my $mw = $GlycoPeptideModule->calculatePeptideMass( sequence => $sequence );
  my $pI = $GlycoPeptideModule->calculatePeptidePI( sequence => $sequence );
  my $hp;
  if ($SSRCalculator->checkSequence($sequence)) {
    $hp = $SSRCalculator->TSUM3($sequence);
  }

  $rowdata_ref->{molecular_weight} = $mw;
  $rowdata_ref->{peptide_isoelectric_point} = $pI;
  $rowdata_ref->{SSRCalc_relative_hydrophobicity} = $hp;

  my $peptide_id = $sbeams->updateOrInsertRow(
    insert=>1,
    table_name=>$TBAT_PEPTIDE,
    rowdata_ref=>$rowdata_ref,
    PK => 'peptide_id',
    return_PK => 1,
    verbose=>$VERBOSE,
    testonly=>$TESTONLY,
  );

  return($peptide_id);

} # end insert_peptide


###############################################################################
# insert_peptide_instance
###############################################################################
sub insert_peptide_instance {
  my %args = @_;

  my $rowdata_ref = $args{'rowdata_ref'} or die("need rowdata_ref");

  my $peptide_instance_id = $sbeams->updateOrInsertRow(
    insert=>1,
    table_name=>$TBAT_PEPTIDE_INSTANCE,
    rowdata_ref=>$rowdata_ref,
    PK => 'peptide_instance_id',
    return_PK => 1,
    verbose=>$VERBOSE,
    testonly=>$TESTONLY,
  );

  return($peptide_instance_id);

} # end insert_peptide_instance


###############################################################################
# insert_modified_peptide_instance
###############################################################################
sub insert_modified_peptide_instance {
  my %args = @_;

  my $rowdata_ref = $args{'rowdata_ref'} or die("need rowdata_ref");


  #### Calculate some mass values based on the sequence
  my $modified_peptide_sequence = $rowdata_ref->{modified_peptide_sequence};
  my $peptide_charge = $rowdata_ref->{peptide_charge};

  $rowdata_ref->{monoisotopic_peptide_mass} =
    $massCalculator->getPeptideMass(
				    sequence => $modified_peptide_sequence,
				    mass_type => 'monoisotopic',
				   );

  $rowdata_ref->{average_peptide_mass} =
    $massCalculator->getPeptideMass(
				    sequence => $modified_peptide_sequence,
				    mass_type => 'average',
				   );

  $rowdata_ref->{monoisotopic_parent_mz} =
    $massCalculator->getPeptideMass(
				    sequence => $modified_peptide_sequence,
				    mass_type => 'monoisotopic',
				    charge => $peptide_charge,
				   );

  $rowdata_ref->{average_parent_mz} =
    $massCalculator->getPeptideMass(
				    sequence => $modified_peptide_sequence,
				    mass_type => 'average',
				    charge => $peptide_charge,
				   );


  #### INSERT the record
  my $modified_peptide_instance_id = $sbeams->updateOrInsertRow(
    insert=>1,
    table_name=>$TBAT_MODIFIED_PEPTIDE_INSTANCE,
    rowdata_ref=>$rowdata_ref,
    PK => 'modified_peptide_instance_id',
    return_PK => 1,
    verbose=>$VERBOSE,
    testonly=>$TESTONLY,
  );

  return($modified_peptide_instance_id);

} # end insert_modified_peptide_instance


###############################################################################
# insert_peptide_instance_samples
#
# @param $peptide_instance_id
# @param $sample_ids string of sample_ids separated by commas
###############################################################################
sub insert_peptide_instance_samples {

  my %args = @_;

  my $peptide_instance_id = $args{'peptide_instance_id'}
    or die("need peptide_instance_id");

  my $sample_ids = $args{'sample_ids'} or die("need sample_ids");

  my @sample_ids = split(/,/,$sample_ids);

  foreach my $sample_id ( @sample_ids ) {
    my %rowdata = (
      peptide_instance_id => $peptide_instance_id,
      sample_id => $sample_id,
    );

    $sbeams->updateOrInsertRow(
      insert=>1,
      table_name=>$TBAT_PEPTIDE_INSTANCE_SAMPLE,
      rowdata_ref=>\%rowdata,
      PK => 'peptide_instance_sample_id',
      add_audit_parameters => 1,
      verbose=>$VERBOSE,
      testonly=>$TESTONLY,
    );

  }

  return(1);

} # end insert_peptide_instance_samples


###############################################################################
# insert_peptide_instance_search_batches
#  
# @param $peptide_instance_id
# @param $atlas_search_batch_ids
###############################################################################
sub insert_peptide_instance_search_batches 
{
    my %args = @_;

    my $peptide_instance_id = $args{'peptide_instance_id'}
        or die("need peptide_instance_id");

    my $atlas_search_batch_ids = $args{'atlas_search_batch_ids'} 
        or die("need atlas_search_batch_ids");

    my @atlas_search_batch_id_array = split(/,/,$atlas_search_batch_ids);

    foreach my $atlas_search_batch_id ( @atlas_search_batch_id_array ) 
    {
        my %rowdata = (
            peptide_instance_id => $peptide_instance_id,
            atlas_search_batch_id => $atlas_search_batch_id,
        );

        $sbeams->updateOrInsertRow(
            insert=>1,
            table_name=>$TBAT_PEPTIDE_INSTANCE_SEARCH_BATCH,
            rowdata_ref=>\%rowdata,
            PK => 'peptide_instance_search_batch_id',
            add_audit_parameters => 1,
            verbose=>$VERBOSE,
            testonly=>$TESTONLY,
        );

    }

    return(1);

} # end insert_peptide_instance_search_batches


###############################################################################
# insert_modified_peptide_instance_samples
###############################################################################
sub insert_modified_peptide_instance_samples 
{

  my %args = @_;

  my $modified_peptide_instance_id = $args{'modified_peptide_instance_id'}
    or die("need modified_peptide_instance_id");

  my $sample_ids = $args{'sample_ids'} or die("need sample_ids");

  my @sample_ids = split(/,/,$sample_ids);

  foreach my $sample_id ( @sample_ids ) {
    my %rowdata = (
      modified_peptide_instance_id => $modified_peptide_instance_id,
      sample_id => $sample_id,
    );

    $sbeams->updateOrInsertRow(
      insert=>1,
      table_name=>$TBAT_MODIFIED_PEPTIDE_INSTANCE_SAMPLE,
      rowdata_ref=>\%rowdata,
      PK => 'modified_peptide_instance_sample_id',
      add_audit_parameters => 1,
      verbose=>$VERBOSE,
      testonly=>$TESTONLY,
    );

  }

  return(1);

} # end insert_modified_peptide_instance_samples


###############################################################################
# insert_modified_peptide_instance_search_batches
#  
# @param $modified_peptide_instance_id
# @param $atlas_search_batch_ids
###############################################################################
sub insert_modified_peptide_instance_search_batches 
{
    my %args = @_;

    my $modified_peptide_instance_id = $args{'modified_peptide_instance_id'}
        or die("need modified_peptide_instance_id");

    my $atlas_search_batch_ids = $args{'atlas_search_batch_ids'} 
        or die("need atlas_search_batch_ids");

    my @atlas_search_batch_id_array = split(/,/,$atlas_search_batch_ids);

    foreach my $atlas_search_batch_id ( @atlas_search_batch_id_array ) 
    {
        my %rowdata = (
            modified_peptide_instance_id => $modified_peptide_instance_id,
            atlas_search_batch_id => $atlas_search_batch_id,
        );

        $sbeams->updateOrInsertRow(
            insert=>1,
            table_name=>$TBAT_MODIFIED_PEPTIDE_INSTANCE_SEARCH_BATCH,
            rowdata_ref=>\%rowdata,
            PK => 'modified_peptide_instance_search_batch_id',
            add_audit_parameters => 1,
            verbose=>$VERBOSE,
            testonly=>$TESTONLY,
        );

    }

    return(1);

} # end insert_modified_peptide_instance_search_batches

#######################################################################
# populateSampleRecordsWithSampleAccession - 
# update sample records that need accession numbers
#######################################################################
sub populateSampleRecordsWithSampleAccession
{

    ## get hash with key=sample_id, value=sample_accession
    ##                              (where values may be '')
    my %sampleId_sampleAccession_hash = getSampleIdSampleAccessionHash();


    ## accession root name and number of digits:
    my $root_name = "PAe";

    my $num_digits = 6;

    ## get last number used, will return 0 if none were set yet
    my $last_number_used = getLastNumberFromAccessions(
        hash_ref => \%sampleId_sampleAccession_hash,
        root_name => $root_name,
    ); 


    ## sort numerically by key:
    foreach my $sample_id (sort { $a <=> $b } keys %sampleId_sampleAccession_hash)
    {
 
        my $existing_accession = $sampleId_sampleAccession_hash{$sample_id};

        ## execute only if there isn't a sample_accession
        unless ($existing_accession )
        {
            my $next_accession;

            my $next_number = $last_number_used + 1;

            my $next_number_length =  length($next_number);

            # exit with error if num digits needed is larger than num digits expected
            if ($next_number_length > $num_digits)
            {
                print "number of digits exceeds $num_digits\n";

                exit(0);
            }


            my $next_accession = $root_name;

            for (my $i=0; $i < ($num_digits - $next_number_length); $i++ )
            {
                $next_accession = $next_accession . "0";
            }

            $next_accession = $next_accession . $next_number;
     
            updateSampleRecord(
                sample_id => $sample_id,
                sample_accession => $next_accession,
            );
 
            $last_number_used = $last_number_used + 1;
 
        }
 
    }
 
}


###############################################################################
#  getSampleIdSampleAccessionHash
#
# @return hash with key = sample_id, value = sample_accession
###############################################################################
sub getSampleIdSampleAccessionHash
{
    my %args = @_;

    ## having to go around through peptide records, then using hash to
    ## store distinct returned rows
    my $sql = qq~
        SELECT sample_id, sample_accession
        FROM $TBAT_SAMPLE 
        WHERE record_status != 'D'
    ~;

    my %hash = $sbeams->selectTwoColumnHash($sql) or die
        "unable to execute statement:\n$sql\n($!)";

    return %hash;

}

#######################################################################
# getLastNumberFromAccessions
# @param ref to sample_id, sample_accession hash
# @param default_first_accession
# @return the latest accession in hash, or zero if none present
#######################################################################
sub getLastNumberFromAccessions
{
    my %args = @_;

    my $hash_ref = $args{hash_ref} or die "need hash_ref";

    my $root_name = $args{root_name} or die "need root_name";

    my %hash = %{$hash_ref};

    my $last_number = 0;

    my $last_written_accession;

    my $n = keys %hash;

    if ($n > 0)
    {
        ## sort by hash values in descending asciibetical order:
        my @sorted_keys = sort { $hash{$b} cmp $hash{$a} } keys %hash;

        if (@sorted_keys)
        {
            $last_written_accession = $hash{$sorted_keys[0]};

            $last_number = $last_written_accession;

            $last_number =~ s/($root_name)(\d+)$/$2/;

            $last_number = int($last_number);
        }

    }

    return $last_number;
}


###############################################################################
# updateSampleRecord -- update sample record...
# @param sample_id
# @param sample_accession
###############################################################################
sub updateSampleRecord 
{

    my %args = @_;

    my $sample_id = $args{sample_id} or die "need sample_id";

    my $sample_accession = $args{sample_accession} 
        or die "need sample_accession";


    my %rowdata = (  
        sample_accession => $sample_accession,
    );

    my $success = $sbeams->updateOrInsertRow(
        update=>1,
        table_name=>$TBAT_SAMPLE,
        rowdata_ref=>\%rowdata,
        PK => 'sample_id',
        PK_value => $sample_id,
        verbose=>$VERBOSE,
        testonly=>$TESTONLY,
    );

    return $success;

} 


#######################################################################
# getNextAccession - get next accession
# @param root_name - root of accession name (e.g. "PAe")
# @param num_digits - number of digits in accession string
# @param last_num_used - last number used
# @return next_accession
#######################################################################
sub getNextAccession
{
    my %args = @_;

    my $num_digits = $args{num_digits} or die "need num_digits";

    my $last_number_used = $args{last_num_used} or die "need last_num_used";

    my $root_name = $args{root_name} or die "need root_name";

    my $next_accession;

    my $next_number = $last_number_used + 1;

    my $next_number_length =  length($next_number);

    # exit with error if num digits needed is larger than num digits expected
    if ($next_number_length > $num_digits)
    {
        print "number of digits exceeds $num_digits\n";

        exit(0);
    }


    my $next_accession = $root_name;

    for (my $i=0; $i < ($num_digits - $next_number_length); $i++ )
    {
        $next_accession = $next_accession . "0";
    }

    $next_accession = $next_accession . $next_number;

    return $next_accession;

}


#######################################################################
#  print_hash
#
# @param hash_ref to hash to print
#######################################################################
sub print_hash
{

    my %args = @_;

    my $hr = $args{hash_ref};

    my %h = %{$hr};

    foreach my $k (keys %h)
    {
        print "key: $k   value:$h{$k}\n";

    }

}