function tripal_feature_load_fasta

2.x tripal_feature.fasta_loader.inc tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession, $db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type, $job = NULL)
1.x fasta_loader.inc tripal_feature_load_fasta($dfile, $organism_id, $type, $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type, $job = NULL)

Actually load a fasta file. This is the function called by tripal jobs

Parameters

$dfile The: full path to the fasta file to load

$organism_id The: organism_id of the organism these features are from

$type The: type of features contained in the fasta file

$re_name A: regular expression to extract the feature.name from the fasta header

$re_uname A: regular expression to extract the feature.uniquename from the fasta header

$re_accession A: regular expression to extract the accession of the feature.dbxref_id

$db_id The: db_id of the above dbxref

$rel_type The: type of relationship when creating a feature_relationship between this feature (object) and an extracted subject

$re_subject The: regular expression to extract the uniquename of the feature to be the subject of the above specified relationship

$parent_type The: type of the parent feature

$method The: method of feature adding. (ie: 'Insert only', 'Update only', 'Insert and update')

$uid The: user id of the user who submitted the job

$analysis_id The: analysis_id to associate the features in this fasta file with

$match_type Whether: to match existing features based on the 'Name' or 'Unique name'

$job =: NULL The tripal job

Related topics

2 string references to 'tripal_feature_load_fasta'
tripal_feature_fasta_load_form_submit in tripal_feature/includes/tripal_feature.fasta_loader.inc
Submit a fasta loading job
tripal_feature_job_describe_args in tripal_feature/tripal_feature.module
Implements hook_job_describe_args() in order to describe the various feature jobs to the tripal jobs interface.

File

tripal_feature/includes/tripal_feature.fasta_loader.inc, line 388
Provides fasta loading functionality. Creates features based on their specification in a fasta file.

Code

function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession, 
$db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type, 
$job = NULL) {
  $transaction = db_transaction();
  print "\nNOTE: Loading of this Fasta file is performed using a database transaction. \n" .
    "If the load fails or is terminated prematurely then the entire set of \n" .
    "insertions/updates is rolled back and will not be found in the database\n\n";
  try {

    // First get the type for this sequence.
    $cvtermsql = "
      SELECT CVT.cvterm_id
      FROM {cvterm} CVT
        INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
        LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
      WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)
    ";
    $cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $type, ':synonym' => $type))->fetchObject();
    if (!$cvterm) {
      tripal_report_error("T_fasta_loader", TRIPAL_ERROR, 
      "Cannot find the term type: '%type'", array('%type' => $type));
      return 0;
    }

    // Second, if there is a parent type then get that.
    $parentcvterm = NULL;
    if ($parent_type) {
      $parentcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $parent_type, ':synonym' => $parent_type))->fetchObject();
      if (!$parentcvterm) {
        tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the parent term type: '%type'", array(
          '%type' => $parentcvterm
        ));
        return 0;
      }
    }

    // Third, if there is a relationship type then get that.
    $relcvterm = NULL;
    if ($rel_type) {
      $relcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $rel_type, ':synonym' => $rel_type))->fetchObject();
      if (!$relcvterm) {
        tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the relationship term type: '%type'", array(
          '%type' => $relcvterm
        ));
        return 0;
      }
    }

    // We need to get the table schema to make sure we don't overrun the
    // size of fields with what our regular expressions retrieve
    $feature_tbl = chado_get_schema('feature');
    $dbxref_tbl = chado_get_schema('dbxref');

    print "Step 1: finding sequences\n";
    $filesize = filesize($dfile);
    $fh = fopen($dfile, 'r');
    if (!$fh) {
      tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "cannot open file: %dfile", array(
        '%dfile' => $dfile
      ));
      return 0;
    }

    // Calculate the interval at which we will print to the screen that status.
    $interval = intval($filesize * 0.01);
    if ($interval < 1) {
      $interval = 1;
    }
    $inv_read = 0;
    $num_read = 0;

    // Iterate through the lines of the file. Keep a record for
    // where in the file each line is at for later import.
    $seqs = array();
    $num_seqs = 0;
    $prev_pos = 0;
    $set_start = FALSE;
    $intv_read = 0;
    $line_num = 0;
    while ($line = fgets($fh)) {
      $line_num++;
      $num_read += strlen($line);
      $intv_read += strlen($line);

      // If we encounter a definition line then get the name, uniquename,
      // accession and relationship subject from the definition line.
      if (preg_match('/^>/', $line)) {

        // Remove the > symbol from the defline.
        $defline = preg_replace("/^>/", '', $line);

        // Get the feature name if a regular expression is provided.
        $name = "";
        if ($re_name) {
          if (!preg_match("/$re_name/", $defline, $matches)) {
            tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Regular expression for the feature name finds nothing. Line %line.", array(
              '%line' => $i
            ));
          }
          elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
            tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array(
              '%line' => $i
            ));
          }
          else {
            $name = trim($matches[1]);
          }
        }
        // If the match_type is name and no regular expression was provided
        // then use the first word as the name, otherwise we don't set the name.
        elseif (strcmp($match_type, 'Name') == 0) {
          if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
            if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
              tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array(
                '%line' => $i));
            }
            else {
              $name = trim($matches[1]);
            }
          }
          else {
            tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Cannot find a feature name. Line %line.", array(
              '%line' => $i));
          }
        }

        // Get the feature uniquename if a regular expression is provided.
        $uname = "";
        if ($re_uname) {
          if (!preg_match("/$re_uname/", $defline, $matches)) {
            tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array(
              '%line' => $i));
          }
          $uname = trim($matches[1]);
        }
        // If the match_type is name and no regular expression was provided
        // then use the first word as the name, otherwise, we don't set the
        // uniquename.
        elseif (strcmp($match_type, 'Unique name') == 0) {
          if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
            $uname = trim($matches[1]);
          }
          else {
            tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Cannot find a feature unique name. Line %line.", array(
              '%line' => $i));
          }
        }

        // Get the accession if a regular expression is provided.
        $accession = "";
        if (!empty($re_accession)) {
          preg_match("/$re_accession/", $defline, $matches);
          if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
            tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves an accession too long for the feature name. " .
              "Cannot add cross reference. Line %line.", array('%line' => $i
            ));
          }
          else {
            $accession = trim($matches[1]);
          }
        }

        // Get the relationship subject
        $subject = "";
        if (!empty($re_subject)) {
          preg_match("/$re_subject/", $line, $matches);
          $subject = trim($matches[1]);
        }

        // Add the details to the sequence.
        $seqs[$num_seqs] = array(
          'name' => $name,
          'uname' => $uname,
          'accession' => $accession,
          'subject' => $subject,
          'seq_start' => ftell($fh)
        );
        $set_start = TRUE;
        // If this isn't the first sequence, then we want to specify where
        // the previous sequence ended.
        if ($num_seqs > 0) {
          $seqs[$num_seqs - 1]['seq_end'] = $prev_pos;
        }
        $num_seqs++;
      }
      // Keep the current file position so we can use it to set the sequence
      // ending position
      $prev_pos = ftell($fh);

      // update the job status every % bytes
      if ($job and $intv_read >= $interval) {
        $intv_read = 0;
        $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
        if ($name) {
          print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
            " bytes.\r";
        }
        else {
          print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
            " bytes.\r";
        }
        tripal_set_job_progress($job, intval(($num_read / $filesize) * 100));
      }
    }
    $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
    print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
      " bytes.\r";
    tripal_set_job_progress($job, 50);

    // Set the end position for the last sequence.
    $seqs[$num_seqs - 1]['seq_end'] = $num_read - strlen($line);

    // Now that we know where the sequences are in the file we need to add them.
    print "\nStep 2: Importing sequences\n";
    for ($j = 0; $j < $num_seqs; $j++) {
      $seq = $seqs[$j];
      print "Importing " . ($j + 1) . " of $num_seqs. ";
      if ($name) {
        print "Current feature: " . $seq['name'] . ".\n";
      }
      else {
        print "Current feature: " . $seq['uname'] . ".\n";
      }

      $source = NULL;
      tripal_feature_load_fasta_feature($fh, $seq['name'], $seq['uname'], $db_id, $seq['accession'], $seq['subject'], $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name, $match_type, $parentcvterm, $relcvterm, $seq['seq_start'], $seq['seq_end']);
    }
    tripal_set_job_progress($job, 100);
    fclose($fh);
  }
  catch (Exception $e) {
    fclose($fh);
    $transaction->rollback();
    print "\n"; // make sure we start errors on new line
    watchdog_exception('T_fasta_loader', $e);
    print "FAILED: Rolling back database changes...\n";
  }

  print "\nDone\n";
}