function tripal_feature_load_fasta

2.x tripal_feature.fasta_loader.inc tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession, $db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type, $job = NULL)
1.x fasta_loader.inc tripal_feature_load_fasta($dfile, $organism_id, $type, $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type, $job = NULL)

Related topics

2 string references to 'tripal_feature_load_fasta'

File

tripal_feature/includes/fasta_loader.inc, line 424
@todo Add file header description

Code

function tripal_feature_load_fasta($dfile, $organism_id, $type, 
$library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type, 
$re_subject, $parent_type, $method, $uid, $analysis_id, 
$match_type, $job = NULL) {

  // begin the transaction
  $connection = tripal_db_start_transaction();

  // if we cannot get a connection then let the user know the loading will be slow
  if (!$connection) {
    print "A persistant connection was not obtained. Loading will be slow\n";
  }
  else {
    print "\nNOTE: Loading of this FASTA file is performed using a database transaction. \n" .
      "If the load fails or is terminated prematurely then the entire set of \n" .
      "insertions/updates is rolled back and will not be found in the database\n\n";
  }

  // first get the type for this sequence
  $cvtermsql = "SELECT CVT.cvterm_id
               FROM {cvterm} CVT
                  INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
                  LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
               WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
  $cvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $type, $type));
  if (!$cvterm) {
    watchdog("T_fasta_loader", "Cannot find the term type: '%type'", array('%type' => $type), WATCHDOG_ERROR);
    return 0;
  }
  if ($parent_type) {
    $parentcvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $parent_type, $parent_type));
    if (!$parentcvterm) {
      watchdog("T_fasta_loader", "Cannot find the paretne term type: '%type'", array('%type' => $parentcvterm), WATCHDOG_ERROR);
      return 0;
    }
  }
  if ($rel_type) {
    $relcvterm = db_fetch_object(chado_query($cvtermsql, 'relationship', $rel_type, $rel_type));
    if (!$relcvterm) {
      watchdog("T_fasta_loader", "Cannot find the relationship term type: '%type'", array('%type' => $relcvterm), WATCHDOG_ERROR);
      return 0;
    }
  }

  print "Opening FASTA file $dfile\n";

  //$lines = file($dfile, FILE_SKIP_EMPTY_LINES);
  $fh = fopen($dfile, 'r');
  if (!$fh) {
    watchdog('T_fasta_loader', "cannot open file: %dfile", array('%dfile' => $dfile), WATCHDOG_ERROR);
    return 0;
  }
  $filesize = filesize($dfile);
  $i = 0;

  $name = '';
  $uname = '';
  $residues = '';
  $interval = intval($filesize * 0.01);
  if ($interval < 1) {
    $interval = 1;
  }
  $inv_read = 0;

  // we need to get the table schema to make sure we don't overrun the 
  // size of fields with what our regular expressions retrieve
  $feature_tbl = tripal_core_get_chado_table_schema('feature');
  $dbxref_tbl = tripal_core_get_chado_table_schema('dbxref');

  //foreach ($lines as $line_num => $line) {  
  while ($line = fgets($fh)) {
    $i++; // update the line count
    $num_read += drupal_strlen($line);
    $intv_read += drupal_strlen($line);

    // if we encounter a definition line then get the name, uniquename,
    // accession and relationship subject from the definition line
    if (preg_match('/^>/', $line)) {
      // if we have a feature name then we are starting a new sequence
      // so lets handle the previous one before moving on
      if ($name or $uname) {
        tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, 
        $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, 
        $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
        $residues = '';
        $name = '';
        $uname = '';
      }

      $line = preg_replace("/^>/", '', $line); // remove the > symbol from the defline

      // get the feature name
      if ($re_name) {
        if (!preg_match("/$re_name/", $line, $matches)) {
          watchdog('trp-fasta', "ERROR: Regular expression for the feature name finds nothing. Line %line.", array('%line' => $i), 'error');
        }
        elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
          watchdog('trp-fasta', "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array('%line' => $i), 'error');
        }
        else {
          $name = trim($matches[1]);
        }
      }
      else {
        // if the match_type is name and no regular expression was provided
        // then use the first word as the name, otherwise we don't set the name
        if (strcmp($match_type, 'Name') == 0) {
          if (preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)) {
            if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
              watchdog('trp-fasta', "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array('%line' => $i), 'error');
            }
            else {
              $name = trim($matches[1]);
            }
          }
          else {
            watchdog('trp-fasta', "ERROR: Cannot find a feature name. Line %line.", array('%line' => $i), 'error');
          }
        }
      }

      // get the feature unique name
      if ($re_uname) {
        if (!preg_match("/$re_uname/", $line, $matches)) {
          watchdog('trp-fasta', "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array('%line' => $i), 'error');
        }
        $uname = trim($matches[1]);
      }
      else {
        // if the match_type is name and no regular expression was provided
        // then use the first word as the name, otherwise, we don't set the unqiuename
        if (strcmp($match_type, 'Unique name') == 0) {
          if (preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)) {
            $uname = trim($matches[1]);
          }
          else {
            watchdog('trp-fasta', "ERROR: Cannot find a feature unique name. Line %line.", array('%line' => $i), 'error');
          }
        }
      }
      // get the accession
      preg_match("/$re_accession/", $line, $matches);
      if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
        watchdog('trp-fasta', "WARNING: Regular expression retrieves an accession too long for the feature name. Cannot add cross reference. Line %line.", array('%line' => $i), 'warning');
      }
      else {
        $accession = trim($matches[1]);
      }

      // get the relationship subject
      preg_match("/$re_subject/", $line, $matches);
      $subject = trim($matches[1]);
    }
    else {
      $residues .= trim($line);

      // update the job status every % features
      if ($job and $intv_read >= $interval) {
        $intv_read = 0;
        $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
        if ($name) {
          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $name\r";
        }
        else {
          print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $uname\r";
        }
        tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
      }
    }
  }

  // now load the last sequence in the file
  tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, 
  $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, 
  $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);

  // commit the transaction
  tripal_db_commit_transaction();
  print "\nDone\n";
}