function tripal_feature_load_fasta
2.x tripal_feature.fasta_loader.inc | tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession,
$db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type,
$job = NULL) |
1.x fasta_loader.inc | tripal_feature_load_fasta($dfile, $organism_id, $type,
$library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
$re_subject, $parent_type, $method, $uid, $analysis_id,
$match_type, $job = NULL) |
Actually load a fasta file. This is the function called by tripal jobs
Parameters
$dfile The: full path to the fasta file to load
$organism_id The: organism_id of the organism these features are from
$type The: type of features contained in the fasta file
$re_name A: regular expression to extract the feature.name from the fasta header
$re_uname A: regular expression to extract the feature.uniquename from the fasta header
$re_accession A: regular expression to extract the accession of the feature.dbxref_id
$db_id The: db_id of the above dbxref
$rel_type The: type of relationship when creating a feature_relationship between this feature (object) and an extracted subject
$re_subject The: regular expression to extract the uniquename of the feature to be the subject of the above specified relationship
$parent_type The: type of the parent feature
$method The: method of feature adding. (ie: 'Insert only', 'Update only', 'Insert and update')
$uid The: user id of the user who submitted the job
$analysis_id The: analysis_id to associate the features in this fasta file with
$match_type Whether: to match existing features based on the 'Name' or 'Unique name'
$job =: NULL The tripal job
Related topics
- tripal_feature_fasta_load_form_submit in tripal_feature/
includes/ tripal_feature.fasta_loader.inc - Submit a fasta loading job
- tripal_feature_job_describe_args in tripal_feature/
tripal_feature.module - Implements hook_job_describe_args() in order to describe the various feature jobs to the tripal jobs interface.
File
- tripal_feature/
includes/ tripal_feature.fasta_loader.inc, line 388 - Provides fasta loading functionality. Creates features based on their specification in a fasta file.
Code
function tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession,
$db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type,
$job = NULL) {
$transaction = db_transaction();
print "\nNOTE: Loading of this Fasta file is performed using a database transaction. \n" .
"If the load fails or is terminated prematurely then the entire set of \n" .
"insertions/updates is rolled back and will not be found in the database\n\n";
try {
// First get the type for this sequence.
$cvtermsql = "
SELECT CVT.cvterm_id
FROM {cvterm} CVT
INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
WHERE cv.name = :cvname and (CVT.name = :name or CVTS.synonym = :synonym)
";
$cvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $type, ':synonym' => $type))->fetchObject();
if (!$cvterm) {
tripal_report_error("T_fasta_loader", TRIPAL_ERROR,
"Cannot find the term type: '%type'", array('%type' => $type));
return 0;
}
// Second, if there is a parent type then get that.
$parentcvterm = NULL;
if ($parent_type) {
$parentcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $parent_type, ':synonym' => $parent_type))->fetchObject();
if (!$parentcvterm) {
tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the parent term type: '%type'", array(
'%type' => $parentcvterm
));
return 0;
}
}
// Third, if there is a relationship type then get that.
$relcvterm = NULL;
if ($rel_type) {
$relcvterm = chado_query($cvtermsql, array(':cvname' => 'sequence', ':name' => $rel_type, ':synonym' => $rel_type))->fetchObject();
if (!$relcvterm) {
tripal_report_error("T_fasta_loader", TRIPAL_ERROR, "Cannot find the relationship term type: '%type'", array(
'%type' => $relcvterm
));
return 0;
}
}
// We need to get the table schema to make sure we don't overrun the
// size of fields with what our regular expressions retrieve
$feature_tbl = chado_get_schema('feature');
$dbxref_tbl = chado_get_schema('dbxref');
print "Step 1: finding sequences\n";
$filesize = filesize($dfile);
$fh = fopen($dfile, 'r');
if (!$fh) {
tripal_report_error('T_fasta_loader', TRIPAL_ERROR, "cannot open file: %dfile", array(
'%dfile' => $dfile
));
return 0;
}
// Calculate the interval at which we will print to the screen that status.
$interval = intval($filesize * 0.01);
if ($interval < 1) {
$interval = 1;
}
$inv_read = 0;
$num_read = 0;
// Iterate through the lines of the file. Keep a record for
// where in the file each line is at for later import.
$seqs = array();
$num_seqs = 0;
$prev_pos = 0;
$set_start = FALSE;
$intv_read = 0;
$line_num = 0;
while ($line = fgets($fh)) {
$line_num++;
$num_read += strlen($line);
$intv_read += strlen($line);
// If we encounter a definition line then get the name, uniquename,
// accession and relationship subject from the definition line.
if (preg_match('/^>/', $line)) {
// Remove the > symbol from the defline.
$defline = preg_replace("/^>/", '', $line);
// Get the feature name if a regular expression is provided.
$name = "";
if ($re_name) {
if (!preg_match("/$re_name/", $defline, $matches)) {
tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Regular expression for the feature name finds nothing. Line %line.", array(
'%line' => $i
));
}
elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array(
'%line' => $i
));
}
else {
$name = trim($matches[1]);
}
}
// If the match_type is name and no regular expression was provided
// then use the first word as the name, otherwise we don't set the name.
elseif (strcmp($match_type, 'Name') == 0) {
if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array(
'%line' => $i));
}
else {
$name = trim($matches[1]);
}
}
else {
tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Cannot find a feature name. Line %line.", array(
'%line' => $i));
}
}
// Get the feature uniquename if a regular expression is provided.
$uname = "";
if ($re_uname) {
if (!preg_match("/$re_uname/", $defline, $matches)) {
tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array(
'%line' => $i));
}
$uname = trim($matches[1]);
}
// If the match_type is name and no regular expression was provided
// then use the first word as the name, otherwise, we don't set the
// uniquename.
elseif (strcmp($match_type, 'Unique name') == 0) {
if (preg_match("/^\s*(.*?)[\s\|].*$/", $defline, $matches)) {
$uname = trim($matches[1]);
}
else {
tripal_report_error('trp-fasta', TRIPAL_ERROR, "ERROR: Cannot find a feature unique name. Line %line.", array(
'%line' => $i));
}
}
// Get the accession if a regular expression is provided.
$accession = "";
if (!empty($re_accession)) {
preg_match("/$re_accession/", $defline, $matches);
if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
tripal_report_error('trp-fasta', TRIPAL_WARNING, "WARNING: Regular expression retrieves an accession too long for the feature name. " .
"Cannot add cross reference. Line %line.", array('%line' => $i
));
}
else {
$accession = trim($matches[1]);
}
}
// Get the relationship subject
$subject = "";
if (!empty($re_subject)) {
preg_match("/$re_subject/", $line, $matches);
$subject = trim($matches[1]);
}
// Add the details to the sequence.
$seqs[$num_seqs] = array(
'name' => $name,
'uname' => $uname,
'accession' => $accession,
'subject' => $subject,
'seq_start' => ftell($fh)
);
$set_start = TRUE;
// If this isn't the first sequence, then we want to specify where
// the previous sequence ended.
if ($num_seqs > 0) {
$seqs[$num_seqs - 1]['seq_end'] = $prev_pos;
}
$num_seqs++;
}
// Keep the current file position so we can use it to set the sequence
// ending position
$prev_pos = ftell($fh);
// update the job status every % bytes
if ($job and $intv_read >= $interval) {
$intv_read = 0;
$percent = sprintf("%.2f", ($num_read / $filesize) * 100);
if ($name) {
print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
" bytes.\r";
}
else {
print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
" bytes.\r";
}
tripal_set_job_progress($job, intval(($num_read / $filesize) * 100));
}
}
$percent = sprintf("%.2f", ($num_read / $filesize) * 100);
print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) .
" bytes.\r";
tripal_set_job_progress($job, 50);
// Set the end position for the last sequence.
$seqs[$num_seqs - 1]['seq_end'] = $num_read - strlen($line);
// Now that we know where the sequences are in the file we need to add them.
print "\nStep 2: Importing sequences\n";
for ($j = 0; $j < $num_seqs; $j++) {
$seq = $seqs[$j];
print "Importing " . ($j + 1) . " of $num_seqs. ";
if ($name) {
print "Current feature: " . $seq['name'] . ".\n";
}
else {
print "Current feature: " . $seq['uname'] . ".\n";
}
$source = NULL;
tripal_feature_load_fasta_feature($fh, $seq['name'], $seq['uname'], $db_id, $seq['accession'], $seq['subject'], $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm, $source, $method, $re_name, $match_type, $parentcvterm, $relcvterm, $seq['seq_start'], $seq['seq_end']);
}
tripal_set_job_progress($job, 100);
fclose($fh);
}
catch (Exception $e) {
fclose($fh);
$transaction->rollback();
print "\n"; // make sure we start errors on new line
watchdog_exception('T_fasta_loader', $e);
print "FAILED: Rolling back database changes...\n";
}
print "\nDone\n";
}