function tripal_feature_load_fasta
2.x tripal_feature.fasta_loader.inc | tripal_feature_load_fasta($dfile, $organism_id, $type, $re_name, $re_uname, $re_accession,
$db_id, $rel_type, $re_subject, $parent_type, $method, $uid, $analysis_id, $match_type,
$job = NULL) |
1.x fasta_loader.inc | tripal_feature_load_fasta($dfile, $organism_id, $type,
$library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
$re_subject, $parent_type, $method, $uid, $analysis_id,
$match_type, $job = NULL) |
Related topics
2 string references to 'tripal_feature_load_fasta'
- tripal_feature_fasta_load_form_submit in tripal_feature/
includes/ fasta_loader.inc - tripal_feature_job_describe_args in tripal_feature/
tripal_feature.module
File
- tripal_feature/
includes/ fasta_loader.inc, line 424 - @todo Add file header description
Code
function tripal_feature_load_fasta($dfile, $organism_id, $type,
$library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
$re_subject, $parent_type, $method, $uid, $analysis_id,
$match_type, $job = NULL) {
// begin the transaction
$connection = tripal_db_start_transaction();
// if we cannot get a connection then let the user know the loading will be slow
if (!$connection) {
print "A persistant connection was not obtained. Loading will be slow\n";
}
else {
print "\nNOTE: Loading of this FASTA file is performed using a database transaction. \n" .
"If the load fails or is terminated prematurely then the entire set of \n" .
"insertions/updates is rolled back and will not be found in the database\n\n";
}
// first get the type for this sequence
$cvtermsql = "SELECT CVT.cvterm_id
FROM {cvterm} CVT
INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
$cvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $type, $type));
if (!$cvterm) {
watchdog("T_fasta_loader", "Cannot find the term type: '%type'", array('%type' => $type), WATCHDOG_ERROR);
return 0;
}
if ($parent_type) {
$parentcvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $parent_type, $parent_type));
if (!$parentcvterm) {
watchdog("T_fasta_loader", "Cannot find the paretne term type: '%type'", array('%type' => $parentcvterm), WATCHDOG_ERROR);
return 0;
}
}
if ($rel_type) {
$relcvterm = db_fetch_object(chado_query($cvtermsql, 'relationship', $rel_type, $rel_type));
if (!$relcvterm) {
watchdog("T_fasta_loader", "Cannot find the relationship term type: '%type'", array('%type' => $relcvterm), WATCHDOG_ERROR);
return 0;
}
}
print "Opening FASTA file $dfile\n";
//$lines = file($dfile, FILE_SKIP_EMPTY_LINES);
$fh = fopen($dfile, 'r');
if (!$fh) {
watchdog('T_fasta_loader', "cannot open file: %dfile", array('%dfile' => $dfile), WATCHDOG_ERROR);
return 0;
}
$filesize = filesize($dfile);
$i = 0;
$name = '';
$uname = '';
$residues = '';
$interval = intval($filesize * 0.01);
if ($interval < 1) {
$interval = 1;
}
$inv_read = 0;
// we need to get the table schema to make sure we don't overrun the
// size of fields with what our regular expressions retrieve
$feature_tbl = tripal_core_get_chado_table_schema('feature');
$dbxref_tbl = tripal_core_get_chado_table_schema('dbxref');
//foreach ($lines as $line_num => $line) {
while ($line = fgets($fh)) {
$i++; // update the line count
$num_read += drupal_strlen($line);
$intv_read += drupal_strlen($line);
// if we encounter a definition line then get the name, uniquename,
// accession and relationship subject from the definition line
if (preg_match('/^>/', $line)) {
// if we have a feature name then we are starting a new sequence
// so lets handle the previous one before moving on
if ($name or $uname) {
tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
$accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
$source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
$residues = '';
$name = '';
$uname = '';
}
$line = preg_replace("/^>/", '', $line); // remove the > symbol from the defline
// get the feature name
if ($re_name) {
if (!preg_match("/$re_name/", $line, $matches)) {
watchdog('trp-fasta', "ERROR: Regular expression for the feature name finds nothing. Line %line.", array('%line' => $i), 'error');
}
elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
watchdog('trp-fasta', "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array('%line' => $i), 'error');
}
else {
$name = trim($matches[1]);
}
}
else {
// if the match_type is name and no regular expression was provided
// then use the first word as the name, otherwise we don't set the name
if (strcmp($match_type, 'Name') == 0) {
if (preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)) {
if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
watchdog('trp-fasta', "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array('%line' => $i), 'error');
}
else {
$name = trim($matches[1]);
}
}
else {
watchdog('trp-fasta', "ERROR: Cannot find a feature name. Line %line.", array('%line' => $i), 'error');
}
}
}
// get the feature unique name
if ($re_uname) {
if (!preg_match("/$re_uname/", $line, $matches)) {
watchdog('trp-fasta', "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array('%line' => $i), 'error');
}
$uname = trim($matches[1]);
}
else {
// if the match_type is name and no regular expression was provided
// then use the first word as the name, otherwise, we don't set the unqiuename
if (strcmp($match_type, 'Unique name') == 0) {
if (preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)) {
$uname = trim($matches[1]);
}
else {
watchdog('trp-fasta', "ERROR: Cannot find a feature unique name. Line %line.", array('%line' => $i), 'error');
}
}
}
// get the accession
preg_match("/$re_accession/", $line, $matches);
if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
watchdog('trp-fasta', "WARNING: Regular expression retrieves an accession too long for the feature name. Cannot add cross reference. Line %line.", array('%line' => $i), 'warning');
}
else {
$accession = trim($matches[1]);
}
// get the relationship subject
preg_match("/$re_subject/", $line, $matches);
$subject = trim($matches[1]);
}
else {
$residues .= trim($line);
// update the job status every % features
if ($job and $intv_read >= $interval) {
$intv_read = 0;
$percent = sprintf("%.2f", ($num_read / $filesize) * 100);
if ($name) {
print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $name\r";
}
else {
print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $uname\r";
}
tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
}
}
}
// now load the last sequence in the file
tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
$accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
$source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
// commit the transaction
tripal_db_commit_transaction();
print "\nDone\n";
}