tripal_phylogeny.import_tree.inc

  1. 2.x tripal_phylogeny/includes/tripal_phylogeny.import_tree.inc
  2. 3.x legacy/tripal_phylogeny/includes/tripal_phylogeny.import_tree.inc

File

tripal_phylogeny/includes/tripal_phylogeny.import_tree.inc
View source
  1. <?php
  2. /**
  3. * Imports a tree file.
  4. *
  5. * This function is used as a wrapper for loading a phylogenetic tree using
  6. * any number of file loaders.
  7. *
  8. * @param $file_name
  9. * The name of the file containing the phylogenetic tree to import.
  10. * @param $format
  11. * The format of the file. Currently only the 'newick' file format is
  12. * supported.
  13. * @param $options
  14. * Options if the phylotree record already exists:
  15. * 'phylotree_id': The imported nodes will be associated with this tree.
  16. * 'leaf_type': A sequence ontology term or the word 'organism'. If the
  17. * type is 'organism' then this tree represents a
  18. * taxonomic tree. The default, if not specified, is the
  19. * term 'polypeptide'.
  20. * 'name_re': If the leaf type is NOT 'taxonomy', then the value of
  21. * this field can be a regular expression to pull out
  22. * the name of the feature from the node label in the
  23. * intput tree. If no value is provided the entire label is
  24. * used.
  25. * 'match': Set to 'uniquename' if the leaf nodes should be matched
  26. * with the feature uniquename.
  27. *
  28. */
  29. function tripal_phylogeny_import_tree_file($file_name, $format, $options = array(), $job_id = NULL) {
  30. // Set some option details.
  31. if (!array_key_exists('leaf_type', $options)) {
  32. $options['leaf_type'] = 'polypeptide';
  33. }
  34. if (!array_key_exists('match', $options)) {
  35. $options['match'] = 'name';
  36. }
  37. if (!array_key_exists('name_re', $options)) {
  38. $options['name_re'] = '^(.*)$';
  39. }
  40. $options['name_re'] = trim($options['name_re']);
  41. // If a phylotree ID is not passed in then make sure we have the other
  42. // required fields for creating a tree.
  43. if (!array_key_exists('phylotree_id', $options)) {
  44. if (!array_key_exists('name', $options)) {
  45. tripal_report_error('tripal_phylogeny', TRIPAL_ERROR,
  46. 'The phylotree_id is required for importing the tree.');
  47. return FALSE;
  48. }
  49. }
  50. // get the phylotree record.
  51. $values = array('phylotree_id' => $options['phylotree_id']);
  52. $phylotree = chado_generate_var('phylotree', $values);
  53. if (!$phylotree) {
  54. tripal_report_error('tripal_phylogeny', TRIPAL_ERROR,
  55. 'Could not find the phylotree using the ID provided: %phylotree_id.',
  56. array('%phylotree_id' => $options['phylotree_id']));
  57. return FALSE;
  58. }
  59. $transaction = db_transaction();
  60. print "\nNOTE: Loading of this tree file is performed using a database transaction. \n" .
  61. "If the load fails or is terminated prematurely then the entire set of \n" .
  62. "insertions/updates is rolled back and will not be found in the database\n\n";
  63. try {
  64. // Parse the file according to the format indicated.
  65. if ($format == 'newick') {
  66. // Parse the tree into the expected nested node format.
  67. module_load_include('inc', 'tripal_phylogeny', 'includes/parsers/tripal_phylogeny.newick_parser');
  68. $tree = tripal_phylogeny_parse_newick_file($file_name);
  69. // Assign the right and left indecies to the tree ndoes
  70. tripal_phylogeny_assign_tree_indices($tree);
  71. }
  72. // Iterate through the tree nodes and add them to Chado in accordance
  73. // with the details in the $options array.
  74. tripal_phylogeny_import_tree($tree, $phylotree, $options);
  75. }
  76. catch (Exception $e) {
  77. $transaction->rollback();
  78. watchdog_exception('tripal_phylogeny', $e);
  79. print "\nFAILED: Rolling back database changes...\n";
  80. }
  81. print "\nDone Importing Tree.\n";
  82. }
  83. /**
  84. *
  85. * @return boolean|multitype:Either
  86. */
  87. function tripal_phylogeny_get_node_types_vocab() {
  88. // Get the vocabulary terms used to describe nodes in the tree
  89. $values = array(
  90. 'name' => 'phylo_leaf',
  91. 'cv_id' => array(
  92. 'name' => 'tripal_phylogeny',
  93. ),
  94. );
  95. $leaf = chado_generate_var('cvterm', $values);
  96. if (!$leaf) {
  97. tripal_report_error('tripal_phylogeny', TRIPAL_ERROR,
  98. "Could not find the leaf vocabulary term: 'phylo_leaf'. It should " .
  99. "already be present as part of the tripal_phylogeny vocabulary.");
  100. return FALSE;
  101. }
  102. $values['name'] = 'phylo_interior';
  103. $internal = chado_generate_var('cvterm', $values);
  104. if (!$internal) {
  105. tripal_report_error('tripal_phylogeny', TRIPAL_ERROR,
  106. "Could not find the leaf vocabulary term: 'phylo_interior'. It should " .
  107. "already be present as part of the tripal_phylogeny vocabulary.");
  108. return FALSE;
  109. }
  110. $values['name'] = 'phylo_root';
  111. $root = chado_generate_var('cvterm', $values);
  112. if (!$root) {
  113. tripal_report_error('tripal_phylogeny', TRIPAL_ERROR,
  114. "Could not find the leaf vocabulary term: 'phylo_root'. It should " .
  115. "already be present as part of the tripal_phylogeny vocabulary.");
  116. return FALSE;
  117. }
  118. $vocab = array(
  119. 'leaf' => $leaf,
  120. 'internal' => $internal,
  121. 'root' => $root,
  122. );
  123. return $vocab;
  124. }
  125. /**
  126. * Iterates through the tree and sets the left and right indicies .
  127. *
  128. * @param $tree
  129. * The tree array.
  130. * @param $index
  131. * This parameters is not used when the function is first called. It
  132. * is used for recursive calls.
  133. */
  134. function tripal_phylogeny_assign_tree_indices(&$tree, &$index = 1) {
  135. // Assign a left and right index to each node. The child node must
  136. // have a right and left index less than that of it's parents. We
  137. // increment the index by 100 to give space for new nodes that might
  138. // be added later.
  139. if (array_key_exists('name', $tree)) {
  140. $tree['left_index'] = $index += 100;
  141. if (array_key_exists('is_leaf', $tree)) {
  142. $tree['right_index'] = $index += 100;
  143. }
  144. }
  145. if (array_key_exists('branch_set', $tree)) {
  146. foreach ($tree['branch_set'] as $key => $node) {
  147. tripal_phylogeny_assign_tree_indices($tree['branch_set'][$key], $index);
  148. $tree['right_index'] = $index += 100;
  149. }
  150. }
  151. }
  152. /**
  153. * Iterates through the tree array and creates phylonodes in Chado.
  154. *
  155. * The function iterates through the tree in a top-down approach adding
  156. * parent internal nodes prior to leaf nodes. Each node of the tree should have
  157. * the following fields:
  158. *
  159. * -name: The name (or label) for this node.
  160. * -depth: The depth of the node in the tree.
  161. * -is_root: Set to 1 if this node is a root node.
  162. * -is_leaf: Set to 1 if this node is a leaf node.
  163. * -is_internal: Set to 1 if this node is an internal node.
  164. * -left_index: The index of the node to the left in the tree.
  165. * -right_index: The index of the node to the right in the tree.
  166. * -branch_set: An array containing a list of nodes of that are children
  167. * of the node.
  168. * -parent: The name of the parent node.
  169. * -organism_id: The organism_id for associtating the node with an organism.
  170. * -properties: An array of key/value pairs where the key is the cvterm_id
  171. * and the value is the property value. These properties
  172. * will be assocaited with the phylonode.
  173. *
  174. * Prior to importing the tree the indicies can be set by using the
  175. * tripal_phylogeny_assign_tree_indices() function.
  176. *
  177. * @param $tree
  178. * The tree array.
  179. * @param $options
  180. * The options provide some direction for how the tree is imported. The
  181. * following keys can be used:
  182. * -taxonomy: Set to 1 if this tree is a taxonomic tree. Set to 0
  183. * otherwise.
  184. * -leaf_type: Set to the leaf type name. If this is a non-taxonomic tree
  185. * that is associated with features, then this should be the
  186. * Sequence Ontology term for the feature (e.g. polypeptide).
  187. * If this is a taxonomic tree then this option is not needed.
  188. * -match: Set to either 'name' or 'uniquename'. This is used for
  189. * matching the feature name or uniquename with the node name.
  190. * This is not needed for taxonomic trees.
  191. * -match_re: Set to a regular that can be used for matching the node
  192. * name with the feature name if the node name is not
  193. * identical to the feature name.
  194. * @param $vocab
  195. * Optional. An array containing a set of key/value pairs that maps node
  196. * types to CV terms. The keys must be 'root', 'internal' or 'leaf'. If
  197. * no vocab is provded then the terms provided by the tripal_phylogeny
  198. * CV will be used.
  199. * @param $parent
  200. * This argument is not needed when the funtion is first called. This
  201. * function is recursive and this argument is used on recursive calls.
  202. */
  203. function tripal_phylogeny_import_tree(&$tree, $phylotree, $options, $vocab = array(), $parent = NULL) {
  204. // Get the vocabulary terms used to describe nodes in the tree if one
  205. // wasn't provided.
  206. if (count($vocab) == 0) {
  207. $vocab = tripal_phylogeny_get_node_types_vocab();
  208. }
  209. if (is_array($tree) and array_key_exists('name', $tree)) {
  210. $values = array(
  211. 'phylotree_id' => $phylotree->phylotree_id,
  212. 'left_idx' => $tree['left_index'],
  213. 'right_idx' => $tree['right_index'],
  214. );
  215. // Add in any optional values to the $values array if they are present
  216. if (!empty($tree['name']) and $tree['name'] != '') {
  217. $values['label'] = $tree['name'];
  218. }
  219. if (!empty($tree['length']) and $tree['length'] != '') {
  220. $values['distance'] = $tree['length'];
  221. }
  222. // Set the type of node
  223. if ($tree['is_root']) {
  224. $values['type_id'] = $vocab['root']->cvterm_id;
  225. }
  226. else if ($tree['is_internal']) {
  227. $values['type_id'] = $vocab['internal']->cvterm_id;
  228. $values['parent_phylonode_id'] = $parent['phylonode_id'];
  229. // TOOD: a feature may be associated here but it is recommended that it
  230. // be a feature of type SO:match and should represent the alignment of
  231. // all features beneath it.
  232. }
  233. else if ($tree['is_leaf']) {
  234. $values['type_id'] = $vocab['leaf']->cvterm_id;
  235. $values['parent_phylonode_id'] = $parent['phylonode_id'];
  236. // Match this leaf node with an organism or feature depending on the
  237. // type of tree. But we can't do that if we don't have a name.
  238. if (!empty($tree['name']) and $tree['name'] != '') {
  239. if (!$options['taxonomy']) {
  240. // This is a sequence-based tree. Try to match leaf nodes with features.
  241. // First, Get the Name and uniquename for the feature
  242. $matches = array();
  243. $sel_values = array();
  244. if ($options['match'] == "name") {
  245. $sel_values['name'] = $tree['name'];
  246. $re = $options['name_re'];
  247. if (preg_match("/$re/", $tree['name'], $matches)) {
  248. $sel_values['name'] = $matches[1];
  249. }
  250. }
  251. else {
  252. $sel_values['uniquename'] = $tree['name'];
  253. $re = $options['name_re'];
  254. if (preg_match("/$re/", $tree['name'], $matches)) {
  255. $sel_values['uniquename'] = $matches[1];
  256. }
  257. }
  258. $sel_values['type_id'] = array(
  259. 'name' => $options['leaf_type'],
  260. 'cv_id' => array(
  261. 'name' => 'sequence'
  262. ),
  263. );
  264. $sel_columns = array('feature_id');
  265. $feature = chado_select_record('feature', $sel_columns, $sel_values);
  266. if (count($feature) > 1) {
  267. // Found multiple features, cannot make an association.
  268. }
  269. else if (count($feature) == 1) {
  270. $values['feature_id'] = $feature[0]->feature_id;
  271. }
  272. else {
  273. // Could not find a feature that matches the name or uniquename
  274. }
  275. }
  276. }
  277. }
  278. // Insert the new node and then add it's assigned phylonode_id to the node
  279. $phylonode = chado_insert_record('phylonode', $values);
  280. $tree['phylonode_id'] = $phylonode['phylonode_id'];
  281. // This is a taxonomic tree, so assocaite this node with an
  282. // organism if one is provided.
  283. if (array_key_exists('organism_id', $tree)) {
  284. $values = array(
  285. 'phylonode_id' => $tree['phylonode_id'],
  286. 'organism_id' => $tree['organism_id']
  287. );
  288. $pylonode_organism = chado_insert_record('phylonode_organism', $values);
  289. }
  290. // Associate any properties
  291. if (array_key_exists('properties', $tree)) {
  292. foreach ($tree['properties'] as $type_id => $value) {
  293. $values = array(
  294. 'phylonode_id' => $tree['phylonode_id'],
  295. 'type_id' => $type_id,
  296. 'value' => $value,
  297. );
  298. $pylonode_organism = chado_insert_record('phylonodeprop', $values);
  299. }
  300. }
  301. }
  302. if (is_array($tree) and array_key_exists('branch_set', $tree)) {
  303. foreach ($tree['branch_set'] as $key => $node) {
  304. tripal_phylogeny_import_tree($tree['branch_set'][$key], $phylotree, $options, $vocab, $tree);
  305. }
  306. }
  307. }