tripal_chado.pub_importer_PMID.inc

This file provides support for importing and parsing of results from the NCBI PubMed database. The functions here are used by both the publication importer setup form and the publication importer.

File

tripal_chado/includes/loaders/tripal_chado.pub_importer_PMID.inc
View source
  1. <?php
  2. /**
  3. * @file
  4. * This file provides support for importing and parsing of results from the
  5. * NCBI PubMed database. The functions here are used by
  6. * both the publication importer setup form and the publication importer.
  7. *
  8. */
  9. /**
  10. * A hook for altering the publication importer form. It Changes the
  11. * 'Abstract' filter to be 'Abstract/Title'.
  12. *
  13. * @param $form
  14. * The Drupal form array
  15. * @param $form_state
  16. * The form state array
  17. * @param $num_criteria
  18. * The number of criteria the user currently has added to the form
  19. *
  20. * @return
  21. * The form (drupal form api)
  22. *
  23. * @ingroup tripal_pub
  24. */
  25. function tripal_pub_remote_alter_form_PMID($form, $form_state, $num_criteria = 1) {
  26. // PubMed doesn't have an 'Abstract' field, so we need to convert the criteria
  27. // from 'Abstract' to 'Title/Abstract'
  28. for($i = 1; $i <= $num_criteria; $i++) {
  29. $form['themed_element']['criteria'][$i]["scope-$i"]['#options']['abstract'] = 'Abstract/Title';
  30. }
  31. return $form;
  32. }
  33. /**
  34. * A hook for providing additional validation of importer setup form.
  35. *
  36. * @param $form
  37. * The Drupal form array
  38. * @param $form_state
  39. * The form state array
  40. *
  41. * @return
  42. * The form (drupal form api)
  43. *
  44. * @ingroup tripal_pub
  45. */
  46. function tripal_pub_remote_validate_form_PMID($form, $form_state) {
  47. $num_criteria = $form_state['values']['num_criteria'];
  48. for ($i = 1; $i <= $num_criteria; $i++) {
  49. $search_terms = trim($form_state['values']["search_terms-$i"]);
  50. $scope = $form_state['values']["scope-$i"];
  51. if ($scope == 'id' and !preg_match('/^PMID:\d+$/', $search_terms)) {
  52. form_set_error("search_terms-$i", "The PubMed accession must be a numeric value, prefixed with 'PMID:' (e.g. PMID:23024789).");
  53. }
  54. }
  55. return $form;
  56. }
  57. /**
  58. * A hook for performing the search on the PubMed database.
  59. *
  60. * @param $search_array
  61. * An array containing the serach criteria for the serach
  62. * @param $num_to_retrieve
  63. * Indicates the maximum number of publications to retrieve from the remote
  64. * database
  65. * @param $page
  66. * Indicates the page to retrieve. This corresponds to a paged table, where
  67. * each page has $num_to_retrieve publications.
  68. *
  69. * @return
  70. * An array of publications.
  71. *
  72. * @ingroup tripal_pub
  73. */
  74. function tripal_pub_remote_search_PMID($search_array, $num_to_retrieve, $page) {
  75. // convert the terms list provided by the caller into a string with words
  76. // separated by a '+' symbol.
  77. $num_criteria = $search_array['num_criteria'];
  78. $days = NULL;
  79. if (isset($search_array['days']))
  80. $days = $search_array['days'];
  81. $search_str = '';
  82. for ($i = 1; $i <= $num_criteria; $i++) {
  83. $search_terms = trim($search_array['criteria'][$i]['search_terms']);
  84. $scope = $search_array['criteria'][$i]['scope'];
  85. $is_phrase = $search_array['criteria'][$i]['is_phrase'];
  86. $op = $search_array['criteria'][$i]['operation'];
  87. if ($op) {
  88. $search_str .= "$op ";
  89. }
  90. // if this is phrase make sure the search terms are surrounded by quotes
  91. if ($is_phrase) {
  92. $search_str .= "(\"$search_terms\" |SCOPE|)";
  93. }
  94. // if this is not a phase then we want to separate each 'OR or 'AND' into a unique criteria
  95. else {
  96. $search_str .= "(";
  97. if (preg_match('/and/i', $search_terms)) {
  98. $elements = preg_split('/\s+and+\s/i', $search_terms);
  99. foreach ($elements as $element) {
  100. $search_str .= "($element |SCOPE|) AND ";
  101. }
  102. $search_str = substr($search_str, 0, -5); // remove trailing 'AND '
  103. }
  104. elseif (preg_match('/or/i', $search_terms)) {
  105. $elements = preg_split('/\s+or+\s/i', $search_terms);
  106. foreach ($elements as $element) {
  107. $search_str .= "($element |SCOPE|) OR ";
  108. }
  109. $search_str = substr($search_str, 0, -4); // remove trailing 'OR '
  110. }
  111. else {
  112. $search_str .= "($search_terms |SCOPE|)";
  113. }
  114. $search_str .= ')';
  115. }
  116. if ($scope == 'title') {
  117. $search_str = preg_replace('/\|SCOPE\|/', '[Title]', $search_str);
  118. }
  119. elseif ($scope == 'author') {
  120. $search_str = preg_replace('/\|SCOPE\|/', '[Author]', $search_str);
  121. }
  122. elseif ($scope == 'abstract') {
  123. $search_str = preg_replace('/\|SCOPE\|/', '[Title/Abstract]', $search_str);
  124. }
  125. elseif ($scope == 'journal') {
  126. $search_str = preg_replace('/\|SCOPE\|/', '[Journal]', $search_str);
  127. }
  128. elseif ($scope == 'id') {
  129. $search_str = preg_replace('/PMID:([^\s]*)/', '$1', $search_str);
  130. $search_str = preg_replace('/\|SCOPE\|/', '[Uid]', $search_str);
  131. }
  132. else {
  133. $search_str = preg_replace('/\|SCOPE\|/', '', $search_str);
  134. }
  135. }
  136. if ($days) {
  137. // get the date of the day suggested
  138. $past_timestamp = time() - ($days * 86400);
  139. $past_date = getdate($past_timestamp);
  140. $search_str .= " AND (\"" . sprintf("%04d/%02d/%02d", $past_date['year'], $past_date['mon'], $past_date['mday']) . "\"[Date - Create] : \"3000\"[Date - Create]))";
  141. }
  142. // now initialize the query
  143. $results = tripal_pub_PMID_search_init($search_str, $num_to_retrieve);
  144. $total_records = $results['Count'];
  145. $query_key = $results['QueryKey'];
  146. $web_env = $results['WebEnv'];
  147. // initialize the pager
  148. $start = $page * $num_to_retrieve;
  149. // if we have no records then return an empty array
  150. if ($total_records == 0) {
  151. return array(
  152. 'total_records' => $total_records,
  153. 'search_str' => $search_str,
  154. 'pubs' => array(),
  155. );
  156. }
  157. // now get the list of PMIDs from the initialized search
  158. $pmids_txt = tripal_pub_PMID_fetch($query_key, $web_env, 'uilist', 'text', $start, $num_to_retrieve);
  159. // iterate through each PMID and get the publication record. This requires a new search and new fetch
  160. $pmids = explode("\n", trim($pmids_txt));
  161. $pubs = array();
  162. foreach ($pmids as $pmid) {
  163. // now retrieve the individual record
  164. $pub_xml = tripal_pub_PMID_fetch($query_key, $web_env, 'null', 'xml', 0, 1, array('id' => $pmid));
  165. $pub = tripal_pub_PMID_parse_pubxml($pub_xml);
  166. $pubs[] = $pub;
  167. }
  168. return array(
  169. 'total_records' => $total_records,
  170. 'search_str' => $search_str,
  171. 'pubs' => $pubs,
  172. );
  173. }
  174. /**
  175. * Initailizes a PubMed Search using a given search string
  176. *
  177. * @param $search_str
  178. * The PubMed Search string
  179. * @param $retmax
  180. * The maximum number of records to return
  181. *
  182. * @return
  183. * An array containing the Count, WebEnv and QueryKey as return
  184. * by PubMed's esearch utility
  185. *
  186. * @ingroup tripal_pub
  187. */
  188. function tripal_pub_PMID_search_init($search_str, $retmax){
  189. // do a search for a single result so that we can establish a history, and get
  190. // the number of records. Once we have the number of records we can retrieve
  191. // those requested in the range.
  192. $query_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?" .
  193. "db=Pubmed" .
  194. "&retmax=$retmax" .
  195. "&usehistory=y".
  196. "&term=" . urlencode($search_str);
  197. $rfh = fopen($query_url, "r");
  198. if (!$rfh) {
  199. drupal_set_message('Could not perform Pubmed query. Cannot connect to Entrez.', 'error');
  200. tripal_report_error('tripal_pubmed', TRIPAL_ERROR, "Could not perform Pubmed query. Cannot connect to Entrez.",
  201. array());
  202. return 0;
  203. }
  204. // retrieve the XML results
  205. $query_xml = '';
  206. while (!feof($rfh)) {
  207. $query_xml .= fread($rfh, 255);
  208. }
  209. fclose($rfh);
  210. $xml = new XMLReader();
  211. $xml->xml($query_xml);
  212. // iterate though the child nodes of the <eSearchResult> tag and get the count, history and query_id
  213. $result = array();
  214. while ($xml->read()) {
  215. $element = $xml->name;
  216. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'WebEnv') {
  217. // we've read as much as we need. If we go too much further our counts
  218. // will get messed up by other 'Count' elements. so we're done.
  219. break;
  220. }
  221. if ($xml->nodeType == XMLReader::ELEMENT) {
  222. switch ($element) {
  223. case 'Count':
  224. $xml->read();
  225. $result['Count'] = $xml->value;
  226. break;
  227. case 'WebEnv':
  228. $xml->read();
  229. $result['WebEnv'] = $xml->value;
  230. break;
  231. case 'QueryKey':
  232. $xml->read();
  233. $result['QueryKey'] = $xml->value;
  234. break;
  235. }
  236. }
  237. }
  238. return $result;
  239. }
  240. /**
  241. * Retrieves from PubMed a set of publications from the
  242. * previously initiated query.
  243. *
  244. * @param $query_key
  245. * The esearch QueryKey
  246. * @param $web_env
  247. * The esearch WebEnv
  248. * @param $rettype
  249. * The efetch return type
  250. * @param $retmod
  251. * The efetch return mode
  252. * @param $start
  253. * The start of the range to retrieve
  254. * @param $limit
  255. * The number of publications to retrieve
  256. * @param $args
  257. * Any additional arguments to add the efetch query URL
  258. *
  259. * @return
  260. * An array containing the total_records in the dataaset, the search string
  261. * and an array of the publications that were retreived.
  262. *
  263. * @ingroup tripal_pub
  264. */
  265. function tripal_pub_PMID_fetch($query_key, $web_env, $rettype = 'null',
  266. $retmod = 'null', $start = 0, $limit = 10, $args = array()){
  267. // repeat the search performed previously (using WebEnv & QueryKey) to retrieve
  268. // the PMID's within the range specied. The PMIDs will be returned as a text list
  269. $fetch_url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?".
  270. "rettype=$rettype" .
  271. "&retmode=$retmod" .
  272. "&retstart=$start" .
  273. "&retmax=$limit" .
  274. "&db=Pubmed" .
  275. "&query_key=$query_key".
  276. "&WebEnv=$web_env";
  277. foreach ($args as $key => $value) {
  278. if(is_array($value)) {
  279. $fetch_url .= "&$key=";
  280. foreach ($value as $item) {
  281. $fetch_url .= "$item,";
  282. }
  283. $fetch_url = substr($fetch_url, 0, -1); // remove trailing comma
  284. }
  285. else {
  286. $fetch_url .= "&$key=$value";
  287. }
  288. }
  289. $rfh = fopen($fetch_url, "r");
  290. if (!$rfh) {
  291. drupal_set_message('ERROR: Could not perform PubMed query.', 'error');
  292. tripal_report_error('tripal_pubmed', TRIPAL_ERROR, "Could not perform PubMed query: %fetch_url.",
  293. array('%fetch_url' => $fetch_url));
  294. return '';
  295. }
  296. $results = '';
  297. if($rfh) {
  298. while (!feof($rfh)) {
  299. $results .= fread($rfh, 255);
  300. }
  301. fclose($rfh);
  302. }
  303. return $results;
  304. }
  305. /**
  306. * This function parses the XML containing details of a publication and
  307. * converts it into an associative array of where keys are Tripal Pub
  308. * ontology terms and the values are extracted from the XML. The
  309. * XML should contain only a single publication record.
  310. *
  311. * Information about the valid elements in the PubMed XML can be found here:
  312. * http://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html
  313. *
  314. * Information about PubMed's citation format can be found here
  315. * http://www.nlm.nih.gov/bsd/policy/cit_format.html
  316. *
  317. * @param $pub_xml
  318. * An XML string describing a single publication
  319. *
  320. * @return
  321. * An array describing the publication
  322. *
  323. * @ingroup tripal_pub
  324. */
  325. function tripal_pub_PMID_parse_pubxml($pub_xml) {
  326. $pub = array();
  327. if (!$pub_xml) {
  328. return $pub;
  329. }
  330. // read the XML and iterate through it.
  331. $xml = new XMLReader();
  332. $xml->xml(trim($pub_xml));
  333. while ($xml->read()) {
  334. $element = $xml->name;
  335. if ($xml->nodeType == XMLReader::ELEMENT) {
  336. switch ($element) {
  337. case 'ERROR':
  338. $xml->read(); // get the value for this element
  339. tripal_report_error('tripal_pubmed', TRIPAL_ERROR, "Error: %err", array('%err' => $xml->value));
  340. break;
  341. case 'PMID':
  342. // thre are multiple places where a PMID is present in the XML and
  343. // since this code does not descend into every branch of the XML tree
  344. // we will encounter many of them here. Therefore, we only want the
  345. // PMID that we first encounter. If we already have the PMID we will
  346. // just skip it. Examples of other PMIDs are in the articles that
  347. // cite this one.
  348. $xml->read(); // get the value for this element
  349. if(!array_key_exists('Publication Dbxref', $pub)) {
  350. $pub['Publication Dbxref'] = 'PMID:' . $xml->value;
  351. }
  352. break;
  353. case 'Article':
  354. $pub_model = $xml->getAttribute('PubModel');
  355. $pub['Publication Model'] = $pub_model;
  356. tripal_pub_PMID_parse_article($xml, $pub);
  357. break;
  358. case 'MedlineJournalInfo':
  359. tripal_pub_PMID_parse_medline_journal_info($xml, $pub);
  360. break;
  361. case 'ChemicalList':
  362. // TODO: handle this
  363. break;
  364. case 'SupplMeshList':
  365. // TODO: meant for protocol list
  366. break;
  367. case 'CitationSubset':
  368. // TODO: not sure this is needed.
  369. break;
  370. case 'CommentsCorrections':
  371. // TODO: handle this
  372. break;
  373. case 'GeneSymbolList':
  374. // TODO: handle this
  375. break;
  376. case 'MeshHeadingList':
  377. // TODO: Medical subject headings
  378. break;
  379. case 'NumberOfReferences':
  380. // TODO: not sure we should keep this as it changes frequently.
  381. break;
  382. case 'PersonalNameSubjectList':
  383. // TODO: for works about an individual or with biographical note/obituary.
  384. break;
  385. case 'OtherID':
  386. // TODO: ID's from another NLM partner.
  387. break;
  388. case 'OtherAbstract':
  389. // TODO: when the journal does not contain an abstract for the publication.
  390. break;
  391. case 'KeywordList':
  392. // TODO: handle this
  393. break;
  394. case 'InvestigatorList':
  395. // TODO: personal names of individuals who are not authors (can be used with collection)
  396. break;
  397. case 'GeneralNote':
  398. // TODO: handle this
  399. break;
  400. case 'DeleteCitation':
  401. // TODO: need to know how to handle this
  402. break;
  403. default:
  404. break;
  405. }
  406. }
  407. }
  408. $pub['Citation'] = chado_pub_create_citation($pub);
  409. $pub['raw'] = $pub_xml;
  410. return $pub;
  411. }
  412. /**
  413. * Parses the section from the XML returned from PubMed that contains
  414. * information about the Journal
  415. *
  416. * @param $xml
  417. * The XML to parse
  418. * @param $pub
  419. * The publication object to which additional details will be added
  420. *
  421. * @ingroup tripal_pub
  422. */
  423. function tripal_pub_PMID_parse_medline_journal_info($xml, &$pub) {
  424. while ($xml->read()) {
  425. // get this element name
  426. $element = $xml->name;
  427. // if we're at the </Article> element then we're done with the article...
  428. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'MedlineJournalInfo') {
  429. return;
  430. }
  431. if ($xml->nodeType == XMLReader::ELEMENT) {
  432. switch ($element) {
  433. case 'Country':
  434. // the place of publication of the journal
  435. $xml->read();
  436. $pub['Journal Country'] = $xml->value;
  437. break;
  438. case 'MedlineTA':
  439. // TODO: not sure how this is different from ISOAbbreviation
  440. break;
  441. case 'NlmUniqueID':
  442. // TODO: the journal's unique ID in medline
  443. break;
  444. case 'ISSNLinking':
  445. // TODO: not sure how this is different from ISSN
  446. break;
  447. default:
  448. break;
  449. }
  450. }
  451. }
  452. }
  453. /**
  454. * Parses the section from the XML returned from PubMed that contains
  455. * information about an article.
  456. *
  457. * @param $xml
  458. * The XML to parse
  459. * @param $pub
  460. * The publication object to which additional details will be added
  461. *
  462. * @ingroup tripal_pub
  463. */
  464. function tripal_pub_PMID_parse_article($xml, &$pub) {
  465. while ($xml->read()) {
  466. // get this element name
  467. $element = $xml->name;
  468. // if we're at the </Article> element then we're done with the article...
  469. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Article') {
  470. return;
  471. }
  472. if ($xml->nodeType == XMLReader::ELEMENT) {
  473. switch ($element) {
  474. case 'Journal':
  475. tripal_pub_PMID_parse_journal($xml, $pub);
  476. break;
  477. case 'ArticleTitle':
  478. $xml->read();
  479. // remoave any trailing period from the title
  480. $pub['Title'] = trim(preg_replace('/\.$/', '', $xml->value));
  481. break;
  482. case 'Abstract':
  483. tripal_pub_PMID_parse_abstract($xml, $pub);
  484. break;
  485. case 'Pagination':
  486. tripal_pub_PMID_parse_pagination($xml, $pub);
  487. break;
  488. case 'ELocationID':
  489. $type = $xml->getAttribute('EIdType');
  490. $valid = $xml->getAttribute('ValidYN');
  491. $xml->read();
  492. $elocation = $xml->value;
  493. if ($type == 'doi' and $valid == 'Y') {
  494. $pub['DOI'] = $elocation;
  495. }
  496. if ($type == 'pii' and $valid == 'Y') {
  497. $pub['PII'] = $elocation;
  498. }
  499. $pub['Elocation'] = $elocation;
  500. break;
  501. case 'Affiliation':
  502. // the affiliation tag at this level is meant solely for the first author
  503. $xml->read();
  504. $pub['Author List'][0]['Affiliation'] = $xml->value;
  505. break;
  506. case 'AuthorList':
  507. $complete = $xml->getAttribute('CompleteYN');
  508. tripal_pub_PMID_parse_authorlist($xml, $pub);
  509. break;
  510. case 'InvestigatorList':
  511. // TODO: perhaps handle this one day. The investigator list is to list the names of people who
  512. // are members of a collective or corporate group that is an author in the paper.
  513. break;
  514. case 'Language':
  515. $xml->read();
  516. $lang_abbr = $xml->value;
  517. // there may be multiple languages so we store these in an array
  518. $pub['Language'][] = tripal_pub_remote_search_get_language($lang_abbr);
  519. $pub['Language Abbr'][] = $lang_abbr;
  520. break;
  521. case 'DataBankList':
  522. // TODO: handle this case
  523. break;
  524. case 'GrantList':
  525. // TODO: handle this case
  526. break;
  527. case 'PublicationTypeList':
  528. tripal_pub_PMID_parse_publication_type($xml, $pub);
  529. break;
  530. case 'VernacularTitle':
  531. $xml->read();
  532. $pub['Vernacular Title'][] = $xml->value;
  533. break;
  534. case 'ArticleDate':
  535. // TODO: figure out what to do with this element. We already have the
  536. // published date in the <PubDate> field, but this date should be in numeric
  537. // form and may have more information.
  538. break;
  539. default:
  540. break;
  541. }
  542. }
  543. }
  544. }
  545. /**
  546. * Parses the section from the XML returned from PubMed that contains
  547. * information about a publication
  548. *
  549. * A full list of publication types can be found here:
  550. * http://www.nlm.nih.gov/mesh/pubtypes.html.
  551. *
  552. * The Tripal Pub ontology doesn't yet have terms for all of the
  553. * publication types so we store the value in the 'publication_type' term.
  554. *
  555. * @param $xml
  556. * The XML to parse
  557. * @param $pub
  558. * The publication object to which additional details will be added
  559. *
  560. * @ingroup tripal_pub
  561. */
  562. function tripal_pub_PMID_parse_publication_type($xml, &$pub) {
  563. while ($xml->read()) {
  564. $element = $xml->name;
  565. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'PublicationTypeList') {
  566. // we've reached the </PublicationTypeList> element so we're done.
  567. return;
  568. }
  569. if ($xml->nodeType == XMLReader::ELEMENT) {
  570. switch ($element) {
  571. case 'PublicationType':
  572. $xml->read();
  573. $value = $xml->value;
  574. $identifiers = array(
  575. 'name' => $value,
  576. 'cv_id' => array(
  577. 'name' => 'tripal_pub',
  578. )
  579. );
  580. $options = array('case_insensitive_columns' => array('name'));
  581. $pub_cvterm = chado_get_cvterm($identifiers, $options);
  582. if (!$pub_cvterm) {
  583. // see if this we can find the name using a synonym
  584. $identifiers = array(
  585. 'synonym' => array(
  586. 'name' => $value,
  587. 'cv_name' => 'tripal_pub'
  588. )
  589. );
  590. $pub_cvterm = chado_get_cvterm($identifiers, $options);
  591. if (!$pub_cvterm) {
  592. tripal_report_error('tripal_pubmed', TRIPAL_ERROR,
  593. 'Cannot find a valid vocabulary term for the publication type: "%term".',
  594. array('%term' => $value));
  595. }
  596. }
  597. else {
  598. $pub['Publication Type'][] = $pub_cvterm->name;
  599. }
  600. break;
  601. default:
  602. break;
  603. }
  604. }
  605. }
  606. }
  607. /**
  608. * Parses the section from the XML returned from PubMed that contains
  609. * information about the abstract
  610. *
  611. * @param $xml
  612. * The XML to parse
  613. * @param $pub
  614. * The publication object to which additional details will be added
  615. *
  616. * @ingroup tripal_pub
  617. */
  618. function tripal_pub_PMID_parse_abstract($xml, &$pub) {
  619. $abstract = '';
  620. while ($xml->read()) {
  621. $element = $xml->name;
  622. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Abstract') {
  623. // we've reached the </Abstract> element so return
  624. $pub['Abstract'] = $abstract;
  625. return;
  626. }
  627. // the abstract text can be just a singe paragraph or be broken into multiple
  628. // abstract texts for structured abstracts. Here we will just combine then
  629. // into a single element in the order that they arrive in HTML format
  630. if ($xml->nodeType == XMLReader::ELEMENT) {
  631. switch ($element) {
  632. case 'AbstractText':
  633. $label = $xml->getAttribute('Label');
  634. $xml->read();
  635. if ($label) {
  636. $part = "<p><b>$label</b></br>" . $xml->value . '</p>';
  637. $abstract .= $part;
  638. $pub['Structured Abstract Part'][] = $part;
  639. }
  640. else {
  641. $abstract .= '<p>' . $xml->value . '</p>';
  642. }
  643. break;
  644. case 'CopyrightInformation':
  645. $xml->read();
  646. $pub['Copyright'] = $xml->value;
  647. break;
  648. default:
  649. break;
  650. }
  651. }
  652. }
  653. }
  654. /**
  655. * Parses the section from the XML returned from PubMed that contains
  656. * information about pagination
  657. *
  658. * @param $xml
  659. * The XML to parse
  660. * @param $pub
  661. * The publication object to which additional details will be added
  662. *
  663. * @ingroup tripal_pub
  664. */
  665. function tripal_pub_PMID_parse_pagination($xml, &$pub) {
  666. while ($xml->read()) {
  667. $element = $xml->name;
  668. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Pagination') {
  669. // we've reached the </Pagination> element so we're done.
  670. return;
  671. }
  672. if ($xml->nodeType == XMLReader::ELEMENT) {
  673. switch ($element) {
  674. case 'MedlinePgn':
  675. $xml->read();
  676. if(trim($xml->value)) {
  677. $pub['Pages'] = $xml->value;
  678. }
  679. break;
  680. default:
  681. break;
  682. }
  683. }
  684. }
  685. }
  686. /**
  687. * Parses the section from the XML returned from PubMed that contains
  688. * information about a journal
  689. *
  690. * @param $xml
  691. * The XML to parse
  692. * @param $pub
  693. * The publication object to which additional details will be added
  694. *
  695. * @ingroup tripal_pub
  696. */
  697. function tripal_pub_PMID_parse_journal($xml, &$pub) {
  698. while ($xml->read()) {
  699. $element = $xml->name;
  700. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'Journal') {
  701. return;
  702. }
  703. if ($xml->nodeType == XMLReader::ELEMENT) {
  704. switch ($element) {
  705. case 'ISSN':
  706. $issn_type = $xml->getAttribute('IssnType');
  707. $xml->read();
  708. $issn = $xml->value;
  709. $pub['ISSN'] = $issn;
  710. if ($issn_type == 'Electronic') {
  711. $pub['eISSN'] = $issn;
  712. }
  713. if ($issn_type == 'Print') {
  714. $pub['pISSN'] = $issn;
  715. }
  716. break;
  717. case 'JournalIssue':
  718. // valid values of cited_medium are 'Internet' and 'Print'
  719. $cited_medium = $xml->getAttribute('CitedMedium');
  720. tripal_pub_PMID_parse_journal_issue($xml, $pub);
  721. break;
  722. case 'Title':
  723. $xml->read();
  724. $pub['Journal Name'] = $xml->value;
  725. break;
  726. case 'ISOAbbreviation':
  727. $xml->read();
  728. $pub['Journal Abbreviation'] = $xml->value;
  729. break;
  730. default:
  731. break;
  732. }
  733. }
  734. }
  735. }
  736. /**
  737. * Parses the section from the XML returned from PubMed that contains
  738. * information about a journal issue
  739. *
  740. * @param $xml
  741. * The XML to parse
  742. * @param $pub
  743. * The publication object to which additional details will be added
  744. *
  745. * @ingroup tripal_pub
  746. */
  747. function tripal_pub_PMID_parse_journal_issue($xml, &$pub) {
  748. while ($xml->read()) {
  749. $element = $xml->name;
  750. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == 'JournalIssue'){
  751. // if we're at the </JournalIssue> element then we're done
  752. return;
  753. }
  754. if ($xml->nodeType == XMLReader::ELEMENT) {
  755. switch ($element) {
  756. case 'Volume':
  757. $xml->read();
  758. $pub['Volume'] = $xml->value;
  759. break;
  760. case 'Issue':
  761. $xml->read();
  762. $pub['Issue'] = $xml->value;
  763. break;
  764. case 'PubDate':
  765. $date = tripal_pub_PMID_parse_date($xml, 'PubDate');
  766. $year = $date['year'];
  767. $month = array_key_exists('month', $date) ? $date['month'] : '';
  768. $day = array_key_exists('day', $date) ? $date['day'] : '';
  769. $medline = array_key_exists('medline', $date) ? $date['medline'] : '';
  770. $pub['Year'] = $year;
  771. if ($month and $day and $year) {
  772. $pub['Publication Date'] = "$year $month $day";
  773. }
  774. elseif ($month and !$day and $year) {
  775. $pub['Publication Date'] = "$year $month";
  776. }
  777. elseif (!$month and !$day and $year) {
  778. $pub['Publication Date'] = $year;
  779. }
  780. elseif ($medline) {
  781. $pub['Publication Date'] = $medline;
  782. }
  783. else {
  784. $pub['Publication Date'] = "Date Unknown";
  785. }
  786. break;
  787. default:
  788. break;
  789. }
  790. }
  791. }
  792. }
  793. /**
  794. * Parses the section from the XML returned from PubMed that contains
  795. * information regarding to dates
  796. *
  797. * @param $xml
  798. * The XML to parse
  799. * @param $pub
  800. * The publication object to which additional details will be added
  801. *
  802. * @ingroup tripal_pub
  803. */
  804. function tripal_pub_PMID_parse_date($xml, $element_name) {
  805. $date = array();
  806. while ($xml->read()) {
  807. $element = $xml->name;
  808. if ($xml->nodeType == XMLReader::END_ELEMENT and $element == $element_name){
  809. // if we're at the </$element_name> then we're done
  810. return $date;
  811. }
  812. if ($xml->nodeType == XMLReader::ELEMENT) {
  813. switch ($element) {
  814. case 'Year':
  815. $xml->read();
  816. $date['year'] = $xml->value;
  817. break;
  818. case 'Month':
  819. $xml->read();
  820. $month =
  821. $date['month'] = $xml->value;
  822. break;
  823. case 'Day':
  824. $xml->read();
  825. $date['day'] = $xml->value;
  826. break;
  827. case 'MedlineDate':
  828. // the medline date is when the date cannot be broken into distinct month day year.
  829. $xml->read();
  830. $date['year'] = preg_replace('/^(\d{4}).*$/', '\1', $xml->value);
  831. $date['medline'] = $xml->value;
  832. break;
  833. default:
  834. break;
  835. }
  836. }
  837. }
  838. }
  839. /**
  840. * Parses the section from the XML returned from PubMed that contains
  841. * information about the author list for a publication
  842. *
  843. * @param $xml
  844. * The XML to parse
  845. * @param $pub
  846. * The publication object to which additional details will be added
  847. *
  848. * @ingroup tripal_pub
  849. */
  850. function tripal_pub_PMID_parse_authorlist($xml, &$pub) {
  851. $num_authors = 0;
  852. while ($xml->read()) {
  853. $element = $xml->name;
  854. if ($xml->nodeType == XMLReader::END_ELEMENT){
  855. // if we're at the </AuthorList> element then we're done with the article...
  856. if($element == 'AuthorList') {
  857. // build the author list before returning
  858. $authors = '';
  859. foreach ($pub['Author List'] as $author) {
  860. if ($author['valid'] == 'N') {
  861. // skip non-valid entries. A non-valid entry should have
  862. // a corresponding corrected entry so we can saftely skip it.
  863. continue;
  864. }
  865. if (array_key_exists('Collective', $author)) {
  866. $authors .= $author['Collective'] . ', ';
  867. }
  868. else {
  869. $authors .= $author['Surname'] . ' ' . $author['First Initials'] . ', ';
  870. }
  871. }
  872. $authors = substr($authors, 0, -2);
  873. $pub['Authors'] = $authors;
  874. return;
  875. }
  876. // if we're at the end </Author> element then we're done with the author and we can
  877. // start a new one.
  878. if($element == 'Author') {
  879. $num_authors++;
  880. }
  881. }
  882. if ($xml->nodeType == XMLReader::ELEMENT) {
  883. switch ($element) {
  884. case 'Author':
  885. $valid = $xml->getAttribute('ValidYN');
  886. $pub['Author List'][$num_authors]['valid'] = $valid;
  887. break;
  888. case 'LastName':
  889. $xml->read();
  890. $pub['Author List'][$num_authors]['Surname'] = $xml->value;
  891. break;
  892. case 'ForeName':
  893. $xml->read();
  894. $pub['Author List'][$num_authors]['Given Name'] = $xml->value;
  895. break;
  896. case 'Initials':
  897. $xml->read();
  898. $pub['Author List'][$num_authors]['First Initials'] = $xml->value;
  899. break;
  900. case 'Suffix':
  901. $xml->read();
  902. $pub['Author List'][$num_authors]['Suffix'] = $xml->value;
  903. break;
  904. case 'CollectiveName':
  905. $xml->read();
  906. $pub['Author List'][$num_authors]['Collective'] = $xml->value;
  907. break;
  908. case 'Identifier':
  909. // according to the specification, this element is not yet used.
  910. break;
  911. default:
  912. break;
  913. }
  914. }
  915. }
  916. }
  917. /**
  918. * Get the name of the language based on an abbreviation
  919. *
  920. * Language abbreviations were obtained here:
  921. * http://www.nlm.nih.gov/bsd/language_table.html
  922. *
  923. * @param $lang_abbr
  924. * The abbreviation of the language to return
  925. *
  926. * @return
  927. * The full name of the language
  928. *
  929. * @ingroup tripal_pub
  930. */
  931. function tripal_pub_remote_search_get_language($lang_abbr) {
  932. $languages = array(
  933. 'afr' => 'Afrikaans',
  934. 'alb' => 'Albanian',
  935. 'amh' => 'Amharic',
  936. 'ara' => 'Arabic',
  937. 'arm' => 'Armenian',
  938. 'aze' => 'Azerbaijani',
  939. 'ben' => 'Bengali',
  940. 'bos' => 'Bosnian',
  941. 'bul' => 'Bulgarian',
  942. 'cat' => 'Catalan',
  943. 'chi' => 'Chinese',
  944. 'cze' => 'Czech',
  945. 'dan' => 'Danish',
  946. 'dut' => 'Dutch',
  947. 'eng' => 'English',
  948. 'epo' => 'Esperanto',
  949. 'est' => 'Estonian',
  950. 'fin' => 'Finnish',
  951. 'fre' => 'French',
  952. 'geo' => 'Georgian',
  953. 'ger' => 'German',
  954. 'gla' => 'Scottish Gaelic',
  955. 'gre' => 'Greek, Modern',
  956. 'heb' => 'Hebrew',
  957. 'hin' => 'Hindi',
  958. 'hrv' => 'Croatian',
  959. 'hun' => 'Hungarian',
  960. 'ice' => 'Icelandic',
  961. 'ind' => 'Indonesian',
  962. 'ita' => 'Italian',
  963. 'jpn' => 'Japanese',
  964. 'kin' => 'Kinyarwanda',
  965. 'kor' => 'Korean',
  966. 'lat' => 'Latin',
  967. 'lav' => 'Latvian',
  968. 'lit' => 'Lithuanian',
  969. 'mac' => 'Macedonian',
  970. 'mal' => 'Malayalam',
  971. 'mao' => 'Maori',
  972. 'may' => 'Malay',
  973. 'mul' => 'Multiple languages',
  974. 'nor' => 'Norwegian',
  975. 'per' => 'Persian',
  976. 'pol' => 'Polish',
  977. 'por' => 'Portuguese',
  978. 'pus' => 'Pushto',
  979. 'rum' => 'Romanian, Rumanian, Moldovan',
  980. 'rus' => 'Russian',
  981. 'san' => 'Sanskrit',
  982. 'slo' => 'Slovak',
  983. 'slv' => 'Slovenian',
  984. 'spa' => 'Spanish',
  985. 'srp' => 'Serbian',
  986. 'swe' => 'Swedish',
  987. 'tha' => 'Thai',
  988. 'tur' => 'Turkish',
  989. 'ukr' => 'Ukrainian',
  990. 'und' => 'Undetermined',
  991. 'urd' => 'Urdu',
  992. 'vie' => 'Vietnamese',
  993. 'wel' => 'Welsh',
  994. );
  995. return $languages[strtolower($lang_abbr)];
  996. }