tripal_pub.AGL.inc

Importer for the USDA Agricultural Library (Agricola).

This file provides support for importing and parsing of results from the USDA National Agricultural Library (AGL) database. The functions here are used by both the publication importer setup form and the publication importer. The USDA AGL database uses a YAZ protocol for querying and retrieving records.

File

tripal_pub/includes/importers/tripal_pub.AGL.inc
View source
  1. <?php
  2. /**
  3. * @file
  4. *
  5. * Importer for the USDA Agricultural Library (Agricola).
  6. *
  7. * This file provides support for importing and parsing of results from the
  8. * USDA National Agricultural Library (AGL) database. The functions here are
  9. * used by both the publication importer setup form and the publication
  10. * importer. The USDA AGL database uses a YAZ protocol for querying and
  11. * retrieving records.
  12. *
  13. */
  14. /**
  15. * A hook for altering the publication importer form.
  16. *
  17. * It Changes the 'Days' element to 'Year' and removes the 'Journal Name'
  18. * filter.
  19. *
  20. * @param $form
  21. * The Drupal form array
  22. * @param $form_state
  23. * The form state array
  24. * @param $num_criteria
  25. * The number of criteria the user currently has added to the form
  26. *
  27. * @return
  28. * The form (drupal form api)
  29. *
  30. * @ingroup tripal_pub
  31. */
  32. function tripal_pub_remote_alter_form_AGL($form, $form_state, $num_criteria = 1) {
  33. // So far we haven't been able to get AGL to filter results to only
  34. // include pubs by the XX number days in the past. So, we will
  35. // change the 'days' element to be the year to query
  36. $form['themed_element']['days']['#title'] = t('Year');
  37. $form['themed_element']['days']['#description'] = t('Please enter a year to limit records by the year they were published, created or modified in the database.');
  38. // The Journal Name filter doesn't seem to work, so remove it
  39. for($i = 1; $i <= $num_criteria; $i++) {
  40. unset($form['themed_element']['criteria'][$i]["scope-$i"]['#options']['journal']);
  41. }
  42. return $form;
  43. }
  44. /**
  45. * A hook for providing additional validation of importer setup form.
  46. *
  47. * @param $form
  48. * The Drupal form array
  49. * @param $form_state
  50. * The form state array
  51. *
  52. * @return
  53. * The form (drupal form api)
  54. *
  55. * @ingroup tripal_pub
  56. */
  57. function tripal_pub_remote_validate_form_AGL($form, $form_state) {
  58. $days = trim($form_state['values']["days"]);
  59. $num_criteria = $form_state['values']['num_criteria'];
  60. if ($days and !preg_match('/^\d\d\d\d$/', $days)) {
  61. form_set_error("days", "Please enter a four digit year.");
  62. }
  63. $num_ids = 0;
  64. for ($i = 1; $i <= $num_criteria; $i++) {
  65. $search_terms = trim($form_state['values']["search_terms-$i"]);
  66. $scope = $form_state['values']["scope-$i"];
  67. if ($scope == 'id' and !preg_match('/^AGL:\d+$/', $search_terms)) {
  68. form_set_error("search_terms-$i", "The AGL accession be a numeric value, prefixed with 'AGL:' (e.g. AGL:3890740).");
  69. }
  70. if ($scope == 'id') {
  71. $num_ids++;
  72. }
  73. if($num_ids > 1) {
  74. form_set_error("search_terms-$i", "Unfortuantely, the AGL importer can only support a single accession at a time. Please remove the others.");
  75. }
  76. }
  77. return $form;
  78. }
  79. /**
  80. * A hook for performing the search on the AGL database.
  81. *
  82. * @param $search_array
  83. * An array containing the serach criteria for the serach
  84. * @param $num_to_retrieve
  85. * Indicates the maximum number of publications to retrieve from the remote
  86. * database
  87. * @param $page
  88. * Indicates the page to retrieve. This corresponds to a paged table, where
  89. * each page has $num_to_retrieve publications.
  90. *
  91. * @return
  92. * An array of publications.
  93. *
  94. * @ingroup tripal_pub
  95. */
  96. function tripal_pub_remote_search_AGL($search_array, $num_to_retrieve, $page) {
  97. // get some values from the serach array
  98. $num_criteria = $search_array['num_criteria'];
  99. $days = array_key_exists('days', $search_array) ? $search_array['days'] : '';
  100. // set some defaults
  101. $search_array['limit'] = $num_to_retrieve;
  102. // To build the CCL search string we want to have a single entry for
  103. // 'author', 'title', 'abstract' or 'id', and also the corresponding 'not
  104. // for each of those. But the search form allows the user to have multiple
  105. // rows of the same type. So, we will build the search string separately for
  106. // each category and it's negative category (if NOT is selected as the op)
  107. // and at the end we will put them together into a single search string. We
  108. // need to keep track of the first entry of any category because it will not
  109. // have an op (e.g. 'or' or 'and') but the operation will be pushed out to
  110. // separate the categories. The op for any second or third instance of
  111. // the same category will be included within the search string for the
  112. // category.
  113. $ccl = '';
  114. $title = '';
  115. $author = '';
  116. $abstract = '';
  117. $id = '';
  118. $any = '';
  119. $negate_title = '';
  120. $negate_author = '';
  121. $negate_abstract = '';
  122. $negate_id = '';
  123. $negate_any = '';
  124. $order = array();
  125. $first_abstract = 1;
  126. $first_author = 1;
  127. $first_title = 1;
  128. $first_id = 1;
  129. $first_any = 1;
  130. $first_negate_abstract = 1;
  131. $first_negate_author = 1;
  132. $first_negate_title = 1;
  133. $first_negate_id = 1;
  134. $first_negate_any = 1;
  135. for ($i = 1; $i <= $num_criteria; $i++) {
  136. $search_terms = trim($search_array['criteria'][$i]['search_terms']);
  137. $scope = $search_array['criteria'][$i]['scope'];
  138. $is_phrase = $search_array['criteria'][$i]['is_phrase'];
  139. $op = $search_array['criteria'][$i]['operation'];
  140. if ($op) {
  141. $op = strtolower($op);
  142. }
  143. $search_terms = trim($search_terms);
  144. // If this is not a phrase then make sure the AND and OR are lower-case.
  145. if (!$is_phrase) {
  146. $search_terms = preg_replace('/ OR /', ' or ', $search_terms);
  147. $search_terms = preg_replace('/ AND /', ' and ', $search_terms);
  148. }
  149. // Else make sure the search terms are surrounded by quotes.
  150. else {
  151. $search_terms = "\"$search_terms\"";
  152. }
  153. // If this is a 'not' operation then we want to change it to an "and".
  154. $negate = '';
  155. if ($op == 'not') {
  156. $scope = "negate_$scope";
  157. $op = 'or';
  158. }
  159. $order[] = array('scope' => $scope, 'op' => $op);
  160. // Build each category.
  161. if ($scope == 'title') {
  162. if ($first_title) {
  163. $title .= "($search_terms) ";
  164. $first_title = 0;
  165. }
  166. else {
  167. $title .= "$op ($search_terms) ";
  168. }
  169. }
  170. if ($scope == 'negate_title') {
  171. if ($first_negate_title) {
  172. $negate_title .= "($search_terms) ";
  173. $first_negate_title = 0;
  174. }
  175. else {
  176. $negate_title .= "$op ($search_terms) ";
  177. }
  178. }
  179. elseif ($scope == 'author') {
  180. if ($first_author) {
  181. $author .= "($search_terms) ";
  182. $first_author = 0;
  183. }
  184. else {
  185. $author .= "$op ($search_terms) ";
  186. }
  187. }
  188. elseif ($scope == 'negate_author') {
  189. if ($first_negate_author) {
  190. $negate_author .= "($search_terms) ";
  191. $first_negate_author = 0;
  192. }
  193. else {
  194. $negate_author .= "$op ($search_terms) ";
  195. }
  196. }
  197. elseif ($scope == 'abstract') {
  198. if ($first_abstract) {
  199. $abstract .= "($search_terms) ";
  200. $first_abstract = 0;
  201. }
  202. else {
  203. $abstract .= "$op ($search_terms) ";
  204. }
  205. }
  206. elseif ($scope == 'negate_abstract') {
  207. if ($first_negate_abstract) {
  208. $negate_abstract .= "($search_terms) ";
  209. $first_negate_abstract = 0;
  210. }
  211. else {
  212. $negate_abstract .= "$op ($search_terms) ";
  213. }
  214. }
  215. elseif ($scope == 'journal') {
  216. if ($first_journal) {
  217. $journal .= "($search_terms) ";
  218. $first_jounral = 0;
  219. }
  220. else {
  221. $journal .= "$op ($search_terms) ";
  222. }
  223. }
  224. elseif ($scope == 'negate_journal') {
  225. if ($first_negate_journal) {
  226. $negate_journal .= "($search_terms) ";
  227. $first_negate_journal = 0;
  228. }
  229. else {
  230. $negate_journal .= "$op ($search_terms) ";
  231. }
  232. }
  233. elseif ($scope == 'id') {
  234. if ($first_id) {
  235. $id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  236. $first_id = 0;
  237. }
  238. else {
  239. $id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  240. }
  241. }
  242. elseif ($scope == 'negate_id') {
  243. if ($first_negate_id) {
  244. $negate_id .= "(" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  245. $first_negate_id = 0;
  246. }
  247. else {
  248. $negate_id .= "$op (" . preg_replace('/AGL:([^\s]*)/', '$1', $search_terms) . ") ";
  249. }
  250. }
  251. elseif ($scope == 'any'){
  252. if ($first_any) {
  253. $any .= "($search_terms) ";
  254. $first_any = 0;
  255. }
  256. else {
  257. $any .= "$op ($search_terms) ";
  258. }
  259. }
  260. elseif ($scope == 'negate_any'){
  261. if ($first_negate_any) {
  262. $negate_any .= "($search_terms) ";
  263. $first_any = 0;
  264. }
  265. else {
  266. $negate_any .= "$op ($search_terms) ";
  267. }
  268. }
  269. }
  270. // Now build the CCL string in order.
  271. $abstract_done = 0;
  272. $author_done = 0;
  273. $journal_done = 0;
  274. $title_done = 0;
  275. $id_done = 0;
  276. $any_done = 0;
  277. $negate_abstract_done = 0;
  278. $negate_journal_done = 0;
  279. $negate_author_done = 0;
  280. $negate_title_done = 0;
  281. $negate_id_done = 0;
  282. $negate_any_done = 0;
  283. for ($i = 0; $i < count($order) ; $i++) {
  284. if ($order[$i]['scope'] == 'abstract' and !$abstract_done) {
  285. $op = $order[$i]['op'];
  286. $ccl .= "$op abstract=($abstract) ";
  287. $abstract_done = 1;
  288. }
  289. if ($order[$i]['scope'] == 'negate_abstract' and !$negate_abstract_done) {
  290. $ccl .= "not abstract=($negate_abstract) ";
  291. $negate_abstract_done = 1;
  292. }
  293. if ($order[$i]['scope'] == 'author' and !$author_done) {
  294. $op = $order[$i]['op'];
  295. $ccl .= "$op author=($author) ";
  296. $author_done = 1;
  297. }
  298. if ($order[$i]['scope'] == 'negate_author' and !$negate_author_done) {
  299. $ccl .= "not author=($negate_author) ";
  300. $negate_author_done = 1;
  301. }
  302. if ($order[$i]['scope'] == 'journal' and !$journal_done) {
  303. $op = $order[$i]['op'];
  304. $ccl .= "$op journal=($journal) ";
  305. $journal_done = 1;
  306. }
  307. if ($order[$i]['scope'] == 'negate_journal' and !$negate_journal_done) {
  308. $ccl .= "not author=($negate_journal) ";
  309. $negate_journal_done = 1;
  310. }
  311. if ($order[$i]['scope'] == 'id' and !$id_done) {
  312. $op = $order[$i]['op'];
  313. $ccl .= "$op id=($id) ";
  314. $id_done = 1;
  315. }
  316. if ($order[$i]['scope'] == 'negate_id' and !$negate_id_done) {
  317. $ccl .= "not id=($negate_id) ";
  318. $negate_id_done = 1;
  319. }
  320. if ($order[$i]['scope'] == 'title' and !$title_done) {
  321. $op = $order[$i]['op'];
  322. $ccl .= "$op title=($title) ";
  323. $title_done = 1;
  324. }
  325. if ($order[$i]['scope'] == 'negate_title' and !$negate_title_done) {
  326. $ccl .= "not title=($negate_title) ";
  327. $negate_title_done = 1;
  328. }
  329. if ($order[$i]['scope'] == 'any' and !$any_done) {
  330. $op = $order[$i]['op'];
  331. $ccl .= "$op ($any) ";
  332. $any_done = 1;
  333. }
  334. if ($order[$i]['scope'] == 'negate_any' and !$negate_any_done) {
  335. $ccl .= "not ($negate_any) ";
  336. $negate_any_done = 1;
  337. }
  338. }
  339. // For AGL the 'days' form element was converted to represent the year.
  340. if ($days) {
  341. $ccl .= "and year=($days)";
  342. }
  343. // Remove any preceeding 'and' or 'or'.
  344. $ccl = preg_replace('/^\s*(and|or)/', '', $ccl);
  345. // yaz_connect() prepares for a connection to a Z39.50 server. This function
  346. // is non-blocking and does not attempt to establish a connection - it merely
  347. // prepares a connect to be performed later when yaz_wait() is called.
  348. // NAL Catalog
  349. // $yazc = yaz_connect('agricola.nal.usda.gov:7090/voyager');
  350. // NAL Article Citation Database
  351. $yazc = yaz_connect('agricola.nal.usda.gov:7190/voyager');
  352. // Use the USMARC record type. But OPAC is also supported by Agricola.
  353. yaz_syntax($yazc, "usmarc");
  354. // The search query is built using CCL, we need to first
  355. // configure it so it can map the attributes to defined identifiers
  356. // The attribute set used by AGL can be found at the bottom of this page:
  357. // http://agricola.nal.usda.gov/help/z3950.html
  358. //
  359. // More in depth details: http://www.loc.gov/z3950/agency/bib1.html
  360. //
  361. // CCL Syntax: http://www.indexdata.com/yaz/doc/tools.html#CCL
  362. //
  363. $fields = array(
  364. "title" => "u=4",
  365. "author" => "u=1003",
  366. "abstract" => "u=62",
  367. "id" => "u=12",
  368. "year" => "u=30 r=o",
  369. "journal" => "u=1033"
  370. );
  371. yaz_ccl_conf($yazc, $fields);
  372. if (!yaz_ccl_parse($yazc, $ccl, $cclresult)) {
  373. drupal_set_message('Error parsing search string: ' . $cclresult["errorstring"], "error");
  374. watchdog('tpub_import', 'Error: %errstr', array('%errstr' => $cclresult["errorstring"]), WATCHDOG_ERROR);
  375. return array(
  376. 'total_records' => 0,
  377. 'search_str' => '',
  378. 'pubs' => array(),
  379. );
  380. }
  381. $search_str = $cclresult["rpn"];
  382. // get the total number of records
  383. $total_records = tripal_pub_AGL_count($yazc, $search_str);
  384. // get the pubs in the specified rang
  385. $start = $page * $num_to_retrieve;
  386. $results = tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records);
  387. // close the connection
  388. yaz_close($yazc);
  389. return $results;
  390. }
  391. /**
  392. * Retreives a range of publications from AGL.
  393. *
  394. * @param $yazc
  395. * The YAZC connection object.
  396. * @param $search_str
  397. * The search string to use for searching.
  398. * @param $start
  399. * The start of the range
  400. * @param $num_to_retrieve
  401. * The number of publications to retrieve
  402. * @param $total_records
  403. * The total number of records in the dataset. This value should have
  404. * been retrieved by tripal_pub_AGL_count() function.
  405. *
  406. * @return
  407. * An array containing the total_records in the dataaset, the search string
  408. * and an array of the publications that were retreived.
  409. *
  410. * @ingroup tripal_pub
  411. */
  412. function tripal_pub_AGL_range($yazc, $search_str, $start, $num_to_retrieve, $total_records) {
  413. yaz_range($yazc, 1, $total_records);
  414. if (!yaz_present($yazc)) {
  415. $error_no = yaz_errno($yazc);
  416. $error_msg = yaz_error($yazc);
  417. $additional = yaz_addinfo($yazc);
  418. if ($additional != $error_msg) {
  419. $error_msg .= " $additional";
  420. }
  421. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  422. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  423. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  424. return array(
  425. 'total_records' => 0,
  426. 'search_str' => $search_str,
  427. 'pubs' => array(),
  428. );
  429. }
  430. if ($start + $num_to_retrieve > $total_records) {
  431. $num_to_retrieve = $total_records - $start;
  432. }
  433. $pubs = array();
  434. for($i = $start; $i < $start + $num_to_retrieve; $i++) {
  435. // retrieve the XML results
  436. $pub_xml = yaz_record($yazc, $i + 1, 'xml; charset=marc-8,utf-8');
  437. if (!$pub_xml) {
  438. $error_no = yaz_errno($yazc);
  439. $error_msg = yaz_error($yazc);
  440. drupal_set_message("ERROR retrieving records from AGL: ($error_no) $error_msg", "error");
  441. watchdog('tpub_import', "ERROR retrieving records from AGL: (%error_no) %error_msg",
  442. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  443. return array(
  444. 'total_records' => 0,
  445. 'search_str' => $search_str,
  446. 'pubs' => array(),
  447. );
  448. }
  449. // parse the pub XML
  450. $pub = tripal_pub_AGL_parse_pubxml($pub_xml);
  451. $pubs[] = $pub;
  452. }
  453. return array(
  454. 'total_records' => $total_records,
  455. 'search_str' => $search_str,
  456. 'pubs' => $pubs,
  457. );
  458. }
  459. /**
  460. * Retreives the total number of publications that match the search string.
  461. *
  462. * @param $yazc
  463. * The YAZC connection object.
  464. * @param $search_str
  465. * The search string to use for searching.
  466. *
  467. * @return
  468. * a count of the total number of publications that match the search string
  469. *
  470. * @ingroup tripal_pub
  471. */
  472. function tripal_pub_AGL_count($yazc, $search_str) {
  473. // Sort by publication date descending.
  474. // yaz_sort($yazc, "1=31 id");
  475. if (!yaz_search($yazc, "rpn", $search_str)){
  476. $error_no = yaz_errno($yazc);
  477. $error_msg = yaz_error($yazc);
  478. $additional = yaz_addinfo($yazc);
  479. if ($additional != $error_msg) {
  480. $error_msg .= " $additional";
  481. }
  482. drupal_set_message("ERROR preparing search at AGL: ($error_no) $error_msg", "error");
  483. watchdog('tpub_import', "ERROR preparing search at AGL: (%error_no) %error_msg",
  484. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  485. return 0;
  486. }
  487. if (!yaz_wait()) {
  488. $error_no = yaz_errno($yazc);
  489. $error_msg = yaz_error($yazc);
  490. $additional = yaz_addinfo($yazc);
  491. if ($additional != $error_msg) {
  492. $error_msg .= " $additional";
  493. }
  494. drupal_set_message("ERROR waiting on search at AGL: ($error_no) $error_msg", "error");
  495. watchdog('tpub_import', "ERROR waiting on search at AGL: (%error_no) %error_msg",
  496. array('%error_no' => $error_no, '%error_msg' => $error_msg), WATCHDOG_ERROR);
  497. return 0;
  498. }
  499. // get the total number of results from the serach
  500. $count = yaz_hits($yazc);
  501. return $count;
  502. }
  503. /**
  504. * Parse publication XML for a single publication
  505. *
  506. * Description of XML format:
  507. * http://www.loc.gov/marc/bibliographic/bdsummary.html
  508. *
  509. * @param $pub_xml
  510. * A string containing the XML for a single publications
  511. *
  512. * @return
  513. * An array containing the details of the publication
  514. *
  515. * @ingroup tripal_pub
  516. */
  517. function tripal_pub_AGL_parse_pubxml($pub_xml) {
  518. $pub = array();
  519. // we will set the default publication type as a journal article. The NAL
  520. // dataset doesn't specify an article type so we'll have to glean the type
  521. // from other information (e.g. series name has 'Proceedings' in it)
  522. $pub['Publication Type'][0] = 'Journal Article';
  523. if (!$pub_xml) {
  524. return $pub;
  525. }
  526. // read the XML and iterate through it.
  527. $xml = new XMLReader();
  528. $xml->xml(trim($pub_xml));
  529. while ($xml->read()) {
  530. $element = $xml->name;
  531. if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
  532. $tag = $xml->getAttribute('tag');
  533. $xml->read();
  534. $value = $xml->value;
  535. switch ($tag) {
  536. case '001': // control number
  537. $pub['Publication Accession'] = $value;
  538. break;
  539. case '003': // control number identifier
  540. break;
  541. case '005': // datea nd time of latest transaction
  542. break;
  543. case '006': // fixed-length data elemetns
  544. break;
  545. case '007': // physical description fixed field
  546. break;
  547. case '008': // fixed length data elements
  548. $month = array(
  549. '01' => 'Jan', '02' => 'Feb', '03' => 'Mar',
  550. '04' => 'Apr', '05' => 'May', '06' => 'Jun',
  551. '07' => 'Jul', '08' => 'Aug', '09' => 'Sep',
  552. '10' => 'Oct', '11' => 'Nov', '12' => 'Dec'
  553. );
  554. $date0 = substr($value, 0, 6); // date entered on file
  555. $date1 = substr($value, 7, 4); // year of publication
  556. $date2 = substr($value, 11, 4); // month of publication
  557. $place = substr($value, 15, 3);
  558. $lang = substr($value, 35, 3);
  559. if (preg_match('/\d\d\d\d/', $date1)) {
  560. $pub['Year'] = $date1;
  561. $pub['Publication Date'] = $date1;
  562. }
  563. if (preg_match('/\d\d/', $date2)) {
  564. $pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
  565. }
  566. if (!preg_match('/\s+/', $place)) {
  567. $pub['Published Location'] = $place;
  568. }
  569. if (!preg_match('/\s+/', $lang)) {
  570. $pub['Language Abbr'] = $lang;
  571. }
  572. break;
  573. default: // unhandled tag
  574. break;
  575. }
  576. }
  577. elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
  578. $tag = $xml->getAttribute('tag');
  579. $ind1 = $xml->getAttribute('ind1');
  580. $ind2 = $xml->getAttribute('ind2');
  581. switch ($tag) {
  582. case '16': // National Bibliographic Agency Control Number
  583. break;
  584. case '35': // System Control Number
  585. $author = array();
  586. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  587. foreach ($codes as $code => $value) {
  588. switch ($code) {
  589. case 'a': // System control number
  590. $pub['Publication Accession'] = $value;
  591. break;
  592. }
  593. }
  594. case '40': // Cataloging Source (NR)
  595. $author = array();
  596. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  597. foreach ($codes as $code => $value) {
  598. switch ($code) {
  599. case 'a': // original cataolging agency
  600. $pub['Publication Database'] = $value;
  601. break;
  602. }
  603. }
  604. break;
  605. case '72': // Subject Category Code
  606. break;
  607. case '100': // main entry-personal name
  608. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  609. $pub['Author List'][] = $author;
  610. break;
  611. case '110': // main entry-corporate nmae
  612. $author = array();
  613. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  614. foreach ($codes as $code => $value) {
  615. switch ($code) {
  616. case 'a': // Corporate name or jurisdiction name as entry elemen
  617. $author['Collective'] = $value;
  618. break;
  619. case 'b': // Subordinate unit
  620. $author['Collective'] .= ' ' . $value;
  621. break;
  622. }
  623. }
  624. $pub['Author List'][] = $author;
  625. break;
  626. case '111': // main entry-meeting name
  627. break;
  628. case '130': // main entry-uniform title
  629. break;
  630. case '210': // abbreviated title
  631. break;
  632. case '222': // key title
  633. break;
  634. case '240': // uniform title
  635. break;
  636. case '242': // translation of title by cataloging agency
  637. break;
  638. case '243': // collective uniform title
  639. break;
  640. case '245': // title statement
  641. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  642. foreach ($codes as $code => $value) {
  643. switch ($code) {
  644. case 'a':
  645. $pub['Title'] = trim(preg_replace('/\.$/', '', $value));
  646. break;
  647. case 'b':
  648. $pub['Title'] .= ' ' . $value;
  649. break;
  650. case 'h':
  651. $pub['Publication Model'] = $value;
  652. break;
  653. }
  654. }
  655. break;
  656. case '246': // varying form of title
  657. break;
  658. case '247': // former title
  659. break;
  660. case '250': // edition statement
  661. break;
  662. case '254': // musicla presentation statement
  663. break;
  664. case '255': // cartographic mathematical data
  665. break;
  666. case '256': // computer file characteristics
  667. break;
  668. case '257': // country of producing entity
  669. break;
  670. case '258': // philatelic issue data
  671. break;
  672. case '260': // publication, distribution ,etc (imprint)
  673. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  674. foreach ($codes as $code => $value) {
  675. switch ($code) {
  676. case 'a':
  677. $pub['Published Location'] = $value;
  678. break;
  679. case 'b':
  680. $pub['Publisher'] = $value;
  681. break;
  682. case 'c':
  683. $pub['Publication Date'] = $value;
  684. break;
  685. }
  686. }
  687. break;
  688. case '263': // projected publication date
  689. break;
  690. case '264': // production, publication, distribution, manufacture and copyright notice
  691. break;
  692. case '270': // Address
  693. break;
  694. case '300': // Address
  695. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  696. foreach ($codes as $code => $value) {
  697. switch ($code) {
  698. case 'a':
  699. $pages = $value;
  700. $pages = preg_replace('/^p\. /', '', $pages);
  701. $pages = preg_replace('/\.$/', '' , $pages);
  702. if(preg_match('/p$/', $pages)) {
  703. // skip this, it's the number of pages not the page numbers
  704. }
  705. else {
  706. $pub['Pages'] = $pages;
  707. }
  708. break;
  709. }
  710. }
  711. break;
  712. case '500': // series statements
  713. $pub['Notes'] = $value;
  714. break;
  715. case '504': // Bibliography, Etc. Note
  716. break;
  717. case '520': // Summary, etc
  718. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  719. foreach ($codes as $code => $value) {
  720. switch ($code) {
  721. case 'a':
  722. $pub['Abstract'] = $value;
  723. break;
  724. }
  725. }
  726. break;
  727. case '650': // Subject Added Entry-Topical Term
  728. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  729. foreach ($codes as $code => $value) {
  730. switch ($code) {
  731. case 'a':
  732. $pub['Keywords'][] = $value;
  733. break;
  734. }
  735. }
  736. break;
  737. case '653': // Index Term-Uncontrolled
  738. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  739. foreach ($codes as $code => $value) {
  740. switch ($code) {
  741. case 'a':
  742. $pub['Keywords'][] = $value;
  743. break;
  744. }
  745. }
  746. break;
  747. case '700': // Added Entry-Personal Name
  748. $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
  749. $pub['Author List'][] = $author;
  750. break;
  751. case '710': // Added Entry-Corporate Name
  752. $author = array();
  753. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  754. foreach ($codes as $code => $value) {
  755. switch ($code) {
  756. case 'a': // Corporate name or jurisdiction name as entry elemen
  757. $author['Collective'] = $value;
  758. break;
  759. case 'b': // Subordinate unit
  760. $author['Collective'] .= ' ' . $value;
  761. break;
  762. }
  763. }
  764. $pub['Author List'][] = $author;
  765. break;
  766. case '773': // host item entry
  767. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  768. foreach ($codes as $code => $value) {
  769. switch ($code) {
  770. case 'a':
  771. if (preg_match('/Proceedings/i', $value)) {
  772. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  773. $pub['Publication Type'][0] = 'Conference Proceedings';
  774. }
  775. else {
  776. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  777. }
  778. break;
  779. case 't':
  780. if (preg_match('/Proceedings/i', $value)) {
  781. $pub['Series Name'] = preg_replace('/\.$/', '', $value);
  782. $pub['Publication Type'][0] = 'Conference Proceedings';
  783. }
  784. $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
  785. break;
  786. case 'g':
  787. $matches = array();
  788. if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
  789. $pub['Publication Date'] = $matches[1];
  790. }
  791. elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
  792. $year = $matches[4];
  793. $month = $matches[1];
  794. $day = $matches[3];
  795. $pub['Publication Date'] = "$year $month $day";
  796. }
  797. elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
  798. $year = $matches[3];
  799. $month = $matches[1];
  800. $pub['Publication Date'] = "$year $month";
  801. }
  802. elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
  803. $year = $matches[2];
  804. $month = $matches[1];
  805. $pub['Publication Date'] = "$year $month";
  806. }
  807. if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
  808. $pub['Volume'] = $matches[1];
  809. }
  810. if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
  811. $pub['Volume'] = $matches[1];
  812. $pub['Issue'] = $matches[3];
  813. }
  814. if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
  815. $pub['Issue'] = $matches[1];
  816. }
  817. break;
  818. case 'p':
  819. $pub['Journal Abbreviation'] = $value;
  820. break;
  821. case 'z':
  822. $pub['ISBN'] = $value;
  823. break;
  824. }
  825. }
  826. break;
  827. case '852': // Location (Where is the publication held)
  828. break;
  829. case '856': // Electronic Location and Access
  830. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  831. foreach ($codes as $code => $value) {
  832. switch ($code) {
  833. case 'u':
  834. $pub['URL'] = $value;
  835. break;
  836. }
  837. }
  838. break;
  839. default:
  840. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  841. $unhandled[$tag][] = $codes;
  842. break;
  843. }
  844. }
  845. }
  846. //dpm($unhandled);
  847. // build the Dbxref
  848. if ($pub['Publication Database'] != 'AGL') {
  849. }
  850. if ($pub['Publication Accession'] and $pub['Publication Database']) {
  851. $pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
  852. unset($pub['Publication Accession']);
  853. unset($pub['Publication Database']);
  854. }
  855. // build the full authors list
  856. if (is_array($pub['Author List'])) {
  857. $authors = '';
  858. foreach ($pub['Author List'] as $author) {
  859. if (array_key_exists('valid', $author) and $author['valid'] == 'N') {
  860. // skip non-valid entries. A non-valid entry should have
  861. // a corresponding corrected entry so we can saftely skip it.
  862. continue;
  863. }
  864. if (array_key_exists('Collective', $author)) {
  865. $authors .= $author['Collective'] . ', ';
  866. }
  867. else {
  868. if (array_key_exists('Surname', $author)) {
  869. $authors .= $author['Surname'];
  870. if(array_key_exists('First Initials', $author)) {
  871. $authors .= ' ' . $author['First Initials'];
  872. }
  873. $authors .= ', ';
  874. }
  875. }
  876. }
  877. $authors = substr($authors, 0, -2);
  878. $pub['Authors'] = $authors;
  879. }
  880. else {
  881. $pub['Authors'] = $pub['Author List'];
  882. }
  883. // for Title, Abstract, Authors, convert the html entity and remove special
  884. // unicode chars that are not meant for display
  885. $pub['Title'] = preg_replace( '/[\p{So}]/u', '', mb_convert_encoding($pub['Title'], 'UTF-8', 'HTML-ENTITIES'));
  886. if (key_exists('Abstract', $pub)) {
  887. $pub['Abstract'] = preg_replace( '/[\p{So}]/u', '', mb_convert_encoding($pub['Abstract'], 'UTF-8', 'HTML-ENTITIES'));
  888. }
  889. $newauths = array();
  890. foreach ($pub['Author List'] AS $auth) {
  891. foreach($auth AS $k => $v) {
  892. $auth[$k] = preg_replace( '/[\p{So}]/u', '', mb_convert_encoding($v, 'UTF-8', 'HTML-ENTITIES'));
  893. }
  894. array_push($newauths, $auth);
  895. }
  896. $pub['Author List'] = $newauths;
  897. // build the citation
  898. $pub['Citation'] = tripal_pub_create_citation($pub);
  899. $pub['raw'] = $pub_xml;
  900. return $pub;
  901. }
  902. /**
  903. * Used for parsing of the XML results to get a set of subfields
  904. *
  905. * @param $xml
  906. * The XMl object to read
  907. * @return
  908. * An array of codes and their values
  909. *
  910. * @ingroup tripal_pub
  911. */
  912. function tripal_pub_remote_search_AGL_get_subfield($xml) {
  913. $codes = array();
  914. while ($xml->read()) {
  915. $sub_element = $xml->name;
  916. // when we've reached the end of the datafield element then break out of the while loop
  917. if ($xml->nodeType == XMLReader::END_ELEMENT and $sub_element == 'datafield') {
  918. return $codes;
  919. }
  920. // if inside the subfield element then get the code
  921. if ($xml->nodeType == XMLReader::ELEMENT and $sub_element == 'subfield') {
  922. $code = $xml->getAttribute('code');
  923. $xml->read();
  924. $value = $xml->value;
  925. $codes[$code] = $value;
  926. }
  927. }
  928. return $codes;
  929. }
  930. /**
  931. * Used for parsing of the XML results to get details about an author
  932. *
  933. * @param $xml
  934. * The XML object to read
  935. * @param $ind1
  936. * Indicates how an author record is stored; 0 means given name is first
  937. * 1 means surname is first, 3 means a family name is given
  938. *
  939. * @return
  940. *
  941. *
  942. * @ingroup tripal_pub
  943. */
  944. function tripal_pub_remote_search_AGL_get_author($xml, $ind1) {
  945. $author = array();
  946. $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
  947. foreach ($codes as $code => $value) {
  948. switch ($code) {
  949. case 'a':
  950. // remove any trailing commas
  951. $value = preg_replace('/,$/', '', $value);
  952. if ($ind1 == 0) { // Given Name is first
  953. $author['Given Name'] = $names[0];
  954. }
  955. if ($ind1 == 1) { // Surname is first
  956. // split the parts of the name using a comma
  957. $names = explode(',', $value);
  958. $author['Surname'] = $names[0];
  959. $author['Given Name'] = '';
  960. unset($names[0]);
  961. foreach($names as $index => $name) {
  962. $author['Given Name'] .= $name . ' ';
  963. }
  964. $first_names = explode(' ', $author['Given Name']);
  965. $author['First Initials'] = '';
  966. foreach ($first_names as $index => $name) {
  967. $author['First Initials'] .= substr($name, 0, 1);
  968. }
  969. }
  970. if ($ind1 == 3) { // A family name
  971. }
  972. break;
  973. }
  974. }
  975. return $author;
  976. }