function tripal_pub_AGL_parse_pubxml

2.x tripal_pub.AGL.inc tripal_pub_AGL_parse_pubxml($pub_xml)
3.x tripal_chado.pub_importer_AGL.inc tripal_pub_AGL_parse_pubxml($pub_xml)
1.x AGL.inc tripal_pub_AGL_parse_pubxml($pub_xml)

Parse publication XML for a single publication

Description of XML format: http://www.loc.gov/marc/bibliographic/bdsummary.html

Parameters

$pub_xml: A string containing the XML for a single publications

Return value

An array containing the details of the publication

1 call to tripal_pub_AGL_parse_pubxml()
tripal_pub_AGL_range in tripal_chado/includes/loaders/tripal_chado.pub_importer_AGL.inc
Retrieves a range of publications from AGL

File

tripal_chado/includes/loaders/tripal_chado.pub_importer_AGL.inc, line 537
This file provides support for importing and parsing of results from the USDA National Agricultural Library (AGL) database. The functions here are used by both the publication importer setup form and the publication importer. The USDA AGL database…

Code

function tripal_pub_AGL_parse_pubxml($pub_xml) {
  $pub = array();

  // we will set the default publication type as a journal article. The NAL
  // dataset doesn't specify an article type so we'll have to glean the type
  // from other information (e.g. series name has 'Proceedings' in it)
  $pub['Publication Type'][0] = 'Journal Article';

  if (!$pub_xml) {
    return $pub;
  }

  // read the XML and iterate through it.
  $xml = new XMLReader();
  $xml->xml(trim($pub_xml));
  while ($xml->read()) {
    $element = $xml->name;

    if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
      $tag = $xml->getAttribute('tag');
      $xml->read();
      $value = $xml->value;
      switch ($tag) {
        case '001': // control number
          $pub['Publication Accession'] = $value;
          break;
        case '003': // control number identifier
          break;
        case '005': // datea nd time of latest transaction
          break;
        case '006': // fixed-length data elemetns
          break;
        case '007': // physical description fixed field
          break;
        case '008': // fixed length data elements
          $month = array(
            '01' => 'Jan', '02' => 'Feb', '03' => 'Mar',
            '04' => 'Apr', '05' => 'May', '06' => 'Jun',
            '07' => 'Jul', '08' => 'Aug', '09' => 'Sep',
            '10' => 'Oct', '11' => 'Nov', '12' => 'Dec'
          );
          $date0 = substr($value, 0, 6); // date entered on file
          $date1 = substr($value, 7, 4); // year of publication
          $date2 = substr($value, 11, 4); // month of publication
          $place = substr($value, 15, 3);
          $lang = substr($value, 35, 3);
          if (preg_match('/\d\d\d\d/', $date1)) {
            $pub['Year'] = $date1;
            $pub['Publication Date'] = $date1;
          }
          if (preg_match('/\d\d/', $date2)) {
            $pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
          }
          if (!preg_match('/\s+/', $place)) {
            $pub['Published Location'] = $place;
          }
          if (!preg_match('/\s+/', $lang)) {
            $pub['Language Abbr'] = $lang;
          }
          break;
        default: // unhandled tag
          break;
      }
    }
    elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
      $tag = $xml->getAttribute('tag');
      $ind1 = $xml->getAttribute('ind1');
      $ind2 = $xml->getAttribute('ind2');
      switch ($tag) {
        case '16': // National Bibliographic Agency Control Number
          break;
        case '35': // System Control Number
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // System control number
                $pub['Publication Accession'] = $value;
                break;
            }
          }
        case '40': // Cataloging Source (NR)
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // original cataolging agency
                $pub['Publication Database'] = $value;
                break;
            }
          }
          break;
        case '72': // Subject Category Code
          break;
        case '100': // main entry-personal name
          $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
          $pub['Author List'][] = $author;
          break;
        case '110': // main entry-corporate nmae
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // Corporate name or jurisdiction name as entry elemen
                $author['Collective'] = $value;
                break;
              case 'b': // Subordinate unit
                $author['Collective'] .= ' ' . $value;
                break;
            }
          }
          $pub['Author List'][] = $author;
          break;
        case '111': // main entry-meeting name
          break;
        case '130': // main entry-uniform title
          break;

        case '210': // abbreviated title
          break;
        case '222': // key title
          break;
        case '240': // uniform title
          break;
        case '242': // translation of title by cataloging agency
          break;
        case '243': // collective uniform title
          break;
        case '245': // title statement
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Title'] = trim(preg_replace('/\.$/', '', $value));
                break;
              case 'b':
                $pub['Title'] .= ' ' . $value;
                break;
              case 'h':
                $pub['Publication Model'] = $value;
                break;
            }
          }
          break;
        case '246': // varying form of title
          break;
        case '247': // former title
          break;

        case '250': // edition statement
          break;
        case '254': // musicla presentation statement
          break;
        case '255': // cartographic mathematical data
          break;
        case '256': // computer file characteristics
          break;
        case '257': // country of producing entity
          break;
        case '258': // philatelic issue data
          break;
        case '260': // publication, distribution ,etc (imprint)
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Published Location'] = $value;
                break;
              case 'b':
                $pub['Publisher'] = $value;
                break;
              case 'c':
                $pub['Publication Date'] = $value;
                break;
            }
          }
          break;
        case '263': // projected publication date
          break;
        case '264': // production, publication, distribution, manufacture and copyright notice
          break;
        case '270': // Address
          break;

        case '300': // Address
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pages = $value;
                $pages = preg_replace('/^p\. /', '', $pages);
                $pages = preg_replace('/\.$/', '', $pages);
                if (preg_match('/p$/', $pages)) {
                  // skip this, it's the number of pages not the page numbers
                }
                else {
                  $pub['Pages'] = $pages;
                }
                break;
            }
          }
          break;


        case '500': // series statements
          $pub['Notes'] = $value;
          break;
        case '504': // Bibliography, Etc. Note
          break;
        case '520': // Summary, etc
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Abstract'] = $value;
                break;
            }
          }
          break;
        case '650': // Subject Added Entry-Topical Term
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Keywords'][] = $value;
                break;
            }
          }
          break;
        case '653': // Index Term-Uncontrolled
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Keywords'][] = $value;
                break;
            }
          }
          break;
        case '700': // Added Entry-Personal Name
          $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
          $pub['Author List'][] = $author;
          break;
        case '710': // Added Entry-Corporate Name
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // Corporate name or jurisdiction name as entry elemen
                $author['Collective'] = $value;
                break;
              case 'b': // Subordinate unit
                $author['Collective'] .= ' ' . $value;
                break;
            }
          }
          $pub['Author List'][] = $author;
          break;
        case '773': // host item entry
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                if (preg_match('/Proceedings/i', $value)) {
                  $pub['Series Name'] = preg_replace('/\.$/', '', $value);
                  $pub['Publication Type'][0] = 'Conference Proceedings';
                }
                else {
                  $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
                }
                break;
              case 't':
                if (preg_match('/Proceedings/i', $value)) {
                  $pub['Series Name'] = preg_replace('/\.$/', '', $value);
                  $pub['Publication Type'][0] = 'Conference Proceedings';
                }
                $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
                break;
              case 'g':
                $matches = array();
                if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
                  $pub['Publication Date'] = $matches[1];
                }
                elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
                  $year = $matches[4];
                  $month = $matches[1];
                  $day = $matches[3];
                  $pub['Publication Date'] = "$year $month $day";
                }
                elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
                  $year = $matches[3];
                  $month = $matches[1];
                  $pub['Publication Date'] = "$year $month";
                }
                elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
                  $year = $matches[2];
                  $month = $matches[1];
                  $pub['Publication Date'] = "$year $month";
                }
                if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
                  $pub['Volume'] = $matches[1];
                }
                if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
                  $pub['Volume'] = $matches[1];
                  $pub['Issue'] = $matches[3];
                }
                if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
                  $pub['Issue'] = $matches[1];
                }
                break;
              case 'p':
                $pub['Journal Abbreviation'] = $value;
                break;
              case 'z':
                $pub['ISBN'] = $value;
                break;
            }
          }
          break;
        case '852': // Location (Where is the publication held)
          break;
        case '856': // Electronic Location and Access
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'u':
                $pub['URL'] = $value;
                break;
            }
          }
          break;
        default:
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          $unhandled[$tag][] = $codes;
          break;
      }
    }
  }

  // build the Dbxref
  if ($pub['Publication Database'] != 'AGL') {

  }
  if ($pub['Publication Accession'] and $pub['Publication Database']) {
    $pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
    unset($pub['Publication Accession']);
    unset($pub['Publication Database']);
  }

  // build the full authors list
  if (is_array($pub['Author List'])) {
    $authors = '';
    foreach ($pub['Author List'] as $author) {
      if (array_key_exists('valid', $author) and $author['valid'] == 'N') {
        // skip non-valid entries.  A non-valid entry should have
        // a corresponding corrected entry so we can saftely skip it.
        continue;
      }
      if (array_key_exists('Collective', $author)) {
        $authors .= $author['Collective'] . ', ';
      }
      else {
        if (array_key_exists('Surname', $author)) {
          $authors .= $author['Surname'];
          if (array_key_exists('First Initials', $author)) {
            $authors .= ' ' . $author['First Initials'];
          }
          $authors .= ', ';
        }
      }
    }
    $authors = substr($authors, 0, -2);
    $pub['Authors'] = $authors;
  }
  else {
    $pub['Authors'] = $pub['Author List'];
  }

  // for Title, Abstract, Authors, convert the html entity and remove special unicode chars that are not meant for display
  $pub['Title'] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($pub['Title'], 'UTF-8', 'HTML-ENTITIES'));
  if (key_exists('Abstract', $pub)) {
    $pub['Abstract'] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($pub['Abstract'], 'UTF-8', 'HTML-ENTITIES'));
  }
  $newauths = array();
  foreach ($pub['Author List'] as $auth) {
    foreach ($auth as $k => $v) {
      $auth[$k] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($v, 'UTF-8', 'HTML-ENTITIES'));
    }
    array_push($newauths, $auth);
  }
  $pub['Author List'] = $newauths;

  // build the citation
  $pub['Citation'] = chado_pub_create_citation($pub);

  $pub['raw'] = $pub_xml;

  return $pub;
}