function tripal_pub_AGL_parse_pubxml

2.x tripal_pub.AGL.inc tripal_pub_AGL_parse_pubxml($pub_xml)
3.x tripal_chado.pub_importer_AGL.inc tripal_pub_AGL_parse_pubxml($pub_xml)
1.x AGL.inc tripal_pub_AGL_parse_pubxml($pub_xml)
1 call to tripal_pub_AGL_parse_pubxml()
tripal_pub_AGL_range in tripal_pub/includes/importers/AGL.inc

File

tripal_pub/includes/importers/AGL.inc, line 444

Code

function tripal_pub_AGL_parse_pubxml($pub_xml) {
  $pub = array();

  // we will set the default publication type as a journal article. The NAL
  // dataset doesn't specify an article type so we'll have to glean the type
  // from other information (e.g. series name has 'Proceedings' in it)
  $pub['Publication Type'][0] = 'Journal Article';

  if (!$pub_xml) {
    return $pub;
  }

  // read the XML and iterate through it.
  $xml = new XMLReader();
  $xml->xml(trim($pub_xml));
  while ($xml->read()) {
    $element = $xml->name;

    if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
      $tag = $xml->getAttribute('tag');
      $xml->read();
      $value = $xml->value;
      switch ($tag) {
        case '001': // control number
          $pub['Publication Accession'] = $value;
          break;
        case '003': // control number identifier
          break;
        case '005': // datea nd time of latest transaction
          break;
        case '006': // fixed-length data elemetns
          break;
        case '007': // physical description fixed field
          break;
        case '008': // fixed length data elements
          $month = array(
            '01' => 'Jan', '02' => 'Feb', '03' => 'Mar',
            '04' => 'Apr', '05' => 'May', '06' => 'Jun',
            '07' => 'Jul', '08' => 'Aug', '09' => 'Sep',
            '10' => 'Oct', '11' => 'Nov', '12' => 'Dec'
          );
          $date0 = substr($value, 0, 6); // date entered on file
          $date1 = substr($value, 7, 4); // year of publication
          $date2 = substr($value, 11, 4); // month of publication
          $place = substr($value, 15, 3);
          $lang = substr($value, 35, 3);
          if (preg_match('/\d\d\d\d/', $date1)) {
            $pub['Year'] = $date1;
            $pub['Publication Date'] = $date1;
          }
          if (preg_match('/\d\d/', $date2)) {
            $pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
          }
          if (!preg_match('/\s+/', $place)) {
            $pub['Published Location'] = $place;
          }
          if (!preg_match('/\s+/', $lang)) {
            $pub['Language Abbr'] = $lang;
          }
          break;
        default: // unhandled tag
          break;
      }
    }
    elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
      $tag = $xml->getAttribute('tag');
      $ind1 = $xml->getAttribute('ind1');
      $ind2 = $xml->getAttribute('ind2');
      switch ($tag) {
        case '16': // National Bibliographic Agency Control Number
          break;
        case '35': // System Control Number
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // System control number
                $pub['Publication Accession'] = $value;
                break;
            }
          }
        case '40': // Cataloging Source (NR)
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // original cataolging agency
                $pub['Publication Database'] = $value;
                break;
            }
          }
          break;
        case '72': // Subject Category Code
          break;
        case '100': // main entry-personal name
          $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
          $pub['Author List'][] = $author;
          break;
        case '110': // main entry-corporate nmae
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // Corporate name or jurisdiction name as entry elemen
                $author['Collective'] = $value;
                break;
              case 'b': // Subordinate unit
                $author['Collective'] .= ' ' . $value;
                break;
            }
          }
          $pub['Author List'][] = $author;
          break;
        case '111': // main entry-meeting name
          break;
        case '130': // main entry-uniform title
          break;

        case '210': // abbreviated title
          break;
        case '222': // key title
          break;
        case '240': // uniform title
          break;
        case '242': // translation of title by cataloging agency
          break;
        case '243': // collective uniform title
          break;
        case '245': // title statement
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Title'] = trim(preg_replace('/\.$/', '', $value));
                break;
              case 'b':
                $pub['Title'] .= ' ' . $value;
                break;
              case 'h':
                $pub['Publication Model'] = $value;
                break;
            }
          }
          break;
        case '246': // varying form of title
          break;
        case '247': // former title
          break;

        case '250': // edition statement
          break;
        case '254': // musicla presentation statement
          break;
        case '255': // cartographic mathematical data
          break;
        case '256': // computer file characteristics
          break;
        case '257': // country of producing entity
          break;
        case '258': // philatelic issue data
          break;
        case '260': // publication, distribution ,etc (imprint)
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Published Location'] = $value;
                break;
              case 'b':
                $pub['Publisher'] = $value;
                break;
              case 'c':
                $pub['Publication Date'] = $value;
                break;
            }
          }
          break;
        case '263': // projected publication date
          break;
        case '264': // production, publication, distribution, manufacture and copyright notice
          break;
        case '270': // Address
          break;

        case '300': // Address
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pages = $value;
                $pages = preg_replace('/^p\. /', '', $pages);
                $pages = preg_replace('/\.$/', '', $pages);
                if (preg_match('/p$/', $pages)) {
                  // skip this, it's the number of pages not the page numbers
                }
                else {
                  $pub['Pages'] = $pages;
                }
                break;
            }
          }
          break;


        case '500': // series statements
          $pub['Notes'] = $value;
          break;
        case '504': // Bibliography, Etc. Note
          break;
        case '520': // Summary, etc
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Abstract'] = $value;
                break;
            }
          }
          break;
        case '650': // Subject Added Entry-Topical Term
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Keywords'][] = $value;
                break;
            }
          }
          break;
        case '653': // Index Term-Uncontrolled
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                $pub['Keywords'][] = $value;
                break;
            }
          }
          break;
        case '700': // Added Entry-Personal Name
          $author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
          $pub['Author List'][] = $author;
          break;
        case '710': // Added Entry-Corporate Name
          $author = array();
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a': // Corporate name or jurisdiction name as entry elemen
                $author['Collective'] = $value;
                break;
              case 'b': // Subordinate unit
                $author['Collective'] .= ' ' . $value;
                break;
            }
          }
          $pub['Author List'][] = $author;
          break;
        case '773': // host item entry
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'a':
                if (preg_match('/Proceedings/i', $value)) {
                  $pub['Series Name'] = preg_replace('/\.$/', '', $value);
                  $pub['Publication Type'][0] = 'Conference Proceedings';
                }
                else {
                  $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
                }
                break;
              case 't':
                if (preg_match('/Proceedings/i', $value)) {
                  $pub['Series Name'] = preg_replace('/\.$/', '', $value);
                  $pub['Publication Type'][0] = 'Conference Proceedings';
                }
                $pub['Journal Name'] = preg_replace('/\.$/', '', $value);
                break;
              case 'g':
                $matches = array();
                if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
                  $pub['Publication Date'] = $matches[1];
                }
                elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
                  $year = $matches[4];
                  $month = $matches[1];
                  $day = $matches[3];
                  $pub['Publication Date'] = "$year $month $day";
                }
                elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
                  $year = $matches[3];
                  $month = $matches[1];
                  $pub['Publication Date'] = "$year $month";
                }
                elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
                  $year = $matches[2];
                  $month = $matches[1];
                  $pub['Publication Date'] = "$year $month";
                }
                if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
                  $pub['Volume'] = $matches[1];
                }
                if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
                  $pub['Volume'] = $matches[1];
                  $pub['Issue'] = $matches[3];
                }
                if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
                  $pub['Issue'] = $matches[1];
                }
                break;
              case 'p':
                $pub['Journal Abbreviation'] = $value;
                break;
              case 'z':
                $pub['ISBN'] = $value;
                break;
            }
          }
          break;
        case '852': // Location (Where is the publication held)
          break;
        case '856': // Electronic Location and Access
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          foreach ($codes as $code => $value) {
            switch ($code) {
              case 'u':
                $pub['URL'] = $value;
                break;
            }
          }
          break;
        default:
          $codes = tripal_pub_remote_search_AGL_get_subfield($xml);
          $unhandled[$tag][] = $codes;
          break;
      }
    }
  }
  //dpm($unhandled);

  // build the Dbxref
  if ($pub['Publication Database'] != 'AGL') {

  }
  if ($pub['Publication Accession'] and $pub['Publication Database']) {
    $pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
    unset($pub['Publication Accession']);
    unset($pub['Publication Database']);
  }

  // build the full authors list
  if (is_array($pub['Author List'])) {
    foreach ($pub['Author List'] as $author) {
      if ($author['valid'] == 'N') {
        // skip non-valid entries.  A non-valid entry should have
        // a corresponding corrected entry so we can saftely skip it.
        continue;
      }
      if ($author['Collective']) {
        $authors .= $author['Collective'] . ', ';
      }
      else {
        $authors .= $author['Surname'] . ' ' . $author['First Initials'] . ', ';
      }
    }
    $authors = substr($authors, 0, -2);
    $pub['Authors'] = $authors;
  }
  else {
    $pub['Authors'] = $pub['Author List'];
  }

  // build the citation
  $pub['Citation'] = tripal_pub_create_citation($pub);

  $pub['raw'] = $pub_xml;
  return $pub;
}