function tripal_pub_AGL_parse_pubxml
2.x tripal_pub.AGL.inc | tripal_pub_AGL_parse_pubxml($pub_xml) |
3.x tripal_chado.pub_importer_AGL.inc | tripal_pub_AGL_parse_pubxml($pub_xml) |
1.x AGL.inc | tripal_pub_AGL_parse_pubxml($pub_xml) |
Parse publication XML for a single publication
Description of XML format: http://www.loc.gov/marc/bibliographic/bdsummary.html
Parameters
$pub_xml: A string containing the XML for a single publications
Return value
An array containing the details of the publication
1 call to tripal_pub_AGL_parse_pubxml()
- tripal_pub_AGL_range in tripal_chado/
includes/ loaders/ tripal_chado.pub_importer_AGL.inc - Retrieves a range of publications from AGL
File
- tripal_chado/
includes/ loaders/ tripal_chado.pub_importer_AGL.inc, line 537 - This file provides support for importing and parsing of results from the USDA National Agricultural Library (AGL) database. The functions here are used by both the publication importer setup form and the publication importer. The USDA AGL database…
Code
function tripal_pub_AGL_parse_pubxml($pub_xml) {
$pub = array();
// we will set the default publication type as a journal article. The NAL
// dataset doesn't specify an article type so we'll have to glean the type
// from other information (e.g. series name has 'Proceedings' in it)
$pub['Publication Type'][0] = 'Journal Article';
if (!$pub_xml) {
return $pub;
}
// read the XML and iterate through it.
$xml = new XMLReader();
$xml->xml(trim($pub_xml));
while ($xml->read()) {
$element = $xml->name;
if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
$tag = $xml->getAttribute('tag');
$xml->read();
$value = $xml->value;
switch ($tag) {
case '001': // control number
$pub['Publication Accession'] = $value;
break;
case '003': // control number identifier
break;
case '005': // datea nd time of latest transaction
break;
case '006': // fixed-length data elemetns
break;
case '007': // physical description fixed field
break;
case '008': // fixed length data elements
$month = array(
'01' => 'Jan', '02' => 'Feb', '03' => 'Mar',
'04' => 'Apr', '05' => 'May', '06' => 'Jun',
'07' => 'Jul', '08' => 'Aug', '09' => 'Sep',
'10' => 'Oct', '11' => 'Nov', '12' => 'Dec'
);
$date0 = substr($value, 0, 6); // date entered on file
$date1 = substr($value, 7, 4); // year of publication
$date2 = substr($value, 11, 4); // month of publication
$place = substr($value, 15, 3);
$lang = substr($value, 35, 3);
if (preg_match('/\d\d\d\d/', $date1)) {
$pub['Year'] = $date1;
$pub['Publication Date'] = $date1;
}
if (preg_match('/\d\d/', $date2)) {
$pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
}
if (!preg_match('/\s+/', $place)) {
$pub['Published Location'] = $place;
}
if (!preg_match('/\s+/', $lang)) {
$pub['Language Abbr'] = $lang;
}
break;
default: // unhandled tag
break;
}
}
elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
$tag = $xml->getAttribute('tag');
$ind1 = $xml->getAttribute('ind1');
$ind2 = $xml->getAttribute('ind2');
switch ($tag) {
case '16': // National Bibliographic Agency Control Number
break;
case '35': // System Control Number
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // System control number
$pub['Publication Accession'] = $value;
break;
}
}
case '40': // Cataloging Source (NR)
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // original cataolging agency
$pub['Publication Database'] = $value;
break;
}
}
break;
case '72': // Subject Category Code
break;
case '100': // main entry-personal name
$author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
$pub['Author List'][] = $author;
break;
case '110': // main entry-corporate nmae
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // Corporate name or jurisdiction name as entry elemen
$author['Collective'] = $value;
break;
case 'b': // Subordinate unit
$author['Collective'] .= ' ' . $value;
break;
}
}
$pub['Author List'][] = $author;
break;
case '111': // main entry-meeting name
break;
case '130': // main entry-uniform title
break;
case '210': // abbreviated title
break;
case '222': // key title
break;
case '240': // uniform title
break;
case '242': // translation of title by cataloging agency
break;
case '243': // collective uniform title
break;
case '245': // title statement
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Title'] = trim(preg_replace('/\.$/', '', $value));
break;
case 'b':
$pub['Title'] .= ' ' . $value;
break;
case 'h':
$pub['Publication Model'] = $value;
break;
}
}
break;
case '246': // varying form of title
break;
case '247': // former title
break;
case '250': // edition statement
break;
case '254': // musicla presentation statement
break;
case '255': // cartographic mathematical data
break;
case '256': // computer file characteristics
break;
case '257': // country of producing entity
break;
case '258': // philatelic issue data
break;
case '260': // publication, distribution ,etc (imprint)
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Published Location'] = $value;
break;
case 'b':
$pub['Publisher'] = $value;
break;
case 'c':
$pub['Publication Date'] = $value;
break;
}
}
break;
case '263': // projected publication date
break;
case '264': // production, publication, distribution, manufacture and copyright notice
break;
case '270': // Address
break;
case '300': // Address
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pages = $value;
$pages = preg_replace('/^p\. /', '', $pages);
$pages = preg_replace('/\.$/', '', $pages);
if (preg_match('/p$/', $pages)) {
// skip this, it's the number of pages not the page numbers
}
else {
$pub['Pages'] = $pages;
}
break;
}
}
break;
case '500': // series statements
$pub['Notes'] = $value;
break;
case '504': // Bibliography, Etc. Note
break;
case '520': // Summary, etc
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Abstract'] = $value;
break;
}
}
break;
case '650': // Subject Added Entry-Topical Term
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Keywords'][] = $value;
break;
}
}
break;
case '653': // Index Term-Uncontrolled
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Keywords'][] = $value;
break;
}
}
break;
case '700': // Added Entry-Personal Name
$author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
$pub['Author List'][] = $author;
break;
case '710': // Added Entry-Corporate Name
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // Corporate name or jurisdiction name as entry elemen
$author['Collective'] = $value;
break;
case 'b': // Subordinate unit
$author['Collective'] .= ' ' . $value;
break;
}
}
$pub['Author List'][] = $author;
break;
case '773': // host item entry
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
if (preg_match('/Proceedings/i', $value)) {
$pub['Series Name'] = preg_replace('/\.$/', '', $value);
$pub['Publication Type'][0] = 'Conference Proceedings';
}
else {
$pub['Journal Name'] = preg_replace('/\.$/', '', $value);
}
break;
case 't':
if (preg_match('/Proceedings/i', $value)) {
$pub['Series Name'] = preg_replace('/\.$/', '', $value);
$pub['Publication Type'][0] = 'Conference Proceedings';
}
$pub['Journal Name'] = preg_replace('/\.$/', '', $value);
break;
case 'g':
$matches = array();
if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
$pub['Publication Date'] = $matches[1];
}
elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
$year = $matches[4];
$month = $matches[1];
$day = $matches[3];
$pub['Publication Date'] = "$year $month $day";
}
elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
$year = $matches[3];
$month = $matches[1];
$pub['Publication Date'] = "$year $month";
}
elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
$year = $matches[2];
$month = $matches[1];
$pub['Publication Date'] = "$year $month";
}
if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
$pub['Volume'] = $matches[1];
}
if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
$pub['Volume'] = $matches[1];
$pub['Issue'] = $matches[3];
}
if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
$pub['Issue'] = $matches[1];
}
break;
case 'p':
$pub['Journal Abbreviation'] = $value;
break;
case 'z':
$pub['ISBN'] = $value;
break;
}
}
break;
case '852': // Location (Where is the publication held)
break;
case '856': // Electronic Location and Access
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'u':
$pub['URL'] = $value;
break;
}
}
break;
default:
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
$unhandled[$tag][] = $codes;
break;
}
}
}
// build the Dbxref
if ($pub['Publication Database'] != 'AGL') {
}
if ($pub['Publication Accession'] and $pub['Publication Database']) {
$pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
unset($pub['Publication Accession']);
unset($pub['Publication Database']);
}
// build the full authors list
if (is_array($pub['Author List'])) {
$authors = '';
foreach ($pub['Author List'] as $author) {
if (array_key_exists('valid', $author) and $author['valid'] == 'N') {
// skip non-valid entries. A non-valid entry should have
// a corresponding corrected entry so we can saftely skip it.
continue;
}
if (array_key_exists('Collective', $author)) {
$authors .= $author['Collective'] . ', ';
}
else {
if (array_key_exists('Surname', $author)) {
$authors .= $author['Surname'];
if (array_key_exists('First Initials', $author)) {
$authors .= ' ' . $author['First Initials'];
}
$authors .= ', ';
}
}
}
$authors = substr($authors, 0, -2);
$pub['Authors'] = $authors;
}
else {
$pub['Authors'] = $pub['Author List'];
}
// for Title, Abstract, Authors, convert the html entity and remove special unicode chars that are not meant for display
$pub['Title'] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($pub['Title'], 'UTF-8', 'HTML-ENTITIES'));
if (key_exists('Abstract', $pub)) {
$pub['Abstract'] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($pub['Abstract'], 'UTF-8', 'HTML-ENTITIES'));
}
$newauths = array();
foreach ($pub['Author List'] as $auth) {
foreach ($auth as $k => $v) {
$auth[$k] = preg_replace('/[\p{So}]/u', '', mb_convert_encoding($v, 'UTF-8', 'HTML-ENTITIES'));
}
array_push($newauths, $auth);
}
$pub['Author List'] = $newauths;
// build the citation
$pub['Citation'] = chado_pub_create_citation($pub);
$pub['raw'] = $pub_xml;
return $pub;
}