function tripal_pub_AGL_parse_pubxml
2.x tripal_pub.AGL.inc | tripal_pub_AGL_parse_pubxml($pub_xml) |
3.x tripal_chado.pub_importer_AGL.inc | tripal_pub_AGL_parse_pubxml($pub_xml) |
1.x AGL.inc | tripal_pub_AGL_parse_pubxml($pub_xml) |
1 call to tripal_pub_AGL_parse_pubxml()
- tripal_pub_AGL_range in tripal_pub/
includes/ importers/ AGL.inc
File
- tripal_pub/
includes/ importers/ AGL.inc, line 444
Code
function tripal_pub_AGL_parse_pubxml($pub_xml) {
$pub = array();
// we will set the default publication type as a journal article. The NAL
// dataset doesn't specify an article type so we'll have to glean the type
// from other information (e.g. series name has 'Proceedings' in it)
$pub['Publication Type'][0] = 'Journal Article';
if (!$pub_xml) {
return $pub;
}
// read the XML and iterate through it.
$xml = new XMLReader();
$xml->xml(trim($pub_xml));
while ($xml->read()) {
$element = $xml->name;
if ($xml->nodeType == XMLReader::ELEMENT and $element == 'controlfield') {
$tag = $xml->getAttribute('tag');
$xml->read();
$value = $xml->value;
switch ($tag) {
case '001': // control number
$pub['Publication Accession'] = $value;
break;
case '003': // control number identifier
break;
case '005': // datea nd time of latest transaction
break;
case '006': // fixed-length data elemetns
break;
case '007': // physical description fixed field
break;
case '008': // fixed length data elements
$month = array(
'01' => 'Jan', '02' => 'Feb', '03' => 'Mar',
'04' => 'Apr', '05' => 'May', '06' => 'Jun',
'07' => 'Jul', '08' => 'Aug', '09' => 'Sep',
'10' => 'Oct', '11' => 'Nov', '12' => 'Dec'
);
$date0 = substr($value, 0, 6); // date entered on file
$date1 = substr($value, 7, 4); // year of publication
$date2 = substr($value, 11, 4); // month of publication
$place = substr($value, 15, 3);
$lang = substr($value, 35, 3);
if (preg_match('/\d\d\d\d/', $date1)) {
$pub['Year'] = $date1;
$pub['Publication Date'] = $date1;
}
if (preg_match('/\d\d/', $date2)) {
$pub['Publication Date'] = $date1 . " " . $month[substr($date2, 0, 2)] . " " . substr($date2, 3, 2);
}
if (!preg_match('/\s+/', $place)) {
$pub['Published Location'] = $place;
}
if (!preg_match('/\s+/', $lang)) {
$pub['Language Abbr'] = $lang;
}
break;
default: // unhandled tag
break;
}
}
elseif ($xml->nodeType == XMLReader::ELEMENT and $element == 'datafield') {
$tag = $xml->getAttribute('tag');
$ind1 = $xml->getAttribute('ind1');
$ind2 = $xml->getAttribute('ind2');
switch ($tag) {
case '16': // National Bibliographic Agency Control Number
break;
case '35': // System Control Number
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // System control number
$pub['Publication Accession'] = $value;
break;
}
}
case '40': // Cataloging Source (NR)
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // original cataolging agency
$pub['Publication Database'] = $value;
break;
}
}
break;
case '72': // Subject Category Code
break;
case '100': // main entry-personal name
$author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
$pub['Author List'][] = $author;
break;
case '110': // main entry-corporate nmae
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // Corporate name or jurisdiction name as entry elemen
$author['Collective'] = $value;
break;
case 'b': // Subordinate unit
$author['Collective'] .= ' ' . $value;
break;
}
}
$pub['Author List'][] = $author;
break;
case '111': // main entry-meeting name
break;
case '130': // main entry-uniform title
break;
case '210': // abbreviated title
break;
case '222': // key title
break;
case '240': // uniform title
break;
case '242': // translation of title by cataloging agency
break;
case '243': // collective uniform title
break;
case '245': // title statement
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Title'] = trim(preg_replace('/\.$/', '', $value));
break;
case 'b':
$pub['Title'] .= ' ' . $value;
break;
case 'h':
$pub['Publication Model'] = $value;
break;
}
}
break;
case '246': // varying form of title
break;
case '247': // former title
break;
case '250': // edition statement
break;
case '254': // musicla presentation statement
break;
case '255': // cartographic mathematical data
break;
case '256': // computer file characteristics
break;
case '257': // country of producing entity
break;
case '258': // philatelic issue data
break;
case '260': // publication, distribution ,etc (imprint)
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Published Location'] = $value;
break;
case 'b':
$pub['Publisher'] = $value;
break;
case 'c':
$pub['Publication Date'] = $value;
break;
}
}
break;
case '263': // projected publication date
break;
case '264': // production, publication, distribution, manufacture and copyright notice
break;
case '270': // Address
break;
case '300': // Address
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pages = $value;
$pages = preg_replace('/^p\. /', '', $pages);
$pages = preg_replace('/\.$/', '', $pages);
if (preg_match('/p$/', $pages)) {
// skip this, it's the number of pages not the page numbers
}
else {
$pub['Pages'] = $pages;
}
break;
}
}
break;
case '500': // series statements
$pub['Notes'] = $value;
break;
case '504': // Bibliography, Etc. Note
break;
case '520': // Summary, etc
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Abstract'] = $value;
break;
}
}
break;
case '650': // Subject Added Entry-Topical Term
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Keywords'][] = $value;
break;
}
}
break;
case '653': // Index Term-Uncontrolled
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
$pub['Keywords'][] = $value;
break;
}
}
break;
case '700': // Added Entry-Personal Name
$author = tripal_pub_remote_search_AGL_get_author($xml, $ind1);
$pub['Author List'][] = $author;
break;
case '710': // Added Entry-Corporate Name
$author = array();
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a': // Corporate name or jurisdiction name as entry elemen
$author['Collective'] = $value;
break;
case 'b': // Subordinate unit
$author['Collective'] .= ' ' . $value;
break;
}
}
$pub['Author List'][] = $author;
break;
case '773': // host item entry
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'a':
if (preg_match('/Proceedings/i', $value)) {
$pub['Series Name'] = preg_replace('/\.$/', '', $value);
$pub['Publication Type'][0] = 'Conference Proceedings';
}
else {
$pub['Journal Name'] = preg_replace('/\.$/', '', $value);
}
break;
case 't':
if (preg_match('/Proceedings/i', $value)) {
$pub['Series Name'] = preg_replace('/\.$/', '', $value);
$pub['Publication Type'][0] = 'Conference Proceedings';
}
$pub['Journal Name'] = preg_replace('/\.$/', '', $value);
break;
case 'g':
$matches = array();
if (preg_match('/^(\d\d\d\d)/', $value, $matches)) {
$pub['Publication Date'] = $matches[1];
}
elseif (preg_match('/(.*?)(\.|\s+)\s*(\d+),\s(\d\d\d\d)/', $value, $matches)) {
$year = $matches[4];
$month = $matches[1];
$day = $matches[3];
$pub['Publication Date'] = "$year $month $day";
}
elseif (preg_match('/\((.*?)(\.|\s+)(\d\d\d\d)\)/', $value, $matches)) {
$year = $matches[3];
$month = $matches[1];
$pub['Publication Date'] = "$year $month";
}
elseif (preg_match('/^(.*?) (\d\d\d\d)/', $value, $matches)) {
$year = $matches[2];
$month = $matches[1];
$pub['Publication Date'] = "$year $month";
}
if (preg_match('/v\. (.*?)(,|\s+)/', $value, $matches)) {
$pub['Volume'] = $matches[1];
}
if (preg_match('/v\. (.*?)(,|\s+)\((.*?)\)/', $value, $matches)) {
$pub['Volume'] = $matches[1];
$pub['Issue'] = $matches[3];
}
if (preg_match('/no\. (.*?)(\s|$)/', $value, $matches)) {
$pub['Issue'] = $matches[1];
}
break;
case 'p':
$pub['Journal Abbreviation'] = $value;
break;
case 'z':
$pub['ISBN'] = $value;
break;
}
}
break;
case '852': // Location (Where is the publication held)
break;
case '856': // Electronic Location and Access
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
foreach ($codes as $code => $value) {
switch ($code) {
case 'u':
$pub['URL'] = $value;
break;
}
}
break;
default:
$codes = tripal_pub_remote_search_AGL_get_subfield($xml);
$unhandled[$tag][] = $codes;
break;
}
}
}
//dpm($unhandled);
// build the Dbxref
if ($pub['Publication Database'] != 'AGL') {
}
if ($pub['Publication Accession'] and $pub['Publication Database']) {
$pub['Publication Dbxref'] = $pub['Publication Database'] . ":" . $pub['Publication Accession'];
unset($pub['Publication Accession']);
unset($pub['Publication Database']);
}
// build the full authors list
if (is_array($pub['Author List'])) {
foreach ($pub['Author List'] as $author) {
if ($author['valid'] == 'N') {
// skip non-valid entries. A non-valid entry should have
// a corresponding corrected entry so we can saftely skip it.
continue;
}
if ($author['Collective']) {
$authors .= $author['Collective'] . ', ';
}
else {
$authors .= $author['Surname'] . ' ' . $author['First Initials'] . ', ';
}
}
$authors = substr($authors, 0, -2);
$pub['Authors'] = $authors;
}
else {
$pub['Authors'] = $pub['Author List'];
}
// build the citation
$pub['Citation'] = tripal_pub_create_citation($pub);
$pub['raw'] = $pub_xml;
return $pub;
}