#!/usr/bin/perl # search.pl - query id.loc.gov for authorities # Eric Lease Morgan # March 24, 2016 - first cut # March 25, 2016 - aded second search, which added a lot of logic # configure use constant TEMPLATE => 'http://id.loc.gov/search/?q=##QUERY##&q=&format=atom'; use constant CURL => '/usr/bin/curl'; use constant TRIES => 2; # require use strict; use XML::XPath; # sanity check my $db = $ARGV[ 0 ]; my $output = $ARGV[ 1 ]; if ( ! $db or ! $output ) { print "Usage: $0 [text | db ]\n"; exit; } # initialize if ( $output eq 'db' ) { print "id\tbrief name\tdate\tfull name\ttries\ttotal\tauthorized name\tURI\tLCCN\n" } binmode STDOUT, ':utf8'; # process each record in the list open LIST, " < $db" or die "Can't open database file ($db): $!. Call Eric.\n"; while ( ) { # parse chop; my ( $id, $briefName, $date, $fullName ) = split /\t/, $_; # monitor processing if ( $output eq 'db' ) { print STDERR "$id\n" } # skip items with single quotes or square bracket(s) or parentheses; weird next if ( $briefName =~ /'/ ); next if ( $briefName =~ /\[/ ); next if ( $briefName =~ /\(/ ); # try multiple times to find a single match for ( my $tries = 1; $tries <= TRIES; $tries++ ) { # initialize my $done = 0; my $url = TEMPLATE; # simple query with only simple name if ( $tries == 1 ) { $url =~ s/##QUERY##/$briefName/e; $url =~ s/ /+/g; $url =~ s/"/\"/g; $url =~ s/'/\'/g; } # query with name and date elsif ( $tries == 2 ) { # remove beginning non-digits $date =~ s/^\D+//; # remove everything after the initial digits $date =~ s/(\d+)\W+.*/$1/; # build a query and the resulting url, even if the date is empty my $query = "$briefName $date"; $url =~ s/##QUERY##/$query/e; $url =~ s/ /+/g; $url =~ s/"/\"/g; $url =~ s/'/\'/g; } # search my $cmd = CURL . qq( "$url" ) . '2>/dev/null'; my $results = `$cmd`; # evaluate my $xpath = XML::XPath->new( xml => $results ); my $entries = $xpath->find( '/feed/entry' ); my $total = $entries->size; # if necessary, parse my $authorizedName = ''; my $uri = ''; my $lccn = ''; if ( $total == 1 ) { # extract authorized headings; be careful because the uri might not be in the first link $authorizedName = $xpath->findvalue( '/feed/entry/title' ); $uri = $xpath->findvalue( '@href', $xpath->find( '/feed/entry/link' )->shift )->value; my @elements = split /\//, $uri; $lccn = $elements[ $#elements ]; $done = 1; } # branch according to desired OUTPUT if ( $total == 0 or $done or $tries == TRIES ) { # human-readable output if ( $output eq 'text' ) { # echo print " id = $id\n"; print " try = $tries\n"; print " brief name = $briefName\n"; print " date = $date\n"; print " full name = $fullName\n"; print " url = $url\n"; #print " cmd = $cmd\n"; #print " results = $results\n"; print " total = $total\n"; print " authorized name = $authorizedName\n"; print " uri = $uri\n"; print " lccn = $lccn\n"; print "\n"; } # database output elsif ( $output eq 'db' ) { print "$id\t$briefName\t$date\t$fullName\t$tries\t$total\t$authorizedName\t$uri\t$lccn\n"; } # error else { print "Error: Unknown value for output ($output). Call Eric.\n"; exit; } } # stop searching if none were found if ( $total == 0 ) { last } # stop searching if done if ( $done ) { last } } } # done exit;