#!/usr/bin/perl # index.pl - index the corpus of great books # Eric Lease Morgan # May 30, 2010 - first investigations # define use constant SOLR => 'http://localhost:8983/solr/great-books'; use constant XML => '/var/www/html/main/sandbox/great-books/great-books.xml'; use constant CORPUS => '/var/www/html/main/sandbox/great-books/corpus/'; # require use strict; use WebService::Solr; use XML::XPath; # initialize my $solr = WebService::Solr->new( SOLR ); my $parser = XML::XPath->new( filename => XML ); my $books = $parser->find( '//book' ); my $index = 0; foreach my $book ( $books->get_nodelist ) { # get metadata my $identifier = $book->getAttribute( 'id' ); my $title = $book->findvalue( 'title' ); my $creator = $book->findvalue( 'author' ); my $url = $book->findvalue( 'local_url' ); # get full text my $filename = CORPUS . $identifier . '.txt'; my $fulltext = &escape_entities( &slurp( $filename )); # echo print " identifier: $identifier\n"; print " creator: $creator\n"; print " title: $title\n"; print " url: $url\n"; print ' length of text: ' , length( $fulltext ), "\n"; print "\n"; # create solr/lucene document my $solr_id = WebService::Solr::Field->new( id => "$identifier" ); my $solr_creator = WebService::Solr::Field->new( creator => "$creator" ); my $solr_title = WebService::Solr::Field->new( title => "$title" ); my $solr_url = WebService::Solr::Field->new( url => "$url" ); my $solr_fulltext = WebService::Solr::Field->new( fulltext => "$fulltext" ); # fill up a document my $doc = WebService::Solr::Document->new; $doc->add_fields(( $solr_id, $solr_creator, $solr_title, $solr_url, $solr_fulltext )); # save $solr->add( $doc ); } # done $solr->commit; exit; sub escape_entities { # get the input my $s = shift; # escape $s =~ s/&/&/g; $s =~ s//>/g; $s =~ s/"/"/g; $s =~ s/'/'/g; # done return $s; } sub slurp { # open a file named by the input and return its contents my $f = shift; my $r; open F, $f or die "Can't slurp: $!\n"; $r = do { local $/; }; return $r; }