#!/usr/bin/perl # build-corpus.pl - create a corpus of the great books complete with indexes # Eric Lease Morgan # May 27, 2010 - first investigations # May 28, 2010 - started downloading texts and creating indexes # require use strict; use XML::XPath; use LWP::UserAgent; use CGI; # define use constant BOOKMARKS => '/disk01/www/html/main/sandbox/great-books/etc/bookmarks.xml'; use constant HTTPROOT => 'http://infomotions.com/sandbox/great-books/corpus/'; use constant CORPUS => '/disk01/www/html/main/sandbox/great-books/corpus/'; use constant HTML => '/disk01/www/html/main/sandbox/great-books/great-books.html'; use constant XML => '/disk01/www/html/main/sandbox/great-books/great-books.xml'; # build a list of books from a bookmark file my $parser = XML::XPath->new( filename => BOOKMARKS ); my $anchors = $parser->find( '//A' ); my %books = (); foreach my $anchor ( $anchors->get_nodelist ) { $books{ $anchor->getAttribute( 'HREF' ) } = $anchor->string_value } # initialize my $ua = LWP::UserAgent->new; my $xml = ""; my $cgi = CGI->new; my $list = ''; # process each book foreach ( sort { $books{ $a } cmp $books{ $b } } keys %books ) { # get the book's metadata my $original_url = $_; my ( $author, $title ) = split / \/ /, $books{ $_ }; # generate meaningful file name my @author_words = split / /, $author; my $author_word = lc( $author_words[ 0 ] ); $title =~ s/(\w+)/\u\L$1/g; $title =~ s/'S/'s/g; my @title_words = split / /, $title; my $title_word = lc( $title_words[ 0 ] ); $title_word =~ s/'//; my $integer = unpack( "%32C*", "$author$title" ) % 65535; my $identifier = "$author_word-$title_word-$integer"; my $filename = "$identifier.txt"; my $local_url = HTTPROOT . $filename; # echo print " author: $author\n"; print " title: $title\n"; print " original url: $original_url\n"; print " identifier: $identifier\n"; print " filename: $filename\n"; print " local url: $local_url\n"; print "\n"; # build xml and html lists $xml .= "$author$title$local_url$original_url"; $list .= $cgi->li( "$author / $title (" . $cgi->a({ href => $original_url}, "original location" ) . ' | ' . $cgi->a({ href => $local_url}, "local mirror" ) . ')' ); next; # mirror the book's content my $request = HTTP::Request->new( GET => $original_url ); my $response = $ua->request( $request ); open BOOK, " > " . CORPUS . $filename or die "Can't open $filename ($!)\n"; print BOOK $response->content; close BOOK; } # save xml $xml .= ""; open OUT, " > " . XML or die "Can't open XML ($!)\n"; print OUT $xml; close OUT; # save html my $html = &template; $html =~ s/##LIST##/$cgi->ol( $list )/e; open OUT, " > " . HTML or die "Can't open HTML ($!)\n"; print OUT $html; close OUT; # done exit; sub template { return < Great Books of the Western World

Great Books of the Western World

This is the beginnings of a list. (Librarians love lists.) Specifically, it is a list of the Great Books of the Western World. The ultimate goal is to create a corpus of "great books", do some text analysis against them, and actually measure the "greatness" of each. The list is incomplete -- about 191 of the 250 some odd titles:

##LIST##

Creator: Eric Lease Morgan <eric_morgan\@infomotions.com>
Date created: May 28, 2010
Date updated: May 28, 2010
URL: http://infomotions.com/sandbox/great-books/

EOF }