#!/usr/bin/perl # measure.pl - calculate "great ideas coefficient" (and readability scores) against a corpus # Eric Lease Morgan # May 28, 2010 - first investigations # May 31, 2010 - started adding tfidf # define use constant CORPUS => '/disk01/www/html/main/sandbox/great-books/corpus/'; use constant D => 223; use constant IDEAS => '/disk01/www/html/main/sandbox/great-books/etc/ideas.txt'; use constant SOLR => 'http://localhost:210/solr/great-books'; use constant XML => '/disk01/www/html/main/sandbox/great-books/great-books.xml'; # require use Lingua::EN::Fathom; use Lingua::Stem::Snowball; use Lingua::StopWords qw( getStopWords ); use Math::Round; use strict; use WebService::Solr; use XML::XPath; # initialize my $parser = XML::XPath->new( filename => XML ); my $stemmer = Lingua::Stem::Snowball->new( lang => 'en' ); my $solr = WebService::Solr->new( SOLR ); my @ideas = &slurp_words( IDEAS ); my $count = $ARGV[ 0 ]; $| = 1; # sanity check if ( ! $count ) { print "Usage: $0 integer\n"; exit; } # get stop words my @stopwords = (); my $stoplist = getStopWords( 'en' ); foreach my $stopword ( keys %$stoplist ) { push @stopwords, $stopword } # process each book in the XML (index) file my $index = 0; my $books = $parser->find( '//book' ); foreach my $book ( $books->get_nodelist ) { # pass over previously processed books next if ( $book->getAttribute( 'aristocracy' ) ne '' ); # increment and check $index++; last if ( $index > $count ); # build a file name my $identifier = $book->getAttribute( 'id' ); my $file = CORPUS . $identifier . '.txt'; # build list of word stems (library, librarian, libraries, etc.) my $fathom = Lingua::EN::Fathom->new; $fathom->analyse_file( $file ); my %words = $fathom->unique_words; my %stems = (); foreach my $word ( keys %words ) { $stems{ $stemmer->stem( $word ) } += $words{ $word } } # count total stop words; seemingly around 50%, wow! my $total_stopwords = 0; foreach my $stopword ( @stopwords ) { if ( $words{ $stopword } ) { $total_stopwords += $words{ $stopword } } } # calculate readability scores my $number_of_words = $fathom->num_words; my $fog = round( $fathom->fog ); my $kincaid = round( $fathom->kincaid ); my $flesch = round( $fathom->flesch ); # echo them print " identifier: $identifier\n"; print " words: $number_of_words\n"; print " kincaid: $kincaid\n"; print " fog: $fog\n"; print " flesch: $flesch\n"; # update attributes; (would rather create elements?) $book->setAttribute( 'words', $number_of_words ); $book->setAttribute( 'fog', $fog ); $book->setAttribute( 'kincaid', $kincaid ); $book->setAttribute( 'flesch', $flesch ); # calculate coefficients; the really hard work is here my $great_ideas = 0; print " ideas: "; foreach my $idea ( @ideas ) { # process each idea if ( $words{ $idea } ) { # stem the idea and search for it my $idea_stem = $stemmer->stem( $idea ); if ( $idea eq 'universal' ) { $idea_stem = 'universe' } # a hack print $idea, ' (' , $stems{ $idea_stem }; my $d = $solr->search( $idea_stem )->content->{ response }->{ numFound }; next if ( $d == 0 ); # should never be true but is; stop word list needs updating? # calculate tfidf and add it to the coefficient my $tfidf = round( &tfidf( $stems{ $idea_stem }, ( $number_of_words - $total_stopwords ), D, $d ) * 10000 ); $great_ideas += $tfidf; # finish dislay and update print ", $tfidf); "; $book->setAttribute( $idea, $tfidf ); } # update attributes; (would rather create elements) else { $book->setAttribute( $idea, 0 ) } } print "\n"; # display and update print " coefficient: $great_ideas\n"; print "\n"; $book->setAttribute( 'coefficient', $great_ideas ); # save open OUT, " > " . XML or die "Can't open XML ($!)\n"; print OUT $parser->findnodes_as_string( '/' ); close OUT; } # done exit; # calculate tfidf sub tfidf { my $n = shift; # C my $t = shift; # T my $d = shift; # D my $h = shift; # DT my $tfidf = 0; if ( $d == $h ) { $tfidf = ( $n / $t ) } else { $tfidf = ( $n / $t ) * log( $d / $h ) } return $tfidf; } sub slurp_words { my $file = shift; my @words = (); open ( S, " < $file" ) or die "Can't open $file ($!)\n"; while ( ) { chop; next if ( ! $_ ); # blank line next if ( /^#/ ); # comments push @words, $_; } close S; return @words; }