#!/usr/bin/perl # evaluate.pl - do simple tabulation against a text and calcualte it Great Ideas Coefficient # Eric Lease Morgan # July 6, 2010 - for THATCamp @ DH2010 # July 7, 2010 - included Great Ideas comparison, but it seems incorrect # configure use constant READABILITY => ( '12-15', '55' ); use constant D => 223; # require use Lingua::EN::Bigram; use Lingua::EN::Fathom; use Lingua::Stem::Snowball; use Lingua::StopWords qw( getStopWords ); use Math::Round; use strict; # sanity check my $corpus = $ARGV[ 0 ]; if ( ! $corpus ) { print "Usage: $0 \n"; exit; } # initialize my $stopwords = &getStopWords( 'en' ); my $text = &slurp( $corpus ); my $stemmer = Lingua::Stem::Snowball->new( lang => 'en' ); # get stop words my @stopwords = (); my $stoplist = getStopWords( 'en' ); foreach my $stopword ( keys %$stoplist ) { push @stopwords, $stopword } # build list of stemmed words my $fathom = Lingua::EN::Fathom->new; $fathom->analyse_file( $corpus ); my %words = $fathom->unique_words; my %stems = (); foreach my $word ( keys %words ) { $stems{ $stemmer->stem( $word ) } += $words{ $word } } # count total stop words; seemingly around 50%, wow! my $total_stopwords = 0; foreach my $stopword ( @stopwords ) { if ( $words{ $stopword } ) { $total_stopwords += $words{ $stopword } } } # save calculations for future reference my $number_of_words = $fathom->num_words; my $fog = round( $fathom->fog ); my $kincaid = round( $fathom->kincaid ); my $flesch = round( $fathom->flesch ); # build bigrams and get counts my $bigrams = Lingua::EN::Bigram->new; $bigrams->text( $text ); my $word_count = $bigrams->word_count; my $bigram_count = $bigrams->bigram_count; # list the words according to frequency print "\n"; print "Top 50 words: Word (Count)\n"; print "--------------------------\n"; my $index = 0; foreach ( sort { $$word_count{ $b } <=> $$word_count{ $a } } keys %$word_count ) { next if ( length( $_ ) < 3 ); next if ( $_ =~ /[,.?!:;()\-']/ ); next if ( $$stopwords{ $_ } ); $index++; last if ( $index > 50 ); print "$_ (" , $$word_count{ $_ }, '); '; } print "\n"; print "\n"; # build bigrams and get counts my $bigrams = Lingua::EN::Bigram->new; $bigrams->text( $text ); my $word_count = $bigrams->word_count; my $bigram_count = $bigrams->bigram_count; my $tscore = $bigrams->tscore; # display, sans stop words and punctuation print "\n"; print "Top 50 bi-grams: Bigram (Count)\n"; print "-------------------------------\n"; $index = 0; foreach my $bigram ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) { # get the tokens of the bigram my ( $first_token, $second_token ) = split / /, $bigram; # skip stopwords and punctuation next if ( $$stopwords{ $first_token } ); next if ( $first_token =~ /[,.?!:;()\-']/ ); next if ( $$stopwords{ $second_token } ); next if ( $second_token =~ /[,.?!:;()\-']/ ); $index++; last if ( $index > 50 ); # output print "$bigram (" , $$bigram_count{ $bigram }, '); '; } print "\n"; print "\n"; print "\n"; # display readability my @readability = READABILITY; print "Readability: Type (Score, Index)\n"; print "--------------------------------\n"; print "Grade level ($kincaid-$fog, ", $readability[ 0 ], "); Flesch: ($flesch, ", $readability[ 1 ], ")"; print "\n"; print "\n"; print "\n"; # process and display each great idea my @ideas = (); print "Great ideas: Idea (Coefficient, Index)\n"; print "--------------------------------------\n"; while ( ) { chop; my ( $idea, $index, $d ) = split /\t/, $_; my $idea_stem = $stemmer->stem( $idea ); if ( $stems{ $idea_stem } ) { my $tfidf = round( &tfidf( $stems{ $idea_stem }, $number_of_words, D, $d ) * 1000 ); if ( $tfidf ) { print "$idea ($tfidf, $index); "; } } } print "\n"; print "\n"; # done exit; # read the CORPUS sub slurp { # open a file named by the input and return its contents my $f = shift; my $r; open F, $f or die "Can't slurp: $!\n"; $r = do { local $/; }; return $r; } # calculate relevance sub tfidf { my $n = shift; # C my $t = shift; # T my $d = shift; # D my $h = shift; # DT my $tfidf = 0; if ( $d == $h ) { $tfidf = ( $n / $t ) } else { $tfidf = ( $n / $t ) * log( $d / $h ) } return $tfidf; } # great ideas: idea, average coefficient score, total documents containing the idea __DATA__ angel 2 42 animal 8 101 aristocracy 1 7 art 3 181 astronomy 0 17 beauty 2 136 being 225 223 cause 2 200 chance 1 118 change 1 194 citizen 4 86 constitution 4 91 contingency 0 16 convention 1 47 cosmology 0 0 courage 2 90 custom 2 96 death 3 173 definition 4 69 democracy 2 17 desire 2 178 despotism 1 29 dialectic 1 43 duty 5 121 education 2 68 element 3 94 emotion 2 32 equality 2 165 eternity 2 79 evil 4 136 evolution 0 7 experience 3 115 family 2 83 fate 3 78 form 2 204 god 6 181 good 3 207 government 5 108 habit 1 110 happiness 3 135 history 2 90 honor 2 37 hypothesis 1 27 idea 7 99 imagination 2 117 immortality 1 58 induction 1 16 infinity 1 17 judgment 2 113 justice 3 129 knowledge 4 153 labor 3 30 language 2 129 law 5 170 liberty 3 76 life 2 202 logic 1 44 love 5 180 man 2 215 many 0 217 mathematics 1 36 matter 1 196 mechanics 0 33 medicine 4 68 memory 1 110 metaphysics 8 39 mind 2 195 monarchy 1 23 nature 2 208 necessity 2 116 oligarchy 1 6 one 0 222 opinion 3 153 opposition 2 123 other 0 222 pain 3 145 particular 3 143 peace 3 129 philosophy 4 86 physics 1 82 pleasure 3 149 poetry 1 49 principle 5 124 progress 2 74 prophecy 1 30 prudence 1 31 punishment 3 89 quality 3 114 quantity 4 59 reasoning 1 206 relation 5 140 religion 3 62 revolution 1 43 rhetoric 2 34 same 1 217 science 4 92 sense 47 167 sign 2 101 sin 3 82 slavery 1 31 soul 4 152 space 3 74 state 3 202 symbol 1 28 temperance 2 94 theology 1 25 time 0 222 truth 2 179 tyranny 2 37 universal 0 128 vice 2 70 virtue 3 151 war 3 141 wealth 2 86 will 1 221 wisdom 2 119 world 2 193