#!/usr/bin/perl

# evaluate.pl - do simple tabulation against a text and calcualte it Great Ideas Coefficient

# Eric Lease Morgan <emorgan@nd.edu>
# July 6, 2010 - for THATCamp @ DH2010
# July 7, 2010 - included Great Ideas comparison, but it seems incorrect


# configure
use constant READABILITY => ( '12-15', '55' );
use constant D           => 223;

# require
use Lingua::EN::Bigram;
use Lingua::EN::Fathom;
use Lingua::Stem::Snowball;
use Lingua::StopWords qw( getStopWords );
use Math::Round;
use strict;

# sanity check
my $corpus = $ARGV[ 0 ];
if ( ! $corpus ) {

	print "Usage: $0 <file>\n";
	exit;
	
}

# initialize
my $stopwords = &getStopWords( 'en' );
my $text      = &slurp( $corpus );
my $stemmer = Lingua::Stem::Snowball->new( lang => 'en' );

# get stop words
my @stopwords = ();
my $stoplist  = getStopWords( 'en' );
foreach my $stopword ( keys %$stoplist ) { push @stopwords, $stopword }

# build list of stemmed words
my $fathom = Lingua::EN::Fathom->new;
$fathom->analyse_file( $corpus );
my %words = $fathom->unique_words;
my %stems = ();
foreach my $word ( keys %words ) { $stems{ $stemmer->stem( $word ) } += $words{ $word } }

# count total stop words; seemingly around 50%, wow!
my $total_stopwords = 0;
foreach my $stopword ( @stopwords ) {

	if ( $words{ $stopword } ) { $total_stopwords += $words{ $stopword } }

}

# save calculations for future reference
my $number_of_words = $fathom->num_words;
my $fog             = round( $fathom->fog );
my $kincaid         = round( $fathom->kincaid );
my $flesch          = round( $fathom->flesch );

# build bigrams and get counts
my $bigrams = Lingua::EN::Bigram->new;
$bigrams->text( $text );
my $word_count   = $bigrams->word_count;
my $bigram_count = $bigrams->bigram_count;

# list the words according to frequency
print "\n";
print "Top 50 words: Word (Count)\n";
print "--------------------------\n";
my $index = 0;
foreach ( sort { $$word_count{ $b } <=> $$word_count{ $a } } keys %$word_count ) {

	next if ( length( $_ ) < 3 );
	next if ( $_ =~ /[,.?!:;()\-']/ );
	next if ( $$stopwords{ $_ } );
	
	$index++;
	last if ( $index > 50 );
	print "$_ (" , $$word_count{ $_ }, '); ';

}
print "\n";
print "\n";

# build bigrams and get counts
my $bigrams = Lingua::EN::Bigram->new;
$bigrams->text( $text );
my $word_count   = $bigrams->word_count;
my $bigram_count = $bigrams->bigram_count;
my $tscore       = $bigrams->tscore;

# display, sans stop words and punctuation
print "\n";
print "Top 50 bi-grams: Bigram (Count)\n";
print "-------------------------------\n";
$index = 0;
foreach my $bigram ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {

	# get the tokens of the bigram
	my ( $first_token, $second_token ) = split / /, $bigram;
	
	# skip stopwords and punctuation
	next if ( $$stopwords{ $first_token } );
	next if ( $first_token =~ /[,.?!:;()\-']/ );
	next if ( $$stopwords{ $second_token } );
	next if ( $second_token =~ /[,.?!:;()\-']/ );
	
	$index++;
	last if ( $index > 50 );
	# output
	print "$bigram (" , $$bigram_count{ $bigram }, '); ';

}
print "\n";
print "\n";
print "\n";

# display readability
my @readability = READABILITY;
print "Readability: Type (Score, Index)\n";
print "--------------------------------\n";
print "Grade level ($kincaid-$fog, ", $readability[ 0 ], "); Flesch: ($flesch, ", $readability[ 1 ], ")";
print "\n";
print "\n";
print "\n";

# process and display each great idea
my @ideas = ();
print "Great ideas: Idea (Coefficient, Index)\n";
print "--------------------------------------\n";
while ( <DATA> ) { 

	chop;
	my ( $idea, $index, $d ) = split /\t/, $_;
	my $idea_stem = $stemmer->stem( $idea );
	if ( $stems{ $idea_stem } ) {
	
		my $tfidf = round( &tfidf( $stems{ $idea_stem }, $number_of_words, D, $d ) * 1000 );
		if ( $tfidf ) { print "$idea ($tfidf, $index); "; }
		
	}
	
}
print "\n";
print "\n";

# done
exit;


# read the CORPUS
sub slurp {

	# open a file named by the input and return its contents
	my $f = shift;
	my $r;
	open F, $f or die "Can't slurp: $!\n";
	$r = do { local $/; <F> };
	return $r;

}


# calculate relevance
sub tfidf {

	my $n = shift;  # C
	my $t = shift;  # T
	my $d = shift;  # D
	my $h = shift;  # DT
	
	my $tfidf = 0;
	
	if ( $d == $h ) { $tfidf = ( $n / $t ) }
	else { $tfidf = ( $n / $t ) * log( $d / $h ) }
	
	return $tfidf;

}


# great ideas: idea, average coefficient score, total documents containing the idea
__DATA__
angel	2	42
animal	8	101
aristocracy	1	7
art	3	181
astronomy	0	17
beauty	2	136
being	225	223
cause	2	200
chance	1	118
change	1	194
citizen	4	86
constitution	4	91
contingency	0	16
convention	1	47
cosmology	0	0
courage	2	90
custom	2	96
death	3	173
definition	4	69
democracy	2	17
desire	2	178
despotism	1	29
dialectic	1	43
duty	5	121
education	2	68
element	3	94
emotion	2	32
equality	2	165
eternity	2	79
evil	4	136
evolution	0	7
experience	3	115
family	2	83
fate	3	78
form	2	204
god	6	181
good	3	207
government	5	108
habit	1	110
happiness	3	135
history	2	90
honor	2	37
hypothesis	1	27
idea	7	99
imagination	2	117
immortality	1	58
induction	1	16
infinity	1	17
judgment	2	113
justice	3	129
knowledge	4	153
labor	3	30
language	2	129
law	5	170
liberty	3	76
life	2	202
logic	1	44
love	5	180
man	2	215
many	0	217
mathematics	1	36
matter	1	196
mechanics	0	33
medicine	4	68
memory	1	110
metaphysics	8	39
mind	2	195
monarchy	1	23
nature	2	208
necessity	2	116
oligarchy	1	6
one	0	222
opinion	3	153
opposition	2	123
other	0	222
pain	3	145
particular	3	143
peace	3	129
philosophy	4	86
physics	1	82
pleasure	3	149
poetry	1	49
principle	5	124
progress	2	74
prophecy	1	30
prudence	1	31
punishment	3	89
quality	3	114
quantity	4	59
reasoning	1	206
relation	5	140
religion	3	62
revolution	1	43
rhetoric	2	34
same	1	217
science	4	92
sense	47	167
sign	2	101
sin	3	82
slavery	1	31
soul	4	152
space	3	74
state	3	202
symbol	1	28
temperance	2	94
theology	1	25
time	0	222
truth	2	179
tyranny	2	37
universal	0	128
vice	2	70
virtue	3	151
war	3	141
wealth	2	86
will	1	221
wisdom	2	119
world	2	193