#!/usr/bin/perl

# bigram.pl - list top words and top phrases

# Eric Lease Morgan <emorgan@nd.edu>
# June 9, 2010 - a quick hack


# configure
use constant CORPUS => './articles.txt';

# require
use Lingua::EN::Bigram;
use Lingua::StopWords qw( getStopWords );
use strict;

# initialize
my $stopwords = &getStopWords( 'en' );
my $text      = &slurp( CORPUS );

# build bigrams
my $bigrams = Lingua::EN::Bigram->new;
$bigrams->text( $text );

# get counts
my $word_count   = $bigrams->word_count;
my $bigram_count = $bigrams->bigram_count;
my $tscore       = $bigrams->tscore;

# list the words according to frequency
print "Word count\n";
print "----------\n";
foreach ( sort { $$word_count{ $b } <=> $$word_count{ $a } } keys %$word_count ) {

	next if ( $_ =~ /[,.?!:;()\-]/ );
	next if ( $$stopwords{ $_ } );
	print $$word_count{ $_ }, "\t$_\n";

}

# display, sans stop words and punctuation
foreach my $bigram ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {

	# get the tokens of the bigram
	my ( $first_token, $second_token ) = split / /, $bigram;
	
	# skip stopwords and punctuation
	next if ( $$stopwords{ $first_token } );
	next if ( $first_token =~ /[,.?!:;()\-]/ );
	next if ( $$stopwords{ $second_token } );
	next if ( $second_token =~ /[,.?!:;()\-]/ );
	
	# output
	print "$$tscore{ $bigram }\t"           . 
	      "$$word_count{ $first_token }\t"  . 
	      "$$word_count{ $second_token }\t" . 
	      "$$bigram_count{ $bigram }\t"     . 
	      "$bigram\t\n";

}
# done
exit;


# read the CORPUS
sub slurp {

	# open a file named by the input and return its contents
	my $f = shift;
	my $r;
	open F, $f or die "Can't slurp: $!\n";
	$r = do { local $/; <F> };
	return $r;

}