#!/usr/bin/perl

# bigram.pl - output cooccurances based on t-score

# Eric Lease Morgan <eric_morgan@infomotions.com>
# June 22, 2010 - based on previous work; designed for ALA


# require
use Lingua::EN::Bigram;
use Lingua::StopWords qw( getStopWords );
use strict;

my $corpus = $ARGV[ 0 ];
if ( ! $corpus ) {

	print "Usage: $0 <file>\n";
	exit;
	
}

# initialize
my $stopwords = &getStopWords( 'en' );
my $text      = &slurp( $corpus );

# build bigrams
my $bigrams = Lingua::EN::Bigram->new;
$bigrams->text( $text );

# get counts
my $word_count   = $bigrams->word_count;
my $bigram_count = $bigrams->bigram_count;
my $tscore       = $bigrams->tscore;

# display, sans stop words and punctuation
foreach my $bigram ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) {

	# get the tokens of the bigram
	my ( $first_token, $second_token ) = split / /, $bigram;
	
	# skip stopwords and punctuation
	next if ( $$stopwords{ $first_token } );
	next if ( $first_token =~ /[,.?!:;()\-']/ );
	next if ( $$stopwords{ $second_token } );
	next if ( $second_token =~ /[,.?!:;()\-']/ );
	
	# output
	print "$$bigram_count{ $bigram }\t$bigram\t\n";

}
# done
exit;


# read the CORPUS
sub slurp {

	# open a file named by the input and return its contents
	my $f = shift;
	my $r;
	open F, $f or die "Can't slurp: $!\n";
	$r = do { local $/; <F> };
	return $r;

}