#!/usr/bin/perl

# ngrams.pl - count and display the most frequent ngrams in a text

# Eric Lease Morgan <eric_morgan@infomotions.com>
# August 28, 2010 - first cut; for a blog posting
# August 29, 2010 - tweeked to accept command-line input


# denote the number of ngrams to display; season to taste
use constant LENGTH => 10;

# require
use Lingua::EN::Bigram;
use strict;

# sanity check
my $file = $ARGV[ 0 ];
my $size = $ARGV[ 1 ];
if ( ! $file or ! $size ) {

	print "Usage: $0 <file> <integer>\n";
	exit;
	
}

# slurp
open F, $file or die "Can't open input: $!\n";
my $text = do { local $/; <F> };
close F;

# initialize and count phrases
my $ngrams = Lingua::EN::Bigram->new;
$ngrams->text( $text );
my $frequencies = $ngrams->ngram_count( [$ngrams->ngram( $size )] );

# process each phrase
my $index = 0;
foreach my $phrase ( sort { $$frequencies{ $b } <=> $$frequencies{ $a } } keys %$frequencies ) {
	
	# check for punctuation in each token of phrase
	my $found = 0;
	foreach ((split / /, $phrase )) {
	
		if ( $_ =~ /[,.?!:;()\-]/ ) {
		
			$found = 1;
			last;
			
		}
		
	}
	
	# don't want found tokens
	next if ( $found );
	
	# increment; only want LENGTH phrases displayed
	$index++;
	last if ( $index > LENGTH );
	
	# don't want single frequency phrases
	last if ( $$frequencies{ $phrase } == 1 );
	
	# echo
	print $$frequencies{ $phrase }, "\t$phrase\n";
	
}

# done
exit;