#!/usr/bin/perl

# ngrams.pl - list single- or multi-word phrases from an irc log

# Eric Lease Morgan <eric_morgan@infomotions.com>
# February 12, 2011 - based on previous work


# configure
use constant DB => './irclog.db';

# require
use Lingua::EN::Ngram;
use Lingua::StopWords qw( getStopWords );
use strict;

# sanity check
my $size = $ARGV[ 0 ];
if ( ! $size ) {

	print "Usage: $0 <integer>\n";
	exit;
	
}

# initialize
my $corpus    = '';
my $stopwords = &getStopWords( 'en' );

# create corpus
open INPUT, ' < ' . DB or die "Can't open " . DB . ": $!\n";
while ( <INPUT> ) {

	# clean, parse, and build corpus
	chop;
	my ( $datestamp, $name, $text ) = split /\t/, $_;
	$corpus .= $text . ' ';
	
}

# clean up
close INPUT;

# initialize and count ngrams
my $ngram  = Lingua::EN::Ngram->new( text => $corpus );
my $ngrams = $ngram->ngram( $size );

# process all the ngrams
my $index = 0;
foreach my $phrase ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) {
	
	# get the tokens of the phrase
	my @tokens = split / /, $phrase;

	# process each token; filter based on it's value
	my $found = 0;
	foreach ( @tokens ) {
	
		# skip stop words for uni-grams and bi-grams
		if ( $size == 1 or $size == 2 ) {
		
			if ( $$stopwords{ $_ }) {
			
				$found = 1;
				last;
			
			}
		
		}
				
		# skip punctuation
		if ( $_ =~ /[,.?!:;()\-]/ or $_ =~ /^'/ or $_ =~ /'$/  ) {
		
			$found = 1;
			last;
			
		}
			
		# skip single-letter words
		if ( length( $_ ) == 1 ) {
		
			$found = 1;
			last;
			
		}
			
		# skip more punctuation
		if ( $_ =~ /^'/ ) {
		
			$found = 1;
			last;
			
		}
			
	}
	
	# loop if found an unwanted token
	next if ( $found );
		
	# don't want single frequency phrases
	last if ( $$ngrams{ $phrase } == 1 );
	
	# echo
	print $$ngrams{ $phrase }, "\t$phrase\n";
	
}

# done
exit;