#!/usr/bin/perl

use constant DB     => './irclog.db';
use constant LENGTH => 10;

# require
use Lingua::EN::Ngram;
use strict;

# sanity check
my $size = $ARGV[ 0 ];
if ( ! $size ) {

	print "Usage: $0 <integer>\n";
	exit;
	
}

# initialize
my $corpus = '';

# create corpus
open INPUT, ' < ' . DB or die "Can't open " . DB . ": $!\n";
while ( <INPUT> ) {

	# clean, parse, and build corpus
	chop;
	my ( $datestamp, $name, $text ) = split /\t/, $_;
	$corpus .= $text . ' ';
	
}
close INPUT;

# initialize and count ngrams
my $ngram = Lingua::EN::Ngram->new( text => $corpus );
my $ngrams = $ngram->ngram( $size );

# process all the ngrams
my $index = 0;
foreach my $phrase ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) {
	
	# check for punctuation in each token of phrase
	my $found = 0;
	foreach ((split / /, $phrase )) {
	
		if ( $_ =~ /[,.?!:;()\-]/ ) {
		
			$found = 1;
			last;
			
		}
		
	}
	
	# don't want found tokens
	next if ( $found );
	
	# increment; only want LENGTH phrases displayed
	$index++;
	last if ( $index > LENGTH );
	
	# don't want single frequency phrases
	last if ( $$ngrams{ $phrase } == 1 );
	
	# echo
	print $$ngrams{ $phrase }, "\t$phrase\n";
	
}

# done
exit;