#!/usr/bin/perl use constant DB => './irclog.db'; use constant LENGTH => 10; # require use Lingua::EN::Ngram; use strict; # sanity check my $size = $ARGV[ 0 ]; if ( ! $size ) { print "Usage: $0 \n"; exit; } # initialize my $corpus = ''; # create corpus open INPUT, ' < ' . DB or die "Can't open " . DB . ": $!\n"; while ( ) { # clean, parse, and build corpus chop; my ( $datestamp, $name, $text ) = split /\t/, $_; $corpus .= $text . ' '; } close INPUT; # initialize and count ngrams my $ngram = Lingua::EN::Ngram->new( text => $corpus ); my $ngrams = $ngram->ngram( $size ); # process all the ngrams my $index = 0; foreach my $phrase ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) { # check for punctuation in each token of phrase my $found = 0; foreach ((split / /, $phrase )) { if ( $_ =~ /[,.?!:;()\-]/ ) { $found = 1; last; } } # don't want found tokens next if ( $found ); # increment; only want LENGTH phrases displayed $index++; last if ( $index > LENGTH ); # don't want single frequency phrases last if ( $$ngrams{ $phrase } == 1 ); # echo print $$ngrams{ $phrase }, "\t$phrase\n"; } # done exit;