#!/usr/bin/perl # ngrams.pl - list single- or multi-word phrases from an irc log # Eric Lease Morgan # February 12, 2011 - based on previous work # configure use constant DB => './irclog.db'; # require use Lingua::EN::Ngram; use Lingua::StopWords qw( getStopWords ); use strict; # sanity check my $size = $ARGV[ 0 ]; if ( ! $size ) { print "Usage: $0 \n"; exit; } # initialize my $corpus = ''; my $stopwords = &getStopWords( 'en' ); # create corpus open INPUT, ' < ' . DB or die "Can't open " . DB . ": $!\n"; while ( ) { # clean, parse, and build corpus chop; my ( $datestamp, $name, $text ) = split /\t/, $_; $corpus .= $text . ' '; } # clean up close INPUT; # initialize and count ngrams my $ngram = Lingua::EN::Ngram->new( text => $corpus ); my $ngrams = $ngram->ngram( $size ); # process all the ngrams my $index = 0; foreach my $phrase ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) { # get the tokens of the phrase my @tokens = split / /, $phrase; # process each token; filter based on it's value my $found = 0; foreach ( @tokens ) { # skip stop words for uni-grams and bi-grams if ( $size == 1 or $size == 2 ) { if ( $$stopwords{ $_ }) { $found = 1; last; } } # skip punctuation if ( $_ =~ /[,.?!:;()\-]/ or $_ =~ /^'/ or $_ =~ /'$/ ) { $found = 1; last; } # skip single-letter words if ( length( $_ ) == 1 ) { $found = 1; last; } # skip more punctuation if ( $_ =~ /^'/ ) { $found = 1; last; } } # loop if found an unwanted token next if ( $found ); # don't want single frequency phrases last if ( $$ngrams{ $phrase } == 1 ); # echo print $$ngrams{ $phrase }, "\t$phrase\n"; } # done exit;