#!/usr/bin/perl # ngrams.pl - count and display the most frequent ngrams in a text # Eric Lease Morgan # August 28, 2010 - first cut; for a blog posting # August 29, 2010 - tweeked to accept command-line input # denote the number of ngrams to display; season to taste use constant LENGTH => 10; # require use Lingua::EN::Bigram; use strict; # sanity check my $file = $ARGV[ 0 ]; my $size = $ARGV[ 1 ]; if ( ! $file or ! $size ) { print "Usage: $0 \n"; exit; } # slurp open F, $file or die "Can't open input: $!\n"; my $text = do { local $/; }; close F; # initialize and count phrases my $ngrams = Lingua::EN::Bigram->new; $ngrams->text( $text ); my $frequencies = $ngrams->ngram_count( [$ngrams->ngram( $size )] ); # process each phrase my $index = 0; foreach my $phrase ( sort { $$frequencies{ $b } <=> $$frequencies{ $a } } keys %$frequencies ) { # check for punctuation in each token of phrase my $found = 0; foreach ((split / /, $phrase )) { if ( $_ =~ /[,.?!:;()\-]/ ) { $found = 1; last; } } # don't want found tokens next if ( $found ); # increment; only want LENGTH phrases displayed $index++; last if ( $index > LENGTH ); # don't want single frequency phrases last if ( $$frequencies{ $phrase } == 1 ); # echo print $$frequencies{ $phrase }, "\t$phrase\n"; } # done exit;