#!/usr/bin/perl # n-grams.pl - list top 10 bi-grams from a text ordered by tscore, and # list top 10 tri-grams and 4-grams ordered by number of occurances # Eric Lease Morgan # June 18, 2009 - first implementation # June 19, 2009 - tweaked # August 22, 2009 - added tri-grams and 4-grams # require use lib '../lib'; use Lingua::EN::Bigram; use Lingua::StopWords qw( getStopWords ); use strict; # initialize my $stopwords = &getStopWords( 'en' ); # sanity check my $file = $ARGV[ 0 ]; if ( ! $file ) { print "Usage: $0 \n"; exit; } # slurp open F, $file or die "Can't open input: $!\n"; my $text = do { local $/; }; close F; # build n-grams my $ngrams = Lingua::EN::Bigram->new; $ngrams->text( $text ); # get bi-gram counts my $bigram_count = $ngrams->bigram_count; my $tscore = $ngrams->tscore; # display top ten bi-grams, sans stop words and punctuation my $index = 0; print "Bi-grams (T-Score, count, bi-gram)\n"; foreach my $bigram ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) { # get the tokens of the bigram my ( $first_token, $second_token ) = split / /, $bigram; # skip stopwords and punctuation next if ( $$stopwords{ $first_token } ); next if ( $first_token =~ /[,.?!:;()\-]/ ); next if ( $$stopwords{ $second_token } ); next if ( $second_token =~ /[,.?!:;()\-]/ ); # increment $index++; last if ( $index > 10 ); # output print "$$tscore{ $bigram }\t" . "$$bigram_count{ $bigram }\t" . "$bigram\t\n"; } print "\n"; # get tri-gram counts my $trigram_count = $ngrams->trigram_count; # process the first top 10 tri-grams $index = 0; print "Tri-grams (count, tri-gram)\n"; foreach my $trigram ( sort { $$trigram_count{ $b } <=> $$trigram_count{ $a } } keys %$trigram_count ) { # get the tokens of the bigram my ( $first_token, $second_token, $third_token ) = split / /, $trigram; # skip punctuation next if ( $first_token =~ /[,.?!:;()\-]/ ); next if ( $second_token =~ /[,.?!:;()\-]/ ); next if ( $third_token =~ /[,.?!:;()\-]/ ); # skip stopwords; results are often more interesting if these are commented out #next if ( $$stopwords{ $first_token } ); #next if ( $$stopwords{ $second_token } ); #next if ( $$stopwords{ $third_token } ); # increment $index++; last if ( $index > 10 ); # echo print $$trigram_count{ $trigram }, "\t$trigram\n"; } print "\n"; # get quad-gram counts my $quadgram_count = $ngrams->quadgram_count; # process the first top 10 tri-grams $index = 0; print "Quad-grams (count, quad-gram)\n"; foreach my $quadgram ( sort { $$quadgram_count{ $b } <=> $$quadgram_count{ $a } } keys %$quadgram_count ) { # get the tokens of the bigram my ( $first_token, $second_token, $third_token, $fourth_token ) = split / /, $quadgram; # skip punctuation next if ( $first_token =~ /[,.?!:;()\-]/ ); next if ( $second_token =~ /[,.?!:;()\-]/ ); next if ( $third_token =~ /[,.?!:;()\-]/ ); next if ( $fourth_token =~ /[,.?!:;()\-]/ ); # skip stopwords; results are often more interesting if these are commented out #next if ( $$stopwords{ $first_token } ); #next if ( $$stopwords{ $second_token } ); #next if ( $$stopwords{ $third_token } ); #next if ( $$stopwords{ $fourth_token } ); # increment $index++; last if ( $index > 10 ); # echo print $$quadgram_count{ $quadgram }, "\t$quadgram\n"; } print "\n"; # done exit;