#!/usr/bin/perl # bigram.pl - list top words and top phrases # Eric Lease Morgan # June 9, 2010 - a quick hack # configure use constant CORPUS => './articles.txt'; # require use Lingua::EN::Bigram; use Lingua::StopWords qw( getStopWords ); use strict; # initialize my $stopwords = &getStopWords( 'en' ); my $text = &slurp( CORPUS ); # build bigrams my $bigrams = Lingua::EN::Bigram->new; $bigrams->text( $text ); # get counts my $word_count = $bigrams->word_count; my $bigram_count = $bigrams->bigram_count; my $tscore = $bigrams->tscore; # list the words according to frequency print "Word count\n"; print "----------\n"; foreach ( sort { $$word_count{ $b } <=> $$word_count{ $a } } keys %$word_count ) { next if ( $_ =~ /[,.?!:;()\-]/ ); next if ( $$stopwords{ $_ } ); print $$word_count{ $_ }, "\t$_\n"; } # display, sans stop words and punctuation foreach my $bigram ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) { # get the tokens of the bigram my ( $first_token, $second_token ) = split / /, $bigram; # skip stopwords and punctuation next if ( $$stopwords{ $first_token } ); next if ( $first_token =~ /[,.?!:;()\-]/ ); next if ( $$stopwords{ $second_token } ); next if ( $second_token =~ /[,.?!:;()\-]/ ); # output print "$$tscore{ $bigram }\t" . "$$word_count{ $first_token }\t" . "$$word_count{ $second_token }\t" . "$$bigram_count{ $bigram }\t" . "$bigram\t\n"; } # done exit; # read the CORPUS sub slurp { # open a file named by the input and return its contents my $f = shift; my $r; open F, $f or die "Can't slurp: $!\n"; $r = do { local $/; }; return $r; }