#!/usr/bin/perl # words.pl - output most common words # Eric Lease Morgan # June 22, 2010 - based on previous work; designed for ALA # require use Lingua::EN::Bigram; use Lingua::StopWords qw( getStopWords ); use strict; my $corpus = $ARGV[ 0 ]; if ( ! $corpus ) { print "Usage: $0 \n"; exit; } # initialize my $stopwords = &getStopWords( 'en' ); my $text = &slurp( $corpus ); # build bigrams my $bigrams = Lingua::EN::Bigram->new; $bigrams->text( $text ); # get counts my $word_count = $bigrams->word_count; my $bigram_count = $bigrams->bigram_count; # list the words according to frequency print "Word count\n"; print "----------\n"; foreach ( sort { $$word_count{ $b } <=> $$word_count{ $a } } keys %$word_count ) { next if ( length( $_ ) < 3 ); next if ( $_ =~ /[,.?!:;()\-']/ ); next if ( $$stopwords{ $_ } ); print $$word_count{ $_ }, "\t$_\n"; } # done exit; # read the CORPUS sub slurp { # open a file named by the input and return its contents my $f = shift; my $r; open F, $f or die "Can't slurp: $!\n"; $r = do { local $/; }; return $r; }