#!/usr/bin/perl # bigram.pl - output cooccurances based on t-score # Eric Lease Morgan # June 22, 2010 - based on previous work; designed for ALA # require use Lingua::EN::Bigram; use Lingua::StopWords qw( getStopWords ); use strict; my $corpus = $ARGV[ 0 ]; if ( ! $corpus ) { print "Usage: $0 \n"; exit; } # initialize my $stopwords = &getStopWords( 'en' ); my $text = &slurp( $corpus ); # build bigrams my $bigrams = Lingua::EN::Bigram->new; $bigrams->text( $text ); # get counts my $word_count = $bigrams->word_count; my $bigram_count = $bigrams->bigram_count; my $tscore = $bigrams->tscore; # display, sans stop words and punctuation foreach my $bigram ( sort { $$tscore{ $b } <=> $$tscore{ $a } } keys %$tscore ) { # get the tokens of the bigram my ( $first_token, $second_token ) = split / /, $bigram; # skip stopwords and punctuation next if ( $$stopwords{ $first_token } ); next if ( $first_token =~ /[,.?!:;()\-']/ ); next if ( $$stopwords{ $second_token } ); next if ( $second_token =~ /[,.?!:;()\-']/ ); # output print "$$bigram_count{ $bigram }\t$bigram\t\n"; } # done exit; # read the CORPUS sub slurp { # open a file named by the input and return its contents my $f = shift; my $r; open F, $f or die "Can't slurp: $!\n"; $r = do { local $/; }; return $r; }