#!/usr/bin/perl # cloud.pl - create a pseudo word cloud based on a given time interval # Eric Lease Morgan # February 17, 2011 - first cut; needs command line arguments # configure use constant DSN => 'dbi:mysql:c4l11'; use constant USERNAME => '---'; use constant PASSWORD => '---'; use constant STARTDATE => '2011-02-11 00:40:00'; use constant ENDDATE => '2011-02-13 17:20:00'; use constant SIZE => 2; # require use DateTime; use DateTime::Format::MySQL; use DBI; use HTML::Entities qw( decode_entities ); use Lingua::EN::Ngram; use Lingua::StopWords qw( getStopWords ); use strict; # initalize my $dbh = DBI->connect( DSN, USERNAME, PASSWORD ); my $startdate = DateTime::Format::MySQL->parse_datetime( STARTDATE ); my $enddate = DateTime::Format::MySQL->parse_datetime( ENDDATE ); my $stopwords = &getStopWords( 'en' ); # search my $start = DateTime::Format::MySQL->format_datetime( $startdate ); my $end = DateTime::Format::MySQL->format_datetime( $enddate ); my $sql = "SELECT tweet FROM tweets WHERE datetime >= '$start' AND datetime <= '$end'"; my $rows = $dbh->selectall_arrayref( $sql, { Slice => {} } ); # build corpus my $tweets = ''; foreach my $row ( @$rows ) { my $tweet = decode_entities( $row->{ tweet } ); $tweets .= $tweet } # clean up $dbh->disconnect; # initialize and count ngrams my $ngram = Lingua::EN::Ngram->new( text => $tweets ); my $ngrams = $ngram->ngram( SIZE ); # process all the ngrams my $index = 0; foreach my $phrase ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) { # get the tokens of the phrase my @tokens = split / /, $phrase; # process each token; filter based on it's value my $found = 0; foreach ( @tokens ) { # skip stop words for uni-grams and bi-grams if ( SIZE == 1 or SIZE == 2 ) { if ( $$stopwords{ $_ }) { $found = 1; last; } } # skip punctuation if ( $_ =~ /[,.?!:;()\-]/ or $_ =~ /^'/ or $_ =~ /'$/ ) { $found = 1; last; } # skip single-letter words if ( length( $_ ) == 1 ) { $found = 1; last; } # skip more punctuation if ( $_ =~ /^'/ ) { $found = 1; last; } } # loop if found an unwanted token next if ( $found ); # don't want single frequency phrases last if ( $$ngrams{ $phrase } == 1 ); # echo print "$phrase (" . $$ngrams{ $phrase } . ') '; } # done print "\n\n"; exit;