#!/usr/bin/perl

# cloud.pl - create a pseudo word cloud based on a given time interval

# Eric Lease Morgan <eric_morgan@infomotions.com>
# February 17, 2011 - first cut; needs command line arguments


# configure
use constant DSN       => 'dbi:mysql:c4l11';
use constant USERNAME  => '---';
use constant PASSWORD  => '---';
use constant STARTDATE => '2011-02-11 00:40:00';
use constant ENDDATE   => '2011-02-13 17:20:00';
use constant SIZE      => 2;

# require
use DateTime;
use DateTime::Format::MySQL;
use DBI;
use HTML::Entities qw( decode_entities );
use Lingua::EN::Ngram;
use Lingua::StopWords qw( getStopWords );
use strict;

# initalize
my $dbh       = DBI->connect( DSN, USERNAME, PASSWORD );
my $startdate = DateTime::Format::MySQL->parse_datetime( STARTDATE );
my $enddate   = DateTime::Format::MySQL->parse_datetime( ENDDATE );
my $stopwords = &getStopWords( 'en' );

# search
my $start = DateTime::Format::MySQL->format_datetime( $startdate );
my $end   = DateTime::Format::MySQL->format_datetime( $enddate );
my $sql   = "SELECT tweet FROM tweets WHERE datetime >= '$start' AND datetime <= '$end'";
my $rows  = $dbh->selectall_arrayref( $sql, { Slice => {} } );

# build corpus
my $tweets = '';
foreach my $row ( @$rows ) {

	my $tweet = decode_entities( $row->{ tweet } );
	$tweets .= $tweet

}
  
# clean up
$dbh->disconnect;

# initialize and count ngrams
my $ngram  = Lingua::EN::Ngram->new( text => $tweets );
my $ngrams = $ngram->ngram( SIZE );

# process all the ngrams
my $index = 0;
foreach my $phrase ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) {
	
	# get the tokens of the phrase
	my @tokens = split / /, $phrase;

	# process each token; filter based on it's value
	my $found = 0;
	foreach ( @tokens ) {
	
		# skip stop words for uni-grams and bi-grams
		if ( SIZE == 1 or SIZE == 2 ) {
		
			if ( $$stopwords{ $_ }) {
			
				$found = 1;
				last;
			
			}
		
		}
				
		# skip punctuation
		if ( $_ =~ /[,.?!:;()\-]/ or $_ =~ /^'/ or $_ =~ /'$/  ) {
		
			$found = 1;
			last;
			
		}
			
		# skip single-letter words
		if ( length( $_ ) == 1 ) {
		
			$found = 1;
			last;
			
		}
			
		# skip more punctuation
		if ( $_ =~ /^'/ ) {
		
			$found = 1;
			last;
			
		}
			
	}
	
	# loop if found an unwanted token
	next if ( $found );
		
	# don't want single frequency phrases
	last if ( $$ngrams{ $phrase } == 1 );
	
	# echo
	print "$phrase (" . $$ngrams{ $phrase } . ') ';
	
}

# done
print "\n\n";
exit;