#!/usr/bin/perl

# cloud.pl - create a tag cloud from the IRC log

# Eric Lease Morgan <eric_morgan@infomotions.com>
# February 13, 2011 - first cut


# configure
use constant DB  => './irclog.db';
use constant MAX => 300;

# require
use HTML::TagCloud::Centred;
use Lingua::EN::Ngram;
use Lingua::StopWords qw( getStopWords );
use strict;

# get and check input
my $input = $ARGV[ 0 ];
if ( ! $input ) { &usage }

# initialize
my $corpus    = '';
my $names     = '';
my $stopwords = &getStopWords( 'en' );

# generate corpus
open INPUT, ' < ' . DB or die "Can't open " . DB . ": $!\n";
while ( <INPUT> ) {

	# clean, parse, and build corpus
	chop;
	my ( $datestamp, $name, $text ) = split /\t/, $_;
	$corpus .= $text . ' ';
	$names  .= $name . ' ';

}

# initialize and count ngrams
my $ngram = '';
if    ( $input eq 'n' ) { $ngram  = Lingua::EN::Ngram->new( text => $names ) }
elsif ( $input eq 'w' ) { $ngram  = Lingua::EN::Ngram->new( text => $corpus ) }
else  { &usage }
my $ngrams = $ngram->ngram( 1 );

# process each ngram
my %tokens = ();
foreach my $token ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) {
		
	# filter
	next if ( $$stopwords{ $token });
	next if ( $token =~ /[,.?!:;()\-]/ or $token =~ /^'/ or $token =~ /'$/ );
	next if ( length( $token ) == 1 );
	next if ( $token =~ /^'/ );
	last if ( $$ngrams{ $token } == 1 );
		
	# tally
	$tokens{ $token } = $$ngrams{ $token };
	
}

# build the cloud
my $cloud = HTML::TagCloud::Centred->new( clr_max => "#FF0000", clr_min => "#3333FF" );
my $index = 0;
foreach my $token ( sort { $tokens{ $b } <=> $tokens{ $a } } keys %tokens ) {

	# increment and check
	$index++;
	last if ( $index > MAX );
	
	# update cloud
	$cloud->add( $token );

}

# output
print $cloud->html_and_css;

# done
close INPUT;
exit;


sub usage {

	print "Usage: $0 <n|w>\n";
	exit;
	
}