#!/usr/bin/perl # cloud.pl - create a tag cloud from the IRC log # Eric Lease Morgan # February 13, 2011 - first cut # configure use constant DB => './irclog.db'; use constant MAX => 300; # require use HTML::TagCloud::Centred; use Lingua::EN::Ngram; use Lingua::StopWords qw( getStopWords ); use strict; # get and check input my $input = $ARGV[ 0 ]; if ( ! $input ) { &usage } # initialize my $corpus = ''; my $names = ''; my $stopwords = &getStopWords( 'en' ); # generate corpus open INPUT, ' < ' . DB or die "Can't open " . DB . ": $!\n"; while ( ) { # clean, parse, and build corpus chop; my ( $datestamp, $name, $text ) = split /\t/, $_; $corpus .= $text . ' '; $names .= $name . ' '; } # initialize and count ngrams my $ngram = ''; if ( $input eq 'n' ) { $ngram = Lingua::EN::Ngram->new( text => $names ) } elsif ( $input eq 'w' ) { $ngram = Lingua::EN::Ngram->new( text => $corpus ) } else { &usage } my $ngrams = $ngram->ngram( 1 ); # process each ngram my %tokens = (); foreach my $token ( sort { $$ngrams{ $b } <=> $$ngrams{ $a } } keys %$ngrams ) { # filter next if ( $$stopwords{ $token }); next if ( $token =~ /[,.?!:;()\-]/ or $token =~ /^'/ or $token =~ /'$/ ); next if ( length( $token ) == 1 ); next if ( $token =~ /^'/ ); last if ( $$ngrams{ $token } == 1 ); # tally $tokens{ $token } = $$ngrams{ $token }; } # build the cloud my $cloud = HTML::TagCloud::Centred->new( clr_max => "#FF0000", clr_min => "#3333FF" ); my $index = 0; foreach my $token ( sort { $tokens{ $b } <=> $tokens{ $a } } keys %tokens ) { # increment and check $index++; last if ( $index > MAX ); # update cloud $cloud->add( $token ); } # output print $cloud->html_and_css; # done close INPUT; exit; sub usage { print "Usage: $0 \n"; exit; }