#!/usr/bin/perl

# mine-mail.pl - apply text mining techniques against mbox files

# Eric Lease Morgan <eric@morgan@infomotions.com>
# April 10, 2010 - first investigations; based on http://perl.goeszen.com/simple-mbox-email-reader-in-perl.html


# configure
use constant CACHE              => '/tmp/mbox.cache';
use constant AUTHORS            => './authors.dat';
use constant SUBJECTLINES       => './subject-lines.dat';
use constant SUBJECTWORDCOUNT   => './subject-word-count.dat';
use constant SUBJECTBIGRAMCOUNT => './subject-bigram-count.dat';
use constant BODYWORDCOUNT      => './body-word-count.dat';
use constant BODYBIGRAMCOUNT    => './body-bigram-count.dat';

# require
use Mail::Mbox::MessageParser;
use Email::MIME;
use Data::Dumper;
use Encode;
use strict;
use Lingua::EN::Bigram;
use Lingua::StopWords qw( getStopWords );

# get input and sanity check
my $mbox = $ARGV[ 0 ];
if ( ! $mbox ) {

	print "Usage: $0 <mbox>\n";
	exit;

}

# initalize 
my %subjects      = ();
my %authors       = ();
my $bigram        = Lingua::EN::Bigram->new;
my $bodies        = '';
my $subject_lines = '';
my $stopwords     = &getStopWords( 'en' );

# create a reader and sanity check
Mail::Mbox::MessageParser::SETUP_CACHE({ file_name => CACHE });
my $reader = new Mail::Mbox::MessageParser({ file_name => $mbox, enable_grep => 1 });
die $reader unless ref $reader;

# process each email message
while( ! $reader->end_of_file ){

	my $raw_email = $reader->read_next_email;
	my $email = Email::MIME->new( $raw_email );

	# parse out and normalize authors
	my $author = &encode_utf8( $email->header( 'From' ));
	$author =~ s/"//g;
	$author =~ s/ <.*>//g;
	$author = lc( $author );
	$authors{ $author }++;
	
	# parse out and normalize subjects
	my $subject = &encode_utf8( $email->header( 'Subject' ));
	$subject =~ s/^Re: //;
	$subjects{ $subject }++;
	$subject_lines .= $subject;
	
	# parse out and normalize bodies
	$bodies .= &encode_utf8( $email->body ) . "\n";
	
}


# create authors.dat file
open OUT, ' > ' . AUTHORS or die "Can't open AUTHORS: $!\n";
foreach my $author ( sort { $authors{ $b } <=> $authors{ $a }} keys( %authors )) {

  print OUT "$author:" . $authors{ $author } . "\n";
	
}
close OUT;

# create subject-lines.dat file
open OUT, ' > ' . SUBJECTLINES or die "Can't open SUBJECTLINES: $!\n";
foreach my $subject_line ( sort { $subjects{ $b } <=> $subjects{ $a }} keys( %subjects )) {

	print OUT "$subject_line:" . $subjects{ $subject_line } . "\n";
	
}
close OUT;

# create subject*.dat files
$bigram->text( $subject_lines );
&count_words( $bigram->word_count, SUBJECTWORDCOUNT );
&count_bigrams( $bigram->bigram_count, SUBJECTBIGRAMCOUNT );

# create body*.dat files
$bigram->text( $bodies );
&count_words( $bigram->word_count, BODYWORDCOUNT );
&count_bigrams( $bigram->bigram_count, BODYBIGRAMCOUNT );

# done
exit;


sub count_words {

	my $word_count = shift;
	my $file = shift;
	
	open OUT, ' > ' . $file or die "Can't open $file: $!\n";
	foreach ( sort { $$word_count{ $b } <=> $$word_count{ $a } } keys %$word_count ) {

		next if ( $$stopwords{ $_ } );
		next if ( $_ =~ /[,.?!:;()\-]/ );
		print OUT "$_:" . $$word_count{ $_ }. "\n";

	}

	close OUT;

}


sub count_bigrams {

	my $bigram_count = shift;
	my $file = shift;
	
	open OUT, ' > ' . $file or die "Can't open $file: $!\n";
	foreach ( sort { $$bigram_count{ $b } <=> $$bigram_count{ $a } } keys %$bigram_count ) {
	
		# get the tokens of the bigram
		my ( $first_token, $second_token ) = split / /, $_;
		
		# skip stopwords and punctuation
		next if ( $$stopwords{ $first_token } );
		next if ( $first_token =~ /[,.?!:;()\-]/ );
		next if ( $$stopwords{ $second_token } );
		next if ( $second_token =~ /[,.?!:;()\-]/ );
	
		print OUT "$_:" . $$bigram_count{ $_ }. "\n";
	
	}
	close OUT;

}