#!/usr/bin/perl # mine-mail.pl - apply text mining techniques against mbox files # Eric Lease Morgan # April 10, 2010 - first investigations; based on http://perl.goeszen.com/simple-mbox-email-reader-in-perl.html # configure use constant CACHE => '/tmp/mbox.cache'; use constant AUTHORS => './authors.dat'; use constant SUBJECTLINES => './subject-lines.dat'; use constant SUBJECTWORDCOUNT => './subject-word-count.dat'; use constant SUBJECTBIGRAMCOUNT => './subject-bigram-count.dat'; use constant BODYWORDCOUNT => './body-word-count.dat'; use constant BODYBIGRAMCOUNT => './body-bigram-count.dat'; # require use Mail::Mbox::MessageParser; use Email::MIME; use Data::Dumper; use Encode; use strict; use Lingua::EN::Bigram; use Lingua::StopWords qw( getStopWords ); # get input and sanity check my $mbox = $ARGV[ 0 ]; if ( ! $mbox ) { print "Usage: $0 \n"; exit; } # initalize my %subjects = (); my %authors = (); my $bigram = Lingua::EN::Bigram->new; my $bodies = ''; my $subject_lines = ''; my $stopwords = &getStopWords( 'en' ); # create a reader and sanity check Mail::Mbox::MessageParser::SETUP_CACHE({ file_name => CACHE }); my $reader = new Mail::Mbox::MessageParser({ file_name => $mbox, enable_grep => 1 }); die $reader unless ref $reader; # process each email message while( ! $reader->end_of_file ){ my $raw_email = $reader->read_next_email; my $email = Email::MIME->new( $raw_email ); # parse out and normalize authors my $author = &encode_utf8( $email->header( 'From' )); $author =~ s/"//g; $author =~ s/ <.*>//g; $author = lc( $author ); $authors{ $author }++; # parse out and normalize subjects my $subject = &encode_utf8( $email->header( 'Subject' )); $subject =~ s/^Re: //; $subjects{ $subject }++; $subject_lines .= $subject; # parse out and normalize bodies $bodies .= &encode_utf8( $email->body ) . "\n"; } # create authors.dat file open OUT, ' > ' . AUTHORS or die "Can't open AUTHORS: $!\n"; foreach my $author ( sort { $authors{ $b } <=> $authors{ $a }} keys( %authors )) { print OUT "$author:" . $authors{ $author } . "\n"; } close OUT; # create subject-lines.dat file open OUT, ' > ' . SUBJECTLINES or die "Can't open SUBJECTLINES: $!\n"; foreach my $subject_line ( sort { $subjects{ $b } <=> $subjects{ $a }} keys( %subjects )) { print OUT "$subject_line:" . $subjects{ $subject_line } . "\n"; } close OUT; # create subject*.dat files $bigram->text( $subject_lines ); &count_words( $bigram->word_count, SUBJECTWORDCOUNT ); &count_bigrams( $bigram->bigram_count, SUBJECTBIGRAMCOUNT ); # create body*.dat files $bigram->text( $bodies ); &count_words( $bigram->word_count, BODYWORDCOUNT ); &count_bigrams( $bigram->bigram_count, BODYBIGRAMCOUNT ); # done exit; sub count_words { my $word_count = shift; my $file = shift; open OUT, ' > ' . $file or die "Can't open $file: $!\n"; foreach ( sort { $$word_count{ $b } <=> $$word_count{ $a } } keys %$word_count ) { next if ( $$stopwords{ $_ } ); next if ( $_ =~ /[,.?!:;()\-]/ ); print OUT "$_:" . $$word_count{ $_ }. "\n"; } close OUT; } sub count_bigrams { my $bigram_count = shift; my $file = shift; open OUT, ' > ' . $file or die "Can't open $file: $!\n"; foreach ( sort { $$bigram_count{ $b } <=> $$bigram_count{ $a } } keys %$bigram_count ) { # get the tokens of the bigram my ( $first_token, $second_token ) = split / /, $_; # skip stopwords and punctuation next if ( $$stopwords{ $first_token } ); next if ( $first_token =~ /[,.?!:;()\-]/ ); next if ( $$stopwords{ $second_token } ); next if ( $second_token =~ /[,.?!:;()\-]/ ); print OUT "$_:" . $$bigram_count{ $_ }. "\n"; } close OUT; }