#!/usr/bin/perl # mine-tweets.pl - count and output names, urls, tags, and words in tweets # Eric Lease Morgan # January 12, 2011 - first investigations # configure use constant TWEETS => './tweets.txt'; use constant NAMES => './names.txt'; use constant URLS => './urls.txt'; use constant TAGS => './tags.txt'; use constant WORDS => './words.txt'; # require use strict; use Lingua::StopWords qw( getStopWords ); # initalize my $stopwords = getStopWords('en'); my %names = (); my %urls = (); my %tags = (); my %words = (); # process every tweet open IN, ' < ' . TWEETS or die "Can't open tweets: $!\n"; while ( ) { chop; # extract names if ( /\@(\w+)\s/ ) { $names{ $1 }++ } # extract urls if ( /(http:\S+)\s/g ) { $urls{ $1 }++ } # extract tags if ( /#(\w+)\s/ ) { $tags{ $1 }++ } # extract words my @words = split / /, $_; foreach my $word ( split /\s/, $_ ) { # skip previous collections next if ( $word =~ /^#/ ); next if ( $word =~ /^@/ ); next if ( $word =~ /^http:/ ); # normalize $word =~ s/\W+$//; $word =~ s/^\W+//; # skip some more next if ( length( $word ) < 2 ); next if ( $$stopwords{ $word } ); # count $words{ $word }++; } } close IN; # output names, urls, tags, & words open OUT, ' > ' . NAMES or die "Cant' open " . NAMES . "$!\n"; foreach ( sort { $names{ $b } <=> $names{ $a } } keys %names ) { print OUT $names{ $_ }, "\t", $_, "\n"; } close OUT; open OUT, ' > ' . URLS or die "Cant' open " . URLS . "$!\n"; foreach ( sort { $urls{ $b } <=> $urls{ $a } } keys %urls ) { print OUT $urls{ $_ }, "\t", $_, "\n"; } close OUT; open OUT, ' > ' . TAGS or die "Cant' open " . TAGS . "$!\n"; foreach ( sort { $tags{ $b } <=> $tags{ $a } } keys %tags ) { print OUT $tags{ $_ }, "\t", $_, "\n"; } close OUT; open OUT, ' > ' . WORDS or die "Cant' open " . WORDS . "$!\n"; foreach ( sort { $words{ $b } <=> $words{ $a } } keys %words ) { print OUT $words{ $_ }, "\t", $_, "\n"; } close OUT; # done exit;