#!/usr/bin/perl # count.pl - tally the most frequently used parts of speech in a text # Eric Lease Morgan # February 1, 2011 -- first investigtations # February 5, 2011 -- tweaked comments # require use strict; # get input and sanity check my $partofspeech = $ARGV[ 0 ]; my $tokenLemma = $ARGV[ 1 ]; my $file = $ARGV[ 2 ]; if ( ! $partofspeech or ! $tokenLemma or ! $file ) { &usage } # initialize my %adjectives = (); my %adverbs = (); my %lemmas = (); my %nouns = (); my %other = (); my %pos = (); my %pronous = (); my %sum = (); my %tokens = (); my %verbs = (); # process each line from the input file open FILE, " < $file" or die "Can't open $file: $!\n"; while ( ) { # clean chop; # parse my ( $t, $l, $p ) = split /\t/, $_; # do the work; count $tokens{ $t }++; $lemmas{ $l }++; $pos{ $p }++; # re-initialize my $s = ''; my $v = ''; # keep track of tokens or lemmas if ( $tokenLemma eq 't' ) { $v = $t } elsif ( $tokenLemma eq 'l' ) { $v = $l } else { &usage } # map and normalize TreeTagger POS to simpler forms if ( $p =~ /^N/ ) { $s = 'noun'; $nouns{ $v }++ } elsif ( $p =~ /^WP/ ) { $s = 'pronoun'; $pronous{ $v }++ } elsif ( $p =~ /^PP/ ) { $s = 'pronoun'; $pronous{ $v }++ } elsif ( $p =~ /^J/ ) { $s = 'adjective'; $adjectives{ $v }++ } elsif ( $p =~ /^V/ ) { $s = 'verb'; $verbs { $v }++ } elsif ( $p =~ /^WRB/ ) { $s = 'adverb'; $adverbs{ $v }++ } elsif ( $p =~ /^RB/ ) { $s = 'adverb'; $adverbs{ $v }++ } elsif ( $p =~ /^,/ ) { $s = 'punctuation' } elsif ( $p =~ /^:/ ) { $s = 'punctuation' } elsif ( $p =~ /^SENT/ ) { $s = 'punctuation' } elsif ( $p =~ /^\(/ ) { $s = 'punctuation' } elsif ( $p =~ /^\)/ ) { $s = 'punctuation' } elsif ( $p =~ /^''/ ) { $s = 'punctuation' } elsif ( $p =~ /^``/ ) { $s = 'punctuation' } elsif ( $p =~ /^\$/ ) { $s = 'symbol' } elsif ( $p =~ /^SYM/ ) { $s = 'symbol' } elsif ( $p =~ /^CD/ ) { $s = 'symbol' } elsif ( $p =~ /^CC/ ) { $s = 'conjunction' } elsif ( $p =~ /^DT/ ) { $s = 'determiner' } elsif ( $p =~ /^PDT/ ) { $s = 'determiner' } elsif ( $p =~ /^WDT/ ) { $s = 'determiner' } elsif ( $p =~ /^IN/ ) { $s = 'preposition' } elsif ( $p =~ /^TO/ ) { $s = 'preposition' } elsif ( $p =~ /^UH/ ) { $s = 'interjection' } else { $s = 'other'; $other{ $p }++ } # count the simpler forms $sum{ $s }++ } close FILE; # output all POS if ( $partofspeech eq 'all' ) { foreach ( sort keys %pos ) { print "$_\t" . $pos{ $_ } . "\n" } } # simple parts of speach elsif ( $partofspeech eq 'simple' ) { my @list = ( 'noun', 'pronoun', 'adjective', 'verb', 'adverb', 'determiner', 'preposition', 'conjunction', 'interjection', 'symbol', 'punctuation', 'other' ); foreach ( @list ) { print "$_\t" . $sum{ $_ } . "\n" } } # nouns elsif ( $partofspeech eq 'nouns' ) { foreach ( sort { $nouns{ $b } <=> $nouns{ $a } } keys %nouns ) { print "$_\t" . $nouns{ $_ } . "\n" } } # pronous elsif ( $partofspeech eq 'pronouns' ) { foreach ( sort { $pronous{ $b } <=> $pronous{ $a } } keys %pronous ) { print "$_\t" . $pronous{ $_ } . "\n" } } # verbs elsif ( $partofspeech eq 'verbs' ) { foreach ( sort { $verbs{ $b } <=> $verbs{ $a } } keys %verbs ) { print "$_\t" . $verbs{ $_ } . "\n" } } # adverbs elsif ( $partofspeech eq 'adverbs' ) { foreach ( sort { $adverbs{ $b } <=> $adverbs{ $a } } keys %adverbs ) { print "$_\t" . $adverbs{ $_ } . "\n" } } # adjectives elsif ( $partofspeech eq 'adjectives' ) { foreach ( sort { $adjectives{ $b } <=> $adjectives{ $a } } keys %adjectives ) { print "$_\t" . $adjectives{ $_ } . "\n" } } # "other" elsif ( $partofspeech eq 'other' ) { foreach ( sort { $other{ $b } <=> $other{ $a } } keys %other ) { print "$_\t" . $other{ $_ } . "\n" } } # error else { &usage } # done exit; sub usage { print "Usage: $0 \n"; exit; }