#!/usr/bin/perl

# count.pl - tally the most frequently used parts of speech in a text

# Eric Lease Morgan <emrogan@nd.edu>
# February 1, 2011 -- first investigtations
# February 5, 2011 -- tweaked comments

# require
use strict;

# get input and sanity check
my $partofspeech = $ARGV[ 0 ];
my $tokenLemma   = $ARGV[ 1 ];
my $file         = $ARGV[ 2 ];
if ( ! $partofspeech or ! $tokenLemma or ! $file ) { &usage }

# initialize
my %adjectives = ();
my %adverbs    = ();
my %lemmas     = ();
my %nouns      = ();
my %other      = ();
my %pos        = ();
my %pronous    = ();
my %sum        = ();
my %tokens     = ();
my %verbs      = ();

# process each line from the input file
open FILE, " < $file" or die "Can't open $file: $!\n";
while ( <FILE> ) {

	# clean
	chop;
	
	# parse
	my ( $t, $l, $p ) = split /\t/, $_;
	
	# do the work; count
	$tokens{ $t }++;
	$lemmas{ $l }++;
	$pos{ $p }++;
		
	# re-initialize
	my $s = '';
	my $v = '';
	
	# keep track of tokens or lemmas
	if    ( $tokenLemma eq 't' ) { $v = $t }
	elsif ( $tokenLemma eq 'l' ) { $v = $l }
	else  { &usage }
	
	# map and normalize TreeTagger POS to simpler forms
	if    ( $p =~ /^N/ )    { $s = 'noun';      $nouns{ $v }++ }  
	elsif ( $p =~ /^WP/ )   { $s = 'pronoun';   $pronous{ $v }++ }
	elsif ( $p =~ /^PP/ )   { $s = 'pronoun';   $pronous{ $v }++  }
	elsif ( $p =~ /^J/ )    { $s = 'adjective'; $adjectives{ $v }++ }
	elsif ( $p =~ /^V/ )    { $s = 'verb';      $verbs { $v }++  }
	elsif ( $p =~ /^WRB/ )  { $s = 'adverb';    $adverbs{ $v }++ }
	elsif ( $p =~ /^RB/ )   { $s = 'adverb';    $adverbs{ $v }++ }
	elsif ( $p =~ /^,/ )    { $s = 'punctuation' }
	elsif ( $p =~ /^:/ )    { $s = 'punctuation' }
	elsif ( $p =~ /^SENT/ ) { $s = 'punctuation' }
	elsif ( $p =~ /^\(/ )   { $s = 'punctuation' }
	elsif ( $p =~ /^\)/ )   { $s = 'punctuation' }
	elsif ( $p =~ /^''/ )   { $s = 'punctuation' }
	elsif ( $p =~ /^``/ )   { $s = 'punctuation' }
	elsif ( $p =~ /^\$/ )   { $s = 'symbol' }
	elsif ( $p =~ /^SYM/ )  { $s = 'symbol' }
	elsif ( $p =~ /^CD/ )   { $s = 'symbol' }
	elsif ( $p =~ /^CC/ )   { $s = 'conjunction' }
	elsif ( $p =~ /^DT/ )   { $s = 'determiner' }
	elsif ( $p =~ /^PDT/ )  { $s = 'determiner' }
	elsif ( $p =~ /^WDT/ )  { $s = 'determiner' }
	elsif ( $p =~ /^IN/ )   { $s = 'preposition' }
	elsif ( $p =~ /^TO/ )   { $s = 'preposition' }
	elsif ( $p =~ /^UH/ )   { $s = 'interjection' }
	else                    { $s = 'other';     $other{ $p }++ }
	
	# count the simpler forms
	$sum{ $s }++
	
}
close FILE;

# output all POS
if ( $partofspeech eq 'all' ) { foreach ( sort keys %pos ) { print "$_\t" . $pos{ $_ } . "\n" } }

# simple parts of speach
elsif ( $partofspeech eq 'simple' ) {

	my @list = ( 'noun', 'pronoun', 'adjective', 'verb', 'adverb', 'determiner', 'preposition', 'conjunction', 'interjection', 'symbol', 'punctuation',  'other' );
	foreach ( @list ) { print "$_\t" . $sum{ $_ } . "\n" }
	
}

# nouns
elsif ( $partofspeech eq 'nouns' ) { foreach ( sort { $nouns{ $b } <=> $nouns{ $a } } keys %nouns ) { print "$_\t" . $nouns{ $_ } . "\n" } }

# pronous
elsif ( $partofspeech eq 'pronouns' ) { foreach ( sort { $pronous{ $b } <=> $pronous{ $a } } keys %pronous ) { print "$_\t" . $pronous{ $_ } . "\n" } }

# verbs
elsif ( $partofspeech eq 'verbs' ) { foreach ( sort { $verbs{ $b } <=> $verbs{ $a } } keys %verbs ) { print "$_\t" . $verbs{ $_ } . "\n" } }

# adverbs
elsif ( $partofspeech eq 'adverbs' ) { foreach ( sort { $adverbs{ $b } <=> $adverbs{ $a } } keys %adverbs ) { print "$_\t" . $adverbs{ $_ } . "\n" } }

# adjectives
elsif ( $partofspeech eq 'adjectives' ) { foreach ( sort { $adjectives{ $b } <=> $adjectives{ $a } } keys %adjectives ) { print "$_\t" . $adjectives{ $_ } . "\n" } }

# "other"
elsif ( $partofspeech eq 'other' ) { foreach ( sort { $other{ $b } <=> $other{ $a } } keys %other ) { print "$_\t" . $other{ $_ } . "\n" } }

# error
else { &usage }

# done
exit;


sub usage {

	print "Usage: $0 <all | simple | other | pronouns | nouns | verbs | adverbs | adjectives > <t|l> <filename>\n";
	exit;

}