#!/usr/bin/perl

# tag.pl - produce a list of words (tokens), lemmas, and parts-of-speech from a text file
# Feburary 4, 2011 - first investigations

# require
use strict;
use Lingua::TreeTagger;

# sanity check
my $file = $ARGV[ 0 ];
if ( ! $file ) {

	print "Usage: $0 <filename>\n";
	exit;
	
}

# intialize, tag, and output
my $tagger = Lingua::TreeTagger->new( 'language' => 'english' );
my $tagged_text = $tagger->tag_file( $file );
foreach my $token ( @{ $tagged_text->sequence() } ) { print  lc( $token->original ) . "\t" . $token->lemma . "\t" . $token->tag . "\n" }

# done
exit;