package Alex::Concordance; # Concordance.pm - KWIC search engine and bigram tool for Alex content # Eric Lease Morgan # June 18, 2009 - first investigations # July 1, 2009 - add bigrams and dynamic content from Alex # July 3, 2009 - added words by letter and specialized searches # July 22, 2009 - tweaked option elements in forms; changed id to fkey # August 27, 2010 - added ngram functionality and tweaked the form; yeah for Lingua::EN::Bigram! # August 30, 2010 - added map of where queries are located in the text # September 12, 2010 - tweaked for use with Lingua::EN::Ngram # September 18, 2010 - added Twitter and Delicious links; moved ' in the beginning and ending of words # Januaray 23, 2011 - linked to network diagrams # define/configure use constant PROXY => 56234; use constant INSTANCE => 'alex3'; use constant ROOT => '/var/www/html/main'; use constant INFOMO => 'http://infomotions.com'; use constant URL => 73038; use constant MAXCOLLOCATIONS => 100; use constant RADIUS => 40; # require use Apache2::Const -compile => qw( OK ); use CGI; use Lingua::Concordance; use Lingua::EN::Ngram; use Lingua::StopWords qw( getStopWords ); use MyLibrary::Core; use strict; # main sub handler { # initalize my $r = shift; my $cgi = CGI->new; my $id = $cgi->param( 'id' ); my $html = ''; MyLibrary::Config->instance( INSTANCE ); # display default page if ( ! $id ) { # set-up output $html = &template; $html =~ s/##CONTENT##/&default/e; } # branch according to cmd else { # get the id's metadata; needs error checking my ( $title, $author, $filename, $url ) = &get_metadata( $id ); # get the cmd my $cmd = $cgi->param( 'cmd' ); # no command if ( ! $cmd ) { # display the home page $html = &template; $html =~ s/##CONTENT##/&home( $url )/e; $html =~ s/##TITLE##/$title/e; $html =~ s/##AUTHOR##/$author/e; $html =~ s/##FORM##/&form( $cgi )/e; $html =~ s/##ID##/$id/ge; $html =~ s/##SHOWMAP##//e; $html =~ s/##DIAGRAM##//e; $html =~ s/##MAP##//e; $html =~ s/##QUERY##//e; $html =~ s/##SUBFORM##//e; $html =~ s/##LINES##//e; } # display words starting with letter elsif ( $cmd eq 'letters' ) { my $i = $cgi->param( 'l' ); # set up my $collocations = Lingua::EN::Ngram->new( file => $filename ); my $words = $collocations->ngram( 1 ); my $lines = ''; foreach my $word ( sort keys %$words ) { my $l = substr $word, 0, 1; next if ( $i gt $l ); last if ( $i lt $l ); my $link = $cgi->a({ href => "./?cmd=search&id=$id&query=" . $word }, $word ); $lines .= $link . ' (' . $$words{ $word } . ');  '; } $lines = $cgi->p( $lines ); # set-up output $html = &template; $html =~ s/##CONTENT##/&home( $url )/e; $html =~ s/##TITLE##/$title/e; $html =~ s/##AUTHOR##/$author/e; $html =~ s/##FORM##/&form( $cgi )/e; $html =~ s/##ID##/$id/ge; $html =~ s/##QUERY##//e; $html =~ s/##SHOWMAP##//e; $html =~ s/##DIAGRAM##//e; $html =~ s/##MAP##//e; $html =~ s/##SUBFORM##//e; $html =~ s/##LINES##/$lines/e; } # display single words elsif ( $cmd eq 'words' ) { # get the number of words my $n = $cgi->param( 'n' ); # set up my $collocations = Lingua::EN::Ngram->new( file => $filename ); # do the work my $words = $collocations->ngram( 1 ); my $index = 0; my $lines = ''; my $stopwords = &getStopWords( 'en' ); foreach ( sort { $$words{ $b } <=> $$words{ $a } } keys %$words ) { # skip stopwords and punctuation next if ( $$stopwords{ $_ } ); next if ( $_ =~ /[,.?!:;()\-]/ or $_ =~ /^'/ or $_ =~ /'$/ ); # limit the output $index++; last if ( $index > $n ); # gather the words my $link = $cgi->a({ href => "./?cmd=search&id=$id&query=" . $_ }, $_ ); $lines .= $link . ' (' . $$words{ $_ } . ');  '; } $lines = $cgi->p( $lines ); # set-up output $html = &template; $html =~ s/##CONTENT##/&home( $url )/e; $html =~ s/##TITLE##/$title/e; $html =~ s/##AUTHOR##/$author/e; $html =~ s/##FORM##/&form( $cgi )/e; $html =~ s/##ID##/$id/ge; $html =~ s/##QUERY##//e; $html =~ s/##SHOWMAP##//e; $html =~ s/##DIAGRAM##//e; $html =~ s/##MAP##//e; $html =~ s/##SUBFORM##//e; $html =~ s/##LINES##/$lines/e; } # display collocations elsif ( $cmd eq 'collocations' ) { # get the input: length of collocations my $n = $cgi->param( 'n' ); # initalize my $collocations = Lingua::EN::Ngram->new( file => $filename ); # count; branch according to how many my $bigrams = ''; my $count = ''; my $tscore = ''; if ( $n == 2 ) { $bigrams = $collocations->ngram( 2 ); $count = $collocations->tscore; } else { $count = $collocations->ngram( $n )} # process each count my $index = 0; my $lines = ''; my $stopwords = &getStopWords( 'en' ); foreach my $phrase ( sort { $$count{ $b } <=> $$count{ $a } } keys %$count ) { # get the tokens of the phrase my @tokens = split / /, $phrase; # process each token; filter based on it's value my $found = 0; foreach ( @tokens ) { # skip stop words for bigrams if ( $n == 2 ) { if ( $$stopwords{ $_ }) { $found = 1; last; } } # skip punctuation if ( $_ =~ /[,.?!:;()\-]/ or $_ =~ /^'/ or $_ =~ /'$/ ) { $found = 1; last; } # skip punctuation if ( $_ =~ /^'/ ) { $found = 1; last; } } # loop if found an unwanted token next if ( $found ); # limit the output $index++; last if ( $index > MAXCOLLOCATIONS ); last if ( $$count{ $phrase } == 1 ); # gather the words my $link = $cgi->a({ href => "./?cmd=search&id=$id&query=" . $phrase }, $phrase ); if ( $n == 2 ) { $lines .= $link . ' (' . $$bigrams{ $phrase } . ');  ' } else { $lines .= $link . ' (' . $$count{ $phrase } . ');  ' } } $lines = $cgi->p( $lines ); # set-up output $html = &template; $html =~ s/##CONTENT##/&home( $url )/e; $html =~ s/##TITLE##/$title/e; $html =~ s/##AUTHOR##/$author/e; $html =~ s/##FORM##/&form( $cgi )/e; $html =~ s/##QUERY##//e; $html =~ s/##SHOWMAP##//e; $html =~ s/##DIAGRAM##//e; $html =~ s/##MAP##//e; $html =~ s/##SUBFORM##//e; $html =~ s/##ID##/$id/ge; $html =~ s/##LINES##/$lines/e; } # implement concordance elsif ( $cmd eq 'search' ) { # get the query my $query = $cgi->param( 'query' ); # check for escaped urls because twitter hacks them; a reverse hack if ( $query =~ /%5C/ or $query =~ /%28/ or $query =~ /%7C/ ) { # unescape $query =~ s/%5C/\\/g; $query =~ s/%28/(/g; $query =~ s/%29/)/g; $query =~ s/%7C/\|/g; # redirect my $url = INFOMO . "/sandbox/concordance/?cmd=search&id=$id&query=$query"; print $cgi->header( -status => '303 See Other', -Location => $url ); # done return Apache2::Const::OK; } # build & configure concordance my $concordance = Lingua::Concordance->new; $concordance->text( &slurp( $filename )); $concordance->radius( RADIUS ); $concordance->query( $query ); my $radius = $cgi->param( 'radius' ) ? $cgi->param( 'radius' ) : $concordance->radius; $concordance->radius( $radius ); my $sort = $cgi->param( 'sort' ) ? $cgi->param( 'sort' ) : $concordance->radius; $concordance->sort( $sort ); # do the work my $lines = ''; my $index = 0; foreach my $line ( $concordance->lines ) { # build padding $index++; if ( $radius < 200 ) { my $spaces = ''; if ( length( $index ) == 1 ) { $spaces = ' ' } if ( length( $index ) == 2 ) { $spaces = ' ' } if ( length( $index ) == 3 ) { $spaces = ' ' } # format line $lines .= "$index.$spaces$line" . $cgi->br; } else { $lines .= $cgi->p( $index . '. ' . $line ) } } # format results, some more my $pattern = '\w+' . $query . '\w+|' . $query . '\w+|' . $query . '|\w+' . $query ; $lines =~ s|($pattern)|$1|gi; if ( $radius < 200 ) { $lines = $cgi->pre({ style => 'text-align: center' }, $lines )} # calculate and configure map $concordance->scale( 10 ); my $map = $concordance->map; my @keys = sort { $$map{ $b } <=> $$map{ $a }} keys %$map; my $greatest_value = $$map{ $keys[ 0 ]}; @keys = sort { $a <=> $b } keys %$map; my $values = ''; foreach ( @keys ) { $values .= $$map{ $_ } . ',' } $values = substr( $values, 0, -1 ); my $showmap = qq[Show map]; my $diagram = "Visualize results"; # set-up output $html = &template; $html =~ s/##CONTENT##/&home( $url )/e; $html =~ s/##TITLE##/$title/e; $html =~ s/##AUTHOR##/$author/e; $html =~ s/##FORM##/&form( $cgi )/e; $html =~ s/##SHOWMAP##/$showmap/e; $html =~ s/##DIAGRAM##/$diagram/e; $html =~ s/##ID##/$id/ge; $html =~ s/##QUERY##/$query/eg; $html =~ s/##MAP##/&map( $greatest_value, $values )/e; $html =~ s/##SUBFORM##/&subform( $radius, $sort )/e; $html =~ s/##LINES##/$lines/e; } } # done $r->content_type( 'text/html' ); $r->print( $html ); return Apache2::Const::OK; } # template sub template { return < Concordances ##CONTENT##

Author: Eric Lease Morgan <eric_morgan\@infomotions.com>
Date created: June 18, 2009
Date updated: September 12, 2010
URL: http://infomotions.com/sandbox/concordance/

EOT } # default sub default { return <Infomotions, Inc.Concordances

This is a set of concordances against content in the Alex Catalogue of Electronic Texts. You can use the concordances to analyze and evaluate texts quickly. Here are few texts that can be used as examples:

For more texts, search the Catalogue.

EOT } # home sub home { my $url = shift; return <Infomotions, Inc.Concordance for ##TITLE## / ##AUTHOR##

Use the features on this page to analyze and evaluate the text.

View full text ##FORM## ##MAP## ##LINES## EOT } # map sub map { my $g = shift; my $v = shift; my $q = shift; return < EOT } # form sub form { my $cgi = shift; my $script = $cgi->url; return < Words begining with:
Number of words:
Most frequent collocations:
Search: ##SHOWMAP## ##DIAGRAM## ##SUBFORM##

Specialized searches: colors; adverbs; gerunds; "big names"; "great ideas"

EOT } # update subform; retain selected values sub subform { my $radius = shift; my $sort = shift; my $subform = < Radius: Sort: None Left Right Match EOF # brute force dynamic updating; there's got to be a better way if ( $radius eq '40' ) { $subform =~ s/value="40" /value="40" selected="selected" / } elsif ( $radius eq '50' ) { $subform =~ s/value="50" /value="50" selected="selected" / } elsif ( $radius eq '60' ) { $subform =~ s/value="60" /value="60" selected="selected" / } elsif ( $radius eq '200' ) { $subform =~ s/value="200" /value="200" selected="selected" / } elsif ( $radius eq '500' ) { $subform =~ s/value="500" /value="500" selected="selected" / } elsif ( $radius eq '1000' ) { $subform =~ s/value="1000" /value="1000" selected="selected" / } else { $subform =~ s/value="30" /value="30" selected="selected" / } if ( $sort eq 'none' ) { $subform =~ s/value="none" /value="none" checked="checked" / } elsif ( $sort eq 'left' ) { $subform =~ s/value="left" /value="left" checked="checked" / } elsif ( $sort eq 'right' ) { $subform =~ s/value="right" /value="right" checked="checked" / } elsif ( $sort eq 'match' ) { $subform =~ s/value="match" /value="match" checked="checked" / } else { $subform =~ s/value="none" /value="none" checked="checked" / } # done return $subform; } # open a file named by the input and return its contents sub slurp { my $f = shift; open ( F, $f ) or die "Can't open $f: $!\n"; my $r = do { local $/; }; close F; return $r; } # given an id, return author, title, and filename sub get_metadata { my $root = ROOT; my $host = INFOMO; my $resource = MyLibrary::Resource->new( fkey => shift ); my @locations = $resource->resource_locations; my $filename = ''; my $url = ''; foreach my $location ( @locations ) { if ( $location->resource_location_type == PROXY ) { $filename = $location->location } if ( $location->resource_location_type == URL ) { $url = $location->location } } $filename =~ s|$host|$root|e; return ( $resource->name, $resource->creator, $filename, $url ); } # return true or die 1;