package Alex::Networkdiagram; # Networkdiagram.pm - visualize concordance using network diagram # Eric Lease Morgan # January 23, 2011 - first investigations; based on CGI script # define/configure use constant INFOMO => 'http://infomotions.com'; use constant INSTANCE => 'alex3'; use constant PROXY => 56234; use constant QUERY => 'love'; use constant RADIUS => 45; use constant ROOT => '/var/www/html/main'; use constant THRESHOLD => 5; use constant URL => 73038; use constant CONCORDANCE => '/sandbox/concordance/?cmd=search&##ID##&query=##QUERY##&radius=##RADIUS##'; # require use Apache2::Const -compile => qw( OK ); use CGI; use Lingua::Concordance; use Lingua::StopWords qw( getStopWords ); use MyLibrary::Core; use strict; # main sub handler { # initalize my $r = shift; my $cgi = CGI->new; my $id = $cgi->param( 'id' ); my $query = $cgi->param( 'q' ) ? $cgi->param( 'q' ) : QUERY; my $radius = $cgi->param( 'r' ) ? $cgi->param( 'r' ) : RADIUS; my $threshold = $cgi->param( 't' ) ? $cgi->param( 't' ) : THRESHOLD; my $html = ''; my $stopwords = &getStopWords( 'en' ); $$stopwords{ 'one' }++; my %matrix = (); MyLibrary::Config->instance( INSTANCE ); # display default page if ( ! $id ) { # set-up output $html = &template; $html =~ s/##CONTENT##/&default/e; $html =~ s/##DATA##//; $html =~ s/##JAVASCRIPT##//; } # do the work else { # get the id's metadata; needs error checking my ( $title, $author, $filename, $url ) = &get_metadata( $id ); # initalize some more my $corpus = &slurp( $filename ); # get initial words found near the query and sort them by frequency my $words = &concordance( $corpus, $query, $radius, $stopwords ); my @keys = sort { $$words{ $b } <=> $$words{ $a } } keys %$words; # process each word (key) below a particular threshold; build matrix of words for ( my $i = 0; $i < $threshold; $i++ ) { my $query = $keys[ $i ]; my $words = &concordance( $corpus, $query, $radius, $stopwords ); my @subkeys = ( sort { $$words{ $b } <=> $$words{ $a } } keys %$words ); my $coocurrances = &coocurances( $subkeys[ 0 ], $words, $threshold ); my @list = (); my $j = 0; my $key = ''; foreach ( sort { $$coocurrances{ $b } <=> $$coocurrances{ $a } } keys %$coocurrances ) { $j++; if ( $j == 1 ) { $key = $_ } push @list, $_; } $matrix{ $key } = [ @list ]; } # format matrix as table my $matrix = ''; foreach ( sort keys %matrix ) { $matrix .= ''; my $list = $matrix{ $_ }; foreach my $word ( @$list ) { $matrix .= "" } $matrix .= ''; } $matrix .= '
$word
'; # create an ordered list of the words in the matrix my %words = (); my $i = 0; foreach ( keys %matrix ) { my $list = $matrix{ $_ }; foreach my $word ( @$list ) { my $found = 0; foreach my $key ( keys %words ) { if ( $key eq $word ) { $found = 1 } } if ( ! $found ) { $words{ $word } = $i; $i++; } } } # build a list of nodes from the words for Protovis my $nodes = ''; foreach ( sort { $words{ $a } <=> $words{ $b } } keys %words ) { $nodes .= qq({nodeName:"$_"},) } chop $nodes; # build a list of links from the words for Protovis my $links = ''; foreach my $source ( keys %matrix ) { my $list = $matrix{ $source }; foreach ( my $i = 1; $i < $threshold; $i++ ) { $links .= qq({source:$words{ $$list[ $source ] },target:$words{ $$list[ $i ] }},); } } chop $links; # build the javascript and data; my $javascript = &same_breath; my $data = qq(\n); # display the page $html = &template; $html =~ s/##CONTENT##/&home( $url )/e; $html =~ s/##TITLE##/$title/eg; $html =~ s/##AUTHOR##/$author/eg; $html =~ s/##JAVASCRIPT##/$javascript/e; $html =~ s/##MATRIX##/$matrix/e; $html =~ s/##DATA##/$data/e; $html =~ s/##TITLEQUERY##/" :: $query"/eg; $html =~ s/##ID##/$id/eg; $html =~ s/##QUERY##/$query/eg; $html =~ s/##BREATH##/$radius/eg; $html =~ s/##DETAIL##/$threshold/eg; } # done $r->content_type( 'text/html' ); $r->print( $html ); return Apache2::Const::OK; } # template sub template { return < Concordance visualizations ##DATA## ##CONTENT##

Author: Eric Lease Morgan <eric_morgan\@infomotions.com>
Date created: January 23, 2011
Date updated: January 23, 2011
URL: http://infomotions.com/sandbox/diagrams/

EOT } # default sub default { return <Infomotions, LLCConcordance visualizations

This is a set of concordance visualizations against content in the Alex Catalogue of Electronic Texts. You can use them to analyze and evaluate texts quickly. Here are few texts that can be used as examples:

For more texts, search the Catalogue.
Delicious Bookmark this on Delicious

EOT } # home sub home { my $url = shift; return <

##TITLE##

Play with this page to literally see what was said in the same breath when a given word is used in the book ##TITLE##.


How to get the most out of this application:

  1. Enter or a word (or "regular expression") to locate in the book. This is your query.
  2. Change the size of the breath to increase or decrease the number of characters on either side of the query where co-occurances will be found. Values between 40 and 80 work well.
  3. Change the amount of detail to increase or decrease the number of co-occurances to identify for each query. Values between 4 and 7 work well.
  4. Adjust the breath and detail until the resulting diagram forms simple patterns with very few crossing lines.

The resulting graphic will tell you something about the text, and you will be doing "distant reading".

The visualization is based on the following matrix of terms, starting with and building upon "##QUERY##":

##MATRIX##

Zoom in and out to see detail. Drag nodes to simplify the diagram. Note enclosed polygons to "read" coherent thoughts. (Matrix)

Concordance

EOT } # given an id, return author, title, and filename sub get_metadata { my $root = ROOT; my $host = INFOMO; my $resource = MyLibrary::Resource->new( fkey => shift ); my @locations = $resource->resource_locations; my $filename = ''; my $url = ''; foreach my $location ( @locations ) { if ( $location->resource_location_type == PROXY ) { $filename = $location->location } if ( $location->resource_location_type == URL ) { $url = $location->location } } $filename =~ s|$host|$root|e; return ( $resource->name, $resource->creator, $filename, $url ); } sub concordance { my $corpus = shift; my $query = shift; my $radius = shift; my $stopwords = shift; my $subset = ''; my $concordance = Lingua::Concordance->new; $concordance->text( $corpus ); $concordance->query( $query ); $concordance->radius( $radius ); foreach ( $concordance->lines ) { $subset .= $_ . ' ' } if ( ! $subset ) { ¬found } $subset =~ tr/a-zA-Zà-ƶÀ-Ƶ'()\-,.?!;:/\n/cs; $subset =~ s/([,.?!:;()\-])/\n$1\n/g; $subset =~ s/\n+/\n/g; my @tokens = split /\n/, lc( $subset ); my %words = (); foreach ( @tokens ) { next if ( $_ =~ /[,.?!:;()\-]/ ); next if ( $$stopwords{ $_ } ); next if ( length( $_ ) < 3 ); $words{ $_ }++; } return \%words; } sub coocurances { my $query = shift; my $words = shift; my $threshold = shift; my $t = 0; my %coocurrances = (); foreach ( sort { $$words{ $b } <=> $$words{ $a } } keys %$words ) { $coocurrances{ $_ } = $$words{ $_ }; $t++; last if ( $t == $threshold ); } return \%coocurrances; } sub slurp { # open a file named by the input and return its contents my $f = @_[0]; open (F, "< $f"); my $r = do { local $/; }; close F; return $r; } sub same_breath { return < JAVASCRIPT } # return true or die 1;