package Alex; # search.cgi - search a solr/lucene index via a web browser # Eric Lease Morgan # December 20, 2008 - first investigations # December 22, 2008 - added facets, a pager, icons, and bunches o' stuff # January 31, 2009 - turned it into a mod_perl module # February 1, 2009 - added title, author, subject, and synonym queries # March 13, 2009 - put into production, for better or for worse # April 4, 2009 - added commify and expandable creator/subject facets # April 5, 2009 - sorted creator/subject facets; changed long links to words # August 16, 2009 - added browse by subject/tag # August 21, 2009 - fixed bad syntax bug; should get few, if not zero, 500 errors; Mom's birthday # September 24, 2009 - started adding snippets and sorting # October 2, 2009 - made dismax the defult; implemented filter queries; tweaked sorting # August 18, 2010 - added accordian widget to facets div; kewl eye candy # define use constant ROOT => '/var/www/html/main/alex'; use constant SOLR => 'http://localhost:8983/solr/alex'; use constant TEMPLATE => ROOT . '/etc/template.txt'; use constant RESULTS => ROOT . '/etc/results.txt'; use constant HOME => ROOT . '/etc/home.txt'; use constant ROWS => 25; use constant MINCOUNT => 2; use constant POSES => qw(n v a s r); use constant NAME_LIST => ROOT . '/etc/name_LETTER.txt'; use constant NAME_PAGE => ROOT . '/etc/name-page.txt'; use constant SUBJECT_LIST => ROOT . '/etc/subject_LETTER.txt'; use constant SUBJECT_PAGE => ROOT . '/etc/subject-page.txt'; use constant TITLE_LIST => ROOT . '/etc/titles_LETTER.txt'; use constant TITLE_PAGE => ROOT . '/etc/title-page.txt'; use constant FORM => ROOT . '/etc/form.txt'; use constant FORMSMALL => ROOT . '/etc/form-small.txt'; # require use Apache2::Const -compile => qw( OK ); use strict; use CGI; use Lingua::Wordnet; use WebService::Solr; require ( ROOT . '/lib/subroutines.pl' ); sub handler { # initalize my $r = shift; my $solr = WebService::Solr->new( SOLR ); my $cgi = CGI->new; my $html = ''; # get the command my $cmd = $cgi->param( 'cmd' ); # branch accordingly if ( ! $cmd ) { # display the home page and quit $html = &slurp( TEMPLATE ); $html =~ s/##CONTENT##/&slurp( HOME )/e; $html =~ s/##FORM##/&slurp( FORM )/ge; $html =~ s/##FORMSMALL##//ge; $html =~ s/##QUERY##//ge; } # subject/tag index elsif ( $cmd eq 'tags' ) { # get the list to display my $letter = $cgi->param( 'ltr' ); if ( ! $letter ) { $letter = chr( int( rand( 26 )) + 65 ) } my $subject_list = SUBJECT_LIST; $subject_list =~ s/LETTER/$letter/; # display the list $html = &slurp( TEMPLATE ); $html =~ s/##CONTENT##/&slurp( SUBJECT_PAGE )/e; $html =~ s/##TAGLIST##/&slurp( $subject_list )/e; $html =~ s/##FORMSMALL##/&slurp( FORMSMALL )/e; $html =~ s/##QUERY##//; $html =~ s/##GOOGLE##//ge; } # name index elsif ( $cmd eq 'names' ) { # get the list to display my $letter = $cgi->param( 'ltr' ); if ( ! $letter ) { $letter = chr( int( rand( 26 )) + 65 ) } my $name_list = NAME_LIST; $name_list =~ s/LETTER/$letter/; # display the list $html = &slurp( TEMPLATE ); $html =~ s/##CONTENT##/&slurp( NAME_PAGE )/e; $html =~ s/##NAMELIST##/&slurp( $name_list )/e; $html =~ s/##FORMSMALL##/&slurp( FORMSMALL )/e; $html =~ s/##QUERY##//; $html =~ s/##GOOGLE##//ge; } # title index elsif ( $cmd eq 'titles' ) { # get the list to display my $letter = $cgi->param( 'ltr' ); if ( ! $letter ) { $letter = chr( int( rand( 26 )) + 65 ) } my $title_list = TITLE_LIST; $title_list =~ s/LETTER/$letter/; # display the home page $html = &slurp(TEMPLATE); $html =~ s/##CONTENT##/&slurp( TITLE_PAGE )/e; $html =~ s/##FORMSMALL##/&slurp( FORMSMALL )/e; $html =~ s/##TITLELIST##/&slurp( $title_list )/e; $html =~ s/##QUERY##//; $html =~ s/##GOOGLE##//ge; } elsif ( $cmd eq 'search' ) { # get the input my $query = $cgi->param( 'query' ); my $start = $cgi->param( 'start' ); my $sort = $cgi->param( 'sort' ); my $params = $cgi->query_string; # calculate start if ( ! $start ) { $start = 0 } # calculate sort if ( ! $sort ) { $sort = 'score desc' } # build the search options my %search_options = (); $search_options{ 'start' } = $start; $search_options{ 'rows' } = ROWS; $search_options{ 'facet' } = 'true'; $search_options{ 'facet.field' } = [ 'type', 'facet_creator', 'facet_subject', 'repository' ]; $search_options{ 'facet.mincount' } = MINCOUNT; $search_options{ 'sort' } = $sort; if ( $cgi->param( 'fq') ) { $search_options{ 'fq' } = [ $cgi->param( 'fq') ] }; # do the work; search my $response = $solr->search( $query, \%search_options ); # types facet my %types = &get_facets( $response->facet_counts->{ facet_fields }->{ type } ); my $types_facet = ''; foreach ( sort( keys( %types ))) { my $link = $cgi->a({ href => "./?cmd=search&query=$query&fq=type:$_" }, $_ ); $types_facet .= '   ' . $cgi->img({ src => "./etc/$_.png" }) . " $link (" . $types{ $_ } . ')' . $cgi->br; } if ( ! $types_facet ) { $types_facet = "   (none)" } # repository facet my %repositories = &get_facets( $response->facet_counts->{ facet_fields }->{ repository } ); my $repositories_facet = ''; foreach ( sort( keys( %repositories ))) { my $link = $cgi->a({ href => "./?$params&fq=repository:" . '"' . $_ . '"' }, $_ ); my $repository_icon = ''; if ( $_ =~ /Alex/ or $_ =~ /Gallery/ ) { $repository_icon = $cgi->img({ src => "./etc/infomotions.ico", width => "16", height => "16" }) } if ( $_ =~ /DOAJ/ ) { $repository_icon = $cgi->img({ src => "./etc/doaj.png", width => "16", height => "16" }) } if ( $_ =~ /Gutenberg/ ) { $repository_icon = $cgi->img({ src => "./etc/gutenberg.ico", width => "16", height => "16" }) } if ( $_ =~ /Hathi/ ) { $repository_icon = $cgi->img({ src => "./etc/hathi.png", width => "16", height => "16" }) } if ( $_ =~ /Archive/ ) { $repository_icon = $cgi->img({ src => "./etc/archive.ico", width => "16", height => "16" }) } $repositories_facet .= "   $repository_icon $link (" . $repositories{ $_ } . ')' . $cgi->br; } if ( ! $repositories_facet ) { $repositories_facet = "   (none)" } # subjects facet my %subjects = &get_facets( $response->facet_counts->{ facet_fields }->{ facet_subject } ); my $subjects_facet = ''; my $subject_count = 0; foreach ( sort { $subjects{ $b } <=> $subjects{ $a } } keys( %subjects )) { next if ( ! $_ ); my $link = $cgi->a({ href => "./?$params&fq=facet_subject:" . '"' . $_ . '"' }, $_ ); $subject_count++; if ( $subject_count == 5 ) { $subjects_facet .= " more..." } $subjects_facet .= "$link (" . $subjects{ $_ } . ") "; } if ( $subject_count > 4 ) { $subjects_facet .= '' } if ( ! $subjects_facet ) { $subjects_facet = "(none)" } # creators facet my %creators = &get_facets( $response->facet_counts->{ facet_fields }->{ facet_creator } ); my $creators_facet = ''; my $creator_count = 0; foreach ( sort { $creators{ $b } <=> $creators{ $a } } keys( %creators )) { next if ( ! $_ ); my $link = $cgi->a({ href => "./?$params&fq=facet_creator:" . '"' . $_ . '"' }, $_ ); $creator_count++; if ( $creator_count == 5 ) { $creators_facet .= " more..." } $creators_facet .= "$link (" . $creators{ $_ } . ") "; } if ( $creator_count > 4 ) { $creators_facet .= '' } if ( ! $creators_facet ) { $creators_facet = "(none)" } # build the hitlist my @hits = $response->docs; my $hit_count = $response->content->{ response }->{ numFound }; my $hitlist = ''; my $highlights = $response->content->{ highlighting }; foreach my $doc ( @hits ) { # slurp my $id = $doc->value_for( 'id' ); my $title = $doc->value_for( 'title' ); my $publisher = $doc->value_for( 'publisher' ); my $creator = $doc->value_for( 'creator' ); #my $creator = ''; foreach ( @creator ) { $creator .= "$_; " } my $contributor = $doc->value_for( 'contributor' ); my $url = $doc->value_for( 'url' ); my @subjects = $doc->values_for('subject' ); my $subject = ''; foreach ( @subjects ) { $subject .= "$_; " } my $source = $doc->value_for( 'source' ); my $description = $doc->value_for( 'description' ); my $format = $doc->value_for( 'format' ); my $type = $doc->value_for( 'type' ); my $relation = $doc->value_for( 'relation' ); my $repository = $doc->value_for( 'repository' ); my $snippet = $highlights->{ $id }->{ fulltext }->[ 0 ]; $snippet .= ' ... ' . $highlights->{ $id }->{ fulltext }->[ 1 ]; $snippet =~ s{\s+}{ }g; # details my $sublist = ''; if ( $creator ) { $sublist .= $cgi->li( "creator(s) - $creator" ) } if ( $contributor ) { $sublist .= $cgi->li( "contributor - $contributor" ) } if ( $url ) { $sublist .= $cgi->li( "links - " . $cgi->a({ href => $url}, 'view document' )) } if ( $description ) { $sublist .= $cgi->li( "description - $description" ) } if ( $format ) { $sublist .= $cgi->li( "format - $format" ) } if ( $publisher ) { $sublist .= $cgi->li( "publisher - $publisher" ) } if ( $relation ) { if ( $relation =~ /^http:/ ) { $sublist .= $cgi->li( "relation - " . $cgi->a({ href => $relation }, 'local cache' ))} else { $sublist .= $cgi->li( "relation - $relation" )} } if ( $source ) { if ( $source =~ /^http:/ or $source =~ /^gopher:/ ) { $sublist .= $cgi->li( "source - " . $cgi->a({ href => $source }, 'original file' ))} else { $sublist .= $cgi->li( "source: $source" )} } if ( $subject ) { $sublist .= $cgi->li( "subject(s) - $subject" )} if ( $title ) { $sublist .= $cgi->li( "title - $title" )} if ( $type ) { $sublist .= $cgi->li( "type - $type" )} $sublist = $cgi->ul( {style => 'margin-bottom: 1em'}, $sublist ); $sublist = $cgi->div({ -id => $id, -style => 'display: none' }, $sublist ); # cute icons my $type_icon = $cgi->img({ src => "./etc/$type.png" }); my $repository_icon = ''; if ( $repository =~ /Alex/ or $repository =~ /Gallery/ ) { $repository_icon = $cgi->img({ src => "./etc/infomotions.ico", width => "16", height => "16" }) } if ( $repository =~ /DOAJ/ ) { $repository_icon = $cgi->img({ src => "./etc/doaj.png", width => "16", height => "16" }) } if ( $repository =~ /Gutenberg/ ) { $repository_icon = $cgi->img({ src => "./etc/gutenberg.ico", width => "16", height => "16" }) } if ( $repository =~ /Hathi/ ) { $repository_icon = $cgi->img({ src => "./etc/hathi.png", width => "16", height => "16" }) } if ( $repository =~ /Archive/ ) { $repository_icon = $cgi->img({ src => "./etc/archive.ico", width => "16", height => "16" }) } # hit list my $main_entry = $title; if ( $creator ) { $main_entry .= " / $creator" } #$hitlist .= $cgi->li({ style => 'margin-bottom: 1em' }, $type_icon . ' ' . $repository_icon . '  ' . $cgi->a({ href => $url, style => 'font-size: large' }, $main_entry ) . $cgi->br . $snippet . '  ' . $cgi->a( { -href => "javascript:expand('$id')", -style => 'color: grey; text-decoration: none' }, 'details...' ) . $sublist ); $hitlist .= $cgi->li({ style => 'margin-bottom: 1em' }, $cgi->a({ href => $url, style => 'font-size: large' }, $main_entry ) . $cgi->br . $snippet . '  ' . $cgi->a( { -href => "javascript:expand('$id')", -style => 'color: grey; text-decoration: none' }, 'details...' ) . $sublist ); } $hitlist = $cgi->ol( { start => ( $start + 1 )}, $hitlist ); my $query_suggestions = &suggest_queries( $cgi->param( 'query' ), $cgi ); my $synonyms = &suggest_synonyms( $cgi->param( 'query' ), $cgi ); # add filter query my $fq = ''; if ( $cgi->param( 'fq' )) { foreach ( $cgi->param( 'fq' )) { $fq .= "&fq=$_" } }; # sorts my $aaz = "./?cmd=search&query=$query&sort=sort_creator%20asc,sort_title%20asc$fq"; my $aza = "./?cmd=search&query=$query&sort=sort_creator%20desc,sort_title%20asc"; my $taz = "./?cmd=search&query=$query&sort=sort_title%20asc,sort_creator%20asc$fq"; my $tza = "./?cmd=search&query=$query&sort=sort_title%20desc,sort_creator%20asc$fq"; my $rnk = "./?cmd=search&query=$query$fq"; # display the results page and quit $html .= &slurp( TEMPLATE ); $html =~ s/##CONTENT##/&slurp( RESULTS )/e; $html =~ s/##FORM##/&slurp( FORM )/ge; $html =~ s/##FORMSMALL##//ge; $html =~ s/##QUERY##/$cgi->param( 'query' )/ge; $html =~ s/##NUMBEROFHITS##/&commify( $hit_count )/ge; $html =~ s/##HITLIST##/$hitlist/ge; $html =~ s/##TYPES##/$types_facet/ge; $html =~ s/##CREATORS##/$creators_facet/ge; $html =~ s/##SUBJECTS##/$subjects_facet/ge; $html =~ s/##REPOSITORIES##/$repositories_facet/ge; $html =~ s/##PAGER##/&pager( '?cmd=search&query=' . $cgi->param( 'query' ) . "&sort=$sort". '&start=', $start, $hit_count, $cgi )/e; $html =~ s/##QUERYSUGGESTIONS##/$query_suggestions/e; $html =~ s/##SYNONYMS##/$synonyms/e; $html =~ s/##AAZ##/$aaz/e; $html =~ s/##AZA##/$aza/e; $html =~ s/##TAZ##/$taz/e; $html =~ s/##TZA##/$tza/e; $html =~ s/##RNK##/$rnk/e; } # done $r->content_type( 'text/html' ); $r->print( "$html" ); return Apache2::Const::OK; } sub pager { # get the input my $link = shift; my $start = shift; my $size = shift; my $cgi = shift; # create the links my $previous = $cgi->a({ -href => $link . ( $start - ROWS ) }, 'Previous' ) . '  '; my $next = '  ' . $cgi->a({ -href => $link . ( $start + ROWS ) }, 'Next' ); # build the pager my $pager; if ( $start - ROWS >= 0 ) { $pager = $previous } if ( $start + ROWS < $size ) { $pager .= $next } # done return $pager; } =cut sub enhance_query { my $query = shift; while ( $query =~ /^ / ) { $query =~ s/^ // } while ( $query =~ / $/ ) { $query =~ s/ $// } while ( $query =~ / / ) { $query =~ s/ / / } $query =~ s/=/:/g; $query =~ s/'/"/g; $query =~ s/ and / AND /g; $query =~ s/ or / OR /g; $query =~ s/ not / NOT /g; if ($query =~ /\s/) { if (( $query =~ / AND / ) | ( $query =~ / OR / ) | ($ query =~ / NOT / )) { } elsif ( $query =~ /:/ ) { } elsif ( $query !~ /"/ ) { # try to make queries with no syntactical sugar a bit "smarter" my @terms = split / /, $query; my $enhancement = ''; for (my $i; $i <= $#terms; $i++) { if ($i < $#terms) { $enhancement .= $terms[ $i ] . ' AND ' } else { $enhancement .= $terms[ $i ] } } $query = '"' . $query . '"' . " OR ($enhancement)"; } } return $query; } =cut sub suggest_queries { my $query = shift; my $cgi = shift; # normalize the query $query =~ s/ =/=/g; $query =~ s/= /=/g; while ( $query =~ / / ) { $query =~ s/ / / } my $suggestions; if ( $query !~ / / && $query !~ /=/ ) { $suggestions = '   ' . $cgi->a({ href => './?cmd=search&query=subject=' . $query }, 'Subject tags' ) . $cgi->br; $suggestions .= '   ' . $cgi->a({ href => './?cmd=search&query=creator=' . $query }, 'Author names' ) . $cgi->br; $suggestions .= '   ' . $cgi->a({ href => './?cmd=search&query=title=' . $query }, 'Title words' ) . $cgi->br; } elsif ( $query !~ / / && $query =~ /creator/ ) { $query =~ s/creator=//; $suggestions = '   ' . $cgi->a({ href => './?cmd=search&query=subject=' . $query }, 'Subject tags' ) . $cgi->br; $suggestions .= '   ' . $cgi->a({ href => './?cmd=search&query=title=' . $query }, 'Title words' ) . $cgi->br; } elsif ( $query !~ / / && $query =~ /title/ ) { $query =~ s/title=//; $suggestions = '   ' . $cgi->a({ href => './?cmd=search&query=subject=' . $query }, 'Subject tags' ) . $cgi->br; $suggestions .= '   ' . $cgi->a({ href => './?cmd=search&query=creator=' . $query }, 'Author names' ) . $cgi->br; } elsif ( $query !~ / / && $query =~ /subject/ ) { $query =~ s/subject=//; $suggestions = '   ' . $cgi->a({ href => './?cmd=search&query=creator=' . $query }, 'Author names' ) . $cgi->br; $suggestions .= '   ' . $cgi->a({ href => './?cmd=search&query=title=' . $query }, 'Title words' ) . '
'; } elsif ( $query =~ / / && $query =~ /=/ ) { my @words = &tokenize_query( $cgi->param( 'query' )); my $field_search; if ( $cgi->param( 'query' ) =~ /subject/ ) { foreach ( @words ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "creator=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Author names' ) . $cgi->br; $field_search = ''; foreach ( @words ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "title=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Title words' ) . $cgi->br; } if ( $cgi->param( 'query' ) =~ /creator/ ) { foreach ( @words ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "subject=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Subject tags' ) . $cgi->br; $field_search = ''; foreach ( @words ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "title=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Title words' ) . $cgi->br; } if ( $cgi->param( 'query' ) =~ /title/ ) { foreach ( @words ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "subject=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Subject tags' ) . $cgi->br; $field_search = ''; foreach ( @words ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "creator=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Author name' ) . $cgi->br; } } elsif ( $query =~ / / && $query !~ /=/ ) { # further normalization $query =~ s/"//g; $query =~ s/'//g; while ( $query =~ / / ) { $query =~ s/ / / } # get each query term my @terms = split / /, $query; # build subject search my $field_search = ''; foreach ( @terms ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "subject=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Subject tags' ) . $cgi->br; # author search $field_search = ''; foreach ( @terms ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "creator=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Author names' ) . $cgi->br; # title search $field_search = ''; foreach ( @terms ) { next if ( $_ eq 'AND' or $_ eq 'OR' or $_ eq 'NOT' );; $field_search .= "title=$_ AND "; } $field_search =~ s/ AND $//; $suggestions .= '   ' . $cgi->a({ href => "./?cmd=search&query=$field_search"}, 'Title words' ) . $cgi->br; } # done return $suggestions; } sub tokenize_query { my $words = shift; # clean it $words =~ s/=//g; $words =~ s/title//g; $words =~ s/creator//g; $words =~ s/subject//g; $words =~ s/\*//g; $words =~ s/'//g; $words =~ s/"//g; $words =~ s/"//g; while ( $words =~ / / ) { $words =~ s/ / / } # return an array of words return split / /, $words; } sub suggest_synonyms { my @words = &tokenize_query( shift ); my $cgi = shift; # initialize my @synonyms = (); my %seen = (); my $wn = new Lingua::Wordnet; foreach my $word ( @words ) { foreach my $pos ( POSES ) { my @synsets = $wn->lookup_synset( $word, $pos ); foreach my $synset (@synsets) { foreach ( $synset->words ) { s/\%\d+$//; s/\_/ /g; push ( @synonyms, $_ ) unless $seen{ $_ }++; } } } } @synonyms = sort @synonyms; my $list = ''; foreach ( @synonyms ) { if ( $_ =~ / / ) { $list .= $cgi->a({ href => qq|./?cmd=search&query="$_"| }, $_ ) . '; ' } else { $list .= $cgi->a({ href => "./?cmd=search&query=$_" }, $_ ) . '; ' } } $list =~ s/; $//; if ( ! $list ) { $list = "(none)" } return $list; } sub commify { # commify a number. Perl Cookbook, 2.17, p. 64 my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text; } # return true or die 1;