#!/usr/bin/perl # gb2teiframework - convert text file into rudimentary tei file # Eric Lease Morgan # June 6, 2010 - initial investigations # define use constant CORPUS => '/var/www/html/main/sandbox/great-books/corpus/'; use constant XML => '/var/www/html/main/sandbox/great-books/great-books.xml'; # require use strict; use XML::XPath; # initialize my $parser = XML::XPath->new( filename => XML ); my $books = $parser->find( '//book' ); my $index = 0; foreach my $book ( $books->get_nodelist ) { # extract metadata my $identifier = $book->getAttribute( 'id' ); my $title = $book->findvalue( 'title' ); my $creator = $book->findvalue( 'author' ); my $local_url = $book->findvalue( 'local_url' ); my $source = $book->findvalue( 'original_url' ); # build some metadata my $teiurl = CORPUS . $identifier . '.xml'; my $today = 'TODAY'; # echo print "identifier: $identifier\n"; # build the tei my $tei = &template; $tei =~ s/##TITLE##/$title/g; $tei =~ s/##CREATOR##/$creator/g; $tei =~ s/##TEIURL##/$teiurl/g; $tei =~ s/##IDENTIFIER##/$identifier/g; $tei =~ s/##SOURCE##/$source/g; $tei =~ s/##TODAY##/$today/g; $tei =~ s/##TEXT##/&escape_entities( &slurp( CORPUS . $identifier . '.txt' ))/e; # save open TEI, " > $teiurl" or die "Can't open TEI: $!\n"; print TEI $tei; close TEI; } # done exit; sub slurp { # open a file named by the input and return its contents my $f = shift; my $r; open F, $f or die "Can't slurp: $!\n"; $r = do { local $/; }; return $r; } sub escape_entities { # get the input my $s = shift; # escape $s =~ s/&/&/g; $s =~ s//>/g; $s =~ s/"/"/g; $s =~ s/'/'/g; # done return $s; } sub template { return < ]> ##TITLE## ##CREATOR## converted into TEI-conformant markup by Eric Lease Morgan Infomotions, Inc.
eric_morgan\@infomotions.com
##IDENTIFIER##
##TODAY## Eric Lease Morgan initial TEI framework generated
##TITLE## by ##CREATOR##

##TEXT##

Colophon

This file was originally marked up using the Text Encoding Initiative XML markup language for use in an experiment/studuy colloquially called "How 'great' are the Great Books?" ( http://infomotions.com/sandbox/great-books/) by Eric Lease Morgan. It's Infomotions unique identifier is ##IDENTIFIER##.

Infomotions Man says, "Give back to the 'Net."

EOF }