#February 5, 2001: Minor changes to layout of comments # Fixed a bug that caused duplicate typos in plaintext # Incorporated level 8T # Still named reduce8.pl; older ones have descriptive names added #January 18, 2001: Changed line numbers to use SPAN tags. #November 28, 2000: now turns xml entity "&ersand;" into html "&". #November 23, 2000: this version renders the XML entities I've changed in the # massagex #November 21, 2000: Inoutdelete now fixed. #November 15, 2000: now reduce71.pl: I've changed the paragraph entity to # "

     ", which works and # indents #November 9, 2000: now called reduce7.pl, Nathan and I are in synch on this. # All entities are Unicode where applicable. # October 30, 2000: reduce61.pl: Inoutdelete as ⊂ ...5 (display ?? in # Netscape 5). # also has paragraph entity "par" which it prints as #

#November 3, 2000: illegibleword and illegibleletter also unicoded # and also insertmarkmissing and insertmissing: should be # all of them. #June 2, 2000: Nathan sends this as newReduce5.pl, fixing italics. # I modify it from the "short" form to the "letov" form, and # change other displays. #June 5, 2000: now with code fixing dpunc too # # reduce*.pl - current revision of the reduction script # # # Some enumerated types and related chunks of data # # Names for operating modes &enum(<<'EOL'); $geneticsMode,$plaintextMode EOL # Names for the possible output states &enum(<<'EOL'); $printingState,$nonprintingState,$tnState EOL # The possible flavors of deletions &enum(<<'EOL'); $implicitDeletion,$explicitDeletion EOL # Token types &enum(<<'EOL'); $tagToken,$printableToken,$entityToken EOL # Numeric identifiers for the levels... &enum(<<'EOL'); $lev0=0,$lev1,$lev1plus,$lev2,$lev3,$lev3plus,$lev4,$lev4plus, $lev5,$lev5prime,$lev5plus,$lev5plusprime,$lev5ddag,$lev5tdag, $lev6,$lev7,$lev7plus,$lev8,$lev8T,$lev8plus,$lev8plusT,$lev9,$lev9T, $lev10,$lev10T,$lev11,$lev11T,$lev12star,$lev13 EOL # ...and the associative array for their names %levelCodes = ( "0",$lev0, "1",$lev1, "1+",$lev1plus, "2",$lev2, "3",$lev3, "3+",$lev3plus, "4",$lev4, "4+",$lev4plus, "5",$lev5, "5'",$lev5prime, "5+",$lev5plus, "5+'",$lev5plusprime, "5''",$lev5ddag, "5'''",$lev5tdag, "6",$lev6, "7",$lev7, "7+",$lev7plus, "8",$lev8, "8T",$lev8T, "8+",$lev8plus, "8+T",$lev8plusT, "9",$lev9, "9T",$lev9T, "10",$lev10, "10T",$lev10T, "11",$lev11, "11T",$lev11T, "12*",$lev12star, "13",$lev13 ); # Irritatingly, we need the inverse association as well. This is a # major workpoint. @levelNames = ( "0", "1", "1+", "2", "3", "3+", "4", "4+", "5", "5'", "5+", "5+'", "5''", "5'''", "6", "7", "7+", "8", "8T", "8+", "8+T", "9", "9T", "10", "10T", "11", "11T", "12*", "13" ); # # Variable initializations # ############################################################################## # Configurable parameters; control the operation of the program # # by changing these values. The values for $operatingMode are # # $geneticsMode and $plaintextMode. # # # # # # These are default values; they can be overriden from the # # command line on a PC or the options dialog on a Mac. # # # # Usage: newReduce.pl -[gp] -m -M [-s] [-d] [-t] [-T] # # [-l ] "input file" # # # # -g for genetics, -p for plaintext # # is the minimum level, the maximum level # # -s to run silently # # -d to show double punctuation # # -t to treat DM/IM revisions as TDs # # -T to show typos # # -l to specify the line number interval (0 means no line numbers) # # # ############################################################################## $targetMinLevel=$lev0; $targetMaxLevel=$lev13; $operatingMode=$geneticsMode; $runSilent=0; $lineNumInterval=5; $showDoublePunc=0; $treatDMAsTD=0; $showTypos=0; if ($^O eq "MacOS") { MacPerl::LoadExternals("newReduce.rsrc"); #MacPerl::LoadExternals("fw-XCMD"); $configString=&Options(); #use Mac::AppleEvents; #my $optionsOK; #$AppleEvent{'McPL','DATA'} = \&receiveOptions; #while (!$optionsOK) {sleep(1)} } else { $configString=join(' ',@ARGV); } if ($configString =~ /-p/) { if ($configString =~ /-g/) { print STDERR "Conflicting arguments -p and -g\n"; exit 1; } $operatingMode=$plaintextMode; } if ($configString =~ /-m (\S+)/) { $targetMinLevel=$levelCodes{$1}; } if ($configString =~ /-M (\S+)/) { $targetMaxLevel=$levelCodes{$1}; } if ($configString =~ /-s/) { $runSilent=1; } if ($configString =~ /-t/) { $treatDMAsTD=1; } if ($configString =~ /-T/) { if ($operatingMode==$geneticsMode) { $showTypos=1; # showTypos has no effect in plaintext } } if ($configString =~ /-d/) { $showDoublePunc=1; } if ($configString =~ /-l ([0-9]+)/) { $lineNumInterval=$1; } if ($configString =~ /"(.*)"$/) { $infileName=$1; } elsif ($configString =~ /(\S+)$/) { $infileName=$1; } else { print STDERR "Couldn't find infile name\n"; exit 1; } ################################ # End of configurable parameters ################################ $tagNestingDepth=0; $overlayDepth=0; $tokenIndex=0; $tagIndex=0; $tnIndex=1; # because TN 0 looks funny $typoIsPending=0; open(INFILE,$infileName) || die "Couldn't open input file $infileName"; if ($infileName=~/([^\.]*)\./) { $outfileName=$1.".html"; } else { $outfileName=$infileName.".html"; } #open(OUTFILE,">$outfileName") || die "Couldn't open output file $outfileName"; print "\n"; print ""; print "Output from $infileName ("; if ($operatingMode==$geneticsMode) { print "genetics, minimum level $levelNames[$targetMinLevel], "; print "maximum level $levelNames[$targetMaxLevel])"; } else { print "plaintext, level $levelNames[$targetMinLevel])"; } print "\n"; print "\n\n

\n"; $textRoot = { tag => "", parent => 0, childIndex => 0, numChildren => 0, startText => "", selfText => "", endText => "", minLevel => $lev0, maxLevel => $lev13, modifierMaxLevel => $lev13, dontPrintChildren => 0, }; $curTextObj=$textRoot; $stackDepth=0; # It doesn't make sense to recover plaintext for more than one # level. if ($operatingMode != $geneticsMode) { $targetMaxLevel=$targetMinLevel; } # main loop while (($returnValue=getNextToken())==1) { if ($currentTokenType==$printableToken) { $curTextObj->{childList}[$curTextObj->{numChildren}++] =freshTextObject($currentToken); } elsif ($currentTokenType==$entityToken) { $curTextObj->{childList}[$curTextObj->{numChildren}++] =freshTextObject(entityToText($currentToken)); } elsif ($currentTokenType==$tagToken) { if ($currentToken !~ /^\//) { # open tag $stackDepth++; $curTextObj->{childList}[$curTextObj->{numChildren}++] =newNestedTextUnit($currentToken); $curTextObj=$curTextObj->{childList} [$curTextObj->{numChildren}-1]; # down the stack } else { #close tag $stackDepth--; $curTextObj=$curTextObj->{parent}; # up the stack } } else { print STDERR "Tokenizer bug! Attempting to continue...\n"; } } printTextObj($curTextObj,0,$curTextObj->{numChildren},0); # Now print the endnotes print "

\n

Textual Notes

\n\n"; for ($i=1; $i<$tnIndex; $i++) { print "

$i$tnText[$i]

\n\n"; } print "\n\n\n"; close INFILE; #close OUTFILE; exit 0; # to recover Bill's DOS prompt # # And there ends the main flow of the program; everything from this # point forward is a subroutine. # ############################### # Options event handler for Mac ############################### sub receiveOptions { if ($^O ne "MacOS") { return; } my($event, $reply, $desc, $n) = @_; $desc = AEGetParamDesc($event, keyDirectObject()); $n = AEPrint($desc); AEPutParam($reply, keyDirectObject(), typeChar(), sprintf('%.f + %.f = %.f', $n, $n, $n+$n) ); print AEPrint($event), "\n"; print AEPrint($reply), "\n"; AEDisposeDesc($desc); $optionsOK = 1; 0; } ###################### # Print a text object. ###################### sub printTextObj { my $theObj=$_[0]; my $appendToTN=$_[1]; my $returnValue=$_[2]; my $unconditionalOnly=$_[3]; my $i,$locNumChildren,$locChild; my $locChildIndex; my $locChildText,$locFirstWord; my $locContainedTypo,$locContainedDeletion; my $locMatchLength; if ($theObj->{dontPrintChildren}==1) { return $returnValue; } $locNumChildren=$theObj->{numChildren}; if ($theObj->{tag} eq "TN") { #################################################### # The following condition determines if the interval # [$targetMinLevel,$targetMaxLevel] and the interval # [$theObj->{minLevel},$theObj->{maxLevel}] overlap. #################################################### if (($targetMaxLevel >= $theObj->{minLevel}) && ($targetMinLevel <= $theObj->{maxLevel})) { emit("TN$tnIndex"); # Append TN contents to the array for ($i=0; $i < $locNumChildren; $i++) { $locChild=$theObj->{childList}[$i]; $locNumChildren=printTextObj($locChild,1,$locNumChildren,0); } $tnIndex++; } } else { ########### # Here too. ########### if (($targetMaxLevel >= $theObj->{minLevel}) && ($targetMinLevel <= $theObj->{maxLevel})) { if ($appendToTN==1) { $tnText[$tnIndex].=$theObj->{startText}; if ($locNumChildren==0) { $tnText[$tnIndex].=$theObj->{selfText}; } else { for ($i=0; $i < $locNumChildren; $i++) { $locChild=$theObj->{childList}[$i]; $locNumChildren=printTextObj($locChild,1,$locNumChildren,0); } } $tnText[$tnIndex].=$theObj->{endText}; } else { # Fall out immediately in the peculiar case that this revision # is a typo and typos are being suppressed if (($operatingMode==$plaintextMode) || (!$showTypos) && ($theObj->{tag} eq "REVISION")) { for ($i=0; $i<$locNumChildren; $i++) { $locChild=$theObj->{childList}[$i]; if ($locChild->{tag} eq "TYPO") { return $returnValue; } } } if ((($operatingMode==$geneticsMode) && ($targetMinLevel <= $theObj->{modifierMaxLevel})) || ($theObj->{unconditionalText}==1)) { emit($theObj->{startText}); } if ($locNumChildren==0) { emit($theObj->{selfText}); } else { for ($i=0; $i < $locNumChildren; $i++) { $locChild=$theObj->{childList}[$i]; $locNumChildren=printTextObj($locChild,0,$locNumChildren,0); } } if ((($operatingMode==$geneticsMode) && ($targetMinLevel <= $theObj->{modifierMaxLevel})) || ($theObj->{unconditionalText}==1)) { emit($theObj->{endText}); } } # This "else" is counterintuitive. We print the children even if # the parent failed the level check; this is for things like line # numbers whose levels are not related to those of their parents. } else { for ($i=0; $i < $locNumChildren; $i++) { $locChild=$theObj->{childList}[$i]; $locNumChildren=printTextObj($locChild,0,$locNumChildren,1); } } } $returnValue; } sub isFirstNonspaceChild { my $locThePtr=$_[0]; my $locLoopVar; $locLoopVar=$locThePtr->{childIndex}-1; while ($locLoopVar >= 0) { $locIsSpace=&isspace($locThePtr->{parent}->{childList}[$locLoopVar]->{selfText}); if (($locThePtr->{parent}->{childList}[$locLoopVar]->{tag} ne "") || ($locIsSpace==0)){ last; } $locLoopVar--; } if ($locLoopVar < 0) { 1; } else { 0; } } ################################################################# # Climb through the hierarchy to determine the appropriate levels # to inherit from containing objects. ################################################################# sub stackClimb { my $locMinLevel=$_[0],$locMaxLevel=$_[1]; my $locTempPtr=$curTextObj; my $locNothing=0; my $locLoopVar=0; my $locIsSpace=0; # All this code is to handle the exceptional case of deletions. # It walks up the hierarchy looking for a containing deletion; # if it finds one, it either inherits its minimum level from # the deletion itself, or (if the deletion is the first element # in a revision) from the parent of the containing revision. while (($locTempPtr != 0) && ($locTempPtr->{tag} ne "DELETION")) { $locTempPtr=$locTempPtr->{parent}; } $locSavedMinLevel=$locTempPtr->{minLevel}; $locSavedMaxLevel=$locTempPtr->{maxLevel}; if ($locTempPtr->{tag} eq "DELETION") { # Some tags have no level semantics and shouldn't interfere with the # climbing process. That is, in ... the # deletion is the first nonblank child in spite of the containing . while ((($locTempPtr->{parent}->{tag} eq "TYPO") || ($locTempPtr->{parent}->{tag} eq "FIRST") || ($locTempPtr->{parent}->{tag} eq "DOUBTFUL") || ($locTempPtr->{parent}->{tag} eq "CORRECTION") || ($locTempPtr->{parent}->{tag} eq "OVERLAY")) && (&isFirstNonspaceChild($locTempPtr) != 0)) { $locTempPtr=$locTempPtr->{parent}; } if ($locTempPtr->{parent}->{tag} eq "REVISION") { if (&isFirstNonspaceChild($locTempPtr) != 0) { $locMinLevel=$locTempPtr->{parent}->{parent}->{minLevel}; } else { $locMinLevel=$locTempPtr->{parent}->{minLevel}; } } # The max and min levels of the parent deletion should be the same. $locMaxLevel=$locSavedMinLevel; if ($operatingMode==$plaintextMode) { $locMaxLevel--; } } # Revisions also behave strangely. If we're inside a revision, # the min level is inherited from the revision, and the max # level is either $lev13 or inherited from the first containing # deletion. # Here "inside" means "immediately inside" (TBR). If there's an # intervening tag (, , or whatever), we should inherit # the correct levels from that. $revPtr=$curTextObj; if ($revPtr->{tag} eq "REVISION") { $locTempPtr=$revPtr; while (($locTempPtr != 0) && ($locTempPtr->{tag} ne "DELETION")) { $locTempPtr=$locTempPtr->{parent}; } if ($locTempPtr->{tag} eq "DELETION") { $locMaxLevel=$locTempPtr->{maxLevel}; } else { $locMaxLevel=$lev13; } } { min=>$locMinLevel, max=>$locMaxLevel, }; } ############################## # Create a new textual object. ############################## sub freshTextObject { my $theText=$_[0]; my $locMinLevel; my $locMaxLevel; my $locTempPtr=$curTextObj; my $locDummyHash; # Special case: If the immediate parent is a LINENUM tag, this text is a # line number and is processed accordingly. if ($curTextObj->{tag} eq "LINENUM") { if ($lineNumInterval==0) { $theText=""; } else { if ($theText =~ /([0-9]+)$/) { if ($1 % $lineNumInterval != 0) { $theText=""; } } } } # Another special case: If the immediate parent is a DPUNC, theText # is left blank unless double punctuation is on. if ($curTextObj->{tag} eq "DPUNC") { if ($showDoublePunc==0) { $theText=""; } } $locMinLevel=$curTextObj->{minLevel}; $locMaxLevel=$curTextObj->{maxLevel}; $locDummyHash=&stackClimb($locMinLevel,$locMaxLevel); $locMinLevel=$locDummyHash->{min}; $locMaxLevel=$locDummyHash->{max}; { tag => "", parent => $curTextObj, childIndex => $curTextObj->{numChildren}, numChildren => 0, childList => [], startText => "", selfText => $theText, endText => "", unconditionalText => 0, minLevel => $locMinLevel, maxLevel => $locMaxLevel, modifierMaxLevel => $locMaxLevel, dontPrintChildren => 0, }; } ########################################################### # Create a new text unit nested down a level. The argument # is an open tag, including any attributes. ########################################################### sub newNestedTextUnit { my $theText=$_[0]; my $theTag,$theAttrs; my $locStartText="",$locEndText=""; my $locMinLevel,$locMaxLevel,$locModMaxLevel; my $locTypo=0,$locUnconditionalText=0; my $locDummyHash; my $locDontPrintChildren=0; if ($runSilent==0) { ### $runsilent is probably a command line option to disable verbose output. ### manually disable here to allow data to be printed to STDOUT instead of ### a file. # print "Tag $theText\n"; } if ($theText =~ /^(\S+)\s+(.+)$/) { # If there are attributes... $theTag=$1; $theAttrs=$2; } else { $theTag=$theText; } $locMinLevel=$curTextObj->{minLevel}; $locMaxLevel=$curTextObj->{maxLevel}; # usually right if ($theTag eq "REVISION") { if ($theAttrs =~ /LEVEL="([^"]*)"/) { $locMinLevel=$levelCodes{$1}; } else { print STDERR "Revision without a level!\n"; return; } $locStartText=" [$levelNames[$locMinLevel]"; if ($theAttrs =~ /TD="YES"/) { $locStartText .= " TD"; } if ($theAttrs =~ /IM="YES"/) { if ($treatDMAsTD==0) { $locStartText .= " IM"; } else { $locStartText .= " TD"; } } if ($theAttrs =~ /CONTINUATION="YES"/) { $locStartText .= " -"; } $locStartText .= " "; $locEndText=" "; if ($theAttrs =~ /CONTINUED="YES"/) { $locEndText .= "- "; } $locEndText .= $levelNames[$locMinLevel]."] "; if ($curTextObj->{parent}==0) { $locMaxLevel=$lev13; } $locModMaxLevel=$locMinLevel; } elsif ($theTag eq "DELETION") { # Typos need to be handled specially here if (($showTypos) && ($curTextObj->{tag} eq "TYPO")) { # Not sure what happens here } else { # For a deletion, both min and max are the min of the parent revision. # This assures that the start and end text will only appear # if the level at which the deletion took place is being # recovered. $locTempPtr=$curTextObj; while (($locTempPtr != 0) && ($locTempPtr->{tag} ne "REVISION")) { $locTempPtr=$locTempPtr->{parent}; } $locMinLevel=$locTempPtr->{minLevel}; $locMaxLevel=$locTempPtr->{minLevel}; $locModMaxLevel=$locMaxLevel; } $locEndText=""; if ($theAttrs =~ /TYPE="IMPLICIT"/) { $locStartText="<"; $locEndText .= ">"; } $locStartText .= ""; } elsif ($theTag eq "FIRST") { if ($curTextObj->{tag} ne "DELETION") { $prevLevel=$curTextObj->{minLevel}; } else { $outerContainer=$curTextObj; while (($outerContainer != 0) && ($outerContainer->{tag} ne "REVISION")) { $outerContainer=$outerContainer->{parent}; } if ($outerContainer != 0) { $prevLevel=$outerContainer->{parent}->{minLevel}; } else { $prevLevel=$lev0; } $locMaxLevel=$curTextObj->{maxLevel}; $locModMaxLevel=$locMaxLevel; } $locMinLevel=$lev0; if ($targetMinLevel==0) { $locStartText=""; $locEndText=""; } else { $locStartText=""; $locEndText=""; } } elsif ($theTag eq "INOUTDEL") { if ($operatingMode==$geneticsMode) { $locMinLevel=$curTextObj->{minLevel}; $locMaxLevel=$curTextObj->{minLevel};#was maxlevel, fixed November 21, 2000 $locModMaxLevel=$locMaxLevel; $locStartText="⊂"; #was {} $locEndText="⊃"; } else { $locMinLevel=$lev13+1; $locMaxLevel=$lev13+1; } } elsif ($theTag eq "OVERLAY") { if ($theAttrs =~ /DEPTH=([0-9]*)/) { $overlayDepth=$1; } else { print "Warning: Overlay without depth\n"; } if ($overlayDepth % 2 == 1) { $overlayStart=">"x$overlayDepth; $overlayEnd="<"x$overlayDepth; } else { $overlayStart="»"x($overlayDepth/2); $overlayEnd="«"x($overlayDepth/2); } $locStartText="". $overlayStart.""; $locEndText="". $overlayEnd.""; $locDummyHash=&stackClimb($locMinLevel,$locMaxLevel); $locMinLevel=$locDummyHash->{min}; $locMaxLevel=$locDummyHash->{max}; $locModMaxLevel=$locMaxLevel; } elsif ($theTag eq "LINENUM") { $locUnconditionalText=1; $locStartText=""; $locEndText=""; $locMinLevel=$lev0; $locMaxLevel=$lev13; $locModMaxLevel=$locMaxLevel; } elsif ($theTag eq "TYPO") { $locStartText = " TYPO"; $locEndText = " TYPO] "; if ($showTypos==0) { $locMinLevel=$lev13+1; $locMaxLevel=$lev13+1; $locDontPrintChildren=1; } } elsif ($theTag eq "CORRECTION") { # if typos are not being suppressed, set the correction to never show if (($operatingMode==$geneticsMode) && ($showTypos==1)) { $locMinLevel=$lev13+1; $locMaxLevel=$lev13+1; $locDontPrintChildren=1; } } elsif ($theTag eq "DPUNC") { # do nothing } elsif ($theTag eq "TN") { # just check for FIRSTLEVEL and LASTLEVEL attrs if ($theAttrs =~ /FIRSTLEVEL="([^"]*)"/) { $locMinLevel=$levelCodes{$1}; } if ($theAttrs =~ /LASTLEVEL="([^"]*)"/) { $locMaxLevel=$levelCodes{$1}; $locModMaxLevel=$locMaxLevel; } } elsif ($theTag eq "DOUBTFUL") { $locUnconditionalText=1; $locStartText="?"; $locEndText="?"; $locDummyHash=&stackClimb($locMinLevel,$locMaxLevel); $locMinLevel=$locDummyHash->{min}; $locMaxLevel=$locDummyHash->{max}; $locModMaxLevel=$locMaxLevel; } else { # Assume it's an HTML tag $locUnconditionalText=1; $locStartText="<".$theText.">"; $locEndText=""; $locDummyHash=&stackClimb($locMinLevel,$locMaxLevel); $locMinLevel=$locDummyHash->{min}; $locMaxLevel=$locDummyHash->{max}; $locModMaxLevel=$locMaxLevel; } { tag => $theTag, parent => $curTextObj, childIndex => $curTextObj->{numChildren}, numChildren => 0, childList => [], startText => $locStartText, selfText => "", endText => $locEndText, unconditionalText => $locUnconditionalText, minLevel => $locMinLevel, maxLevel => $locMaxLevel, modifierMaxLevel => $locModMaxLevel, dontPrintChildren => $locDontPrintChildren, }; } ####################################### # Routine to convert an entity to text. ####################################### sub entityToText { my $theEntity=$_[0]; my $locMinLevel,$locMaxLevel,$locDummyHash; $locMinLevel=$curTextObj->{minLevel}; $locMaxLevel=$curTextObj->{maxLevel}; $locDummyHash=&stackClimb($locMinLevel,$locMaxLevel); $locMinLevel=$locDummyHash->{min}; $locMaxLevel=$locDummyHash->{max}; if ($theEntity eq "missinginsertmark") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { ""; } else { ""; } } elsif ($theEntity eq "missinginsert") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "Δ"; } else { ""; } } elsif ($theEntity eq "spacereserved") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "Ø"; } else { ""; } } elsif ($theEntity eq "os1") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "A>"; } else { ""; } } elsif ($theEntity eq "os2") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "B>"; } else { ""; } } elsif ($theEntity eq "os3") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "C>"; } else { ""; } } elsif ($theEntity eq "os4") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "D>"; } else { ""; } } elsif ($theEntity eq "os5") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "E>"; } else { ""; } } elsif ($theEntity eq "os6") { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "F>"; } else { ""; } } elsif ($theEntity eq "oe1" ) { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "<A"; } else { ""; } } elsif ($theEntity eq "oe2" ) { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "<B"; } else { ""; } } elsif ($theEntity eq "oe3" ) { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "<C"; } else { ""; } } elsif ($theEntity eq "oe4" ) { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "<D"; } else { ""; } } elsif ($theEntity eq "oe5" ) { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "<E"; } else { ""; } } elsif ($theEntity eq "oe6" ) { if (($operatingMode==$geneticsMode) && (($targetMinLevel <= $locMinLevel) && ($targetMaxLevel >= $locMinLevel))) { "<F"; } else { ""; } } elsif ($theEntity eq "illegibleword" ) { "A1;"; } elsif ($theEntity eq "illegibleletter" ) { "AB;"; } elsif ($theEntity eq "par" ) { "

     "; } elsif ($theEntity eq "emdash" ) { "—"; } elsif ($theEntity eq "ddagger" ) { "‡"; } elsif ($theEntity eq "a_um" ) { "ä"; } elsif ($theEntity eq "e_acute" ) { "é"; } elsif ($theEntity eq "e_grave" ) { "è"; } elsif ($theEntity eq "e_circ" ) { "ê"; } elsif ($theEntity eq "ae_lig" ) { "æ"; } elsif ($theEntity eq "op_smquo" ) { "“"; } elsif ($theEntity eq "cl_smquo" ) { "”"; } elsif ($theEntity eq "sect" ) { "§"; } elsif ($theEntity eq "l_sqbr" ) { "["; } elsif ($theEntity eq "r_sqbr" ) { "]"; } elsif ($theEntity eq "l_T_sig" ) { "K"; } elsif ($theEntity eq "r_T_sig" ) { "J"; } elsif ($theEntity eq "d_T_sig" ) { "I"; } elsif ($theEntity eq "u_T_sig" ) { "T"; } elsif ($theEntity eq "ampersand" ) { "&"; } else { # Other non-HTML entities are unsupported for now "&".$theEntity.";"; } } ############ # Tokenizer. ############ sub getNextToken { if (length $tokenStream==0) { # if there's no text buffered... $tokenStream=; if (!$tokenStream) { return 0; # EOF } #chop $tokenStream; } # Now we have some text to process; match a token. # These are usually right $currentTokenType=$printableToken; $currentTokenAttrs=""; if ($tokenStream =~ /^\n*$/) { $currentToken="\n"; $tokenStream=; while ($tokenStream =~ /^$/) { $currentToken .= "\n"; $tokenStream=; if (!$tokenStream) { return 0; # EOF } } # Set up new stream to be used after this long elsif... $newStream=$tokenStream; } elsif ($tokenStream =~ /^\s+/) { $currentToken=" "; $newStream=$'; # Whitespace does not terminate a text token; big savings in memory usage! } elsif ($tokenStream =~ /^[^<&]+/){ $currentToken=$&; $newStream=$'; } elsif ($tokenStream =~ /^&([^;]*);/) { # entity $currentToken=$1; $currentTokenType=$entityToken; $newStream=$'; } elsif ($tokenStream =~ /^<([^>]*)>/) { # tag $currentToken=$1; $currentToken =~ tr/a-z/A-Z/; $currentTokenType=$tagToken; $newStream=$'; } elsif ($tokenStream =~ /^<([^>]*)$/) { # partial tag $currentToken="$1 "; $currentTokenType=$tagToken; while ($tokenStream=) { # chop $tokenStream; if ($tokenStream =~ /^([^>]*)>/) { $currentToken .= $1; $tokenStream = $'; last; } else { $currentToken .= "$tokenStream "; } $currentToken =~ tr/a-z/A-Z/; $newStream=$'; } return 1; } else { print STDERR "Parser failed!\n"; print STDERR "Next tokens: ($tokenStream)\n"; return 0; } $tokenStream=$newStream; return 1; } ################################################################## # Printing subroutine. ################################################################## sub emit { local $theText=$_[0]; # Handle carriage returns. If the token starts with a carriage return, and # we haven't printed anything since the last CR token, it is not printed. if (($theText =~ /^\n/) && ($outputSinceLastCR==0)){ return; } if ($theText=~/^\n\n+$/) { print "

\n\n

"; } else { print $theText; } if ($theText=~/\n$/) { $outputSinceLastCR=0; } else { $outputSinceLastCR=1; } } ################# # isspace, ‡ la C ################# sub isspace { local $theText=$_[0]; my $c; my @chars=split(//,$theText); foreach $c (@chars) { if (($c ne " ") && ($c ne "\n")) { return 0; } } 1; } ##################################### # enum subroutine from the Camel Book ##################################### sub enum { local($_)=@_; local(@specs)=split(/,/); local($val); for (@specs) { if (/=/) { $val = eval $_; } else { eval $_ . ' = ++$val'; } } }