#February 5, 2001: Minor changes to layout of comments
# Fixed a bug that caused duplicate typos in plaintext
# Incorporated level 8T
# Still named reduce8.pl; older ones have descriptive names added
#January 18, 2001: Changed line numbers to use SPAN tags.
#November 28, 2000: now turns xml entity "&ersand;" into html "&".
#November 23, 2000: this version renders the XML entities I've changed in the
# massagex
#November 21, 2000: Inoutdelete now fixed.
#November 15, 2000: now reduce71.pl: I've changed the paragraph entity to
# "
", which works and
# indents
#November 9, 2000: now called reduce7.pl, Nathan and I are in synch on this.
# All entities are Unicode where applicable.
# October 30, 2000: reduce61.pl: Inoutdelete as ⊂ ...5 (display ?? in
# Netscape 5).
# also has paragraph entity "par" which it prints as
#
#November 3, 2000: illegibleword and illegibleletter also unicoded
# and also insertmarkmissing and insertmissing: should be
# all of them.
#June 2, 2000: Nathan sends this as newReduce5.pl, fixing italics.
# I modify it from the "short" form to the "letov" form, and
# change other displays.
#June 5, 2000: now with code fixing dpunc too
#
# reduce*.pl - current revision of the reduction script
#
#
# Some enumerated types and related chunks of data
#
# Names for operating modes
&enum(<<'EOL');
$geneticsMode,$plaintextMode
EOL
# Names for the possible output states
&enum(<<'EOL');
$printingState,$nonprintingState,$tnState
EOL
# The possible flavors of deletions
&enum(<<'EOL');
$implicitDeletion,$explicitDeletion
EOL
# Token types
&enum(<<'EOL');
$tagToken,$printableToken,$entityToken
EOL
# Numeric identifiers for the levels...
&enum(<<'EOL');
$lev0=0,$lev1,$lev1plus,$lev2,$lev3,$lev3plus,$lev4,$lev4plus,
$lev5,$lev5prime,$lev5plus,$lev5plusprime,$lev5ddag,$lev5tdag,
$lev6,$lev7,$lev7plus,$lev8,$lev8T,$lev8plus,$lev8plusT,$lev9,$lev9T,
$lev10,$lev10T,$lev11,$lev11T,$lev12star,$lev13
EOL
# ...and the associative array for their names
%levelCodes = (
"0",$lev0,
"1",$lev1,
"1+",$lev1plus,
"2",$lev2,
"3",$lev3,
"3+",$lev3plus,
"4",$lev4,
"4+",$lev4plus,
"5",$lev5,
"5'",$lev5prime,
"5+",$lev5plus,
"5+'",$lev5plusprime,
"5''",$lev5ddag,
"5'''",$lev5tdag,
"6",$lev6,
"7",$lev7,
"7+",$lev7plus,
"8",$lev8,
"8T",$lev8T,
"8+",$lev8plus,
"8+T",$lev8plusT,
"9",$lev9,
"9T",$lev9T,
"10",$lev10,
"10T",$lev10T,
"11",$lev11,
"11T",$lev11T,
"12*",$lev12star,
"13",$lev13
);
# Irritatingly, we need the inverse association as well. This is a
# major workpoint.
@levelNames = (
"0",
"1",
"1+",
"2",
"3",
"3+",
"4",
"4+",
"5",
"5'",
"5+",
"5+'",
"5''",
"5'''",
"6",
"7",
"7+",
"8",
"8T",
"8+",
"8+T",
"9",
"9T",
"10",
"10T",
"11",
"11T",
"12*",
"13"
);
#
# Variable initializations
#
##############################################################################
# Configurable parameters; control the operation of the program #
# by changing these values. The values for $operatingMode are #
# $geneticsMode and $plaintextMode. #
# #
# #
# These are default values; they can be overriden from the #
# command line on a PC or the options dialog on a Mac. #
# #
# Usage: newReduce.pl -[gp] -m -M [-s] [-d] [-t] [-T] #
# [-l ] "input file" #
# #
# -g for genetics, -p for plaintext #
# is the minimum level, the maximum level #
# -s to run silently #
# -d to show double punctuation #
# -t to treat DM/IM revisions as TDs #
# -T to show typos #
# -l to specify the line number interval (0 means no line numbers) #
# #
##############################################################################
$targetMinLevel=$lev0;
$targetMaxLevel=$lev13;
$operatingMode=$geneticsMode;
$runSilent=0;
$lineNumInterval=5;
$showDoublePunc=0;
$treatDMAsTD=0;
$showTypos=0;
if ($^O eq "MacOS") {
MacPerl::LoadExternals("newReduce.rsrc");
#MacPerl::LoadExternals("fw-XCMD");
$configString=&Options();
#use Mac::AppleEvents;
#my $optionsOK;
#$AppleEvent{'McPL','DATA'} = \&receiveOptions;
#while (!$optionsOK) {sleep(1)}
} else {
$configString=join(' ',@ARGV);
}
if ($configString =~ /-p/) {
if ($configString =~ /-g/) {
print STDERR "Conflicting arguments -p and -g\n";
exit 1;
}
$operatingMode=$plaintextMode;
}
if ($configString =~ /-m (\S+)/) {
$targetMinLevel=$levelCodes{$1};
}
if ($configString =~ /-M (\S+)/) {
$targetMaxLevel=$levelCodes{$1};
}
if ($configString =~ /-s/) {
$runSilent=1;
}
if ($configString =~ /-t/) {
$treatDMAsTD=1;
}
if ($configString =~ /-T/) {
if ($operatingMode==$geneticsMode) {
$showTypos=1; # showTypos has no effect in plaintext
}
}
if ($configString =~ /-d/) {
$showDoublePunc=1;
}
if ($configString =~ /-l ([0-9]+)/) {
$lineNumInterval=$1;
}
if ($configString =~ /"(.*)"$/) {
$infileName=$1;
} elsif ($configString =~ /(\S+)$/) {
$infileName=$1;
} else {
print STDERR "Couldn't find infile name\n";
exit 1;
}
################################
# End of configurable parameters
################################
$tagNestingDepth=0;
$overlayDepth=0;
$tokenIndex=0;
$tagIndex=0;
$tnIndex=1; # because TN 0 looks funny
$typoIsPending=0;
open(INFILE,$infileName) || die "Couldn't open input file $infileName";
if ($infileName=~/([^\.]*)\./) {
$outfileName=$1.".html";
} else {
$outfileName=$infileName.".html";
}
#open(OUTFILE,">$outfileName") || die "Couldn't open output file $outfileName";
print "\n";
print "";
print "Output from $infileName (";
if ($operatingMode==$geneticsMode) {
print "genetics, minimum level $levelNames[$targetMinLevel], ";
print "maximum level $levelNames[$targetMaxLevel])";
} else {
print "plaintext, level $levelNames[$targetMinLevel])";
}
print "\n";
print "\n\n
\n";
$textRoot = {
tag => "",
parent => 0,
childIndex => 0,
numChildren => 0,
startText => "",
selfText => "",
endText => "",
minLevel => $lev0,
maxLevel => $lev13,
modifierMaxLevel => $lev13,
dontPrintChildren => 0,
};
$curTextObj=$textRoot;
$stackDepth=0;
# It doesn't make sense to recover plaintext for more than one
# level.
if ($operatingMode != $geneticsMode) {
$targetMaxLevel=$targetMinLevel;
}
# main loop
while (($returnValue=getNextToken())==1) {
if ($currentTokenType==$printableToken) {
$curTextObj->{childList}[$curTextObj->{numChildren}++]
=freshTextObject($currentToken);
} elsif ($currentTokenType==$entityToken) {
$curTextObj->{childList}[$curTextObj->{numChildren}++]
=freshTextObject(entityToText($currentToken));
} elsif ($currentTokenType==$tagToken) {
if ($currentToken !~ /^\//) { # open tag
$stackDepth++;
$curTextObj->{childList}[$curTextObj->{numChildren}++]
=newNestedTextUnit($currentToken);
$curTextObj=$curTextObj->{childList}
[$curTextObj->{numChildren}-1]; # down the stack
} else { #close tag
$stackDepth--;
$curTextObj=$curTextObj->{parent}; # up the stack
}
} else {
print STDERR "Tokenizer bug! Attempting to continue...\n";
}
}
printTextObj($curTextObj,0,$curTextObj->{numChildren},0);
# Now print the endnotes
print "
\n\n";
}
print "\n\n\n";
close INFILE;
#close OUTFILE;
exit 0; # to recover Bill's DOS prompt
#
# And there ends the main flow of the program; everything from this
# point forward is a subroutine.
#
###############################
# Options event handler for Mac
###############################
sub receiveOptions {
if ($^O ne "MacOS") {
return;
}
my($event, $reply, $desc, $n) = @_;
$desc = AEGetParamDesc($event, keyDirectObject());
$n = AEPrint($desc);
AEPutParam($reply, keyDirectObject(), typeChar(),
sprintf('%.f + %.f = %.f', $n, $n, $n+$n)
);
print AEPrint($event), "\n";
print AEPrint($reply), "\n";
AEDisposeDesc($desc);
$optionsOK = 1;
0;
}
######################
# Print a text object.
######################
sub printTextObj {
my $theObj=$_[0];
my $appendToTN=$_[1];
my $returnValue=$_[2];
my $unconditionalOnly=$_[3];
my $i,$locNumChildren,$locChild;
my $locChildIndex;
my $locChildText,$locFirstWord;
my $locContainedTypo,$locContainedDeletion;
my $locMatchLength;
if ($theObj->{dontPrintChildren}==1) {
return $returnValue;
}
$locNumChildren=$theObj->{numChildren};
if ($theObj->{tag} eq "TN") {
####################################################
# The following condition determines if the interval
# [$targetMinLevel,$targetMaxLevel] and the interval
# [$theObj->{minLevel},$theObj->{maxLevel}] overlap.
####################################################
if (($targetMaxLevel >= $theObj->{minLevel})
&& ($targetMinLevel <= $theObj->{maxLevel}))
{
emit("TN$tnIndex");
# Append TN contents to the array
for ($i=0; $i < $locNumChildren; $i++) {
$locChild=$theObj->{childList}[$i];
$locNumChildren=printTextObj($locChild,1,$locNumChildren,0);
}
$tnIndex++;
}
} else {
###########
# Here too.
###########
if (($targetMaxLevel >= $theObj->{minLevel})
&& ($targetMinLevel <= $theObj->{maxLevel}))
{
if ($appendToTN==1) {
$tnText[$tnIndex].=$theObj->{startText};
if ($locNumChildren==0) {
$tnText[$tnIndex].=$theObj->{selfText};
} else {
for ($i=0; $i < $locNumChildren; $i++) {
$locChild=$theObj->{childList}[$i];
$locNumChildren=printTextObj($locChild,1,$locNumChildren,0);
}
}
$tnText[$tnIndex].=$theObj->{endText};
} else {
# Fall out immediately in the peculiar case that this revision
# is a typo and typos are being suppressed
if (($operatingMode==$plaintextMode) || (!$showTypos)
&& ($theObj->{tag} eq "REVISION")) {
for ($i=0; $i<$locNumChildren; $i++) {
$locChild=$theObj->{childList}[$i];
if ($locChild->{tag} eq "TYPO") {
return $returnValue;
}
}
}
if ((($operatingMode==$geneticsMode) && ($targetMinLevel <=
$theObj->{modifierMaxLevel}))
|| ($theObj->{unconditionalText}==1)) {
emit($theObj->{startText});
}
if ($locNumChildren==0) {
emit($theObj->{selfText});
} else {
for ($i=0; $i < $locNumChildren; $i++) {
$locChild=$theObj->{childList}[$i];
$locNumChildren=printTextObj($locChild,0,$locNumChildren,0);
}
}
if ((($operatingMode==$geneticsMode) && ($targetMinLevel <=
$theObj->{modifierMaxLevel}))
|| ($theObj->{unconditionalText}==1)) {
emit($theObj->{endText});
}
}
# This "else" is counterintuitive. We print the children even if
# the parent failed the level check; this is for things like line
# numbers whose levels are not related to those of their parents.
} else {
for ($i=0; $i < $locNumChildren; $i++) {
$locChild=$theObj->{childList}[$i];
$locNumChildren=printTextObj($locChild,0,$locNumChildren,1);
}
}
}
$returnValue;
}
sub isFirstNonspaceChild {
my $locThePtr=$_[0];
my $locLoopVar;
$locLoopVar=$locThePtr->{childIndex}-1;
while ($locLoopVar >= 0) {
$locIsSpace=&isspace($locThePtr->{parent}->{childList}[$locLoopVar]->{selfText});
if (($locThePtr->{parent}->{childList}[$locLoopVar]->{tag} ne "")
|| ($locIsSpace==0)){
last;
}
$locLoopVar--;
}
if ($locLoopVar < 0) {
1;
} else {
0;
}
}
#################################################################
# Climb through the hierarchy to determine the appropriate levels
# to inherit from containing objects.
#################################################################
sub stackClimb {
my $locMinLevel=$_[0],$locMaxLevel=$_[1];
my $locTempPtr=$curTextObj;
my $locNothing=0;
my $locLoopVar=0;
my $locIsSpace=0;
# All this code is to handle the exceptional case of deletions.
# It walks up the hierarchy looking for a containing deletion;
# if it finds one, it either inherits its minimum level from
# the deletion itself, or (if the deletion is the first element
# in a revision) from the parent of the containing revision.
while (($locTempPtr != 0) && ($locTempPtr->{tag} ne "DELETION")) {
$locTempPtr=$locTempPtr->{parent};
}
$locSavedMinLevel=$locTempPtr->{minLevel};
$locSavedMaxLevel=$locTempPtr->{maxLevel};
if ($locTempPtr->{tag} eq "DELETION") {
# Some tags have no level semantics and shouldn't interfere with the
# climbing process. That is, in ... the
# deletion is the first nonblank child in spite of the containing .
while ((($locTempPtr->{parent}->{tag} eq "TYPO")
|| ($locTempPtr->{parent}->{tag} eq "FIRST")
|| ($locTempPtr->{parent}->{tag} eq "DOUBTFUL")
|| ($locTempPtr->{parent}->{tag} eq "CORRECTION")
|| ($locTempPtr->{parent}->{tag} eq "OVERLAY"))
&& (&isFirstNonspaceChild($locTempPtr) != 0)) {
$locTempPtr=$locTempPtr->{parent};
}
if ($locTempPtr->{parent}->{tag} eq "REVISION") {
if (&isFirstNonspaceChild($locTempPtr) != 0) {
$locMinLevel=$locTempPtr->{parent}->{parent}->{minLevel};
} else {
$locMinLevel=$locTempPtr->{parent}->{minLevel};
}
}
# The max and min levels of the parent deletion should be the same.
$locMaxLevel=$locSavedMinLevel;
if ($operatingMode==$plaintextMode) {
$locMaxLevel--;
}
}
# Revisions also behave strangely. If we're inside a revision,
# the min level is inherited from the revision, and the max
# level is either $lev13 or inherited from the first containing
# deletion.
# Here "inside" means "immediately inside" (TBR). If there's an
# intervening tag (, , or whatever), we should inherit
# the correct levels from that.
$revPtr=$curTextObj;
if ($revPtr->{tag} eq "REVISION") {
$locTempPtr=$revPtr;
while (($locTempPtr != 0)
&& ($locTempPtr->{tag} ne "DELETION")) {
$locTempPtr=$locTempPtr->{parent};
}
if ($locTempPtr->{tag} eq "DELETION") {
$locMaxLevel=$locTempPtr->{maxLevel};
} else {
$locMaxLevel=$lev13;
}
}
{
min=>$locMinLevel,
max=>$locMaxLevel,
};
}
##############################
# Create a new textual object.
##############################
sub freshTextObject {
my $theText=$_[0];
my $locMinLevel;
my $locMaxLevel;
my $locTempPtr=$curTextObj;
my $locDummyHash;
# Special case: If the immediate parent is a LINENUM tag, this text is a
# line number and is processed accordingly.
if ($curTextObj->{tag} eq "LINENUM") {
if ($lineNumInterval==0) {
$theText="";
} else {
if ($theText =~ /([0-9]+)$/) {
if ($1 % $lineNumInterval != 0) {
$theText="";
}
}
}
}
# Another special case: If the immediate parent is a DPUNC, theText
# is left blank unless double punctuation is on.
if ($curTextObj->{tag} eq "DPUNC") {
if ($showDoublePunc==0) {
$theText="";
}
}
$locMinLevel=$curTextObj->{minLevel};
$locMaxLevel=$curTextObj->{maxLevel};
$locDummyHash=&stackClimb($locMinLevel,$locMaxLevel);
$locMinLevel=$locDummyHash->{min};
$locMaxLevel=$locDummyHash->{max};
{
tag => "",
parent => $curTextObj,
childIndex => $curTextObj->{numChildren},
numChildren => 0,
childList => [],
startText => "",
selfText => $theText,
endText => "",
unconditionalText => 0,
minLevel => $locMinLevel,
maxLevel => $locMaxLevel,
modifierMaxLevel => $locMaxLevel,
dontPrintChildren => 0,
};
}
###########################################################
# Create a new text unit nested down a level. The argument
# is an open tag, including any attributes.
###########################################################
sub newNestedTextUnit {
my $theText=$_[0];
my $theTag,$theAttrs;
my $locStartText="",$locEndText="";
my $locMinLevel,$locMaxLevel,$locModMaxLevel;
my $locTypo=0,$locUnconditionalText=0;
my $locDummyHash;
my $locDontPrintChildren=0;
if ($runSilent==0) {
### $runsilent is probably a command line option to disable verbose output.
### manually disable here to allow data to be printed to STDOUT instead of
### a file.
# print "Tag $theText\n";
}
if ($theText =~ /^(\S+)\s+(.+)$/) { # If there are attributes...
$theTag=$1;
$theAttrs=$2;
} else {
$theTag=$theText;
}
$locMinLevel=$curTextObj->{minLevel};
$locMaxLevel=$curTextObj->{maxLevel}; # usually right
if ($theTag eq "REVISION") {
if ($theAttrs =~ /LEVEL="([^"]*)"/) {
$locMinLevel=$levelCodes{$1};
} else {
print STDERR "Revision without a level!\n";
return;
}
$locStartText=" [$levelNames[$locMinLevel]";
if ($theAttrs =~ /TD="YES"/) {
$locStartText .= " TD";
}
if ($theAttrs =~ /IM="YES"/) {
if ($treatDMAsTD==0) {
$locStartText .= " IM";
} else {
$locStartText .= " TD";
}
}
if ($theAttrs =~ /CONTINUATION="YES"/) {
$locStartText .= " -";
}
$locStartText .= " ";
$locEndText=" ";
if ($theAttrs =~ /CONTINUED="YES"/) {
$locEndText .= "- ";
}
$locEndText .= $levelNames[$locMinLevel]."] ";
if ($curTextObj->{parent}==0) {
$locMaxLevel=$lev13;
}
$locModMaxLevel=$locMinLevel;
} elsif ($theTag eq "DELETION") {
# Typos need to be handled specially here
if (($showTypos) && ($curTextObj->{tag} eq "TYPO")) {
# Not sure what happens here
} else {
# For a deletion, both min and max are the min of the parent revision.
# This assures that the start and end text will only appear
# if the level at which the deletion took place is being
# recovered.
$locTempPtr=$curTextObj;
while (($locTempPtr != 0) && ($locTempPtr->{tag} ne "REVISION")) {
$locTempPtr=$locTempPtr->{parent};
}
$locMinLevel=$locTempPtr->{minLevel};
$locMaxLevel=$locTempPtr->{minLevel};
$locModMaxLevel=$locMaxLevel;
}
$locEndText="";
if ($theAttrs =~ /TYPE="IMPLICIT"/) {
$locStartText="<";
$locEndText .= ">";
}
$locStartText .= "";
} elsif ($theTag eq "FIRST") {
if ($curTextObj->{tag} ne "DELETION") {
$prevLevel=$curTextObj->{minLevel};
} else {
$outerContainer=$curTextObj;
while (($outerContainer != 0)
&& ($outerContainer->{tag} ne "REVISION")) {
$outerContainer=$outerContainer->{parent};
}
if ($outerContainer != 0) {
$prevLevel=$outerContainer->{parent}->{minLevel};
} else {
$prevLevel=$lev0;
}
$locMaxLevel=$curTextObj->{maxLevel};
$locModMaxLevel=$locMaxLevel;
}
$locMinLevel=$lev0;
if ($targetMinLevel==0) {
$locStartText="";
$locEndText="";
} else {
$locStartText="";
$locEndText="";
}
} elsif ($theTag eq "INOUTDEL") {
if ($operatingMode==$geneticsMode) {
$locMinLevel=$curTextObj->{minLevel};
$locMaxLevel=$curTextObj->{minLevel};#was maxlevel, fixed November 21, 2000
$locModMaxLevel=$locMaxLevel;
$locStartText="⊂"; #was {}
$locEndText="⊃";
} else {
$locMinLevel=$lev13+1;
$locMaxLevel=$lev13+1;
}
} elsif ($theTag eq "OVERLAY") {
if ($theAttrs =~ /DEPTH=([0-9]*)/) {
$overlayDepth=$1;
} else {
print "Warning: Overlay without depth\n";
}
if ($overlayDepth % 2 == 1)
{
$overlayStart=">"x$overlayDepth;
$overlayEnd="<"x$overlayDepth;
} else {
$overlayStart="»"x($overlayDepth/2);
$overlayEnd="«"x($overlayDepth/2);
}
$locStartText="".
$overlayStart."";
$locEndText="".
$overlayEnd."";
$locDummyHash=&stackClimb($locMinLevel,$locMaxLevel);
$locMinLevel=$locDummyHash->{min};
$locMaxLevel=$locDummyHash->{max};
$locModMaxLevel=$locMaxLevel;
} elsif ($theTag eq "LINENUM") {
$locUnconditionalText=1;
$locStartText="";
$locEndText="";
$locMinLevel=$lev0;
$locMaxLevel=$lev13;
$locModMaxLevel=$locMaxLevel;
} elsif ($theTag eq "TYPO") {
$locStartText = " TYPO";
$locEndText = " TYPO] ";
if ($showTypos==0) {
$locMinLevel=$lev13+1;
$locMaxLevel=$lev13+1;
$locDontPrintChildren=1;
}
} elsif ($theTag eq "CORRECTION") {
# if typos are not being suppressed, set the correction to never show
if (($operatingMode==$geneticsMode) && ($showTypos==1)) {
$locMinLevel=$lev13+1;
$locMaxLevel=$lev13+1;
$locDontPrintChildren=1;
}
} elsif ($theTag eq "DPUNC") {
# do nothing
} elsif ($theTag eq "TN") {
# just check for FIRSTLEVEL and LASTLEVEL attrs
if ($theAttrs =~ /FIRSTLEVEL="([^"]*)"/) {
$locMinLevel=$levelCodes{$1};
}
if ($theAttrs =~ /LASTLEVEL="([^"]*)"/) {
$locMaxLevel=$levelCodes{$1};
$locModMaxLevel=$locMaxLevel;
}
} elsif ($theTag eq "DOUBTFUL") {
$locUnconditionalText=1;
$locStartText="?";
$locEndText="?";
$locDummyHash=&stackClimb($locMinLevel,$locMaxLevel);
$locMinLevel=$locDummyHash->{min};
$locMaxLevel=$locDummyHash->{max};
$locModMaxLevel=$locMaxLevel;
} else { # Assume it's an HTML tag
$locUnconditionalText=1;
$locStartText="<".$theText.">";
$locEndText="".$theTag.">";
$locDummyHash=&stackClimb($locMinLevel,$locMaxLevel);
$locMinLevel=$locDummyHash->{min};
$locMaxLevel=$locDummyHash->{max};
$locModMaxLevel=$locMaxLevel;
}
{
tag => $theTag,
parent => $curTextObj,
childIndex => $curTextObj->{numChildren},
numChildren => 0,
childList => [],
startText => $locStartText,
selfText => "",
endText => $locEndText,
unconditionalText => $locUnconditionalText,
minLevel => $locMinLevel,
maxLevel => $locMaxLevel,
modifierMaxLevel => $locModMaxLevel,
dontPrintChildren => $locDontPrintChildren,
};
}
#######################################
# Routine to convert an entity to text.
#######################################
sub entityToText {
my $theEntity=$_[0];
my $locMinLevel,$locMaxLevel,$locDummyHash;
$locMinLevel=$curTextObj->{minLevel};
$locMaxLevel=$curTextObj->{maxLevel};
$locDummyHash=&stackClimb($locMinLevel,$locMaxLevel);
$locMinLevel=$locDummyHash->{min};
$locMaxLevel=$locDummyHash->{max};
if ($theEntity eq "missinginsertmark") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"◊";
} else {
"";
}
} elsif ($theEntity eq "missinginsert") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"Δ";
} else {
"";
}
} elsif ($theEntity eq "spacereserved") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"Ø";
} else {
"";
}
} elsif ($theEntity eq "os1") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"A>";
} else {
"";
}
} elsif ($theEntity eq "os2") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"B>";
} else {
"";
}
} elsif ($theEntity eq "os3") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"C>";
} else {
"";
}
} elsif ($theEntity eq "os4") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"D>";
} else {
"";
}
} elsif ($theEntity eq "os5") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"E>";
} else {
"";
}
} elsif ($theEntity eq "os6") {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"F>";
} else {
"";
}
} elsif ($theEntity eq "oe1" ) {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"<A";
} else {
"";
}
} elsif ($theEntity eq "oe2" ) {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"<B";
} else {
"";
}
} elsif ($theEntity eq "oe3" ) {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"<C";
} else {
"";
}
} elsif ($theEntity eq "oe4" ) {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"<D";
} else {
"";
}
} elsif ($theEntity eq "oe5" ) {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"<E";
} else {
"";
}
} elsif ($theEntity eq "oe6" ) {
if (($operatingMode==$geneticsMode)
&& (($targetMinLevel <= $locMinLevel)
&& ($targetMaxLevel >= $locMinLevel))) {
"<F";
} else {
"";
}
} elsif ($theEntity eq "illegibleword" ) {
"A1;";
} elsif ($theEntity eq "illegibleletter" ) {
"AB;";
} elsif ($theEntity eq "par" ) {
"
";
} elsif ($theEntity eq "emdash" ) {
"—";
} elsif ($theEntity eq "ddagger" ) {
"";
} elsif ($theEntity eq "a_um" ) {
"ä";
} elsif ($theEntity eq "e_acute" ) {
"é";
} elsif ($theEntity eq "e_grave" ) {
"è";
} elsif ($theEntity eq "e_circ" ) {
"ê";
} elsif ($theEntity eq "ae_lig" ) {
"æ";
} elsif ($theEntity eq "op_smquo" ) {
"";
} elsif ($theEntity eq "cl_smquo" ) {
"";
} elsif ($theEntity eq "sect" ) {
"§";
} elsif ($theEntity eq "l_sqbr" ) {
"[";
} elsif ($theEntity eq "r_sqbr" ) {
"]";
} elsif ($theEntity eq "l_T_sig" ) {
"K";
} elsif ($theEntity eq "r_T_sig" ) {
"J";
} elsif ($theEntity eq "d_T_sig" ) {
"I";
} elsif ($theEntity eq "u_T_sig" ) {
"T";
} elsif ($theEntity eq "ampersand" ) {
"&";
} else {
# Other non-HTML entities are unsupported for now
"&".$theEntity.";";
}
}
############
# Tokenizer.
############
sub getNextToken {
if (length $tokenStream==0) { # if there's no text buffered...
$tokenStream=;
if (!$tokenStream) {
return 0; # EOF
}
#chop $tokenStream;
}
# Now we have some text to process; match a token.
# These are usually right
$currentTokenType=$printableToken;
$currentTokenAttrs="";
if ($tokenStream =~ /^\n*$/) {
$currentToken="\n";
$tokenStream=;
while ($tokenStream =~ /^$/) {
$currentToken .= "\n";
$tokenStream=;
if (!$tokenStream) {
return 0; # EOF
}
}
# Set up new stream to be used after this long elsif...
$newStream=$tokenStream;
} elsif ($tokenStream =~ /^\s+/) {
$currentToken=" ";
$newStream=$';
# Whitespace does not terminate a text token; big savings in memory usage!
} elsif ($tokenStream =~ /^[^<&]+/){
$currentToken=$&;
$newStream=$';
} elsif ($tokenStream =~ /^&([^;]*);/) { # entity
$currentToken=$1;
$currentTokenType=$entityToken;
$newStream=$';
} elsif ($tokenStream =~ /^<([^>]*)>/) { # tag
$currentToken=$1;
$currentToken =~ tr/a-z/A-Z/;
$currentTokenType=$tagToken;
$newStream=$';
} elsif ($tokenStream =~ /^<([^>]*)$/) { # partial tag
$currentToken="$1 ";
$currentTokenType=$tagToken;
while ($tokenStream=) {
# chop $tokenStream;
if ($tokenStream =~ /^([^>]*)>/) {
$currentToken .= $1;
$tokenStream = $';
last;
} else {
$currentToken .= "$tokenStream ";
}
$currentToken =~ tr/a-z/A-Z/;
$newStream=$';
}
return 1;
} else {
print STDERR "Parser failed!\n";
print STDERR "Next tokens: ($tokenStream)\n";
return 0;
}
$tokenStream=$newStream;
return 1;
}
##################################################################
# Printing subroutine.
##################################################################
sub emit {
local $theText=$_[0];
# Handle carriage returns. If the token starts with a carriage return, and
# we haven't printed anything since the last CR token, it is not printed.
if (($theText =~ /^\n/) && ($outputSinceLastCR==0)){
return;
}
if ($theText=~/^\n\n+$/) {
print "
\n\n
";
} else {
print $theText;
}
if ($theText=~/\n$/) {
$outputSinceLastCR=0;
} else {
$outputSinceLastCR=1;
}
}
#################
# isspace, ‡ la C
#################
sub isspace {
local $theText=$_[0];
my $c;
my @chars=split(//,$theText);
foreach $c (@chars) {
if (($c ne " ") && ($c ne "\n")) {
return 0;
}
}
1;
}
#####################################
# enum subroutine from the Camel Book
#####################################
sub enum {
local($_)=@_;
local(@specs)=split(/,/);
local($val);
for (@specs) {
if (/=/) {
$val = eval $_;
} else {
eval $_ . ' = ++$val';
}
}
}