) {
## accent handlers
## these are quite definite about their replacemnets and so should
## come before the more generic operations later
## all braces in the latex constructs are optional
## info on the html codes can be found at http://www.w3.org/TR/REC-html40/
## \\? is to handle latex \'\i which you do to put acute on i without dot
s/\\ \` (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )?
/&$1grave;/gx; # grave accent
s/ \\ \' (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )?
/&$1acute;/gx; # 'acute accent
s/ \\ \^ (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )?
/&$1circ;/gx; # circumflex
s/ \\ \" (?: \{ )? \\? ([aeiouyAEIOUY]) (?: \} )?
/&$1uml;/gx; # "umlaut
s/ \\ \~ (?: \{ )? \\? ([anoANO]) (?: \} )?
/&$1tilde;/gx; # tilde
s/ \\ c (?: \{ )? ([cC]) (?: \} )?
/&$1cedil;/gx; # cedilla
s/ \\ [v\'] (?: \{ )? ([cC]) (?: \} )?
/$1/gx; # no html code for czech hook
s/ \{? \\[s\"]s \}?
/ß/gx; # german Scharf-S
## several weird symbols
s/ \\copyright
/©/gx;
s/ \\pounds
/£/gx;
## weirder symbols
s/ \\ (ae|AE)
/&$1lig;/gx;
s/ \\ (o|O)
/&$1slash;/gx;
s/ \\ss
/ß/gx;
## greek letters, case insensitive matching, but upper case in latex and
## html have the first letter of the english word capitalized
s/ \\ (?: var )? (alpha|beta|gamma|delta|epsilon|theta|lambda|pi|rho|sigma|omega)
/&$1;/gxi;
## remove any \/ space-increasing symbols
s+ ([^\\]) \\ \/
+$1+gx;
s+ \\ \ + +gx;
## deal with \cite stuff, change it to a link to a record the same html file
s+ ([^\\]) \\cite\{ (.*?) \}
+$1$2+xg;
## These rules are to deal with my (DFK) macros
s/\\ie/i.e./g;
s/\\eg/e.g./g;
s/\\etc/etc./g;
s+\\vs\\+vs.+g;
s/\\usec/usec/g;
s/\\mbox //g;
s/\\par / /g;
s/\\par$/
/g;
s/\\\&/\&/g; # ampersand
s/-{2,3}/-/g; # multiple dashes
# a few rules are needed to compensate for BibTeXs way of splitting
# long words over two lines by sticking a % (TeX comment character) at
# the end of the line. This works when one word (usually a URL) is
# split over more than one line.
## if we have an unescaped % at the end of the line, remove it and the newline and
## join the next line on
## example straight out of the camel book, pg 204. amazing
if ( s/ ([^\\]) \% \n$ /$1/x and $nextline = ) {
$_ .= $nextline;
redo; # back to the top
}
# hyphenation characters should be removed
s+\\-++g;
# tildes -
# tilde not preceded by \ or / is a nbsp
# \~{} is ~ (likely in a URL)
# all other tildes left alone, notably /~ (URL)
s+ ([^\\/]) ~ +$1 +xg; # normal standalone tilde - nbsp
s/ \\~ \{\} /~/xg; # \~{} to ~ - do before removing braces below
## --------------------------------------------------------
## deal with em and tt and bf text surrounded by braces
## deal with \emph{..}, \texttt{...} and \textbf{..}
## the fancy groupings around the em etc are because in html bold font is not 'bf'
## but just b and italic is i not 'it' so we have to pick out only a part of those
## latex tags
## final |\w+ is to skip over \rm or any other commands we don't handle
## also removes braces with no command
while ( /(text(b)f|text(tt)|(em)ph)?([\{\}])/ ) {
if (defined $1) {
my $cmd = "$1";
my $fmt = $2 || $3 || $4;
s/\\$cmd\{/<$fmt>/;
push ( @formatsToClose, $fmt )
} elsif ($5 eq '{') {
## beginning of format
s/ \{ (?: \\ (?: (em)|(b)f|(tt)|(i)t|\w+ ) )? \s*
/ ( $format = $1 or $2 or $3 or $4 ) ? "<$format>" : '' /ex ;
## push the format to be closed onto the stack (may be nothing)
push ( @formatsToClose, $format )
} else {
## pop format to close from stack
s+ \s*\}
+ ( $format = pop (@formatsToClose) ) ? "$format>" : '' +ex ;
}
}
## --------------------------------------------------------
##retrieve symbols escaped by backslashes
my $escapedChars = quotemeta ( '#$%&_{}' );
s/ ([^\\]) \\ ([$escapedChars])
/$1$2/gxo;
print OUT $_;
}
print "\n";
print "\noutput is in $Opts{outfile}\n";
cleanup();
sub killSuffix {
$file = shift();
( $name, $path ) = fileparse ( $file, '\.[^.]*$' ); # the pattern indicates what a suffix looks like
return ($path . $name);
}
sub cleanup {
unlink ( glob ("$tmp.{aux,bbl,blg}") );
}