package LaTeX::ToUnicode::Tables; BEGIN { $LaTeX::ToUnicode::Tables::VERSION = '0.55'; } use strict; use warnings; #ABSTRACT: Character tables for LaTeX::ToUnicode use utf8; # just for the german support # Technically not all of these are ligatures, but close enough. # Order is important, so has to be a list, not a hash. # our @LIGATURES = ( "---" => '\x{2014}', # em dash "--" => '\x{2013}', # en dash "!`" => '\x{00A1}', # inverted exclam "?`" => '\x{00A1}', # inverted question "``" => '\x{201c}', # left double "''" => '\x{201d}', # right double "`" => '\x{2018}', # left single "'" => '\x{2019}', # right single ); # test text: em---dash, en--dash, exc!`am, quest?`ion, ``ld, rd'', `ls, rs'. # # Some additional ligatures supported in T1 encoding, but we won't (from # tex-text.map): # U+002C U+002C <> U+201E ; ,, -> DOUBLE LOW-9 QUOTATION MARK # U+003C U+003C <> U+00AB ; << -> LEFT POINTING GUILLEMET # U+003E U+003E <> U+00BB ; >> -> RIGHT POINTING GUILLEMET # for {\MARKUP(shape) ...} and \textMARKUP{...}; although not all # command names are defined in LaTeX for all markups, we translate them # anyway. Also, LaTeX has more font axes not included here: md, ulc, sw, # ssc, etc. See ltfntcmd.dtx and ltfssaxes.dtx if we ever want to try # for completeness. # our %MARKUPS = ( 'bf' => 'b', 'cal' => '', 'em' => 'em', 'french' => '', # polyglossia 'it' => 'i', 'rm' => '', 'sc' => '', # qqq should uppercasify 'sf' => '', 'sl' => 'i', 'small' => '', 'subscript' => 'sub', 'superscript' => 'sup', 'tt' => 'tt', ); # More commands taking arguments that we want to handle. # our %ARGUMENT_COMMANDS = ( 'emph' => ['\textem{', '}'], # \textem doesn't exist, but we handle it 'enquote' => ["`", "'"], 'natexlab' => ["", ""], # natbib 'path' => ['\texttt{', '}'], # ugh, might not be a braced argument ); # Non-alphabetic \COMMANDs, other than accents and special cases. # our %CONTROL_SYMBOLS = ( ' ' => ' ', # control space "\t" => ' ', # control space "\n" => '\x{0020}', # control space; use entity to avoid being trimmed '!' => '', # negative thin space # " umlaut '#' => '#', # sharp sign '$' => '$', # dollar sign '%' => '%', # percent sign '&' => '\x{0026}', # ampersand, entity to avoid html conflict # ' acute accent '(' => '', # start inline math ')' => '', # end inline math '*' => '', # discretionary multiplication '+' => '', # tabbing: tab stop to right ',' => '', # thin space '-' => '', # discretionary hyphenation # . overdot accent '/' => '', # italic correction # 0..9 undefined ':' => '', # medium space ';' => ' ', # thick space '<' => '', # tabbing: text to left of margin # = macron accent '>' => '', # tabbing: next tab stop # ? undefined '@' => '#', # end of sentence # A..Z control words, not symbols '[' => '', # start display math '\\' => ' ', # line break ']' => '', # end display math # ^ circumflex accent '_' => '_', # underscore # ` grave accent # a..z control words, not symbols '{' => '\x{007b}', # lbrace '|' => '\x{2225}', # parallel '}' => '\x{007d}', # rbrace # ~ tilde accent ); # Alphabetic \COMMANDs that map to nothing. This is simply # interpolated into %CONTROL_WORDS (next), not used directly, so we # redundantly specify the '' on every line. # our %CONTROL_WORDS_EMPTY = ( 'begingroup' => '', 'bgroup' => '', 'checkcomma' => '', #'cite' => '', # keep \cite undefined since it needs manual work 'clearpage' => '', 'doi' => '', 'egroup' => '', 'endgroup' => '', 'ensuremath' => '', 'hbox' => '', 'ignorespaces' => '', 'mbox' => '', 'medspace' => '', 'negmedspace' => '', 'negthickspace' => '', 'negthinspace' => '', 'newblock' => '', 'newpage' => '', 'noindent' => '', 'nolinkurl' => '', 'oldstylenums' => '', 'pagebreak' => '', 'protect' => '', 'raggedright' => '', 'relax' => '', 'thinspace' => '', 'unskip' => '', 'urlprefix' => '', ); # Alphabetic commands, that expand to nothing (above) and to # something (below). # our %CONTROL_WORDS = ( %CONTROL_WORDS_EMPTY, 'BibLaTeX' => 'BibLaTeX', 'BibTeX' => 'BibTeX', 'LaTeX' => 'LaTeX', 'LuaLaTeX' => 'LuaLaTeX', 'LuaTeX' => 'LuaTeX', 'MF' => 'Metafont', 'MP' => 'MetaPost', 'Omega' => '\x{03A9}', 'TeX' => 'TeX', 'XeLaTeX' => 'XeLaTeX', 'XeTeX' => 'XeTeX', 'bullet' => '\x{2022}', 'dag' => '\x{2020}', 'ddag' => '\x{2021}', 'dots' => '\x{2026}', 'epsilon' => '\x{03F5}', 'hookrightarrow' => '\x{2194}', 'ldots' => '\x{2026}', 'log' => 'log', 'omega' => '\x{03C9}', 'par' => "\n\n", 'qquad' => ' ', # 2em space 'quad' => ' ', # em space 'textbackslash' => '\x{005C}', # entities so \ in output indicates # untranslated TeX source 'textbraceleft' => '\x{007B}', # entities so our bare-brace removal 'textbraceright' => '\x{007D}', # skips them 'textgreater' => '\x{003E}', 'textless' => '\x{003C}', 'textquotedbl' => '"', 'thickspace' => ' ', 'varepsilon' => '\x{03B5}', ); # Control words (not symbols) that generate various non-English # letters and symbols. Lots more could be added. # our %SYMBOLS = ( # Table 3.2 in Lamport, plus more 'AA' => '\x{00C5}', # A with ring 'aa' => '\x{00E5}', 'AE' => '\x{00C6}', # AE 'ae' => '\x{00E6}', 'DH' => '\x{00D0}', # ETH 'dh' => '\x{00F0}', 'DJ' => '\x{0110}', # D with stroke 'dj' => '\x{0111}', 'i' => '\x{0131}', # small dotless i 'L' => '\x{0141}', # L with stroke 'l' => '\x{0142}', 'NG' => '\x{014A}', # ENG 'ng' => '\x{014B}', 'OE' => '\x{0152}', # OE 'oe' => '\x{0153}', 'O' => '\x{00D8}', # O with stroke 'o' => '\x{00F8}', 'SS' => 'SS', # lately also U+1E9E, but SS seems good enough 'ss' => '\x{00DF}', 'TH' => '\x{00DE}', # THORN 'textordfeminine' => '\x{00AA}', 'textordmasculine' => '\x{00BA}', 'textregistered' => '\x{00AE}', 'th' => '\x{00FE}', 'TM' => '\x{2122}', # trade mark sign ); # Accent commands that are not alphabetic. # our %ACCENT_SYMBOLS = ( "\"" => { # with diaresis A => '\x{00C4}', E => '\x{00CB}', H => '\x{1E26}', I => '\x{00CF}', O => '\x{00D6}', U => '\x{00DC}', W => '\x{1E84}', X => '\x{1E8c}', Y => '\x{0178}', "\\I" => '\x{00CF}', "\\i" => '\x{00EF}', a => '\x{00E4}', e => '\x{00EB}', h => '\x{1E27}', i => '\x{00EF}', o => '\x{00F6}', t => '\x{1E97}', u => '\x{00FC}', w => '\x{1E85}', x => '\x{1E8d}', y => '\x{00FF}', }, "'" => { # with acute A => '\x{00C1}', AE => '\x{01FC}', C => '\x{0106}', E => '\x{00C9}', G => '\x{01F4}', I => '\x{00CD}', K => '\x{1E30}', L => '\x{0139}', M => '\x{1E3E}', N => '\x{0143}', O => '\x{00D3}', P => '\x{1E54}', R => '\x{0154}', S => '\x{015A}', U => '\x{00DA}', W => '\x{1E82}', Y => '\x{00DD}', Z => '\x{0179}', "\\I" => '\x{00CD}', "\\i" => '\x{00ED}', a => '\x{00E1}', ae => '\x{01FD}', c => '\x{0107}', e => '\x{00E9}', g => '\x{01F5}', i => '\x{00ED}', k => '\x{1E31}', l => '\x{013A}', m => '\x{1E3f}', n => '\x{0144}', o => '\x{00F3}', p => '\x{1E55}', r => '\x{0155}', s => '\x{015B}', u => '\x{00FA}', w => '\x{1E83}', y => '\x{00FD}', z => '\x{017A}', }, "^" => { # with circumflex A => '\x{00C2}', C => '\x{0108}', E => '\x{00CA}', G => '\x{011C}', H => '\x{0124}', I => '\x{00CE}', J => '\x{0134}', O => '\x{00D4}', R => 'R\x{0302}', S => '\x{015C}', U => '\x{00DB}', W => '\x{0174}', Y => '\x{0176}', Z => '\x{1E90}', "\\I" => '\x{00CE}', "\\J" => '\x{0134}', "\\i" => '\x{00EE}', "\\j" => '\x{0135}', a => '\x{00E2}', c => '\x{0109}', e => '\x{00EA}', g => '\x{011D}', h => '\x{0125}', i => '\x{00EE}', j => '\x{0135}', o => '\x{00F4}', s => '\x{015D}', u => '\x{00FB}', w => '\x{0175}', y => '\x{0177}', z => '\x{1E91}', }, "`" => { # with grave A => '\x{00C0}', E => '\x{00C8}', I => '\x{00CC}', N => '\x{01F8}', O => '\x{00D2}', U => '\x{00D9}', W => '\x{1E80}', Y => '\x{1Ef2}', "\\I" => '\x{00CC}', "\\i" => '\x{00EC}', a => '\x{00E0}', e => '\x{00E8}', i => '\x{00EC}', n => '\x{01F9}', o => '\x{00F2}', u => '\x{00F9}', w => '\x{1E81}', y => '\x{1EF3}', }, "." => { # with dot above A => '\x{0226}', B => '\x{1E02}', C => '\x{010A}', D => '\x{1E0A}', E => '\x{0116}', F => '\x{1E1E}', G => '\x{0120}', H => '\x{1E22}', I => '\x{0130}', M => '\x{1E40}', N => '\x{1E44}', O => '\x{022E}', P => '\x{1E56}', R => '\x{1E58}', S => '\x{1E60}', T => '\x{1E6a}', W => '\x{1E86}', X => '\x{1E8A}', Y => '\x{1E8E}', Z => '\x{017B}', "\\I" => '\x{0130}', a => '\x{0227}', b => '\x{1E03}', c => '\x{010B}', d => '\x{1E0B}', e => '\x{0117}', f => '\x{1e1f}', g => '\x{0121}', h => '\x{1E23}', m => '\x{1E41}', n => '\x{1E45}', o => '\x{022F}', p => '\x{1E57}', r => '\x{1E59}', s => '\x{1E61}', t => '\x{1E6b}', w => '\x{1E87}', x => '\x{1E8b}', y => '\x{1E8f}', z => '\x{017C}', }, '=' => { # with macron A => '\x{0100}', AE => '\x{01E2}', E => '\x{0112}', G => '\x{1E20}', I => '\x{012A}', O => '\x{014C}', U => '\x{016A}', Y => '\x{0232}', "\\I" => '\x{012A}', "\\i" => '\x{012B}', a => '\x{0101}', ae => '\x{01E3}', e => '\x{0113}', g => '\x{1E21}', i => '\x{012B}', o => '\x{014D}', u => '\x{016B}', y => '\x{0233}', }, "~" => { # with tilde A => '\x{00C3}', E => '\x{1EBC}', I => '\x{0128}', N => '\x{00D1}', O => '\x{00D5}', U => '\x{0168}', V => '\x{1E7C}', Y => '\x{1EF8}', "\\I" => '\x{0128}', "\\i" => '\x{0129}', a => '\x{00E3}', e => '\x{1EBD}', i => '\x{0129}', n => '\x{00F1}', o => '\x{00F5}', u => '\x{0169}', v => '\x{1E7D}', y => '\x{1EF9}', }, ); # Accent commands that are alphabetic. # our %ACCENT_LETTERS = ( "H" => { # with double acute O => '\x{0150}', U => '\x{0170}', o => '\x{0151}', u => '\x{0171}', }, "c" => { # with cedilla C => '\x{00C7}', D => '\x{1E10}', E => '\x{0228}', G => '\x{0122}', H => '\x{1E28}', K => '\x{0136}', L => '\x{013B}', N => '\x{0145}', R => '\x{0156}', S => '\x{015E}', T => '\x{0162}', c => '\x{00E7}', d => '\x{1E11}', e => '\x{0229}', g => '\x{0123}', h => '\x{1E29}', k => '\x{0137}', l => '\x{013C}', n => '\x{0146}', r => '\x{0157}', s => '\x{015F}', t => '\x{0163}', }, "d" => { # with dot below A => '\x{1EA0}', B => '\x{1E04}', D => '\x{1E0C}', E => '\x{1EB8}', H => '\x{1E24}', I => '\x{1ECA}', K => '\x{1E32}', L => '\x{1E36}', M => '\x{1E42}', N => '\x{1E46}', O => '\x{1ECC}', R => '\x{1E5A}', S => '\x{1E62}', T => '\x{1E6C}', U => '\x{1EE4}', V => '\x{1E7E}', W => '\x{1E88}', Y => '\x{1Ef4}', Z => '\x{1E92}', "\\I" => '\x{1ECA}', "\\i" => '\x{1ECB}', a => '\x{1EA1}', b => '\x{1E05}', d => '\x{1E0D}', e => '\x{1EB9}', h => '\x{1E25}', i => '\x{1ECB}', k => '\x{1E33}', l => '\x{1E37}', m => '\x{1E43}', n => '\x{1E47}', o => '\x{1ECD}', r => '\x{1E5b}', s => '\x{1E63}', t => '\x{1E6D}', u => '\x{1EE5}', v => '\x{1E7F}', w => '\x{1E89}', y => '\x{1EF5}', z => '\x{1E93}', }, "h" => { # with hook above A => '\x{1EA2}', E => '\x{1EBA}', I => '\x{1EC8}', O => '\x{1ECe}', U => '\x{1EE6}', Y => '\x{1EF6}', "\\I" => '\x{1EC8}', "\\i" => '\x{1EC9}', a => '\x{1EA3}', e => '\x{1EBB}', i => '\x{1EC9}', o => '\x{1ECF}', u => '\x{1EE7}', y => '\x{1EF7}', }, "k" => { # with ogonek A => '\x{0104}', E => '\x{0118}', I => '\x{012E}', O => '\x{01EA}', U => '\x{0172}', "\\I" => '\x{012E}', "\\i" => '\x{012F}', a => '\x{0105}', e => '\x{0119}', i => '\x{012F}', o => '\x{01EB}', u => '\x{0173}', }, "r" => { # with ring above A => '\x{00C5}', U => '\x{016E}', a => '\x{00E5}', u => '\x{016F}', w => '\x{1E98}', y => '\x{1E99}', }, "u" => { # with breve A => '\x{0102}', E => '\x{0114}', G => '\x{011E}', I => '\x{012C}', O => '\x{014E}', U => '\x{016C}', "\\I" => '\x{012C}', "\\i" => '\x{012D}', a => '\x{0103}', e => '\x{0115}', g => '\x{011F}', i => '\x{012D}', o => '\x{014F}', u => '\x{016D}', }, "v" => { # with caron A => '\x{01CD}', C => '\x{010C}', D => '\x{010E}', DZ => '\x{01C4}', E => '\x{011A}', G => '\x{01E6}', H => '\x{021E}', I => '\x{01CF}', K => '\x{01E8}', L => '\x{013D}', N => '\x{0147}', O => '\x{01D1}', R => '\x{0158}', S => '\x{0160}', T => '\x{0164}', U => '\x{01D3}', Z => '\x{017D}', "\\I" => '\x{01CF}', "\\i" => '\x{01D0}', "\\j" => '\x{01F0}', a => '\x{01CE}', c => '\x{010D}', d => '\x{010F}', dz => '\x{01C6}', e => '\x{011B}', g => '\x{01E7}', h => '\x{021F}', i => '\x{01D0}', j => '\x{01F0}', k => '\x{01E9}', l => '\x{013E}', n => '\x{0148}', o => '\x{01D2}', r => '\x{0159}', s => '\x{0161}', t => '\x{0165}', u => '\x{01D4}', z => '\x{017E}', }, ); # our %GERMAN = ( # for package `german'/`ngerman' '"a' => 'ä', '"A' => 'Ä', '"e' => 'ë', '"E' => 'Ë', '"i' => 'ï', '"I' => 'Ï', '"o' => 'ö', '"O' => 'Ö', '"u' => 'ü', '"U' => 'Ü', '"s' => 'ß', '"S' => 'SS', '"z' => 'ß', '"Z' => 'SZ', '"ck' => 'ck', # old spelling: ck -> k-k '"ff' => 'ff', # old spelling: ff -> ff-f '"`' => '„', "\"'" => '“', '"<' => '«', '">' => '»', '"-' => '\x{00AD}', # soft hyphen '""' => '\x{200B}', # zero width space '"~' => '\x{2011}', # non-breaking hyphen '"=' => '-', '\glq' => '‚', # left german single quote '\grq' => '‘', # right german single quote '\flqq' => '«', '\frqq' => '»', '\dq' => '"', ); 1; __END__ =pod =encoding UTF-8 =head1 NAME LaTeX::ToUnicode::Tables - Character tables for LaTeX::ToUnicode =head1 VERSION version 0.54 =head1 CONSTANTS =head2 @LIGATURES Standard TeX character sequences (not \commands) which need to be replaced: C<---> with U+2014 (em dash), etc. Includes: em dash, en dash, inverted exclamation, inverted question, left double quote, right double quote, left single quote, right single quote. They are replaced in that order. =head2 %MARKUPS Hash where keys are the names of formatting commands like C<\tt>, without the backslash, namely: C. Values are the obvious HTML equivalent where one exists, given as the tag name without the angle brackets: C. Otherwise the value is the empty string. =head2 %ARGUMENT_COMMANDS Hash where keys are the names of TeX commands taking arguments that we handle, without the backslash, such as C. Each value is a reference to a list of two strings, the first being the text to insert before the argument, the second being the text to insert after. For example, for C the value is C<["`", "'"]>. The inserted text is subject to further replacements. Only three such commands are currently handled: C<\emph>, C<\enquote>, and C<\path>. =head2 %CONTROL_SYMBOLS A hash where the keys are non-alphabetic C<\command>s (without the backslash), other than accents and special cases. These don't take arguments. Although some of these have Unicode equivalents, such as the C<\,> thin space, it seems better to keep the output as simple as possible; small spacing tweaks in TeX aren't usually desirable in plain text or HTML. The values are single-quoted strings C<'\x{...}'>, not double-quoted literal characters <"\x{...}">, to ease future parsing of the TeX/text/HTML. This hash is necessary because TeX's parsing rules for control symbols are different from control words: no space or other token is needed to terminate control symbols. =head2 %CONTROL_WORDS Keys are names of argument-less commands, such as C<\LaTeX> (without the backslash). Values are the replacements, often the empty string. =head2 %SYMBOLS Keys are the commands for extended characters, such as C<\AA> (without the backslash.) =head2 %ACCENT_SYMBOLS Two-level hash of accented characters like C<\'{a}>. The keys of this hash are the accent symbols (without the backslash), such as C<`> and C<'>. The corresponding values are hash references where the keys are the base letters and the values are single-quoted C<'\x{....}'> strings. =head2 %ACCENT_LETTERS Same as %ACCENT_SYMBOLS, except the keys are accents that are alphabetic, such as C<\c> (without the backslash as always). As with control sequences, it's necessary to distinguish symbols and alphabetic commands because of the different parsing rules. =head2 %GERMAN Character sequences (not necessarily commands) as defined by the package `german'/`ngerman', e.g. C<"a> (a with umlaut), C<"s> (german sharp s) or C<"`"> (german left quote). Note the missing backslash. The keys of this hash are the literal character sequences. =head1 AUTHOR Gerhard Gossen , Boris Veytsman , Karl Berry L =head1 COPYRIGHT AND LICENSE Copyright 2010-2024 Gerhard Gossen, Boris Veytsman, Karl Berry This is free software; you can redistribute it and/or modify it under the same terms as the Perl5 programming language system itself. =cut