Module: Rpdf2txt::Symbol

Defined in:
lib/rpdf2txt/symbol.rb

Constant Summary collapse

NAMES =
{
  "Alpha"          => 0101, # Α
  "Beta"           => 0102, # Β
  "Chi"            => 0103, # Χ
  "Delta"          => 0104, # Δ
  "Epsilon"        => 0105, # Ε
  "Eta"            => 0110, # Η
  "Euro"           => 0240, # €
  "Gamma"          => 0107, # Γ
  "Ifraktur"       => 0301, # ℑ
  "Iota"           => 0111, # Ι
  "Kappa"          => 0113, # Κ
  "Lambda"         => 0114, # Λ
  "Mu"             => 0115, # Μ
  "Nu"             => 0116, # Ν
  "Omega"          => 0127, # Ω
  "Omicron"        => 0117, # Ο
  "Phi"            => 0106, # Φ
  "Pi"             => 0120, # Π
  "Psi"            => 0131, # Ψ
  "Rfraktur"       => 0302, # ℜ
  "Rho"            => 0122, # Ρ
  "Sigma"          => 0123, # Σ
  "Tau"            => 0124, # Τ
  "Theta"          => 0121, # Θ
  "Upsilon"        => 0125, # Υ
  "Upsilon1"       => 0241, # ϒ
  "Xi"             => 0130, # Ξ
  "Zeta"           => 0132, # Ζ
  "aleph"          => 0300, # ℵ
  "alpha"          => 0141, # α
  "ampersand"      => 0046, # &
  "angle"          => 0320, # ∠
  "angleleft"      => 0341, # 〈
  "angleright"     => 0361, # 〉
  "approxequal"    => 0273, # ≈
  "arrowboth"      => 0253, # ↔
  "arrowdblboth"   => 0333, # ⇔
  "arrowdbldown"   => 0337, # ⇓
  "arrowdblleft"   => 0334, # ⇐
  "arrowdblright"  => 0336, # ⇒
  "arrowdblup"     => 0335, # ⇑
  "arrowdown"      => 0257, # ↓
  "arrowhorizex"   => 0276, # 
  "arrowleft"      => 0254, # ←
  "arrowright"     => 0256, # →
  "arrowup"        => 0255, # ↑
  "arrowvertex"    => 0275, # 
  "asteriskmath"   => 0052, # ∗
  "bar"            => 0174, # |
  "beta"           => 0142, # β
  "braceleft"      => 0173, # {
  "braceright"     => 0175, # }
  "bracelefttp"    => 0354, # 
  "braceleftmid"   => 0355, # 
  "braceleftbt"    => 0356, # 
  "bracerighttp"   => 0374, # 
  "bracerightmid"  => 0375, # 
  "bracerightbt"   => 0376, # 
  "braceex"        => 0357, # 
  "bracketleft"    => 0133, # [
  "bracketright"   => 0135, # ]
  "bracketlefttp"  => 0351, # 
  "bracketleftex"  => 0352, # 
  "bracketleftbt"  => 0353, # 
  "bracketrighttp" => 0371, # 
  "bracketrightex" => 0372, # 
  "bracketrightbt" => 0373, # 
  "bullet"         => 0267, # •
  "carriagereturn" => 0277, # ↵
  "chi"            => 0143, # χ
  "circlemultiply" => 0304, # ⊗
  "circleplus"     => 0305, # ⊕
  "club"           => 0247, # ♣
  "colon"          => 0072, # :
  "comma"          => 0054, # ,
  "congruent"      => 0100, # ≅
  "copyrightsans"  => 0343, # 
  "copyrightserif" => 0323, 
  "degree"         => 0260, # °
  "delta"          => 0144, # δ
  "diamond"        => 0250, # ♦
  "divide"         => 0270, # ÷
  "dotmath"        => 0327, # ⋅
  "eight"          => 0070, # 8,
  "element"        => 0316, # ∈
  "ellipsis"       => 0274, # …
  "emptyset"       => 0306, # ∅
  "epsilon"        => 0145, # ε
  "equal"          => 0075, # =
  "equivalence"    => 0272, # ≡
  "eta"            => 0150, # η
  "exclam"         => 0041, # !
  "existential"    => 0044, # ∃
  "five"           => 0065, # 5,
  "florin"         => 0246, # ƒ
  "four"           => 0064, # 4,
  "fraction"       => 0244, # ⁄
  "gamma"          => 0147, # γ
  "gradient"       => 0321, # ∇
  "greater"        => 0076, # >
  "greaterequal"   => 0263, # ≥
  "heart"          => 0251, # ♥
  "infinity"       => 0245, # ∞
  "integral"       => 0362, # ∫
  "integraltp"     => 0363, # ⌠
  "integralex"     => 0364, # 
  "integralbt"     => 0365, # ⌡
  "intersection"   => 0307, # ∩
  "iota"           => 0151, # ι
  "kappa"          => 0153, # κ
  "lambda"         => 0154, # λ
  "less"           => 0074, # <
  "lessequal"      => 0243, # ≤
  "logicaland"     => 0331, # ∧
  "logicalnot"     => 0330, # ¬
  "logicalor"      => 0332, # ∨
  "lozenge"        => 0340, # ◊
  "minus"          => 0055, # −
  "minute"         => 0242, # ′
  "mu"             => 0155, # μ
  "multiply"       => 0264, # ×
  "nine"           => 0071, # 9,
  "notelement"     => 0317, # ∉
  "notequal"       => 0271, # ≠
  "notsubset"      => 0313, # ⊄
  "nu"             => 0156, # ν
  "numbersign"     => 0043, # #
  "omega"          => 0167, # ω
  "omega1"         => 0166, # ϖ
  "omicron"        => 0157, # ο
  "one"            => 0061, # 1,
  "parenleft"      => 0050, # (
  "parenright"     => 0051, # )
  "parenlefttp"    => 0346, # 
  "parenleftex"    => 0347, # 
  "parenleftbt"    => 0350, # 
  "parenrighttp"   => 0366, # 
  "parenrightex"   => 0367, # 
  "parenrightbt"   => 0370, # 
  "partialdiff"    => 0266, # ∂
  "percent"        => 0045, # %
  "period"         => 0056, # .
  "perpendicular"  => 0136, # ⊥
  "phi"            => 0146, # φ
  "phi1"           => 0152,
  "pi"             => 0160, # π
  "plus"           => 0053, # +
  "plusminus"      => 0261, # ±
  "product"        => 0325, # Π
  "propersubset"   => 0314, # ⊂
  "propersuperset" => 0311, # ⊃
  "proportional"   => 0265, # ∝
  "psi"            => 0171, # ψ
  "question"       => 0077, # ?
  "radical"        => 0326, # √
  "radicalex"      => 0140,
  "reflexsubset"   => 0315, # ⊆
  "reflexsuperset" => 0312, # ⊇
  "registersans"   => 0342, # 
  "registerserif"  => 0322,
  "rho"            => 0162, # ρ
  "second"         => 0262, # ″
  "semicolon"      => 0073, # ;
  "seven"          => 0067, # 7,
  "sigma"          => 0163, # σ
  "sigma1"         => 0126, # ς
  "similar"        => 0176, # ∼
  "six"            => 0066, # 6,
  "slash"          => 0057, # /
  "space"          => 0040,
  "spade"          => 0252, # ♠
  "suchthat"       => 0047, # ∋
  "summation"      => 0345, # Σ
  "tau"            => 0164, # τ
  "therefore"      => 0134, # ∴
  "theta"          => 0161, # θ
  "theta1"         => 0112,
  "three"          => 0063, # 3,
  "trademarksans"  => 0344, # 
  "trademarkserif" => 0324,
  "two"            => 0062, # 2,
  "underscore"     => 0137, # _
  "union"          => 0310, # ∪
  "universal"      => 0042, # ∀
  "upsilon"        => 0165, # υ
  "weierstrass"    => 0303, # ℘
  "xi"             => 0170, # ξ
  "zero"           => 0060, # 0,
  "zeta"           => 0172, # ζ
}
UNICODE_MAP =

ζ

{ # based on http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/SYMBOL.TXT
  0x20 => "\x00\x20",  # SPACE
  0x21 => "\x00\x21",  # EXCLAMATION MARK
  0x22 => "\x22\x00",  # FOR ALL
  0x23 => "\x00\x23",  # NUMBER SIGN
  0x24 => "\x22\x03",  # THERE EXISTS
  0x25 => "\x00\x25",  # PERCENT SIGN
  0x26 => "\x00\x26",  # AMPERSAND
  0x27 => "\x22\x0D",  # SMALL CONTAINS AS MEMBER
  0x28 => "\x00\x28",  # LEFT PARENTHESIS
  0x29 => "\x00\x29",  # RIGHT PARENTHESIS
  0x2A => "\x22\x17",  # ASTERISK OPERATOR
  0x2B => "\x00\x2B",  # PLUS SIGN
  0x2C => "\x00\x2C",  # COMMA
  0x2D => "\x22\x12",  # MINUS SIGN
  0x2E => "\x00\x2E",  # FULL STOP
  0x2F => "\x00\x2F",  # SOLIDUS
  0x30 => "\x00\x30",  # DIGIT ZERO
  0x31 => "\x00\x31",  # DIGIT ONE
  0x32 => "\x00\x32",  # DIGIT TWO
  0x33 => "\x00\x33",  # DIGIT THREE
  0x34 => "\x00\x34",  # DIGIT FOUR
  0x35 => "\x00\x35",  # DIGIT FIVE
  0x36 => "\x00\x36",  # DIGIT SIX
  0x37 => "\x00\x37",  # DIGIT SEVEN
  0x38 => "\x00\x38",  # DIGIT EIGHT
  0x39 => "\x00\x39",  # DIGIT NINE
  0x3A => "\x00\x3A",  # COLON
  0x3B => "\x00\x3B",  # SEMICOLON
  0x3C => "\x00\x3C",  # LESS-THAN SIGN
  0x3D => "\x00\x3D",  # EQUALS SIGN
  0x3E => "\x00\x3E",  # GREATER-THAN SIGN
  0x3F => "\x00\x3F",  # QUESTION MARK
  0x40 => "\x22\x45",  # APPROXIMATELY EQUAL TO
  0x41 => "\x03\x91",  # GREEK CAPITAL LETTER ALPHA
  0x42 => "\x03\x92",  # GREEK CAPITAL LETTER BETA
  0x43 => "\x03\xA7",  # GREEK CAPITAL LETTER CHI
  0x44 => "\x03\x94",  # GREEK CAPITAL LETTER DELTA
  0x45 => "\x03\x95",  # GREEK CAPITAL LETTER EPSILON
  0x46 => "\x03\xA6",  # GREEK CAPITAL LETTER PHI
  0x47 => "\x03\x93",  # GREEK CAPITAL LETTER GAMMA
  0x48 => "\x03\x97",  # GREEK CAPITAL LETTER ETA
  0x49 => "\x03\x99",  # GREEK CAPITAL LETTER IOTA
  0x4A => "\x03\xD1",  # GREEK THETA SYMBOL
  0x4B => "\x03\x9A",  # GREEK CAPITAL LETTER KAPPA
  0x4C => "\x03\x9B",  # GREEK CAPITAL LETTER LAMDA
  0x4D => "\x03\x9C",  # GREEK CAPITAL LETTER MU
  0x4E => "\x03\x9D",  # GREEK CAPITAL LETTER NU
  0x4F => "\x03\x9F",  # GREEK CAPITAL LETTER OMICRON
  0x50 => "\x03\xA0",  # GREEK CAPITAL LETTER PI
  0x51 => "\x03\x98",  # GREEK CAPITAL LETTER THETA
  0x52 => "\x03\xA1",  # GREEK CAPITAL LETTER RHO
  0x53 => "\x03\xA3",  # GREEK CAPITAL LETTER SIGMA
  0x54 => "\x03\xA4",  # GREEK CAPITAL LETTER TAU
  0x55 => "\x03\xA5",  # GREEK CAPITAL LETTER UPSILON
  0x56 => "\x03\xC2",  # GREEK SMALL LETTER FINAL SIGMA
  0x57 => "\x03\xA9",  # GREEK CAPITAL LETTER OMEGA
  0x58 => "\x03\x9E",  # GREEK CAPITAL LETTER XI
  0x59 => "\x03\xA8",  # GREEK CAPITAL LETTER PSI
  0x5A => "\x03\x96",  # GREEK CAPITAL LETTER ZETA
  0x5B => "\x00\x5B",  # LEFT SQUARE BRACKET
  0x5C => "\x22\x34",  # THEREFORE
  0x5D => "\x00\x5D",  # RIGHT SQUARE BRACKET
  0x5E => "\x22\xA5",  # UP TACK
  0x5F => "\x00\x5F",  # LOW LINE
  0x60 => "\xF8\xE5",  # radical extender # corporate char
  0x61 => "\x03\xB1",  # GREEK SMALL LETTER ALPHA
  0x62 => "\x03\xB2",  # GREEK SMALL LETTER BETA
  0x63 => "\x03\xC7",  # GREEK SMALL LETTER CHI
  0x64 => "\x03\xB4",  # GREEK SMALL LETTER DELTA
  0x65 => "\x03\xB5",  # GREEK SMALL LETTER EPSILON
  0x66 => "\x03\xC6",  # GREEK SMALL LETTER PHI
  0x67 => "\x03\xB3",  # GREEK SMALL LETTER GAMMA
  0x68 => "\x03\xB7",  # GREEK SMALL LETTER ETA
  0x69 => "\x03\xB9",  # GREEK SMALL LETTER IOTA
  0x6A => "\x03\xD5",  # GREEK PHI SYMBOL
  0x6B => "\x03\xBA",  # GREEK SMALL LETTER KAPPA
  0x6C => "\x03\xBB",  # GREEK SMALL LETTER LAMDA
  0x6D => "\x03\xBC",  # GREEK SMALL LETTER MU
  0x6E => "\x03\xBD",  # GREEK SMALL LETTER NU
  0x6F => "\x03\xBF",  # GREEK SMALL LETTER OMICRON
  0x70 => "\x03\xC0",  # GREEK SMALL LETTER PI
  0x71 => "\x03\xB8",  # GREEK SMALL LETTER THETA
  0x72 => "\x03\xC1",  # GREEK SMALL LETTER RHO
  0x73 => "\x03\xC3",  # GREEK SMALL LETTER SIGMA
  0x74 => "\x03\xC4",  # GREEK SMALL LETTER TAU
  0x75 => "\x03\xC5",  # GREEK SMALL LETTER UPSILON
  0x76 => "\x03\xD6",  # GREEK PI SYMBOL
  0x77 => "\x03\xC9",  # GREEK SMALL LETTER OMEGA
  0x78 => "\x03\xBE",  # GREEK SMALL LETTER XI
  0x79 => "\x03\xC8",  # GREEK SMALL LETTER PSI
  0x7A => "\x03\xB6",  # GREEK SMALL LETTER ZETA
  0x7B => "\x00\x7B",  # LEFT CURLY BRACKET
  0x7C => "\x00\x7C",  # VERTICAL LINE
  0x7D => "\x00\x7D",  # RIGHT CURLY BRACKET
  0x7E => "\x22\x3C",  # TILDE OPERATOR
  0xA0 => "\x20\xAC",  # EURO SIGN
  0xA1 => "\x03\xD2",  # GREEK UPSILON WITH HOOK SYMBOL
  0xA2 => "\x20\x32",  # PRIME # minute
  0xA3 => "\x22\x64",  # LESS-THAN OR EQUAL TO
  0xA4 => "\x20\x44",  # FRACTION SLASH
  0xA5 => "\x22\x1E",  # INFINITY
  0xA6 => "\x01\x92",  # LATIN SMALL LETTER F WITH HOOK
  0xA7 => "\x26\x63",  # BLACK CLUB SUIT
  0xA8 => "\x26\x66",  # BLACK DIAMOND SUIT
  0xA9 => "\x26\x65",  # BLACK HEART SUIT
  0xAA => "\x26\x60",  # BLACK SPADE SUIT
  0xAB => "\x21\x94",  # LEFT RIGHT ARROW
  0xAC => "\x21\x90",  # LEFTWARDS ARROW
  0xAD => "\x21\x91",  # UPWARDS ARROW
  0xAE => "\x21\x92",  # RIGHTWARDS ARROW
  0xAF => "\x21\x93",  # DOWNWARDS ARROW
  0xB0 => "\x00\xB0",  # DEGREE SIGN
  0xB1 => "\x00\xB1",  # PLUS-MINUS SIGN
  0xB2 => "\x20\x33",  # DOUBLE PRIME # second
  0xB3 => "\x22\x65",  # GREATER-THAN OR EQUAL TO
  0xB4 => "\x00\xD7",  # MULTIPLICATION SIGN
  0xB5 => "\x22\x1D",  # PROPORTIONAL TO
  0xB6 => "\x22\x02",  # PARTIAL DIFFERENTIAL
  0xB7 => "\x20\x22",  # BULLET
  0xB8 => "\x00\xF7",  # DIVISION SIGN
  0xB9 => "\x22\x60",  # NOT EQUAL TO
  0xBA => "\x22\x61",  # IDENTICAL TO
  0xBB => "\x22\x48",  # ALMOST EQUAL TO
  0xBC => "\x20\x26",  # HORIZONTAL ELLIPSIS
  0xBD => "\x23\xD0",  # VERTICAL LINE EXTENSION (for arrows) # for Unicode 4.0 and later
  0xBE => "\x23\xAF",  # HORIZONTAL LINE EXTENSION (for arrows) # for Unicode 3.2 and later
  0xBF => "\x21\xB5",  # DOWNWARDS ARROW WITH CORNER LEFTWARDS
  0xC0 => "\x21\x35",  # ALEF SYMBOL
  0xC1 => "\x21\x11",  # BLACK-LETTER CAPITAL I
  0xC2 => "\x21\x1C",  # BLACK-LETTER CAPITAL R
  0xC3 => "\x21\x18",  # SCRIPT CAPITAL P
  0xC4 => "\x22\x97",  # CIRCLED TIMES
  0xC5 => "\x22\x95",  # CIRCLED PLUS
  0xC6 => "\x22\x05",  # EMPTY SET
  0xC7 => "\x22\x29",  # INTERSECTION
  0xC8 => "\x22\x2A",  # UNION
  0xC9 => "\x22\x83",  # SUPERSET OF
  0xCA => "\x22\x87",  # SUPERSET OF OR EQUAL TO
  0xCB => "\x22\x84",  # NOT A SUBSET OF
  0xCC => "\x22\x82",  # SUBSET OF
  0xCD => "\x22\x86",  # SUBSET OF OR EQUAL TO
  0xCE => "\x22\x08",  # ELEMENT OF
  0xCF => "\x22\x09",  # NOT AN ELEMENT OF
  0xD0 => "\x22\x20",  # ANGLE
  0xD1 => "\x22\x07",  # NABLA
  0xD2 => "\x00\xAE",  # REGISTERED SIGN # serif
  0xD3 => "\x00\xA9",  # COPYRIGHT SIGN # serif
  0xD4 => "\x21\x22",  # TRADE MARK SIGN # serif
  0xD5 => "\x22\x0F",  # N-ARY PRODUCT
  0xD6 => "\x22\x1A",  # SQUARE ROOT
  0xD7 => "\x22\xC5",  # DOT OPERATOR
  0xD8 => "\x00\xAC",  # NOT SIGN
  0xD9 => "\x22\x27",  # LOGICAL AND
  0xDA => "\x22\x28",  # LOGICAL OR
  0xDB => "\x21\xD4",  # LEFT RIGHT DOUBLE ARROW
  0xDC => "\x21\xD0",  # LEFTWARDS DOUBLE ARROW
  0xDD => "\x21\xD1",  # UPWARDS DOUBLE ARROW
  0xDE => "\x21\xD2",  # RIGHTWARDS DOUBLE ARROW
  0xDF => "\x21\xD3",  # DOWNWARDS DOUBLE ARROW
  0xE0 => "\x25\xCA",  # LOZENGE # previously mapped to 0x22C4 DIAMOND OPERATOR
  0xE1 => "\x30\x08",  # LEFT ANGLE BRACKET
  0xE2 => "\x00\xAE",  # REGISTERED SIGN, alternate: sans serif (0xF87F)
  0xE3 => "\x00\xA9",  # COPYRIGHT SIGN, alternate: sans serif (0xF87F)
  0xE4 => "\x21\x22",  # TRADE MARK SIGN, alternate: sans serif (0xF87F)
  0xE5 => "\x22\x11",  # N-ARY SUMMATION
  0xE6 => "\x23\x9B",  # LEFT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
  0xE7 => "\x23\x9C",  # LEFT PARENTHESIS EXTENSION # for Unicode 3.2 and later
  0xE8 => "\x23\x9D",  # LEFT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
  0xE9 => "\x23\xA1",  # LEFT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
  0xEA => "\x23\xA2",  # LEFT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
  0xEB => "\x23\xA3",  # LEFT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
  0xEC => "\x23\xA7",  # LEFT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
  0xED => "\x23\xA8",  # LEFT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
  0xEE => "\x23\xA9",  # LEFT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
  0xEF => "\x23\xAA",  # CURLY BRACKET EXTENSION # for Unicode 3.2 and later
  0xF0 => "\xF8\xFF",  # Apple logo
  0xF1 => "\x30\x09",  # RIGHT ANGLE BRACKET
  0xF2 => "\x22\x2B",  # INTEGRAL
  0xF3 => "\x23\x20",  # TOP HALF INTEGRAL
  0xF4 => "\x23\xAE",  # INTEGRAL EXTENSION # for Unicode 3.2 and later
  0xF5 => "\x23\x21",  # BOTTOM HALF INTEGRAL
  0xF6 => "\x23\x9E",  # RIGHT PARENTHESIS UPPER HOOK # for Unicode 3.2 and later
  0xF7 => "\x23\x9F",  # RIGHT PARENTHESIS EXTENSION # for Unicode 3.2 and later
  0xF8 => "\x23\xA0",  # RIGHT PARENTHESIS LOWER HOOK # for Unicode 3.2 and later
  0xF9 => "\x23\xA4",  # RIGHT SQUARE BRACKET UPPER CORNER # for Unicode 3.2 and later
  0xFA => "\x23\xA5",  # RIGHT SQUARE BRACKET EXTENSION # for Unicode 3.2 and later
  0xFB => "\x23\xA6",  # RIGHT SQUARE BRACKET LOWER CORNER # for Unicode 3.2 and later
  0xFC => "\x23\xAB",  # RIGHT CURLY BRACKET UPPER HOOK # for Unicode 3.2 and later
  0xFD => "\x23\xAC",  # RIGHT CURLY BRACKET MIDDLE PIECE # for Unicode 3.2 and later
  0xFE => "\x23\xAD",  # RIGHT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later
}
SYMBOL_MAP =

RIGHT CURLY BRACKET LOWER HOOK # for Unicode 3.2 and later

UNICODE_MAP.invert

Class Method Summary collapse

Class Method Details

.byte(name) ⇒ Object



390
391
392
# File 'lib/rpdf2txt/symbol.rb', line 390

def Symbol.byte(name)
  NAMES[name] || SYMBOL_MAP[name]
end

.from_utf16(txt) ⇒ Object



400
401
402
403
404
405
406
# File 'lib/rpdf2txt/symbol.rb', line 400

def Symbol.from_utf16(txt)
  res = ''
  txt.scan(/../n) { |bb|
    res << SYMBOL_MAP.fetch(bb, '')
  }
  res
end

.to_utf16(txt) ⇒ Object



393
394
395
396
397
398
399
# File 'lib/rpdf2txt/symbol.rb', line 393

def Symbol.to_utf16(txt)
  res = ''
  txt.each_byte { |byte|
    res << UNICODE_MAP.fetch(byte, '')
  }
  res
end