Module: TypoHero

Extended by:: TypoHero

Included in:: TypoHero

Defined in:: lib/typohero.rb,
lib/typohero/latex.rb,
lib/typohero/version.rb

Constant Summary collapse

EXCLUDED_TAGS =

%w(head pre code kbd math script style textarea)

EXCLUDED_TAGS_RE =

/\A<(\/)?(?:#{EXCLUDED_TAGS.join('|')})[\p{Space}\/>]/im

TOKENIZER_RE =

%r{
  <!--(?:(?:(?!-->).)*)-->|            # comment
  <!\[CDATA\[(?:(?:(?!\]\]>).)*)\]\]>| # cdata
  <[^>]+>|                             # opening or closing tag
  \\[\(\)\[\]]|                        # latex begin/end
  \$\$|                                # dollar latex begin/end
  (?:(?:(?!\$\$|\\[\(\)\[\]])[^<])+)   # text without double dollar or latex
}xm

ESCAPE =

{
  '\\\\'  => '&#92;',
  '\"'    => '&#34;',
  "\\'"   => '&#39;',
  '\.'    => '&#46;',
  '\,'    => '&#44;',
  '\-'    => '&#45;',
  '\`'    => '&#96;',
}

UNESCAPE =

Hash[ESCAPE.map {|k,v| [v,k[1..-1]]

ESCAPE_RE =

Regexp.union(*ESCAPE.keys)

UNESCAPE_RE =

Regexp.union(*UNESCAPE.keys)

NBSP =

"\u00a0"

NBSP_THIN =

"\u202F"

MDASH =

"\u2014"

NDASH =

"\u2013"

LDQUO =

"\u201C"

RDQUO =

"\u201D"

LSQUO =

"\u2018"

RSQUO =

"\u2019"

BDQUO =

"\u201E"

ELLIPSIS =

"\u2026"

SPECIAL =

{
  # enhance!
  ' - '      => " #{NDASH} ",
  '---'      => MDASH,
  '--'       => NDASH,
  '...'      => ELLIPSIS,
  '. . .'    => ELLIPSIS,
  '``'       => LDQUO,
  "''"       => RDQUO,
  '`'        => LSQUO,
  #'\''        => RSQUO, # needs more complex treatment
  ',,'       => BDQUO,
  '(c)'      => "\u00A9",
  '(C)'      => "\u00A9",
  '(r)'      => "\u00AE",
  '(R)'      => "\u00AE",
  '(tm)'     => "\u2122",
  '(TM)'     => "\u2122",
  # normalize for further processing
  '&ldquo;'  => LDQUO,
  '&rdquo;'  => RDQUO,
  '&lsquo;'  => LSQUO,
  '&rsquo;'  => RSQUO,
  '&nbsp;'   => NBSP,
  '&ndash;'  => NDASH,
  '&mdash;'  => MDASH
}

SPECIAL_RE =

Regexp.union(*SPECIAL.keys)

LATEX_RE =

/(#{Regexp.union *LATEX.keys})(?=\p{Space}|$)/m

DASH_RE =

"[#{MDASH}#{NDASH}]"

AMP_RE =

'&(?:amp;)?'

LEFT_QUOTE_RE =

"[#{LDQUO}#{LSQUO}#{BDQUO}]"

PRIME_RE =

/(?<=\d)(''?)(?=[\p{Space}\dNEWS]|$)/m

PRIMES =

{
 "'"   => "\u2032",
 "''"  => "\u2033",
 "'''" => "\u2034",
}

ORDINAL_RE =

/(?<=\d)(st|nd|rd|th)(?=\p{Space}|$)/

MDASH_SPACE_RE =

/\p{Space}*#{MDASH}\p{Space}*/

NDASH_SPACE_RE =

/\p{Space}*#{NDASH}\p{Space}*/

MDASH_SPACE =

"#{NBSP_THIN}#{MDASH}#{NBSP_THIN}"

NDASH_SPACE =

"#{NBSP}#{NDASH}#{NBSP}"

REPLACE_AMP_RE =

/(?<=\p{Space})#{AMP_RE}(?=\p{Space})/

CAPS_BEGIN_RE =

"(^|\\p{Space}|#{LEFT_QUOTE_RE})"

CAPS_INNER_RE = right quote for posession (e.g. JIMMY’S)

"(?:#{AMP_RE}|[A-Z\\d\\.]|#{RSQUO})*"

CAPS_RE =

/#{CAPS_BEGIN_RE}([A-Z\d]#{CAPS_INNER_RE}[A-Z]#{CAPS_INNER_RE}|[A-Z]#{CAPS_INNER_RE}[A-Z\d]#{CAPS_INNER_RE})/m

RIGHT_QUOTE_RE =

%r{
  ^['"](?=\p{Punct})\B|                       # Very first character is a closing quote followed by punctuation at a non-word-break
  (?<!^|#{DASH_RE}|\p{Space}|[\[\{\(\-])['"]| # Not after dash, space or opening parentheses
  ['"](?=\p{Space}|$)|                        # Followed by space or end of line
  's\b|                                       # Apostrophe
  (?<=#{DASH_RE})['"](?=\p{Punct})|           # Dash quote punctuation (e.g. --'!), for quotations
  '(?=(\d\d(?:s|\p{Space}|$)))                # Decade abbreviations (the '80s)
}xm

LEFT_QUOTES =

{
  "'" => LSQUO,
  '"' => LDQUO,
}

RIGHT_QUOTES =

{
  "'" => RSQUO,
  '"' => RDQUO,
}

TWO_QUOTES =

{
  '"\'' => LDQUO + LSQUO,
  '\'"' => LSQUO + LDQUO
}

PARAGRAPH_RE =

'h[1-6]|p|li|dt|dd|div'

INLINE_RE =

'a|em|span|strong|i|b'

WIDONT_PARAGRAPH_RE =

/\A<\/(?:#{PARAGRAPH_RE})>\Z/im

WIDONT_INLINE_RE =

/\A<\/?(?:#{INLINE_RE})[^>]*>\Z/im

WIDONT_NBSP_RE =

/[#{NBSP}#{NBSP_THIN}<>]/

INITIAL_QUOTE_RE =

/(?=(?:<(?:#{PARAGRAPH_RE})[^>]*>|^)(?:<(?:#{INLINE_RE})[^>]*>|\p{Space})*)#{LEFT_QUOTE_RE}/m

INITIAL_QUOTES =

{
  LSQUO => "<span class=\"quo\">#{LSQUO}</span>",
  LDQUO => "<span class=\"dquo\">#{LDQUO}</span>",
  BDQUO => "<span class=\"bdquo\">#{BDQUO}</span>",
}

LATEX =

{
  '\AA' => "\u00c5",
  '\AE' => "\u00c6",
  '\Alpha' => "\u0391",
  '\Beta' => "\u0392",
  '\Box' => "\u25a1",
  '\Bumpeq' => "\u224e",
  '\Cap' => "\u22d2",
  '\Chi' => "\u03a7",
  '\Cup' => "\u22d3",
  '\DH' => "\u00d0",
  '\DJ' => "\u0110",
  '\Delta' => "\u0394",
  '\ESH' => "\u01a9",
  '\Epsilon' => "\u0395",
  '\Eta' => "\u0397",
  '\Gamma' => "\u0393",
  '\IJ' => "\u0132",
  '\Iota' => "\u0399",
  '\Kappa' => "\u039a",
  '\L' => "\u0141",
  '\Lambda' => "\u039b",
  '\Leftrightarrow' => "\u21d4",
  '\Mu' => "\u039c",
  '\NG' => "\u014a",
  '\Nu' => "\u039d",
  '\O' => "\u00d8",
  '\OE' => "\u0152",
  '\OHORN' => "\u01a0",
  '\Omega' => "\u03a9",
  '\Omicron' => "\u039f",
  '\P' => "\u00b6",
  '\Phi' => "\u03a6",
  '\Pi' => "\u03a0",
  '\Psi' => "\u03a8",
  '\Rho' => "\u03a1",
  '\Rightarrow' => "\u21d2",
  '\S' => "\u00a7",
  '\Sigma' => "\u03a3",
  '\Subset' => "\u22d0",
  '\Supset' => "\u22d1",
  '\TH' => "\u00de",
  '\Tau' => "\u03a4",
  '\Theta' => "\u0398",
  '\Thorn' => "\u00de",
  '\UHORN' => "\u01af",
  '\Upsilon' => "\u03a5",
  '\Vdash' => "\u22a9",
  '\Vvdash' => "\u22aa",
  '\Xi' => "\u039e",
  '\Zeta' => "\u0396",
  '\aa' => "\u00e5",
  '\ae' => "\u00e6",
  '\ain' => "\u02bf",
  '\alpha' => "\u03b1",
  '\angle' => "\u2220",
  '\approx' => "\u2248",
  '\approxeq' => "\u224a",
  '\ast' => "\u2217",
  '\asymp' => "\u224d",
  '\ayn' => "\u02bf",
  '\backsim' => "\u223d",
  '\backsimeq' => "\u22cd",
  '\barwedge' => "\u22bc",
  '\because' => "\u2235",
  '\beta' => "\u03b2",
  '\between' => "\u226c",
  '\bigcap' => "\u22c2",
  '\bigcup' => "\u22c3",
  '\bigvee' => "\u22c1",
  '\bigwedge' => "\u22c0",
  '\bot' => "\u22a5",
  '\bowtie' => "\u22c8",
  '\boxdot' => "\u22a1",
  '\boxminus' => "\u229f",
  '\boxplus' => "\u229e",
  '\boxtimes' => "\u22a0",
  '\bullet' => "\u2219",
  '\bumpeq' => "\u224f",
  '\cap' => "\u2229",
  '\cdot' => "\u22c5",
  '\cdots' => "\u22ef",
  '\chi' => "\u03c7",
  '\circ' => "\u2218",
  '\circeq' => "\u2257",
  '\circledast' => "\u229b",
  '\circledcirc' => "\u229a",
  '\circleddash' => "\u229d",
  '\clubsuit' => "\u2663",
  '\complement' => "\u2201",
  '\cong' => "\u2245",
  '\coprod' => "\u2210",
  '\copyright' => "\u00a9",
  '\cup' => "\u222a",
  '\curlyeqprec' => "\u22de",
  '\curlyeqsucc' => "\u22df",
  '\curlyvee' => "\u22ce",
  '\curlywedge' => "\u22cf",
  '\dag' => "\u2020",
  '\dashv' => "\u22a3",
  '\ddag' => "\u2021",
  '\ddots' => "\u22f1",
  '\delta' => "\u03b4",
  '\dh' => "\u00f0",
  '\diamond' => "\u22c4",
  '\diamondsuit' => "\u2662",
  '\div' => "\u00f7",
  '\divideontimes' => "\u22c7",
  '\dj' => "\u0111",
  '\doteq' => "\u2250",
  '\doteqdot' => "\u2251",
  '\dotplus' => "\u2214",
  '\dots' => "\u2026",
  '\downarrow' => "\u2193",
  '\eqcirc' => "\u2256",
  '\equiv' => "\u2261",
  '\eta' => "\u03b7",
  '\euro' => "\u20ac",
  '\exists' => "\u2203",
  '\fallingdotseq' => "\u2252",
  '\flat' => "\u266d",
  '\forall' => "\u2200",
  '\gamma' => "\u03b3",
  '\geq' => "\u2265",
  '\geqq' => "\u2267",
  '\gg' => "\u226b",
  '\ggg' => "\u22d9",
  '\gneqq' => "\u2269",
  '\gnsim' => "\u22e7",
  '\gtrdot' => "\u22d7",
  '\gtreqless' => "\u22db",
  '\gtrless' => "\u2277",
  '\gtrsim' => "\u2273",
  '\guillemotleft' => "\u00ab",
  '\guillemotright' => "\u00bb",
  '\guilsinglleft' => "\u2039",
  '\guilsinglright' => "\u203a",
  '\hamza' => "\u02be",
  '\heartsuit' => "\u2661",
  '\hv' => "\u0195",
  '\i' => "\u0131",
  '\iiint' => "\u222d",
  '\iint' => "\u222c",
  '\ij' => "\u0133",
  '\in' => "\u2208",
  '\infty' => "\u221e",
  '\int' => "\u222b",
  '\intercal' => "\u22ba",
  '\iota' => "\u03b9",
  '\kappa' => "\u03ba",
  '\l' => "\u0142",
  '\lambda' => "\u03bb",
  '\langle' => "\u27e8",
  '\lceil' => "\u2308",
  '\leadsto' => "\u219d",
  '\leftarrow' => "\u2190",
  '\leftrightarrow' => "\u2194",
  '\leftthreetimes' => "\u22cb",
  '\leq' => "\u2264",
  '\leqq' => "\u2266",
  '\lessdot' => "\u22d6",
  '\lesseqgtr' => "\u22da",
  '\lessgtr' => "\u2276",
  '\lesssim' => "\u2272",
  '\lfloor' => "\u230a",
  '\lhd' => "\u22b2",
  '\ll' => "\u226a",
  '\lll' => "\u22d8",
  '\lneqq' => "\u2268",
  '\lnot' => "\u00ac",
  '\lnsim' => "\u22e6",
  '\ltimes' => "\u22c9",
  '\measuredangle' => "\u2221",
  '\mid' => "\u2223",
  '\mp' => "\u2213",
  '\mu' => "\u03bc",
  '\multimap' => "\u22b8",
  '\nVdash' => "\u22ae",
  '\nabla' => "\u2207",
  '\natural' => "\u266e",
  '\ncong' => "\u2247",
  '\neq' => "\u2260",
  '\nexists' => "\u2204",
  '\ng' => "\u014b",
  '\ngeq' => "\u2271",
  '\ngtr' => "\u226f",
  '\ni' => "\u220b",
  '\nleq' => "\u2270",
  '\nless' => "\u226e",
  '\nmid' => "\u2224",
  '\nobreakspace' => "\u00a0",
  '\notin' => "\u2209",
  '\nparallel' => "\u2226",
  '\nprec' => "\u2280",
  '\nsim' => "\u2241",
  '\nsubseteq' => "\u2288",
  '\nsucc' => "\u2281",
  '\nsupseteq' => "\u2289",
  '\ntriangleleft' => "\u22ea",
  '\ntrianglelefteq' => "\u22ec",
  '\ntriangleright' => "\u22eb",
  '\ntrianglerighteq' => "\u22ed",
  '\nu' => "\u03bd",
  '\o' => "\u00f8",
  '\odot' => "\u2299",
  '\oe' => "\u0153",
  '\ohorn' => "\u01a1",
  '\oint' => "\u222e",
  '\omega' => "\u03c9",
  '\omicron' => "\u03bf",
  '\ominus' => "\u2296",
  '\oplus' => "\u2295",
  '\oslash' => "\u2298",
  '\otimes' => "\u2297",
  '\parallel' => "\u2225",
  '\partial' => "\u2202",
  '\pi' => "\u03c0",
  '\pitchfork' => "\u22d4",
  '\pm' => "\u00b1",
  '\pounds' => "\u00a3",
  '\prec' => "\u227a",
  '\preccurlyeq' => "\u227c",
  '\precnsim' => "\u22e8",
  '\precsim' => "\u227e",
  '\prod' => "\u220f",
  '\propto' => "\u221d",
  '\psi' => "\u03c8",
  '\quotedblbase' => "\u201e",
  '\quotesinglbase' => "\u201a",
  '\rangle' => "\u27e9",
  '\rceil' => "\u2309",
  '\rfloor' => "\u230b",
  '\rhd' => "\u22b3",
  '\rightarrow' => "\u2192",
  '\rightleftharpoons' => "\u21cc",
  '\rightthreetimes' => "\u22cc",
  '\risingdotseq' => "\u2253",
  '\rtimes' => "\u22ca",
  '\set' => "\u2205",
  '\setminus' => "\u2216",
  '\sharp' => "\u266f",
  '\sigma' => "\u03c3",
  '\sim' => "\u223c",
  '\simeq' => "\u2243",
  '\spadesuit' => "\u2660",
  '\sphericalangle' => "\u2222",
  '\sqcap' => "\u2293",
  '\sqcup' => "\u2294",
  '\sqsubset' => "\u228f",
  '\sqsubseteq' => "\u2291",
  '\sqsupset' => "\u2290",
  '\sqsupseteq' => "\u2292",
  '\ss' => "\u00df",
  '\star' => "\u22c6",
  '\subset' => "\u2282",
  '\subseteq' => "\u2286",
  '\subsetneq' => "\u228a",
  '\succ' => "\u227b",
  '\succcurlyeq' => "\u227d",
  '\succnsim' => "\u22e9",
  '\succsim' => "\u227f",
  '\sum' => "\u2211",
  '\supset' => "\u2283",
  '\supseteq' => "\u2287",
  '\supsetneq' => "\u228b",
  '\surd' => "\u221a",
  '\tau' => "\u03c4",
  '\textBhook' => "\u0181",
  '\textChook' => "\u0187",
  '\textDafrican' => "\u0189",
  '\textDhook' => "\u018a",
  '\textEopen' => "\u0190",
  '\textEreversed' => "\u018e",
  '\textEsh' => "\u01a9",
  '\textEzh' => "\u01b7",
  '\textFhook' => "\u0191",
  '\textGammaafrican' => "\u0194",
  '\textHbar' => "\u0126",
  '\textIotaafrican' => "\u0196",
  '\textKhook' => "\u0198",
  '\textNhookleft' => "\u019d",
  '\textOopen' => "\u0186",
  '\textPhook' => "\u01a4",
  '\textTbar' => "\u0166",
  '\textThook' => "\u01ac",
  '\textTretroflexhook' => "\u01ae",
  '\textTstroke' => "\u0166",
  '\textVhook' => "\u01b2",
  '\textYhook' => "\u01b3",
  '\textampersand' => "\u0026",
  '\textasciiacute' => "\u00b4",
  '\textasciicedilla' => "\u00b8",
  '\textasciicircum' => "\u005e",
  '\textasciidieresis' => "\u00a8",
  '\textasciigrave' => "\u0060",
  '\textasciimacron' => "\u00af",
  '\textasciitilde' => "\u007e",
  '\textasteriskcentered' => "\u002a",
  '\textbackslash' => "\u005c",
  '\textbar' => "\u007c",
  '\textbardotlessj' => "\u025f",
  '\textbarglotstop' => "\u02a1",
  '\textbari' => "\u0268",
  '\textbarl' => "\u0142",
  '\textbaro' => "\u0275",
  '\textbarrevglotstop' => "\u02a2",
  '\textbaru' => "\u0289",
  '\textbeltl' => "\u026c",
  '\textbhook' => "\u0253",
  '\textbraceleft' => "\u007b",
  '\textbraceright' => "\u007d",
  '\textbrokenbar' => "\u00a6",
  '\textbullet' => "\u2022",
  '\textbullseye' => "\u0298",
  '\textcent' => "\u00a2",
  '\textcentereddot' => "\u00b7",
  '\textchook' => "\u0188",
  '\textcloseepsilon' => "\u029a",
  '\textcloseomega' => "\u0277",
  '\textcloserevepsilon' => "\u025e",
  '\textcolonmonetary' => "\u20a1",
  '\textcopyright' => "\u00a9",
  '\textcrb' => "\u0180",
  '\textcrd' => "\u0111",
  '\textcrh' => "\u0127",
  '\textcrlambda' => "\u019b",
  '\textctc' => "\u0255",
  '\textctesh' => "\u0286",
  '\textctj' => "\u029d",
  '\textctyogh' => "\u0293",
  '\textctz' => "\u0291",
  '\textcurrency' => "\u00a4",
  '\textdctzlig' => "\u02a5",
  '\textdegree' => "\u00b0",
  '\textdhook' => "\u0257",
  '\textdiv' => "\u00f7",
  '\textdollar' => "\u0024",
  '\textdong' => "\u20ab",
  '\textdtail' => "\u0256",
  '\textdyoghlig' => "\u02a4",
  '\textdzlig' => "\u02a3",
  '\textemdash' => "\u2014",
  '\textendash' => "\u2013",
  '\texteopen' => "\u025b",
  '\textepsilon' => "\u025b",
  '\textequals' => "\u003d",
  '\textesh' => "\u0283",
  '\texteturned' => "\u01dd",
  '\texteuro' => "\u20ac",
  '\textexclamdown' => "\u00a1",
  '\textezh' => "\u0292",
  '\textfishhookr' => "\u027e",
  '\textflorin' => "\u0192",
  '\textg' => "\u0067",
  '\textgamma' => "\u0263",
  '\textgammalatinsmall' => "\u0263",
  '\textglotstop' => "\u0294",
  '\textgreater' => "\u003e",
  '\texthash' => "\u0023",
  '\texthbar' => "\u0127",
  '\texthtb' => "\u0253",
  '\texthtbardotlessj' => "\u0284",
  '\texthtc' => "\u0188",
  '\texthtd' => "\u0257",
  '\texthtg' => "\u0260",
  '\texthth' => "\u0266",
  '\texththeng' => "\u0267",
  '\texthtk' => "\u0199",
  '\texthtp' => "\u01a5",
  '\texthtq' => "\u02a0",
  '\texthtscg' => "\u029b",
  '\texthtt' => "\u01ad",
  '\texthvlig' => "\u0195",
  '\textinterrobang' => "\u203d",
  '\textinvglotstop' => "\u0296",
  '\textinvscr' => "\u0281",
  '\textiota' => "\u0269",
  '\textiotalatin' => "\u0269",
  '\textkhook' => "\u0199",
  '\textkra' => "\u0138",
  '\textlengthmark' => "\u02d0",
  '\textless' => "\u003c",
  '\textlhti' => "\u027f",
  '\textlira' => "\u20a4",
  '\textlogicalnot' => "\u00ac",
  '\textlonglegr' => "\u027c",
  '\textlooptoprevesh' => "\u01aa",
  '\textltailm' => "\u0271",
  '\textltailn' => "\u0272",
  '\textltilde' => "\u026b",
  '\textlyoghlig' => "\u026e",
  '\textminus' => "\u2212",
  '\textmu' => "\u00b5",
  '\textnaira' => "\u20a6",
  '\textnhookleft' => "\u0272",
  '\textnumero' => "\u2116",
  '\textonehalf' => "\u00bd",
  '\textonequarter' => "\u00bc",
  '\textonesuperior' => "\u00b9",
  '\textoopen' => "\u0254",
  '\textopeno' => "\u0254",
  '\textordfeminine' => "\u00aa",
  '\textordmasculine' => "\u00ba",
  '\textoverline' => "\u203e",
  '\textpalhookbelow' => "\u01ab",
  '\textparagraph' => "\u00b6",
  '\textpercent' => "\u0025",
  '\textperiodcentered' => "\u00b7",
  '\textpertenthousand' => "\u2031",
  '\textperthousand' => "\u2030",
  '\textphi' => "\u0278",
  '\textphook' => "\u01a5",
  '\textpm' => "\u00b1",
  '\textprimstress' => "\u02c8",
  '\textquestiondown' => "\u00bf",
  '\textquotedbl' => "\u0022",
  '\textquotedblleft' => "\u201c",
  '\textquotedblright' => "\u201d",
  '\textquoteleft' => "\u2018",
  '\textquoteright' => "\u2019",
  '\textquotesingle' => "\u0027",
  '\textraisevibyi' => "\u0285",
  '\textramshorns' => "\u0264",
  '\textreferencemark' => "\u203b",
  '\textregistered' => "\u00ae",
  '\textreve' => "\u0258",
  '\textrevepsilon' => "\u025c",
  '\textrevglotstop' => "\u0295",
  '\textrhookrevepsilon' => "\u025d",
  '\textrhookschwa' => "\u025a",
  '\textrtaild' => "\u0256",
  '\textrtaill' => "\u026d",
  '\textrtailn' => "\u0273",
  '\textrtailr' => "\u027d",
  '\textrtails' => "\u0282",
  '\textrtailt' => "\u0288",
  '\textrtailz' => "\u0290",
  '\textscb' => "\u0299",
  '\textscg' => "\u0262",
  '\textsch' => "\u029c",
  '\textschwa' => "\u0259",
  '\textsci' => "\u026a",
  '\textscl' => "\u029f",
  '\textscn' => "\u0274",
  '\textscoelig' => "\u0276",
  '\textscr' => "\u0280",
  '\textscripta' => "\u0251",
  '\textscriptg' => "\u0261",
  '\textscriptv' => "\u028b",
  '\textscy' => "\u028f",
  '\textsection' => "\u00a7",
  '\textsterling' => "\u00a3",
  '\textstretchc' => "\u0297",
  '\texttbar' => "\u0167",
  '\texttctclig' => "\u02a8",
  '\texttesh' => "\u02a7",
  '\textteshlig' => "\u02a7",
  '\textthook' => "\u01ad",
  '\textthorn' => "\u00fe",
  '\textthornvari' => "\u00fe",
  '\textthornvarii' => "\u00fe",
  '\textthornvariii' => "\u00fe",
  '\textthornvariv' => "\u00fe",
  '\textthreequarters' => "\u00be",
  '\textthreesuperior' => "\u00b3",
  '\texttimes' => "\u00d7",
  '\texttrademark' => "\u2122",
  '\texttretroflexhook' => "\u0288",
  '\texttslig' => "\u02a6",
  '\texttstroke' => "\u0167",
  '\textturna' => "\u0250",
  '\textturnh' => "\u0265",
  '\textturnk' => "\u029e",
  '\textturnlonglegr' => "\u027a",
  '\textturnm' => "\u026f",
  '\textturnmrleg' => "\u0270",
  '\textturnr' => "\u0279",
  '\textturnrrtail' => "\u027b",
  '\textturnscripta' => "\u0252",
  '\textturnt' => "\u0287",
  '\textturnv' => "\u028c",
  '\textturnw' => "\u028d",
  '\textturny' => "\u028e",
  '\texttwosuperior' => "\u00b2",
  '\textunderscore' => "\u005f",
  '\textupsilon' => "\u028a",
  '\textvhook' => "\u028b",
  '\textwon' => "\u20a9",
  '\textyen' => "\u00a5",
  '\textyhook' => "\u01b4",
  '\textyogh' => "\u0292",
  '\th' => "\u00fe",
  '\therefore' => "\u2234",
  '\times' => "\u00d7",
  '\tone1' => "\u02e9",
  '\tone2' => "\u02e8",
  '\tone3' => "\u02e7",
  '\tone4' => "\u02e6",
  '\tone5' => "\u02e5",
  '\top' => "\u22a4",
  '\triangleq' => "\u225c",
  '\uhorn' => "\u01b0",
  '\unlhd' => "\u22b4",
  '\unrhd' => "\u22b5",
  '\uparrow' => "\u2191",
  '\updownarrow' => "\u2195",
  '\uplus' => "\u228e",
  '\upsilon' => "\u03c5",
  '\epsilon' => "\u03b5",
  '\varepsilon' => "\u03b5",
  '\phi' => "\u03c6",
  '\varphi' => "\u03c6",
  '\rho' => "\u03c1",
  '\varrho' => "\u03c1",
  '\sigma' => "\u03c2",
  '\varsigma' => "\u03c2",
  '\theta' => "\u03b8",
  '\vartheta' => "\u03b8",
  '\vdash' => "\u22a2",
  '\vdots' => "\u22ee",
  '\vee' => "\u2228",
  '\veebar' => "\u22bb",
  '\wedge' => "\u2227",
  '\wr' => "\u2240",
  '\xi' => "\u03be",
  '\zeta' => "\u03b6",
  '\not\approx' => "\u2249",
  '\not\asymp' => "\u226d",
  '\not\equiv' => "\u2262",
  '\not\gtrsim' => "\u2275",
  '\not\lesssim' => "\u2274",
  '\not\ni' => "\u220c",
  '\not\preccurlyeq' => "\u22e0",
  '\not\simeq' => "\u2244",
  '\not\sqsubseteq' => "\u22e2",
  '\not\sqsupseteq' => "\u22e3",
  '\not\subset' => "\u2284",
  '\not\succcurlyeq' => "\u22e1",
  '\not\supset' => "\u2285",
}

VERSION =

'0.0.3'

Instance Method Summary collapse

Instance Method Details

#amp(s) ⇒ `Object`



341
342
343

# File 'lib/typohero.rb', line 341

def amp(s)
  s.gsub!(REPLACE_AMP_RE, '<span class="amp">&amp;</span>')
end

#caps(s) ⇒ `Object`



345
346
347

# File 'lib/typohero.rb', line 345

def caps(s)
  s.gsub!(CAPS_RE, '\1<span class="caps">\2</span>')
end

#dash_spaces(s) ⇒ `Object`

# File 'lib/typohero.rb', line 336

def dash_spaces(s)
  s.gsub!(MDASH_SPACE_RE, MDASH_SPACE)
  s.gsub!(NDASH_SPACE_RE, NDASH_SPACE)
end

#decode(s) ⇒ `Object`

# File 'lib/typohero.rb', line 313

def decode(s)
  s.gsub!(/&#x([0-9A-F]+);|&#([0-9]+);/i) do
    i = $1 ? $1.to_i(16) : $2.to_i(10)
    i == 38 ? '&amp;' : i.chr('UTF-8')
  end
end

#enhance(input) ⇒ `Object`

# File 'lib/typohero.rb', line 250

def enhance(input)
  tokens, text, prev_last_char = [], []
  tokenize(input) do |s, type|
    if type == :text
      last_char = s[-1]
      decode(s)
      escape(s)
      primes(s)
      special(s)
      latex(s)
      quotes(s, prev_last_char)
      dash_spaces(s)
      prev_last_char = last_char
      text << s
    end
    tokens << s
  end
  widont(tokens)
  text.each do |s|
    initial_quotes(s)
    amp(s)
    caps(s)
    ordinals(s)
    nobr(s)
    unescape(s)
  end
  html_safe(input, tokens.join)
end

#escape(s) ⇒ `Object`



320
321
322

# File 'lib/typohero.rb', line 320

def escape(s)
  s.gsub!(ESCAPE_RE, ESCAPE)
end

#html_safe(src, dst) ⇒ `Object`



309
310
311

# File 'lib/typohero.rb', line 309

def html_safe(src, dst)
  src.respond_to?(:html_safe?) && src.html_safe? ? dst.html_safe : dst
end

#initial_quotes(s) ⇒ `Object`



349
350
351

# File 'lib/typohero.rb', line 349

def initial_quotes(s)
  s.gsub!(INITIAL_QUOTE_RE, INITIAL_QUOTES)
end

#latex(s) ⇒ `Object`



332
333
334

# File 'lib/typohero.rb', line 332

def latex(s)
  s.gsub!(LATEX_RE, LATEX)
end

#nobr(s) ⇒ `Object`



353
354
355

# File 'lib/typohero.rb', line 353

def nobr(s)
  s.gsub!(/[\p{Digit}\p{Word}]+(-[\p{Digit}\p{Word}]+)+/, '<span class="nobr">\0</span>')
end

#ordinals(s) ⇒ `Object`



362
363
364

# File 'lib/typohero.rb', line 362

def ordinals(s)
  s.gsub!(ORDINAL_RE, '<span class="ord">\1</span>')
end

#primes(s) ⇒ `Object`

# File 'lib/typohero.rb', line 357

def primes(s)
  # Special case for inches and minutes, seconds
  s.gsub!(PRIME_RE, PRIMES)
end

#quotes(s, prev_last_char) ⇒ `Object`

# File 'lib/typohero.rb', line 366

def quotes(s, prev_last_char)
  if s =~ /\A['"]\Z/
    s.replace(prev_last_char =~ /\P{Space}/ ? RIGHT_QUOTES[s] : LEFT_QUOTES[s])
    return
  end

  # Special case for double sets of quotes, e.g.
  #   <p>He said, "'Quoted' words in a larger quote."</p>
  s.gsub!(/(?:"'|'")(?=\p{Word})/, TWO_QUOTES)
  s.gsub!(RIGHT_QUOTE_RE, RIGHT_QUOTES)
  s.gsub!(/['"]/,         LEFT_QUOTES)
end

#special(s) ⇒ `Object`



328
329
330

# File 'lib/typohero.rb', line 328

def special(s)
  s.gsub!(SPECIAL_RE, SPECIAL)
end

#strip_tags(input) ⇒ `Object`

# File 'lib/typohero.rb', line 244

def strip_tags(input)
  out = ''
  tokenize(input) {|s, type| out << s if type == :text || type == :latex }
  html_safe(input, out)
end

#tokenize(input) ⇒ `Object`

# File 'lib/typohero.rb', line 135

def tokenize(input)
  comment, excluded, latex, dollar = false, 0, 0, 0
  input.scan TOKENIZER_RE do |s|
    type =
      if s =~ /\A<!--/
        :comment
      elsif s =~ /\A<!\[/
        :cdata
      end

    if !type && latex == 0 && dollar.even?
      if s=~ /\A</
        if s =~ EXCLUDED_TAGS_RE
          excluded += $1 ? -1 : 1
          excluded = 0 if excluded < 0
          type = :excluded
        else
          type = excluded == 0 ? :tag : :excluded
        end
      end
    end

    if !type && excluded == 0
      case s
      when /\A\\[\(\[]\Z/
        latex += 1
        type = :latex
      when /\A\\[\)\]]\Z/
        latex -= 1 if latex > 0
        type = :latex
      when '$$'
        dollar += 1
        type = :latex
      end
    end

    type ||=
      if excluded != 0
        :excluded
      elsif latex != 0 || dollar.odd?
        :latex
      else
        :text
      end

    yield(s, type)
  end
end

#tokenize_with_tags(input) ⇒ `Object`

# File 'lib/typohero.rb', line 184

def tokenize_with_tags(input)
  tags = []
  tokenize(input) do |s, type|
    if type == :tag && s =~ /\A<(\/)?([^\p{Space}\/>]+)/
      if $1
        until tags.empty? || tags.pop == $2; end
      else
        tags << $2
      end
    end
    yield(s, type, tags)
  end
end

#truncate(input, *max_words_or_separator) ⇒ `Object`

# File 'lib/typohero.rb', line 198

def truncate(input, *max_words_or_separator)
  max_words = max_words_or_separator.select {|i| Fixnum === i }.first
  if separator = max_words_or_separator.reject {|i| Fixnum === i }.first
    separator = Regexp.union(*separator) unless Regexp === separator
    separator = nil unless input =~ separator
  end
  out, tail, truncated = '', '', false
  tokenize_with_tags(input) do |s, type, tags|
    if separator && (type == :comment || type == :text || type == :latex || type == :tag) && separator === s
      out << $` if type == :text
      if type == :tag
        if s =~ /\A<\//
          tail << s
        else
          tags.pop
        end
      end
      truncated = tags
      break
    elsif max_words == 0
      if type == :text
        truncated = tags
        break
      end
      tail << s
    else
      if max_words && type == :text
        s =~ /\A(\p{Space}*)(.*)\Z/m
        ws, w = $1, $2.split(/\p{Space}+/)
        if w.size > max_words
          out << ws << w[0...max_words].join(' ')
          truncated = tags
          break
        end
        max_words -= w.size
      end
      out << s
    end
  end
  if truncated
    out.sub!(/[\p{Space}\p{Punct}]*\Z/, ELLIPSIS)
    tail << "</#{truncated.pop}>" until truncated.empty?
  end
  html_safe(input, out << tail)
end

#unescape(s) ⇒ `Object`



324
325
326

# File 'lib/typohero.rb', line 324

def unescape(s)
  s.gsub!(UNESCAPE_RE, UNESCAPE)
end

#widont(tokens) ⇒ `Object`

# File 'lib/typohero.rb', line 279

def widont(tokens)
  state, i, widow = 1, tokens.size - 1, nil
  while i >= 0
    if tokens[i] =~ WIDONT_PARAGRAPH_RE
      state = 1
    elsif tokens[i] !~ WIDONT_INLINE_RE
      if tokens[i] =~ WIDONT_NBSP_RE
        state = 0
      elsif state == 1 || state == 3
        if tokens[i] =~ (state == 1 ? /(\P{Space}+)?(\p{Space}+)?(\P{Space}+\p{Space}*)\Z/m :
                                      /(\P{Space}+)?(\p{Space}+)(\P{Space}*)\Z/m)
          if $1 && $2
            tokens[i].replace "#{$`}#{$1}#{NBSP}#{$3}"
            state = 0
          elsif $2
            state = 2
            widow = tokens[i]
          else
            state = 3
          end
        end
      elsif state == 2 && tokens[i] =~ /(\P{Space}+\p{Space}*)\Z/m
        widow.sub!(/\A\p{Space}*/, NBSP)
        state = 0
      end
    end
    i -= 1
  end
end

Module: TypoHero

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#amp(s) ⇒ Object

#caps(s) ⇒ Object

#dash_spaces(s) ⇒ Object

#decode(s) ⇒ Object

#enhance(input) ⇒ Object

#escape(s) ⇒ Object

#html_safe(src, dst) ⇒ Object

#initial_quotes(s) ⇒ Object

#latex(s) ⇒ Object

#nobr(s) ⇒ Object

#ordinals(s) ⇒ Object

#primes(s) ⇒ Object

#quotes(s, prev_last_char) ⇒ Object

#special(s) ⇒ Object

#strip_tags(input) ⇒ Object

#tokenize(input) ⇒ Object

#tokenize_with_tags(input) ⇒ Object

#truncate(input, *max_words_or_separator) ⇒ Object

#unescape(s) ⇒ Object

#widont(tokens) ⇒ Object