Module: TextUtils::UnicodeHelper

Included in:
TextUtils
Defined in:
lib/textutils/helper/unicode_helper.rb

Constant Summary collapse

U_HYPHEN =

NB:

U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. - 

see en.wikipedia.org/wiki/Dash
"\u2010"
U_NON_BREAKING_HYPHEN =

unambigous hyphen

"\u2011"
U_MINUS =

unambigous non-breaking hyphen

"\u2212"
U_NDASH =

unambigous minus sign (html => −)

"\u2013"
U_MDASH =

ndash (html => – ascii => –)

"\u2014"

Instance Method Summary collapse

Instance Method Details

#convert_unicode_dashes_to_plain_ascii(text, opts = {}) ⇒ Object

mdash (html => — ascii => —)



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/textutils/helper/unicode_helper.rb', line 18

def convert_unicode_dashes_to_plain_ascii( text, opts = {} )

  text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|

    # puts "found U+#{'%04X' % $1.ord} (#{$1})"

    msg = ''

    if $1 == U_HYPHEN
      msg << "found hyhpen U+2010 (#{$1})"
    elsif $1 == U_NON_BREAKING_HYPHEN
      msg << "found non_breaking_hyhpen U+2011 (#{$1})"
    elsif $1 == U_MINUS
      msg << "found minus U+2212 (#{$1})"
    elsif $1 == U_NDASH
      msg << "found ndash U+2013 (#{$1})"
    elsif $1 == U_MDASH
      msg << "found mdash U+2014 (#{$1})"
    else
      msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
    end

    msg << " in file >#{opts[:path]}<"   if opts[:path]
    msg << "; converting to plain ascii hyphen_minus (-)"

    puts "*** warning: #{msg}"

    '-'
  end

  text
end