Top Level Namespace

Defined Under Namespace

Modules: TextUtils Classes: CodeReader, File, HashReader, LineReader, StringLineReader, ValuesReader

Constant Summary collapse

U_HYPHEN =

NB:

U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -

see en.wikipedia.org/wiki/Dash
"\u2010"
U_NON_BREAKING_HYPHEN =

unambigous hyphen

"\u2011"
U_MINUS =

unambigous non-breaking hyphen

"\u2212"
U_NDASH =

unambigous minus sign (html => −)

"\u2013"
U_MDASH =

ndash (html => – ascii => –)

"\u2014"

Instance Method Summary collapse

Instance Method Details

#convert_unicode_dashes_to_plain_ascii(text, opts = {}) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/textutils/utils.rb', line 30

def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
  
  text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|

    # puts "found U+#{'%04X' % $1.ord} (#{$1})"

    msg = ''

    if $1 == U_HYPHEN
      msg << "found hyhpen U+2010 (#{$1})"
    elsif $1 == U_NON_BREAKING_HYPHEN
      msg << "found non_breaking_hyhpen U+2011 (#{$1})"
    elsif $1 == U_MINUS
      msg << "found minus U+2212 (#{$1})"
    elsif $1 == U_NDASH
      msg << "found ndash U+2013 (#{$1})"
    elsif $1 == U_MDASH
      msg << "found mdash U+2014 (#{$1})"
    else
      msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
    end

    msg << " in file >#{opts[:path]}<"   if opts[:path]
    msg << "; converting to plain ascii hyphen_minus (-)"

    puts "*** warning: #{msg}"

    '-'
  end

  text
end

#title_esc_regex(title_unescaped) ⇒ Object

fix/todo: share helper for all text readers/parsers- where to put it?



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/textutils/utils.rb', line 68

def title_esc_regex( title_unescaped )
    
    ##  escape regex special chars e.g. . to \. and ( to \( etc.
    # e.g. Benfica Lis.
    # e.g. Club Atlético Colón (Santa Fe)

    ## NB: cannot use Regexp.escape! will escape space '' to '\ '
    ## title = Regexp.escape( title_unescaped )
    title = title_unescaped.gsub( '.', '\.' )
    title = title.gsub( '(', '\(' )
    title = title.gsub( ')', '\)' )

    ##  match accented char with or without accents
    ##  add (ü|ue) etc.
    ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss

    ## todo: add some more
    ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references  for more
    ##
    ##  reuse for all readers!
    
    alternatives = [
      ['-', '(-| )'],  ## e.g. Blau-Weiß Linz
      ['æ', '(æ|ae)'],  ## e.g. 
      ['á', '(á|a)'],  ## e.g. Bogotá, Sársfield
      ['ã', '(ã|a)'],  ## e.g  São Paulo
      ['ä', '(ä|ae)'],  ## e.g. 
      ['ç', '(ç|c)'],  ## e.g. Fenerbahçe
      ['é', '(é|e)'],  ## e.g. Vélez
      ['ê', '(ê|e)'],  ## e.g. Grêmio
      ['ñ', '(ñ|n)'],  ## e.g. Porteño
      ['ň', '(ň|n)'],  ## e.g. Plzeň
      ['Ö', '(Ö|Oe)'], ## e.g. Österreich
      ['ö', '(ö|oe)'],  ## e.g. Mönchengladbach
      ['ó', '(ó|o)'],   ## e.g. Colón
      ['ș', '(ș|s)'],   ## e.g. Bucarești
      ['ß', '(ß|ss)'],  ## e.g. Blau-Weiß Linz
      ['ü', '(ü|ue)'],  ## e.g. 
      ['ú', '(ú|u)']  ## e.g. Fútbol
    ]
    
    alternatives.each do |alt|
      title = title.gsub( alt[0], alt[1] )
    end

    title
end