Class: NameParser

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/textutils/parser/name_parser.rb

Overview

fix: move into TextUtils namespace/module!! ??

Instance Method Summary collapse

Instance Method Details

#parse(chunks) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/textutils/parser/name_parser.rb', line 9

def parse( chunks )
  ## todo/fix: (re)use nameparser - for now "simple" inline version
  ##  fix!!! - note: for now lang gets ignored
  ##  fix: add hanlde
  ##  Leuven[nl]|Louvain[fr] Löwen[de]
  ##  Antwerpen[nl]|Anvers[fr] [Antwerp]
  ##  Brussel[nl]•Bruxelles[fr]   -> official bi-lingual name
  ##  etc.

  ## values - split into names (name n lang pairs)
  ## note: assumes (default) lang from more_attribs unless otherwise marked e.g. [] assume en etc.

  ## split chunks into values
  values = []
  chunks.each do |chunk|
    next if chunk.nil? || chunk.blank?  ## skip nil or empty/blank chunks

    parts = chunk.split( '|' )   # 1)  split |

    parts.each do |part|
      s = StringScanner.new( part )
      s.skip( /[ \t]+/)   # skip whitespaces

      while s.eos? == false
        if s.check( /\[/ )
          ## scan everything until the end of bracket (e.g.])
          ##  fix!!! - note: for now lang gets ignored
          value = s.scan( /\[[^\]]+\]/)
          value = value[1...-1]   # strip enclosing [] e.g. [Bavaria] => Bavaria
        else
          ## scan everything until the begin of bracket (e.g.[)
          value = s.scan( /[^\[]+/)
          value = value.strip
        end
        values << value

        s.skip( /[ \t]+/)  # skip whitespaces
        logger.debug( "[NameParser] eos?: #{s.eos?}, rest: >#{s.rest}<" )
      end
    end
  end

  logger.debug( "[NameParser] values=#{values.inspect}")

  names = []
  values.each do |value|
    name = value
    ## todo: split by bullet ? (official multilang name) e.g. Brussel • Bruxelles
    ## todo: process variants w/ () e.g. Krems (a. d. Donau) etc. ??
    names << name
  end

  logger.debug( "[NameParser] names=#{names.inspect}")

  names
end