Class: NameTokenizer

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/textutils/parser/name_parser.rb

Overview

  • rename to NameScanner, NameSplitter, NameSeparator, etc.

Instance Method Summary collapse

Instance Method Details

#tokenize(value) ⇒ Object

rename to/use split - why? why not??



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/textutils/parser/name_parser.rb', line 14

def tokenize( value )   ## rename to/use split - why? why not??

  names = []

  # 1)  split by | (pipe) -- remove leading n trailing whitespaces

  parts = value.split( /[ \t]*\|[ \t]*/ )

  # 2)  split "inline" translations e.g. München [Munich]


  ## todo: add support for  Munich [en]  e.g. trailing lang tag

  ## todo: add support for bullet (official bi-lingual names w/ tags ??) - see brussels - why, why not??


  parts.each do |part|
      s = StringScanner.new( part )
      s.skip( /[ \t]+/)   # skip whitespaces


      while s.eos? == false
        if s.check( /\[/ )
          ## scan everything until the end of bracket (e.g.])

          name = s.scan( /\[[^\]]+\]/)
          ## todo/fix: if name nil - issue warning??

          #  starting w/ [  but no closing ] found !!!! - possible? fix!!

        else
          ## scan everything until the begin of bracket (e.g.[)

          name = s.scan( /[^\[]+/)
          name = name.rstrip   ## remove trailing spaces (if present)

        end
        names << name

        s.skip( /[ \t]+/)  # skip whitespaces

        logger.debug( "[NameTokenizer] eos?: #{s.eos?}, rest: >#{s.rest}<" )
      end
  end # each part


  logger.debug( "[NameTokenizer] names=#{names.inspect}")
  names
end