Class: NameTokenizer

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/textutils/parser/name_tokenizer.rb

Overview

  • rename to NameScanner, NameSplitter, NameSeparator, etc.

Instance Method Summary collapse

Instance Method Details

#tokenize(value) ⇒ Object

rename to/use split - why? why not??



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/textutils/parser/name_tokenizer.rb', line 14

def tokenize( value )   ## rename to/use split - why? why not??
  names = []

  # 1)  split by | (pipe) -- remove leading n trailing whitespaces
  parts = value.split( /[ \t]*\|[ \t]*/ )

  # 2)  split "inline" translations e.g. München [Munich]

  ## todo: add support for  Munich [en]  e.g. trailing lang tag
  ## todo: add support for bullet (official bi-lingual names w/ tags ??) - see brussels - why, why not??

  parts.each do |part|
      s = StringScanner.new( part )
      s.skip( /[ \t]+/)   # skip whitespaces

      while s.eos? == false
        if s.check( /\[/ )
          ## scan everything until the end of bracket (e.g.])
          name = s.scan( /\[[^\]]+\]/)
          ## todo/fix: if name nil - issue warning??
          #  starting w/ [  but no closing ] found !!!! - possible? fix!!
        else
          ## scan everything until the begin of bracket (e.g.[)
          name = s.scan( /[^\[]+/)
          name = name.rstrip   ## remove trailing spaces (if present)
        end
        names << name

        s.skip( /[ \t]+/)  # skip whitespaces
        logger.debug( "[NameTokenizer] eos?: #{s.eos?}, rest: >#{s.rest}<" )
      end
  end # each part

  logger.debug( "[NameTokenizer] names=#{names.inspect}")
  names
end