Class: ValuesReader

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging, TextUtils::ValueHelper
Defined in:
lib/textutils/reader/values_reader.rb

Constant Summary

Constants included from TextUtils::ValueHelper

TextUtils::ValueHelper::TITLE_KEY_REGEX

Class Method Summary collapse

Instance Method Summary collapse

Methods included from TextUtils::ValueHelper

#find_grade, #find_key_n_title, #is_address?, #is_taglist?, #is_website?, #is_year?, #match_abv, #match_brewery, #match_hl, #match_kcal, #match_km_squared, #match_number, #match_og, #match_website, #match_year

Constructor Details

#initialize(arg, more_attribs = {}) ⇒ ValuesReader

Returns a new instance of ValuesReader.



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/textutils/reader/values_reader.rb', line 55

def initialize( arg, more_attribs={} )
  @more_attribs = more_attribs
  
  ### todo/fix: rename @text to @text !!!!

  if arg.is_a?( String )  ## old style (deprecated) - pass in filepath as string
    path = arg

    ### workaround/hack
    #  if path includes newline assume it's a string buffer not a file name
    #  fix: use  from_file an from_string etc. for  ctor
    #   check  what is the best convention (follow  ???)
    if path =~ /\n/m
      logger.info "ValuesReader.new - deprecated API - use ValuesReader.from_string() instead"
      @text = path.dup   # make a duplicate ?? why? why not?
    else
      logger.info "ValuesReader.new - deprecated API - use ValuesReader.from_file() instead"
      @text = File.read_utf8( @path )
    end
  else   ## assume it's a hash
    opts = arg
    @text = opts[:text]
  end
end

Class Method Details

.from_file(path, more_attribs = {}) ⇒ Object



42
43
44
45
46
47
48
# File 'lib/textutils/reader/values_reader.rb', line 42

def self.from_file( path, more_attribs={} )
  ## note: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
  ## - see textutils/utils.rb
  text = File.read_utf8( path )

  self.from_string( text, more_attribs )
end

.from_string(text, more_attribs = {}) ⇒ Object



50
51
52
# File 'lib/textutils/reader/values_reader.rb', line 50

def self.from_string( text, more_attribs={} )
  ValuesReader.new( {text: text}, more_attribs )
end

.from_zip(zip_file, entry_path, more_attribs = {}) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/textutils/reader/values_reader.rb', line 13

def self.from_zip( zip_file, entry_path, more_attribs={} )
  ## get text content from zip

  entry = zip_file.find_entry( entry_path )

  ## todo/fix: add force encoding to utf-8 ??
  ##  check!!!
  ##  clean/prepprocess lines
  ##  e.g. CR/LF (/r/n) to LF (e.g. /n)
  text = entry.get_input_stream().read()

  ## NOTE: needs logger ref; only available in instance methods; use global logger for now
  logger = LogUtils::Logger.root
  logger.debug "text.encoding.name (before): #{text.encoding.name}"
#####
# NB: ASCII-8BIT == BINARY == Encoding Unknown; Raw Bytes Here
## NB:
# for now "hardcoded" to utf8 - what else can we do?
# - note: force_encoding will NOT change the chars only change the assumed encoding w/o translation
  text = text.force_encoding( Encoding::UTF_8 )
  logger.debug "text.encoding.name (after): #{text.encoding.name}"     

  ## todo:
  # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
  ## text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )

  self.from_string( text, more_attribs )
end

Instance Method Details

#each_lineObject

old style w/o meta hash – rename to each_record - why, why not???



89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/textutils/reader/values_reader.rb', line 89

def each_line       # old style w/o meta hash   -- rename to each_record - why, why not???
  each_line_with_meta do |attribs|
      ## remove meta
      if attribs[:meta].present?
        attribs.delete(:meta)
      end

      ## (more) values array entry - make top level 
      values = attribs[:values]
      attribs.delete(:values)

      yield( attribs, values )
  end
end

#each_line_with_metaObject

support multi line records – rename to each_record_with_ - why, why not??



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/textutils/reader/values_reader.rb', line 105

def each_line_with_meta    # support multi line records   -- rename to each_record_with_  - why, why not??

  inside_record  = false
  blank_counter  = 0    # count of number of blank lines (note: 1+ blank lines clear multi-line record)
  values         = []
  meta           = {}

  ###
  # meta
  #  use format or style key ??
  #   use   line|multiline   or classic|modern  or csv|csv+ etc.??
  #
  #  move header to meta (from top-level)  - why, why not ??
  #    or use context for header and sections etc.????
  #  move section to meta - why, why not ??
  #
  # might add lineno etc. in future??


  # keep track of last header
  #  e.g. lines like
  # ___________________________________
  # - Brauerei Schwechat (Brau Union)
  #
  #  laster_header will be 'Brauerei Schwechat (Brau Union)'
  #  gets passed along as an attribue e.g. more_attribs[:header]='Brauerei Schwechat (Brau Union)'
  last_header  = nil


  @text.each_line do |line|

    ## allow alternative comment lines
    ## e.g. -- comment or
    ##      % comment
    ##  why?  # might get used by markdown for marking headers, for example


    ## NOTE: for now alternative comment lines not allowed as end of line style e.g
    ##  some data, more data   -- comment here

    ######
    ## note:
    ##   # comment MUST follow a space or end-of-line e.g.
    ##     #1 or #hello or #(hello) or #{hello}  is NOT a comment
    ##   ###### is however

    if line =~ /^\s*#+(\s|$)/  ||    # old - simple rule -- /^\s*#/
       line =~ /^\s*--/ ||
       line =~ /^\s*%/  ||
       line =~ /^\s*__/
      # skip komments and do NOT copy to result (keep comments secret!)
      logger.debug 'skipping comment line'
      next
    end

    if line =~ /^\s*$/
      # kommentar oder leerzeile überspringen 
      blank_counter += 1
      logger.debug "skipping blank line (#{blank_counter})"
      next
    end

    # pass 1) remove possible trailing eol comment
    ##  e.g    -> nyc, New York   # Sample EOL Comment Here (with or without commas,,,,)
    ## becomes -> nyc, New York
    #
    ##  note - comment must follow a space or end-of-line
    #    #1 or #hello or #{hello} is NOT a comment !!!
    #    note ###### is a comment

    line = line.sub( /\s+#+(\s.+)?$/, '' )

    # pass 2) remove leading and trailing whitespace

    line = line.strip




    ### NOTE: skip sections lines (marked w/ at least ==) for now
    ###  e.g.  === Waldviertel ===
    if line =~ /^\s*={2,}\s+/
      logger.debug "skipping section line |»#{line}«|"
      next
    end

    if line =~ /^-\s+/   # check for group headers (MUST start w/ dash (-) e.g.  - St. James Brewery)
      if values.length > 0  # check if we already processed a record? if yes; yield last record (before reset)
        attribs, more_values = find_key_n_title( values )
        attribs = attribs.merge( @more_attribs )  # e.g. merge country_id and other defaults if present
        attribs[:header] = last_header   unless last_header.nil?   # add optional header attrib
        attribs[:values] = more_values 
        attribs[:meta]   = meta
        yield( attribs )
        values         = []
        meta           = {}
      end
      inside_record  = false
      blank_counter  = 0

      # update last_header
      last_header = line.sub( /^-\s+/, '' )  # cut-off leading marker and space
      logger.info "  update group header >#{last_header}<"
      next
    end


    if line =~ /^\[(.+)\]$/   # note: check for multiline record; MUST start w/ [ and end w/ ]

      value = $1.strip    # note: remove (allow) possible leading n trailing spaces

      if values.length > 0  # check if we already processed a record? if yes; yield last record (before reset)
        attribs, more_values = find_key_n_title( values )
        attribs = attribs.merge( @more_attribs )  # e.g. merge country_id and other defaults if present
        attribs[:header] = last_header   unless last_header.nil?   # add optional header attrib
        attribs[:values] = more_values 
        attribs[:meta]   = meta
        yield( attribs )
        values         = []
        meta           = {}
      end
      inside_record  = true
      blank_counter  = 0
      meta[:format]  = :multiline    # use :modern - why, why not?

      # NB: every additional line is one value e.g. city:wien, etc.
      #  allows you to use any chars
      logger.debug "   start multi-line record w/ »#{value}«"

      values         = [value]    # add as first value in ary - note: find_key_n_title will check if value is a key or not
    elsif inside_record && blank_counter == 0 && line =~ /\/{2}/ # check address line (must contain //)
      values += [line.dup]     # assume single value column (no need to escape commas)
    elsif inside_record && blank_counter == 0 && line =~ /^[a-z][a-z0-9.]*[a-z0-9]:/ # check key: value pair
      ### todo: split key n value and add to attrib hash  - why, why not???
      values += [line.dup]     # assume single value column (no need to escape commas)
    else
      if inside_record && blank_counter == 0   # continue adding more values
        values += find_values( line )
      else                                     # assume single-line (stand-alone / classic csv) record          
        if values.length > 0
          attribs, more_values = find_key_n_title( values )
          attribs = attribs.merge( @more_attribs )  # e.g. merge country_id and other defaults if present
          attribs[:header] = last_header   unless last_header.nil?   # add optional header attrib
          attribs[:values] = more_values
          attribs[:meta]   = meta
          yield( attribs )
          values         = []
          meta           = {}
        end
        inside_record  = false
        blank_counter  = 0
        meta[:format]  = :line    # use :classic - why, why not?
        values         = find_values( line )
      end
    end

  end # each lines

  # do NOT forget to yield last line (if present/processed)
  if values.length > 0
    attribs, more_values = find_key_n_title( values )
    attribs = attribs.merge( @more_attribs )  # e.g. merge country_id and other defaults if present
    attribs[:header] = last_header   unless last_header.nil?   # add optional header attrib
    attribs[:values] = more_values 
    attribs[:meta]   = meta
    yield( attribs )
  end

end

#find_values(line) ⇒ Object

todo:

 move to helper for reuse a la find_key_n_title ???  
use different/better name ?? e.g. find_values_in_line  or split_line_into_values ??


279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/textutils/reader/values_reader.rb', line 279

def find_values( line )
  ## note returns an array of values (strings)

  meta_comma     = '«KOMMA»'
  meta_separator = '« »'

  # guard escaped commas
  #  e.g. convert \, to «KOMMA»
  line = line.gsub( '\,', meta_comma )

  # note: use generic separator (allow us to configure separator)
  #  e.g « »
  line = line.gsub( ',', meta_separator )

  # restore escaped commas (before split)
  line = line.gsub( meta_comma, ',' )

  logger.debug "line: |»#{line}«|"

  values = line.split( meta_separator )

  # pass 1) remove leading and trailing whitespace for values

  values = values.map { |value| value.strip }

  logger.debug "  values: |»#{values.join('« »')}«|"
  values
end