Class: ValuesReader

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging, TextUtils::ValueHelper
Defined in:
lib/textutils/reader/values_reader.rb

Instance Method Summary collapse

Methods included from TextUtils::ValueHelper

#find_grade, #find_key_n_title, #is_address?, #is_region?, #is_taglist?, #is_website?, #is_year?, #match_abv, #match_brewery, #match_city, #match_country, #match_hl, #match_kcal, #match_km_squared, #match_metro, #match_metro_flag, #match_metro_pop, #match_number, #match_og, #match_region_for_country, #match_supra, #match_supra_flag, #match_website, #match_year

Constructor Details

#initialize(path, more_attribs = {}) ⇒ ValuesReader

Returns a new instance of ValuesReader.



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/textutils/reader/values_reader.rb', line 60

def initialize( path, more_attribs={} )
  @more_attribs = more_attribs

  ### workaround/hack
  #  if path includes newline assume it's a string buffer not a file name
  #  fix: use  from_file an from_string etc. for  ctor
  #   check  what is the best convention (follow  ???)

  if path =~ /\n/m
    @path = 'stringio'   # what name to use ???
    @data = path.dup   # make a duplicate ?? why? why not?
  else
    @path = path
    @data = File.read_utf8( @path )
  end
end

Instance Method Details

#each_lineObject

support multi line records



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/textutils/reader/values_reader.rb', line 86

def each_line   # support multi line records

  inside_record  = false
  blank_counter  = 0    # count of number of blank lines (note: 1+ blank lines clear multi-line record)
  values         = []

  @data.each_line do |line|

    ## allow alternative comment lines
    ## e.g. -- comment or
    ##      % comment
    ##  why?  # might get used by markdown for marking headers, for example

    ## NB: for now alternative comment lines not allowed as end of line style e.g
    ##  some data, more data   -- comment here

    if line =~ /^\s*#/  ||
       line =~ /^\s*--/ ||
       line =~ /^\s*%/  ||
       line =~ /^\s*__/
      # skip komments and do NOT copy to result (keep comments secret!)
      logger.debug 'skipping comment line'
      next
    end

    if line =~ /^\s*$/
      # kommentar oder leerzeile überspringen 
      blank_counter += 1
      logger.debug "skipping blank line (#{blank_counter})"
      next
    end

    # pass 1) remove possible trailing eol comment
    ##  e.g    -> nyc, New York   # Sample EOL Comment Here (with or without commas,,,,)
    ## becomes -> nyc, New York

    line = line.sub( /\s+#.+$/, '' )

    # pass 2) remove leading and trailing whitespace
    
    line = line.strip


    if line =~ /^-\s/   # check for group headers  e.g.  - St. James Brewery
      logger.info "  skip group header #{line} for now (fix/add soon)"
      next
    elsif line =~ /^\[([a-z][a-z]+)\]/
    ### check for multiline record
    ##    must start with key e.g. [guiness]
    ##   for now only supports key with letter a-z (no digits/numbers or underscore or dots)
 
      if values.length > 0  # check if we already processed a record? if yes; yield last record (before reset)
        attribs, more_values = find_key_n_title( values )
        attribs = attribs.merge( @more_attribs )  # e.g. merge country_id and other defaults if present
        yield( attribs, more_values )
        values         = []
      end

      inside_record  = true
      blank_counter  = 0

      # NB: every additional line is one value e.g. city:wien, etc.
      #  allows you to use any chars
      logger.debug "   multi-line record w/ key »#{$1}«"

      values         = [$1.dup]    # add key as first value in ary
    elsif inside_record && blank_counter == 0 && line =~ /\/{2}/ # check address line (must contain //)
      values += [line.dup]     # assume single value column (no need to escape commas)
    elsif inside_record && blank_counter == 0 && line =~ /^[a-z][a-z0-9.]*[a-z0-9]:/ # check key: value pair
      values += [line.dup]     # assume single value column (no need to escape commas)
    else
      if inside_record && blank_counter == 0   # continue adding more values
        values += find_values( line )
      else                                     # assume single-line (stand-alone / classic csv) record          
        if values.length > 0
          attribs, more_values = find_key_n_title( values )
          attribs = attribs.merge( @more_attribs )  # e.g. merge country_id and other defaults if present
          yield( attribs, more_values )
          values         = []
        end
        inside_record  = false
        blank_counter  = 0
        values = find_values( line )
      end
    end

  end # each lines

  # do NOT forget to yield last line (if present/processed)
  if values.length > 0
    attribs, more_values = find_key_n_title( values )
    attribs = attribs.merge( @more_attribs )  # e.g. merge country_id and other defaults if present
    yield( attribs, more_values )
  end

end

#find_values(line) ⇒ Object

todo:

 move to helper for reuse a la find_key_n_title ???  
use different/better name ?? e.g. find_values_in_line  or split_line_into_values ??


187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# File 'lib/textutils/reader/values_reader.rb', line 187

def find_values( line )
  ## note returns an array of values (strings)

  meta_comma     = '«KOMMA»'
  meta_separator = '« »'

  # guard escaped commas
  #  e.g. convert \, to «KOMMA»
  line = line.gsub( '\,', meta_comma )

  # note: use generic separator (allow us to configure separator)
  #  e.g « »
  line = line.gsub( ',', meta_separator )

  # restore escaped commas (before split)
  line = line.gsub( meta_comma, ',' )

  logger.debug "line: |»#{line}«|"

  values = line.split( meta_separator )

  # pass 1) remove leading and trailing whitespace for values

  values = values.map { |value| value.strip }


  ##### todo/fix:
  #  !!!REMOVE!!!
  # remove support of comment column? (NB: must NOT include commas)
  # pass 2) remove comment columns
  #
  #  todo/fix: check if still possible ?? - add an example here how it looks like/works

  values = values.select do |value|
    if value =~ /^#/  ## start with # treat it as a comment column; e.g. remove it
      logger.info "   removing column with value »#{value}«"
      false
    else
      true
    end
  end

  logger.debug "  values: |»#{values.join('« »')}«|"
  values
end