Class: BlackStack::CSVIndexer::Index

Inherits:
Object
  • Object
show all
Defined in:
lib/csv-indexer.rb

Overview

define Index class

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(h) ⇒ Index

Returns a new instance of Index.



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/csv-indexer.rb', line 32

def initialize(h)
    errors = []

    # validate: h is a hash
    raise "The parameter must be a hash." unless h.is_a?(Hash)

    # validate: :name is present
    errors << "The parameter :name is mandatory." unless h.has_key?(:name)

    # validate: :name is a string
    errors << "The parameter :name must be a string." unless h[:name].is_a?(String)

    # validate: if :description is present, it is a string
    errors << "The parameter :description must be a string." if h.has_key?(:description) && !h[:description].is_a?(String)

    # validate: if :input is present, it is a string
    errors << "The parameter :input must be a string." if h.has_key?(:input) && !h[:input].is_a?(String)

    # validate: if :output is present, it is a string
    errors << "The parameter :output must be a string." if h.has_key?(:output) && !h[:output].is_a?(String)

    # validate: if :log is present, it is a string
    errors << "The parameter :log must be a string." if h.has_key?(:log) && !h[:log].is_a?(String)

    # validate: :mapping is present
    errors << "The parameter :mapping is mandatory." unless h.has_key?(:mapping)

    # validate: :mapping is a hash
    errors << "The parameter :mapping must be a hash." unless h[:mapping].is_a?(Hash)

    # validate: :keys is present
    errors << "The parameter :keys is mandatory." unless h.has_key?(:keys)

    # validate: :keys is an array
    errors << "The parameter :keys must be an array." unless h[:keys].is_a?(Array)

    # validate: :name is unique
    errors << "The parameter :name must be unique." if BlackStack::CSVIndexer.indexes.map{|i| i.name}.include?(h[:name])

    # if errors happened, raise an exception
    raise "The following errors happened while creating the index: #{errors.join(', ')}" unless errors.empty?

    # default value for :input
    h[:input] = './*.csv' unless h.has_key?(:input)

    # default value for :output
    h[:output] = './' unless h.has_key?(:output)

    # default value for :log
    h[:log] = './' unless h.has_key?(:log)

    # create the logger
    self.logger = BlackStack::LocalLogger.new("#{h[:log]}/#{h[:name]}.log")

    # set the attributes
    self.name = h[:name]
    self.description = h[:description]
    self.input = h[:input]
    self.output = h[:output]
    self.log = h[:log]
    self.mapping = h[:mapping]
    self.keys = h[:keys]
end

Instance Attribute Details

#descriptionObject

Returns the value of attribute description.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def description
  @description
end

#inputObject

Returns the value of attribute input.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def input
  @input
end

#keysObject

Returns the value of attribute keys.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def keys
  @keys
end

#logObject

Returns the value of attribute log.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def log
  @log
end

#loggerObject

Returns the value of attribute logger.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def logger
  @logger
end

#mappingObject

Returns the value of attribute mapping.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def mapping
  @mapping
end

#nameObject

Returns the value of attribute name.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def name
  @name
end

#outputObject

Returns the value of attribute output.



30
31
32
# File 'lib/csv-indexer.rb', line 30

def output
  @output
end

Instance Method Details

#compare_keys(key1, key2, exact_match = true) ⇒ Object

compare 2 keys. if !exact_match and if each value in key1 is included in the key2, return 0 otherwise, return 0 if equal, -1 if key1 < key2, 1 if key1 > key2 this method is used by the binary search. this method should not be used by the user.

Example: compare_keys(‘Century 21’, ‘Century 21 LLC’, false)

=> 0

Example: compare_keys(‘Century 21’, ‘Century 21 LLC’, true)

=> -1


189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/csv-indexer.rb', line 189

def compare_keys(key1, key2, exact_match=true)
    match = true
    # get the keys as arrays
    a1 = key1 #.split('|')
    a2 = key2 #.split('|')
    # validation: a2.size > a1.size
    raise 'The key2 must has more elements than key1.' if a2.size < a1.size
    # iterate the arrays
    a2.each_with_index do |k, i|
        match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
    end
    return 0 if match && !exact_match
    # return the result
    # iterate the arrays
    a1.each_with_index do |k, i|
        # if the keys are different, return the result
        if k.upcase < a2[i].upcase
            return 1
        elsif k.upcase > a2[i].upcase
            return -1
        end
    end
    # if the keys are equal, return 0
    return 0
end

#find(key, exact_match = true, write_log = false) ⇒ Object

search the index. return a hash description with the matches, and a brief performance report.



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# File 'lib/csv-indexer.rb', line 217

def find(key, exact_match=true, write_log=false)
    # if key is an string, convert it into an array of 1 element
    key = [key] if key.is_a?(String)

    # build the response.
    ret = {
        :matches => [],
    }

    # define the logger to use
    l = write_log ? self.logger : BlackStack::DummyLogger.new
            
    # define the source
    source = "#{File.expand_path(self.output)}/*.#{self.name}"

    # start time
    start_time = Time.now

    # totals
    total_matches = 0

    # searching in the indexed files
    l.log "Search term: #{key.to_s}"
    files = Dir.glob(source)
    n = 0 
    files.each do |file|
        # get the name of the file from the full path
        name = file.split('/').last
        # get the path of the file from the full path
        path = file.gsub("/#{name}", '')
        # opening log line
        l.logs "Searching into #{name}... "
        # setting boundaries for the binary search
        i = 0
        max = `wc -c #{file}`.split(' ').first.to_i
        middle = ((i + max) / 2).to_i
        # totals
        # open file with random access
        f = File.open(file, 'r')
        # remember middle variable from the previous iteration
        prev = -1
        # binary search
        while i<max
            # get the middle of the file
            middle = ((i + max) / 2).to_i
            # break if the middle is the same as the previous iteration
            break if middle==prev
            # remember the middle in this iteration
            prev = middle
            # opening log line
            l.logs "#{middle}... "
            # go to the middle of the file
            f.seek(middle)
            # read the line
            # the cursor is at the middle of a line
            # so, I have to read a second line to get a full line
            line = f.readline 
            # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
            a = line.split('","')
            while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
                middle -= 1
                f.seek(middle)
                line = f.readline
                a = line.split('","')
            end
            line_size = a.last.gsub('"', '').to_i
            middle -= line_size-line.size+1
            # seek and readline again, to get the line from its begining
            f.seek(middle)
            line = f.readline
            # strip the line
            line.strip!
            # get the first field of the CSV line
            fields = CSV.parse_line(line)
            row_key = fields[0].split('|')
            # compare keys
            x = compare_keys(key, row_key, exact_match)
            # compare the first field with the search term
            if x == 0
                # found
                l.logf "found (#{row_key})"
                ret[:matches] << fields.dup
                total_matches += 1
                break
            else
                # not found
                if x == 1
                    # search in the down half
                    max = middle
                else #if x == -1
                    # search in the up half
                    i = middle + line.size+1
                end
                l.logf "not found (#{row_key})"
            end
        end
        # closing the file
        f.close
        # closing the log line
        l.done
        # increment file counter
        n += 1
    end

    end_time = Time.now

    ret[:enlapsed_seconds] = end_time - start_time
    ret[:lines_matched] = total_matches

    l.log "Matches: #{total_matches.to_s}"
    l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"

    ret 
end

#index(write_log = true) ⇒ Object

create the index file



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/csv-indexer.rb', line 97

def index(write_log=true)
    # define the logger to use
    l = write_log ? self.logger : BlackStack::DummyLogger.new
    # output file extension
    ext = ".#{self.name}"
    # index the bites
    Dir.glob(input).each do |file|
        # get the name of the file from the full path
        name = file.split('/').last
        # get the path of the file from the full path
        path = file.gsub("/#{name}", '')
        # opening log line
        l.logs "Indexing #{name}... "
        # get the output filename
        output_filename = "#{File.expand_path(self.output)}/#{name.gsub(/\.csv$/, ext)}"
        # if output file exists, skip
        if File.exists?(output_filename)
            l.logf "skip"
        else
            # open the input file
            input_file = File.open(file, 'r')
            # import the bite to the database
            i = 0
            a = []
            # iterate lines if input_file
            input_file.each_line do |line|
                i += 1
                fields = []
                key = []
                # get the array of fields
                row = CSV.parse_line(line)
                # build the key
                self.keys.each do |k|
                    colnum = self.mapping[k]
                    # replace '"' by empty string, and '|' with ','  
                    key << row[colnum].gsub('"', '').gsub('|', ',')
                end
                key = "\"#{key.join('|')}\""
                # add the key as the first field of the index line
                fields << key
                # add the row number as the second field of the index line
                fields << "\"#{i.to_s}\""
                # iterate the mapping
                self.mapping.each do |k, v|
                    # get the data from the row
                    # format the field values for the CSV
                    fields << "\"#{row[v].gsub('"', '')}\""
                end
                # add fields to the array
                a << fields
            end
            # sort the array
            a.sort!
            # get the output file
            output_file = File.open(output_filename, 'w')
            size = nil
            new_size = nil
            # write the array to the output file
            a.each do |row|
                # add the size of the line, in order to be able to do a binary search
                line = row.join(',')
                # add the size of the line as a last field of the row.
                # this value is necessary to run the search.
                size = line.size
                new_size = size + 1 + 2 + size.to_s.size # 1 comma, 2 double-quotes, and size of the size
                new_size += 1 if size.to_s.size < new_size.to_s.size # sum 1 if new_size had 1 more digit than size (e.g. 104 vs 99)
                size = new_size
                line += ",\"#{size.to_s}\""
                output_file.puts line
            end
            # close the output file
            output_file.close
            # close log
            l.done
        end
    end
end