Module: PageRecognizer

Defined in:
lib/pagerecognizer.rb

Defined Under Namespace

Modules: Dumpable, Gridable Classes: ErrorNotEnoughNodes

Class Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Attribute Details

.loggerObject

Returns the value of attribute logger.



3
4
5
# File 'lib/pagerecognizer.rb', line 3

def logger
  @logger
end

Class Method Details

.dist(h1, s1, v1, h2, s2, v2) ⇒ Object

<=360, <=1, <=1


50
51
52
53
54
55
56
57
58
# File 'lib/pagerecognizer.rb', line 50

def self.dist h1, s1, v1, h2, s2, v2   # [<256, <256, <256]
  # https://en.wikipedia.org/wiki/HSL_and_HSV#/media/File:Hsl-hsv_saturation-lightness_slices.svg
  c1, c2 = s1 * v1 / 256.0, s2 * v2 / 256.0   # chroma
  z1, z2 = v1 * (2 - c1 / 256), v2 * (2 - c2 / 256)
  a = (((h2 - h1) * 360 / 256.0) % 360) / (180 / Math::PI)
      x2 =     Math::sin(a) * c2
  y1, y2 = c1, Math::cos(a) * c2
  x2*x2 + (y1-y2)*(y1-y2) + (z1-z2)*(z1-z2)
end

.load(str) ⇒ Object



24
25
26
27
28
29
30
31
32
# File 'lib/pagerecognizer.rb', line 24

def self.load str
  require "nokogiri"
  Nokogiri::HTML(str).css("div").map do |n|
    Struct.new(*i{ node top left width height }).new Struct.new(:tag_name).new(n.text),
      *n[:style].scan(/(\S+): ([^\;]+)/).to_h.values_at(
                    *%w{ top left width height }
      ).map(&:to_f)
  end.extend Dumpable
end

.piles(z) ⇒ Object



300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/pagerecognizer.rb', line 300

def self.piles z
  max = nil
  result = [current = []]
  z.map.with_index.sort.each do |x|
    if !max || max > x[0][0]
      current.push x
      max = x[0][0] + x[0][1] if !max || max < x[0][0] + x[0][1]
    else
      result.push current = [x]
      max = x[0][0] + x[0][1]
    end
  end
  result.map{ |_| _.map &:last }
end

.rgb2hsv(r, g, b) ⇒ Object

<256, <256, <256


34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/pagerecognizer.rb', line 34

def self.rgb2hsv r, g, b   # [<256, <256, <256]
  # http://stackoverflow.com/q/41926874/322020
  r, g, b  = [r, g, b].map{ |_| _.fdiv 255 }
  min, max = [r, g, b].minmax
  chroma   = max - min
  [
    60.0 * ( chroma.zero? ? 0 : case max
      when r ; (g - b) / chroma
      when g ; (b - r) / chroma + 2
      when b ; (r - g) / chroma + 4
      else 0
    end % 6 ),
    chroma.zero? ? 0.0 : chroma / max,
    max,
  ]   # [<=360, <=1, <=1]
end

Instance Method Details

#cols(heuristics, try_min: nil, dump: nil, &b) ⇒ Object



296
297
298
# File 'lib/pagerecognizer.rb', line 296

def cols heuristics, try_min: nil, dump: nil, &b
  split :width, :height, :left, :top, heuristics, try_min, dump, &b
end

#grid(dump = nil) ⇒ Object



324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
# File 'lib/pagerecognizer.rb', line 324

def grid dump = nil
  logger = Module.nesting.first.logger

  all = recognize
  logger.info "all nodes: #{all.size}"
  File.write "#{dump}.all.htm", all.extend(Dumpable).dump if dump

  # adding the fields for faster upcoming computations
  struct = Struct.new *all.first.members, :midx, :midy
  all.map!{ |i| struct.new *i.values, i.left + i.width / 2.0, i.top * i.height / 2.0 }
  all = all.sort_by{ |_| [_.area, _.top, _.left] }.reverse

  rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
  inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
  raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
  logger.info "inside nodes: #{inside.size}"
  File.write "#{dump}.inside.htm", inside.extend(Dumpable).dump if dump
  good = inside.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name }.uniq{ |i| [i.height, i.width, i.top, i.left] }
  logger.info "good and unique: #{good.size}"   # only those that might be containers
  File.write "#{dump}.good.htm", good.extend(Dumpable).dump if dump

  # large = good#.select{ |i| i[ww] > good.map(&ww).max / 4 }
  # logger.info "large enough: #{large.size}"

  interfere = lambda do |a, b|
    a.top < b.top + b.height &&
    b.top < a.top + a.height &&
    a.left < b.left + b.width &&
    b.left < a.left + a.width
  end

  rest = good.select.with_index do |a, i|
    good.each_with_index.none? do |b, j|
      next if i == j
      a.top >= b.top && a.top + a.height <= b.top + b.height &&
      a.left >= b.left && a.left + a.width <= b.left + b.width &&
      good.all?{ |c| interfere[a, c] == interfere[b, c] }
    end
  end
  logger.info "not nested: #{rest.size}"
  File.write "#{dump}.rest.htm", rest.extend(Dumpable).dump if dump
  begin
    prev = rest.size
    rest.select!.with_index do |a, i|
      rest.each_with_index.any? do |b, j|
        cw = [[a.left + a.width, b.left + b.width].min - [a.left, b.left].max, 0].max
        i != j && !interfere[a, b] && [cw, a.width].min.fdiv(a.width) * [cw, b.width].min.fdiv(b.width) > 0.9
      end and
      rest.each_with_index.any? do |b, j|
        ch = [[a.top + a.height, b.top + b.height].min - [a.top, b.top].max, 0].max
        i != j && !interfere[a, b] && [ch, a.height].min.fdiv(a.height) * [ch, b.height].min.fdiv(b.height) > 0.9
      end
    end
  end until prev == rest.size
  logger.info "gridable: #{rest.size}"
  File.write "#{dump}.griddable.htm", rest.extend(Dumpable).dump if dump

  require "pcbr"
  pcbr = PCBR.new
  max, past = 0, []
  prev = nil
  prev_max = nil
  time = Time.now
  heuristics = i{ SIZE AREA }
  inter = lambda do |a1, a2, b1, b2|
    c = [[a1 + a2, b1 + b2].min - [a1, b1].max, 0].max
    [c, a2].min.fdiv(a2) * [c, b2].min.fdiv(b2)
  end
  lp = lambda do |is|
    past.push is.map{ |i| 2**i }.reduce(:+)
    rest.size.times do |ij|
      next if ij <= is.last unless is.empty?
      sorted = is + [ij]
      next if pcbr.set.include? sorted
      next if is.any?{ |j| interfere[rest[ij], rest[j]] }
      sol = rest.values_at *sorted
      xn = Module.nesting.first.piles sol.map{ |s| [s.left, s.width] }
      yn = Module.nesting.first.piles sol.map{ |s| [s.top, s.height] }
      next if xn.product(yn).any?{ |i,j| (i & j).size > 1 } if sorted.size >= 4
      pcbr.store sorted, [
        *( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
        xn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.left, s1.width, s2.left, s2.width] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / xn.size,
        yn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.top, s1.height, s2.top, s2.height] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / yn.size,
      ]
      if prev && Time.now - time > 3
        logger.debug "check"
        break logger.info "break 0" if Time.now - time > 30
        break logger.info "break 1" if Time.now - prev > 10
        m = pcbr.table.reject{ |i| i.first.size < 3 }.map(&:last).max
        break logger.debug "break 2" if Time.now - prev > (prev - time) * 2 && 1 == pcbr.table.count{ |i| i.last == m }
      end

      break logger.info "break 3" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
      logger.debug [t.last, max, t.first == prev_max, t.first.map{ |i| 2**i }.reduce(:+)]
      if t.last > max && t.first != prev_max
        prev, max, prev_max = Time.now, t.last, t.first
        logger.debug [pcbr.table.size, max, t.first]
      end
      lp.call t.first
    end
  end
  lp.call []
  # TODO: if multiple with max score, take the max by area
  pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
  rest.values_at(*pcbr.table.max_by(&:last).first).extend Dumpable, Gridable
end

#rows(heuristics, try_min: nil, dump: nil, &b) ⇒ Object



293
294
295
# File 'lib/pagerecognizer.rb', line 293

def rows heuristics, try_min: nil, dump: nil, &b
  split :height, :width, :top, :left, heuristics, try_min, dump, &b
end