324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
|
# File 'lib/pagerecognizer.rb', line 324
def grid dump = nil
logger = Module.nesting.first.logger
all = recognize
logger.info "all nodes: #{all.size}"
File.write "#{dump}.all.htm", all.extend(Dumpable).dump if dump
struct = Struct.new *all.first.members, :midx, :midy
all.map!{ |i| struct.new *i.values, i.left + i.width / 2.0, i.top * i.height / 2.0 }
all = all.sort_by{ |_| [_.area, _.top, _.left] }.reverse
rect = page.evaluate("( function(node) { return JSON.parse(JSON.stringify(node.getBoundingClientRect())) } )(arguments[0])", self)
inside = all.reject{ |i| i.left < rect["left"] || i.left + i.width > rect["right"] || i.top < rect["top"] || i.top + i.height > rect["bottom"] }
raise ErrorNotEnoughNodes.new "no inside nodes", all: all, inside: inside if inside.empty?
logger.info "inside nodes: #{inside.size}"
File.write "#{dump}.inside.htm", inside.extend(Dumpable).dump if dump
good = inside.reject{ |i| %w{ button script svg path a img }.include? i.node.tag_name }.uniq{ |i| [i.height, i.width, i.top, i.left] }
logger.info "good and unique: #{good.size}"
File.write "#{dump}.good.htm", good.extend(Dumpable).dump if dump
interfere = lambda do |a, b|
a.top < b.top + b.height &&
b.top < a.top + a.height &&
a.left < b.left + b.width &&
b.left < a.left + a.width
end
rest = good.select.with_index do |a, i|
good.each_with_index.none? do |b, j|
next if i == j
a.top >= b.top && a.top + a.height <= b.top + b.height &&
a.left >= b.left && a.left + a.width <= b.left + b.width &&
good.all?{ |c| interfere[a, c] == interfere[b, c] }
end
end
logger.info "not nested: #{rest.size}"
File.write "#{dump}.rest.htm", rest.extend(Dumpable).dump if dump
begin
prev = rest.size
rest.select!.with_index do |a, i|
rest.each_with_index.any? do |b, j|
cw = [[a.left + a.width, b.left + b.width].min - [a.left, b.left].max, 0].max
i != j && !interfere[a, b] && [cw, a.width].min.fdiv(a.width) * [cw, b.width].min.fdiv(b.width) > 0.9
end and
rest.each_with_index.any? do |b, j|
ch = [[a.top + a.height, b.top + b.height].min - [a.top, b.top].max, 0].max
i != j && !interfere[a, b] && [ch, a.height].min.fdiv(a.height) * [ch, b.height].min.fdiv(b.height) > 0.9
end
end
end until prev == rest.size
logger.info "gridable: #{rest.size}"
File.write "#{dump}.griddable.htm", rest.extend(Dumpable).dump if dump
require "pcbr"
pcbr = PCBR.new
max, past = 0, []
prev = nil
prev_max = nil
time = Time.now
heuristics = i{ SIZE AREA }
inter = lambda do |a1, a2, b1, b2|
c = [[a1 + a2, b1 + b2].min - [a1, b1].max, 0].max
[c, a2].min.fdiv(a2) * [c, b2].min.fdiv(b2)
end
lp = lambda do |is|
past.push is.map{ |i| 2**i }.reduce(:+)
rest.size.times do |ij|
next if ij <= is.last unless is.empty?
sorted = is + [ij]
next if pcbr.set.include? sorted
next if is.any?{ |j| interfere[rest[ij], rest[j]] }
sol = rest.values_at *sorted
xn = Module.nesting.first.piles sol.map{ |s| [s.left, s.width] }
yn = Module.nesting.first.piles sol.map{ |s| [s.top, s.height] }
next if xn.product(yn).any?{ |i,j| (i & j).size > 1 } if sorted.size >= 4
pcbr.store sorted, [
*( sol.map(&:area).reduce(:+) if heuristics.include? :AREA ),
xn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.left, s1.width, s2.left, s2.width] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / xn.size,
yn.map{ |g| sosol = sol.values_at *g; next 0 if sosol.size == 1; sosol.combination(2).map{ |s1, s2| inter[s1.top, s1.height, s2.top, s2.height] }.reduce(:+) / sosol.size / (sosol.size - 1) * 2 }.reduce(:+) / yn.size,
]
if prev && Time.now - time > 3
logger.debug "check"
break logger.info "break 0" if Time.now - time > 30
break logger.info "break 1" if Time.now - prev > 10
m = pcbr.table.reject{ |i| i.first.size < 3 }.map(&:last).max
break logger.debug "break 2" if Time.now - prev > (prev - time) * 2 && 1 == pcbr.table.count{ |i| i.last == m }
end
break logger.info "break 3" unless t = pcbr.table.reject{ |is,| past.include? is.map{ |i| 2**i }.reduce(:+) }.max_by(&:last)
logger.debug [t.last, max, t.first == prev_max, t.first.map{ |i| 2**i }.reduce(:+)]
if t.last > max && t.first != prev_max
prev, max, prev_max = Time.now, t.last, t.first
logger.debug [pcbr.table.size, max, t.first]
end
lp.call t.first
end
end
lp.call []
pcbr.table.max_by(20, &:last).each_with_index{ |_, i| logger.debug "##{i} #{_}" }
rest.values_at(*pcbr.table.max_by(&:last).first).extend Dumpable, Gridable
end
|