2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
# File 'lib/udise_school_report_reader/block_rectangle_combiner.rb', line 2
def self.combine(blocks, rects)
invalid_blocks = blocks.reject { |block| block[:x] && block[:y] || block[:text].to_s.empty? }
if invalid_blocks.any?
warn "Warning: Found #{invalid_blocks.size} non-empty blocks with missing coordinates"
invalid_blocks.each do |block|
warn " - Page #{block[:page]}: '#{block[:text]}'"
end
end
valid_blocks = blocks.select { |block| block[:x] && block[:y] }
invalid_rects = rects.reject { |rect| rect[:x] && rect[:y] && rect[:width] && rect[:height] && rect[:width] > 0 && rect[:height] > 0 }
if invalid_rects.any?
warn "Warning: Found #{invalid_rects.size} rectangles with invalid coordinates"
invalid_rects.each do |rect|
warn " - Page #{rect[:page]}: x=#{rect[:x]}, y=#{rect[:y]}, w=#{rect[:width]}, h=#{rect[:height]}"
end
end
valid_rects = rects.select { |rect| rect[:x] && rect[:y] && rect[:width] && rect[:height] && rect[:width] > 0 && rect[:height] > 0 }
combined_data = valid_blocks.map do |block|
containing_rects = valid_rects.select do |rect|
rect[:page] == block[:page] &&
block[:x] >= rect[:x] &&
block[:x] <= (rect[:x] + rect[:width]) &&
block[:y] >= rect[:y] &&
block[:y] <= (rect[:y] + rect[:height])
end
smallest_rect = containing_rects.min_by { |r| r[:width] * r[:height] }
if smallest_rect
{
page: block[:page],
text: block[:text],
text_x: block[:x],
text_y: block[:y],
font: block[:font],
font_size: block[:font_size],
rect_x: smallest_rect[:x],
rect_y: smallest_rect[:y],
rect_width: smallest_rect[:width],
rect_height: smallest_rect[:height],
stroke_color: smallest_rect[:stroke_color],
fill_color: smallest_rect[:fill_color],
line_width: smallest_rect[:line_width]
}
else
{
page: block[:page],
text: block[:text],
text_x: block[:x],
text_y: block[:y],
font: block[:font],
font_size: block[:font_size],
rect_x: nil,
rect_y: nil,
rect_width: nil,
rect_height: nil,
stroke_color: nil,
fill_color: nil,
line_width: nil
}
end
end
valid_rects.each do |rect|
has_text = valid_blocks.any? do |block|
block[:page] == rect[:page] &&
block[:x] >= rect[:x] &&
block[:x] <= (rect[:x] + rect[:width]) &&
block[:y] >= rect[:y] &&
block[:y] <= (rect[:y] + rect[:height])
end
unless has_text
combined_data << {
page: rect[:page],
text: "",
text_x: nil,
text_y: nil,
font: nil,
font_size: nil,
rect_x: rect[:x],
rect_y: rect[:y],
rect_width: rect[:width],
rect_height: rect[:height],
stroke_color: rect[:stroke_color],
fill_color: rect[:fill_color],
line_width: rect[:line_width]
}
end
end
combined_data.sort_by! do |data|
[
data[:page],
-(data[:rect_y] || data[:text_y] || 0),
data[:rect_x] || data[:text_x] || 0
]
end
combined_data
end
|