2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
# File 'lib/udise_school_report_reader/pdf_block_extractor.rb', line 2
def self.(reader)
blocks = []
reader.pages.each_with_index do |page, index|
page_number = index + 1
current_block = {}
page.raw_content.each_line do |line|
if line.include?('BT')
current_block = {
page: page_number,
start_line: line.strip,
text: [] }
elsif line.match?(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
unless current_block[:x] && current_block[:y]
matches = line.match(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
current_block[:x] = matches[1].to_f
current_block[:y] = matches[2].to_f
end
elsif line.match?(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
unless current_block[:font] && current_block[:font_size]
matches = line.match(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
current_block[:font] = "F#{matches[1]}"
current_block[:font_size] = matches[2].to_f
end
elsif line.match?(/\((.*?)\)\s*Tj/)
text = line.match(/\((.*?)\)\s*Tj/)[1]
text = text.gsub(/\\/, '') current_block[:text] << text
elsif line.include?('ET')
current_block[:end_line] = line.strip
current_block[:text] = current_block[:text].join(' ')
if !current_block[:text].empty? && current_block[:x] && current_block[:y]
blocks << current_block.dup
end
end
end
end
blocks
end
|