Module: PDFTDX::Parser
- Defined in:
- lib/pdftdx/parser.rb
Overview
Parser Module
Constant Summary collapse
- LINE_REGEX =
Line Regex
/^<p style[^>]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/- MAX_CELL_LEN =
Maximum Cell Length (to be considered usable data)
100- PAGE_OFF =
Page Offset
10000- TITLE_CELL_REGEX =
Title Cell Regex
/<bbb>/
Class Method Summary collapse
-
.contains_unusable(row_data) ⇒ Object
Contains Unusable Data (Empty / Long Strings).
-
.hfilter(s) ⇒ Object
HTML Filter.
-
.is_all_same(row_data) ⇒ Object
Is All Same Data.
-
.process_data(data) ⇒ Object
Process Data.
-
.process_page_files(page_data) ⇒ Object
Process Page Files.
-
.same_line(data, idx_a, idx_b) ⇒ Object
Check Same Line.
Class Method Details
.contains_unusable(row_data) ⇒ Object
Contains Unusable Data (Empty / Long Strings)
40 41 42 |
# File 'lib/pdftdx/parser.rb', line 40 def self.contains_unusable row_data row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) } end |
.hfilter(s) ⇒ Object
HTML Filter
65 66 67 |
# File 'lib/pdftdx/parser.rb', line 65 def self.hfilter s s.gsub '<br/>', "\n" end |
.is_all_same(row_data) ⇒ Object
Is All Same Data
34 35 36 37 |
# File 'lib/pdftdx/parser.rb', line 34 def self.is_all_same row_data n = row_data[row_data.keys[0]] row_data.inject(true) { |b, e| b && (e[1] == n) } end |
.process_data(data) ⇒ Object
Process Data
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/pdftdx/parser.rb', line 45 def self.process_data data # Build Data Table table = {} data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] } # Filter Table Rows (Remove Lone Elements & Footers) table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) } # Filter Table Cells table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 } # Cleanup Table ( IS THIS NECESSARY ? ) table.reject! { |r| r.size < 2 } # DEBUG puts "=============> #{table}" end |
.process_page_files(page_data) ⇒ Object
Process Page Files
70 71 72 73 74 75 76 77 78 |
# File 'lib/pdftdx/parser.rb', line 70 def self.process_page_files page_data # Build HTML Entity Decoder coder = HTMLEntities.new # Collect & Process File Data off = 0 process page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten end |
.same_line(data, idx_a, idx_b) ⇒ Object
Check Same Line
29 30 31 |
# File 'lib/pdftdx/parser.rb', line 29 def self.same_line data, idx_a, idx_b data[idx_a][:top] == data[idx_b][:top] end |