Module: PDFTDX::Parser

Defined in:
lib/pdftdx/parser.rb

Overview

Parser Module

Constant Summary collapse

LINE_REGEX =

Line Regex

/^<p style[^>]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/
MAX_CELL_LEN =

Maximum Cell Length (to be considered usable data)

100
PAGE_OFF =

Page Offset

10000
TITLE_CELL_REGEX =

Title Cell Regex

/<bbb>/

Class Method Summary collapse

Class Method Details

.contains_unusable(row_data) ⇒ Object

Contains Unusable Data (Empty / Long Strings)



40
41
42
# File 'lib/pdftdx/parser.rb', line 40

def self.contains_unusable row_data
  row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
end

.hfilter(s) ⇒ Object

HTML Filter



65
66
67
# File 'lib/pdftdx/parser.rb', line 65

def self.hfilter s
  s.gsub '<br/>', "\n"
end

.is_all_same(row_data) ⇒ Object

Is All Same Data



34
35
36
37
# File 'lib/pdftdx/parser.rb', line 34

def self.is_all_same row_data
  n = row_data[row_data.keys[0]]
  row_data.inject(true) { |b, e| b && (e[1] == n) }
end

.process_data(data) ⇒ Object

Process Data



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/pdftdx/parser.rb', line 45

def self.process_data data

  # Build Data Table
  table = {}
  data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }

  # Filter Table Rows (Remove Lone Elements & Footers)
  table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }

  # Filter Table Cells
  table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }

  # Cleanup Table ( IS THIS NECESSARY ? )
  table.reject! { |r| r.size < 2 }

  # DEBUG
  puts "=============> #{table}"
end

.process_page_files(page_data) ⇒ Object

Process Page Files



70
71
72
73
74
75
76
77
78
# File 'lib/pdftdx/parser.rb', line 70

def self.process_page_files page_data

  # Build HTML Entity Decoder
  coder = HTMLEntities.new

  # Collect & Process File Data
  off = 0
  process page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
end

.same_line(data, idx_a, idx_b) ⇒ Object

Check Same Line



29
30
31
# File 'lib/pdftdx/parser.rb', line 29

def self.same_line data, idx_a, idx_b
  data[idx_a][:top] == data[idx_b][:top]
end