Module: Tabula

Defined in:
lib/tabula.rb,
lib/tabula/version.rb,
lib/tabula/writers.rb,
lib/tabula/extraction.rb,
lib/tabula/entities/cell.rb,
lib/tabula/entities/line.rb,
lib/tabula/entities/page.rb,
lib/tabula/table_guesser.rb,
lib/tabula/entities/table.rb,
lib/tabula/entities/ruling.rb,
lib/tabula/table_extractor.rb,
lib/tabula/entities/tabular.rb,
lib/tabula/entities/has_cells.rb,
lib/tabula/entities/page_area.rb,
lib/tabula/entities/text_chunk.rb,
lib/tabula/entities/spreadsheet.rb,
lib/tabula/entities/zone_entity.rb,
lib/tabula/entities/text_element.rb,
lib/tabula/spreadsheet_extractor.rb,
lib/tabula/entities/text_element_index.rb

Defined Under Namespace

Modules: AbstractInterface, Extraction, HasCells, TableGuesser, Tabular, Writers Classes: Cell, Line, Page, PageArea, Ruling, Spreadsheet, Table, TextChunk, TextElement, TextElementIndex, ZoneEntity

Constant Summary collapse

PDFBOX =
'pdfbox-app-2.0.0-SNAPSHOT.jar'
ONLY_SPACES_RE =
Regexp.new('^\s+$')
SAME_CHAR_RE =
Regexp.new('^(.)\1+$')
VERSION =
'0.7.6'

Class Method Summary collapse

Class Method Details

.extract_table(pdf_path, page, area, options = {}) ⇒ Object

extract a table from file pdf_path, pages and area

pages can be a single integer (1-based) or an array of integers

Options

:password - Password if encrypted PDF (default: empty) :detect_ruling_lines - Try to detect vertical (default: true) :vertical_rulings - List of positions for vertical rulings. Overrides :detect_ruling_lines. (default: [])



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/tabula/table_extractor.rb', line 27

def Tabula.extract_table(pdf_path, page, area, options={})
  options = {
    :password => '',
    :detect_ruling_lines => true,
    :vertical_rulings => [],
    :extraction_method => "guess",
  }.merge(options)

  if area.instance_of?(Array)
    top, left, bottom, right = area
    area = Tabula::ZoneEntity.new(top, left,
                                  right - left, bottom - top)
  end

  if page.is_a?(Integer)
    page = [page]
  end

  extractor = Extraction::ObjectExtractor.new(pdf_path,
                                              page,
                                              options[:password])

  pdf_page = extractor.extract.next
  extractor.close!

  if ["spreadsheet", "original"].include? options[:extraction_method]
    use_spreadsheet_extraction_method = options[:extraction_method] == "spreadsheet"
  else
    use_spreadsheet_extraction_method = pdf_page.is_tabular?
  end

  if use_spreadsheet_extraction_method
    return (spreadsheets = pdf_page.get_area(area).spreadsheets).empty? ? Spreadsheet.empty(pdf_page) : spreadsheets.inject(&:+)
  end

  use_detected_lines = false
  if options[:detect_ruling_lines] && options[:vertical_rulings].empty?
    detected_vertical_rulings = Ruling.crop_rulings_to_area(pdf_page.vertical_ruling_lines,
                                                            area)

    # only use lines if at least 80% of them cover at least 90%
    # of the height of area of interest

    # TODO this heuristic SUCKS
    # what if only a couple columns is delimited with vertical rulings?
    # ie: https://www.dropbox.com/s/lpydler5c3pn408/S2MNCEbirdisland.pdf (see 7th column)
    # idea: detect columns without considering rulings, detect vertical rulings
    # calculate ratio and try to come up with a threshold
    use_detected_lines = detected_vertical_rulings.size > 2 \
    && (detected_vertical_rulings.count { |vl|
          vl.height / area.height > 0.9
        } / detected_vertical_rulings.size.to_f) >= 0.8

  end

  pdf_page
    .get_area(area)
    .get_table(:vertical_rulings => use_detected_lines ? detected_vertical_rulings : options[:vertical_rulings])

end

.group_by_lines(text_chunks) ⇒ Object



8
9
10
11
# File 'lib/tabula/table_extractor.rb', line 8

def Tabula.(text_chunks)
  warn 'Tabula.group_by_lines is DEPRECATED. Use Tabula::TextChunk.group_by_lines instead.'
  TextChunk.(text_chunks)
end

.make_table(page, area, options = {}) ⇒ Object

Returns an array of Tabula::Line



14
15
16
17
# File 'lib/tabula/table_extractor.rb', line 14

def Tabula.make_table(page, area, options={})
  warn 'Tabula.make_table is DEPRECATED. Use Tabula::Page#make_table instead.'
  page.get_area(area).make_table(options)
end

.merge_words(text_elements, options = {}) ⇒ Object



3
4
5
6
# File 'lib/tabula/table_extractor.rb', line 3

def Tabula.merge_words(text_elements, options={})
  warn 'Tabula.merge_words is DEPRECATED. Use Tabula::TextElement.merge_words instead'
  TextElement.merge_words(text_elements, options)
end