Class: Docsplit::TextExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/docsplit/text_extractor.rb

Overview

Delegates to pdftotext and tesseract in order to extract text from PDF documents. The ‘–ocr` and `–no-ocr` flags can be used to force or forbid OCR extraction, but by default the heuristic works like this:

* Check for the presence of fonts in the PDF. If no fonts are detected,
  OCR is used automatically.
* Extract the text of each page with **pdftotext**, if the page has less
  than 100 bytes of text (a scanned image page, or a page that just
  contains a filename and a page number), then add it to the list of
  `@pages_to_ocr`.
* Re-OCR each page in the `@pages_to_ocr` list at the end.

Constant Summary collapse

NO_TEXT_DETECTED =
/---------\n\Z/
OCR_FLAGS =
'-density 400x400 -colorspace GRAY'
MEMORY_ARGS =
'-limit memory 256MiB -limit map 512MiB'
MIN_TEXT_PER_PAGE =

in bytes

100

Instance Method Summary collapse

Constructor Details

#initializeTextExtractor

Returns a new instance of TextExtractor.



24
25
26
# File 'lib/docsplit/text_extractor.rb', line 24

def initialize
  @pages_to_ocr = []
end

Instance Method Details

#contains_text?(pdf) ⇒ Boolean

Does a PDF have any text embedded?

Returns:

  • (Boolean)


47
48
49
50
# File 'lib/docsplit/text_extractor.rb', line 47

def contains_text?(pdf)
  fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
  !fonts.match(NO_TEXT_DETECTED)
end

#extract(pdfs, opts) ⇒ Object

Extract text from a list of PDFs.



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/docsplit/text_extractor.rb', line 29

def extract(pdfs, opts)
  extract_options opts
  FileUtils.mkdir_p @output unless File.exists?(@output)
  [pdfs].flatten.each do |pdf|
    @pdf_name = File.basename(pdf, File.extname(pdf))
    pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
    if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
      extract_from_ocr(pdf, pages)
    else
      extract_from_pdf(pdf, pages)
      if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
        extract_from_ocr(pdf, @pages_to_ocr)
      end
    end
  end
end

#extract_from_ocr(pdf, pages) ⇒ Object

Extract a page range worth of text from a PDF via OCR.



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/docsplit/text_extractor.rb', line 59

def extract_from_ocr(pdf, pages)
  tempdir = Dir.mktmpdir
  base_path = File.join(@output, @pdf_name)
  escaped_pdf = ESCAPE[pdf]
  if pages
    pages.each do |page|
      tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
      escaped_tiff = ESCAPE[tiff]
      file = "#{base_path}_#{page}"
      if ENV["toolchain"] == 'graphicsmagick'
        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
      else
        run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
      end
      run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
      clean_text(file + '.txt') if @clean_ocr
      FileUtils.remove_entry_secure tiff
    end
  else
    tiff = "#{tempdir}/#{@pdf_name}.tif"
    escaped_tiff = ESCAPE[tiff]
    if ENV["toolchain"] == 'graphicsmagick'
      run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
    else
      run "convert -define quantum:polarity=min-is-white -endian MSB -units PixelsPerInch -density 204x196 -monochrome -compress Fax -sample 1728 #{escaped_pdf} #{escaped_tiff}"
    end
    run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
    clean_text(base_path + '.txt') if @clean_ocr
  end
ensure
  FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end

#extract_from_pdf(pdf, pages) ⇒ Object

Extract a page range worth of text from a PDF, directly.



53
54
55
56
# File 'lib/docsplit/text_extractor.rb', line 53

def extract_from_pdf(pdf, pages)
  return extract_full(pdf) unless pages
  pages.each {|page| extract_page(pdf, page) }
end