Class: TocExtract

Inherits:

Object

Object
TocExtract

show all

Defined in:: lib/toc_extract.rb,
lib/toc_extract/preview.rb,
lib/toc_extract/extractor.rb

Class Method Summary collapse

Class Method Details

.extract(pdf_file, template, toc_start_page, toc_end_page) ⇒ `Object`

# File 'lib/toc_extract/extractor.rb', line 14

def self.extract(pdf_file, template, toc_start_page, toc_end_page)
  lines = TocExtract.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
  sections = TocExtract.sections_from_toc_lines(lines, template)
  TocExtract.fill_bounding_boxes(pdf_file, sections, toc_end_page)

  sections
end

.fill_bounding_boxes(pdf_file, sections, toc_end_page) ⇒ `Object`

# File 'lib/toc_extract/extractor.rb', line 119

def self.fill_bounding_boxes(pdf_file, sections, toc_end_page)
  i = 0
  bboxes = {}
  content_parts = []
  PDF::Reader.open(pdf_file) do |reader|
    reader.pages.each_with_index do |page, page_num|
      next if page_num <= toc_end_page
      page.extend(PDF::Reader::FindText)
      runs = page.runs(merge: false)
      runs.each do |run|
        content_parts << run.text
        bboxes[i] = {
          "x" => run.x,
          "y" => run.y,
          "width" => run.width,
          "endx" => run.endx,
          "endy" => run.endy,
          "page" => page_num
        }
        i += run.text.length
      end
    end
  end
  content = content_parts.join

  # For each section, search in the content and find its position,
  # look it up in the bounding box and store it
  for section in sections
    pos = content.index(section.title)
    # puts "not found #{pos} for #{section.title}" if bboxes.key?(pos)
    section.bounding_box = bboxes[pos]
  end
end

.hi(language = "english") ⇒ `Object`

# File 'lib/toc_extract.rb', line 2

def self.hi(language = "english")
  translator = Translator.new(language)
  translator.hi
end

.preview(pdf_file, section, crop_width, crop_height) ⇒ `Object`

# File 'lib/toc_extract/preview.rb', line 2

def self.preview(pdf_file, section, crop_width, crop_height)
  require "rmagick"
  require "pdf/reader"
  require "pdf/reader/find_text"

  result = "" 

  PDF::Reader.open(pdf_file) do |reader|
    page_num = section.page_number

    target_page = reader.pages[page_num]
    page_width = target_page.width
    page_height = target_page.height

    images = Magick::Image.read("#{pdf_file}[#{page_num}]") do |info|
      info.density = 150
    end

    img = images.first
    img_width = img.columns
    img_height = img.rows

    scale_x = img_width.to_f / page_width
    scale_y = img_height.to_f / page_height

    # In pdf, x,y are the bottom left coordinates, converting them to top left
    # Convert pdf-reader coordinates to RMagick coordinates
    # pdf-reader: origin at bottom-left, Y increases upward
    # RMagick: origin at top-left, Y increases downward
    # Add some padding as well
    pdf_x = section.bounding_box["x"] - 30
    pdf_y = section.bounding_box["endy"] + 10

    img_x = (pdf_x * scale_x).round
    img_y = ((page_height - pdf_y) * scale_y).round  # Flip Y coordinate
    img_width_pixels = (crop_width * scale_x).round
    img_height_pixels = (crop_height * scale_y).round

    # Ensure coordinates are within image bounds
    img_x = [ img_x, 0 ].max
    img_y = [ img_y, 0 ].max
    img_width_pixels = [ img_width_pixels, img_width ].min
    img_height_pixels = [ img_height_pixels, img_height ].min

    cropped_img = img.crop(img_x, img_y, img_width_pixels, img_height_pixels)

    cropped_img.resize_to_fit!(crop_width, crop_height)
    result = cropped_img.to_blob { |info| info.format = "PNG" }
  end
  
  result
end

.sections_from_toc_lines(lines, template) ⇒ `Object`

# File 'lib/toc_extract/extractor.rb', line 80

def self.sections_from_toc_lines(lines, template)
  # Same as before, the template is important, but is hard-coded as <NUMBER><TEXT>[...]<NUMBER>
  sections = []
  for line in lines
    section_id = ""
    page = ""

    title_start = 0
    title_end = line.length

    # Section id
    line.each_char do |c|
      if c.match?(/[\d\.]/)
        section_id += c
        title_start += 1
      else
        break
      end
    end

    # Page number
    line.reverse.each_char do |c|
      if c.match?(/[\d]/)
        page += c
        title_end -= 1
      else
        break
      end
    end
    page = page.reverse.to_i - 1 # Since TOC pages are 1-based

    # title
    title = line[title_start..title_end-1].sub(/\.+$/, '')
    sections << Section.new(section_id, title, page)
  end

sections
end

.toc_lines(pdf_file, template, toc_start_page, toc_end_page) ⇒ `Object`

# File 'lib/toc_extract/extractor.rb', line 23

def self.toc_lines(pdf_file, template, toc_start_page, toc_end_page)
  # To detect the lines, we assume that all the elements on a line have the same y value.
  # If this assumption changes in the future, we can instead list all the deltas between
  # y values and compute breaking points where a noticeable jumpt in delta occures.

  # template is a placeholder for future extensions. At the moment, always assume that
  # each line starts with a section number, followed by section title and optionally
  # some dots and ends with a page number. Morever, we assume that pages use arabic
  # numbering as opposed to roman numerals.


  lines = []
  max_delta = 0
  line = ""
  last_y = 0
  PDF::Reader.open(pdf_file) do |reader|
    reader.pages.each_with_index do |page, page_num|
      next if page_num < toc_start_page
      break if page_num > toc_end_page
      page.extend(PDF::Reader::FindText)
      runs = page.runs(merge: false)
      runs.each do |run|
        y = run.y
        last_y = y if last_y == 0
        delta = (last_y - y).abs
        if delta > max_delta
          lines << line
          line = run.text
          last_y = y
        else
          line += run.text
        end
      end
    end
  end

  # Merge lines based on the template. Currently, hard-coded as <NUMBER><TEXT>[...]<NUMBER>
  # Also romeve the lines that contain page numbers. This should also be part of the template, assuming arabic numbering
  real_lines = []
  for line in lines
    if line[0].match?(/[\d]/)
      next if line.scan(/\D/).empty?
      real_lines << line
    else
      # we could be unlucky and get a line that happens to have number in the its text portion
      # and the line breaks at that exact moment. Ignoring for now.
      if real_lines.length == 0
        next # this is the toc header
      else
        real_lines[-1] += line
      end
    end
  end

  real_lines
end

Class: TocExtract

Class Method Summary collapse

Class Method Details

.extract(pdf_file, template, toc_start_page, toc_end_page) ⇒ Object

.fill_bounding_boxes(pdf_file, sections, toc_end_page) ⇒ Object

.hi(language = "english") ⇒ Object

.preview(pdf_file, section, crop_width, crop_height) ⇒ Object

.sections_from_toc_lines(lines, template) ⇒ Object

.toc_lines(pdf_file, template, toc_start_page, toc_end_page) ⇒ Object

.extract(pdf_file, template, toc_start_page, toc_end_page) ⇒ `Object`

.fill_bounding_boxes(pdf_file, sections, toc_end_page) ⇒ `Object`

.hi(language = "english") ⇒ `Object`

.preview(pdf_file, section, crop_width, crop_height) ⇒ `Object`

.sections_from_toc_lines(lines, template) ⇒ `Object`

.toc_lines(pdf_file, template, toc_start_page, toc_end_page) ⇒ `Object`