Class: PDF::Reader::OverlappingRunsFilter

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/overlapping_runs_filter.rb

Overview

remove duplicates from a collection of TextRun objects. This can be helpful when a PDF uses slightly offset overlapping characters to achieve a fake ‘bold’ effect.

Constant Summary collapse

OVERLAPPING_THRESHOLD =

This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they have identical characters) then one will be discarded

0.5

Class Method Summary collapse

Class Method Details

.detect_intersection(sweep_line_status, event_point) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
# File 'lib/pdf/reader/overlapping_runs_filter.rb', line 41

def self.detect_intersection(sweep_line_status, event_point)
  sweep_line_status.each do |open_text_run|
    if open_text_run.text == event_point.run.text &&
        event_point.x >= open_text_run.x &&
        event_point.x <= open_text_run.endx &&
        open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
      return true
    end
  end
  return false
end

.exclude_redundant_runs(runs) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/pdf/reader/overlapping_runs_filter.rb', line 14

def self.exclude_redundant_runs(runs)
  sweep_line_status = Array.new
  event_point_schedule = Array.new
  to_exclude = []

  runs.each do |run|
    event_point_schedule << EventPoint.new(run.x, run)
    event_point_schedule << EventPoint.new(run.endx, run)
  end

  event_point_schedule.sort! { |a,b| a.x <=> b.x }

  event_point_schedule.each do |event_point|
    run = event_point.run

    if event_point.start?
      if detect_intersection(sweep_line_status, event_point)
        to_exclude << run
      end
      sweep_line_status.push(run)
    else
      sweep_line_status.delete(run)
    end
  end
  runs - to_exclude
end