Class: PDF::Reader::PageTextReceiver

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Defined in:
lib/pdf/reader/page_text_receiver.rb

Overview

Builds a UTF-8 string of all the text on a single page by processing all the operaters in a content stream.

Constant Summary collapse

SPACE =
" "

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#optionsObject (readonly)

Returns the value of attribute options.



19
20
21
# File 'lib/pdf/reader/page_text_receiver.rb', line 19

def options
  @options
end

#stateObject (readonly)

Returns the value of attribute state.



19
20
21
# File 'lib/pdf/reader/page_text_receiver.rb', line 19

def state
  @state
end

Instance Method Details

#contentObject

deprecated



83
84
85
86
# File 'lib/pdf/reader/page_text_receiver.rb', line 83

def content
  mediabox = @page.rectangles[:MediaBox]
  PageLayout.new(runs, mediabox).to_s
end

#invoke_xobject(label) ⇒ Object

XObjects



122
123
124
125
126
127
128
129
# File 'lib/pdf/reader/page_text_receiver.rb', line 122

def invoke_xobject(label)
  @state.invoke_xobject(label) do |xobj|
    case xobj
    when PDF::Reader::FormXObject then
      xobj.walk(self)
    end
  end
end

#move_to_next_line_and_show_text(str) ⇒ Object



108
109
110
111
# File 'lib/pdf/reader/page_text_receiver.rb', line 108

def move_to_next_line_and_show_text(str) # '
  @state.move_to_start_of_next_line
  show_text(str)
end

#page=(page) ⇒ Object

starting a new page



43
44
45
46
47
48
# File 'lib/pdf/reader/page_text_receiver.rb', line 43

def page=(page)
  @state = PageState.new(page)
  @page = page
  @content = []
  @characters = []
end

#runs(opts = {}) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/pdf/reader/page_text_receiver.rb', line 50

def runs(opts = {})
  runs = @characters

  if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
    runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
  end

  if opts.fetch(:skip_zero_width, true)
    runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
  end

  if opts.fetch(:skip_overlapping, true)
    runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
  end

  runs = NoTextFilter.exclude_empty_strings(runs)

  if opts.fetch(:merge, true)
    runs = merge_runs(runs)
  end

  if (only_filter = opts.fetch(:only, nil))
    runs = AdvancedTextRunFilter.only(runs, only_filter)
  end

  if (exclude_filter = opts.fetch(:exclude, nil))
    runs = AdvancedTextRunFilter.exclude(runs, exclude_filter)
  end

  runs
end

#set_spacing_next_line_show_text(aw, ac, string) ⇒ Object



113
114
115
116
117
# File 'lib/pdf/reader/page_text_receiver.rb', line 113

def set_spacing_next_line_show_text(aw, ac, string) # "
  @state.set_word_spacing(aw)
  @state.set_character_spacing(ac)
  move_to_next_line_and_show_text(string)
end

#show_text(string) ⇒ Object

Text Showing Operators

record text that is drawn on the page



92
93
94
# File 'lib/pdf/reader/page_text_receiver.rb', line 92

def show_text(string) # Tj (AWAY)
  internal_show_text(string)
end

#show_text_with_positioning(params) ⇒ Object

TJ [(A) 120 (WA) 20 (Y)]



96
97
98
99
100
101
102
103
104
105
106
# File 'lib/pdf/reader/page_text_receiver.rb', line 96

def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
  params.each do |arg|
    if arg.is_a?(String)
      internal_show_text(arg)
    elsif arg.is_a?(Numeric)
      @state.process_glyph_displacement(0, arg, false)
    else
      # skip it
    end
  end
end