Class: PDF::Reader::TextReceiver

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/text_receiver.rb

Overview

An example receiver class that processes all text found in a PDF file. All text that is found will be printed to the IO object specified in the constructor.

Usage:

receiver = PDF::Reader::TextReceiver.new($stdout)
PDF::Reader.file("somefile.pdf", receiver)

DEPRECATED: this class was deprecated in version 0.11.0 and will

eventually be removed

Instance Method Summary collapse

Constructor Details

#initialize(main_receiver) ⇒ TextReceiver

Initialize with the library user’s receiver



42
43
44
45
# File 'lib/pdf/reader/text_receiver.rb', line 42

def initialize(main_receiver)
  @main_receiver = main_receiver
  @upper_corners = []
end

Instance Method Details

#begin_document(root) ⇒ Object

Called when the document parsing begins



48
49
50
# File 'lib/pdf/reader/text_receiver.rb', line 48

def begin_document(root)
  @upper_corners = []
end

#begin_page(info) ⇒ Object

Called when new page parsing begins



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/pdf/reader/text_receiver.rb', line 66

def begin_page(info)
  @page = info

  @state = [{
    :char_spacing     => 0,
    :word_spacing     => 0,
    :hori_scaling     => 100,
    :leading          => 0,
    :tj_adjustment    => 0,
  }]

  @upper_corners.push(media_box_check(info))

  @output = []
  @line = 0
  @location = 0
  @displacement = {}
  @smallest_y_loc = @upper_corners.last[:ury]
  @written_to = false
end

#begin_page_container(page) ⇒ Object



57
58
59
# File 'lib/pdf/reader/text_receiver.rb', line 57

def begin_page_container(page)
  @upper_corners.push(media_box_check(page))
end

#begin_text_objectObject

PDF operator BT



94
95
96
# File 'lib/pdf/reader/text_receiver.rb', line 94

def begin_text_object
  @state.push(@state.last.dup)
end

#calculate_line_and_location(new_loc) ⇒ Object



236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/pdf/reader/text_receiver.rb', line 236

def calculate_line_and_location(new_loc)
  ##puts "calculate_line_and_location(#{new_loc})"
  key = new_loc; key.freeze

  #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)

  if @written_to
    unless @displacement.has_key?(key)
      if key < @location
        @displacement[key] = @line + 1
      elsif key < @smallest_y_loc
        @displacement[key] = @line + 1
      else
        key = @displacement.keys.find_all {|i| key > i}.sort.last
        @displacement[key] = 0 unless @displacement.has_key?(key)
      end
    end
  else
    @displacement[key] = 0
  end

  @smallest_y_loc = key if key < @smallest_y_loc
  @location = key
  @line = @displacement[key]
end

#end_documentObject

Called when the document parsing ends



53
54
55
# File 'lib/pdf/reader/text_receiver.rb', line 53

def end_document
  @state.clear
end

#end_pageObject

Called when page parsing ends



88
89
90
91
# File 'lib/pdf/reader/text_receiver.rb', line 88

def end_page
  @main_receiver << @output.join("\n")
  @upper_corners.pop
end

#end_page_containerObject



61
62
63
# File 'lib/pdf/reader/text_receiver.rb', line 61

def end_page_container
  @upper_corners.pop
end

#end_text_objectObject

PDF operator ET



99
100
101
# File 'lib/pdf/reader/text_receiver.rb', line 99

def end_text_object
  @state.pop
end

#media_box_check(dict) ⇒ Object



224
225
226
227
228
229
230
231
232
233
234
# File 'lib/pdf/reader/text_receiver.rb', line 224

def media_box_check(dict)
  corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup

  if dict.has_key?(:MediaBox)
    media_box = dict[:MediaBox]
    corners[:urx] = media_box[2] - media_box[0]
    corners[:ury] = media_box[3] - media_box[1]
  end

  corners
end

#move_text_position(tx, ty) ⇒ Object

PDF operator Td



136
137
138
139
# File 'lib/pdf/reader/text_receiver.rb', line 136

def move_text_position(tx, ty)
  #puts "#{tx} #{ty} Td"
  calculate_line_and_location(@location + ty)
end

#move_text_position_and_set_leading(tx, ty) ⇒ Object

PDF operator TD



142
143
144
145
# File 'lib/pdf/reader/text_receiver.rb', line 142

def move_text_position_and_set_leading(tx, ty)
  set_text_leading(ty)# * -1)
  move_text_position(tx, ty)
end

#move_to_next_line_and_show_text(string) ⇒ Object

PDF operator ‘



212
213
214
215
# File 'lib/pdf/reader/text_receiver.rb', line 212

def move_to_next_line_and_show_text(string)
  move_to_start_of_next_line
  show_text(string)
end

#move_to_start_of_next_lineObject

PDF operator T*



131
132
133
# File 'lib/pdf/reader/text_receiver.rb', line 131

def move_to_start_of_next_line
  move_text_position(0, @state.last[:leading])
end

#set_character_spacing(n) ⇒ Object

PDF operator Tc



111
112
113
# File 'lib/pdf/reader/text_receiver.rb', line 111

def set_character_spacing(n)
  @state.last[:char_spacing] = n
end

#set_horizontal_text_scaling(n) ⇒ Object

PDF operator Tz



121
122
123
# File 'lib/pdf/reader/text_receiver.rb', line 121

def set_horizontal_text_scaling(n)
  @state.last[:hori_scaling] = n/100
end

#set_spacing_next_line_show_text(aw, ac, string) ⇒ Object

PDF operator “



218
219
220
221
222
# File 'lib/pdf/reader/text_receiver.rb', line 218

def set_spacing_next_line_show_text(aw, ac, string)
  set_word_spacing(aw)
  set_character_spacing(ac)
  move_to_next_line_and_show_text(string)
end

#set_text_leading(n) ⇒ Object

PDF operator TL



126
127
128
# File 'lib/pdf/reader/text_receiver.rb', line 126

def set_text_leading(n)
  @state.last[:leading] = n
end

#set_text_matrix_and_text_line_matrix(*args) ⇒ Object

PDF operator Tm



104
105
106
107
108
# File 'lib/pdf/reader/text_receiver.rb', line 104

def set_text_matrix_and_text_line_matrix(*args)
  # these variable names look bad, but they're from the PDF spec
  _a, _b, _c, _d, _e, f = *args
  calculate_line_and_location(f)
end

#set_word_spacing(n) ⇒ Object

PDF operator Tw



116
117
118
# File 'lib/pdf/reader/text_receiver.rb', line 116

def set_word_spacing(n)
  @state.last[:word_spacing] = n
end

#show_text(string) ⇒ Object

PDF operator Tj



148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/pdf/reader/text_receiver.rb', line 148

def show_text(string)
  #puts "getting line #@line"

  place = (@output[@line] ||= "")
  #place << "  " unless place.empty?

  place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
  place << string

  #puts "place is now: #{place}"
  @written_to = true
end

#show_text_with_positioning(params) ⇒ Object

PDF operator TJ



196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/pdf/reader/text_receiver.rb', line 196

def show_text_with_positioning(params)
  prev_adjustment = @state.last[:tj_adjustment]

  params.each do |p|
    case p
    when Float, Integer
      @state.last[:tj_adjustment] = p
    else
      show_text(p)
    end
  end

  @state.last[:tj_adjustment]  = prev_adjustment
end

#super_show_text(string) ⇒ Object



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/pdf/reader/text_receiver.rb', line 160

def super_show_text(string)
  urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
  ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR

  x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
  y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i

  #puts "rendering '#{string}' to #{x}x#{y}"

  place = (@output[y] ||= (" " * urx.to_i))
  #puts "#{urx} #{place.size} #{string.size} #{x}"
  return if x+string.size >= urx

  string.split(//).each do |c|
    chars = 1

    case c
    when " "
      chars += @state.last[:word_spacing].to_i
      place[x-1, chars] = (" " * chars)
    else
      chars += @state.last[:char_spacing].to_i
      chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
      chars = 1 if chars < 1

      place[x-1] = c
      place[x, chars-1] = (" " * (chars-1)) if chars > 1
    end

    x += chars
  end

  @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
end