Class: PDF::Reader::TextReceiver

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/text_receiver.rb

Overview

An example receiver class that processes all text found in a PDF file. All text that is found will be printed to the IO object specified in the constructor.

Usage:

receiver = PDF::Reader::TextReceiver.new($stdout)
PDF::Reader.file("somefile.pdf", receiver)

DEPRECATED: this class was deprecated in version 0.11.0 and will

eventually be removed

Instance Method Summary collapse

Constructor Details

#initialize(main_receiver) ⇒ TextReceiver

Initialize with the library user’s receiver



40
41
42
43
# File 'lib/pdf/reader/text_receiver.rb', line 40

def initialize (main_receiver)
  @main_receiver = main_receiver
  @upper_corners = []
end

Instance Method Details

#begin_document(root) ⇒ Object

Called when the document parsing begins



46
47
48
# File 'lib/pdf/reader/text_receiver.rb', line 46

def begin_document (root)
  @upper_corners = []
end

#begin_page(info) ⇒ Object

Called when new page parsing begins



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/pdf/reader/text_receiver.rb', line 64

def begin_page (info)
  @page = info

  @state = [{
    :char_spacing     => 0,
    :word_spacing     => 0,
    :hori_scaling     => 100,
    :leading          => 0,
    :tj_adjustment    => 0,
  }]

  @upper_corners.push(media_box_check(info))

  @output = []
  @line = 0
  @location = 0
  @displacement = {}
  @smallest_y_loc = @upper_corners.last[:ury]
  @written_to = false
end

#begin_page_container(page) ⇒ Object



55
56
57
# File 'lib/pdf/reader/text_receiver.rb', line 55

def begin_page_container (page)
  @upper_corners.push(media_box_check(page))
end

#begin_text_objectObject

PDF operator BT



92
93
94
# File 'lib/pdf/reader/text_receiver.rb', line 92

def begin_text_object
  @state.push(@state.last.dup)
end

#calculate_line_and_location(new_loc) ⇒ Object



234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/pdf/reader/text_receiver.rb', line 234

def calculate_line_and_location (new_loc)
  ##puts "calculate_line_and_location(#{new_loc})"
  key = new_loc; key.freeze

  #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)

  if @written_to
    unless @displacement.has_key?(key)
      if key < @location
        @displacement[key] = @line + 1
      elsif key < @smallest_y_loc
        @displacement[key] = @line + 1
      else
        key = @displacement.keys.find_all {|i| key > i}.sort.last
        @displacement[key] = 0 unless @displacement.has_key?(key)
      end
    end
  else
    @displacement[key] = 0
  end

  @smallest_y_loc = key if key < @smallest_y_loc
  @location = key
  @line = @displacement[key]
  #puts "calculate_line_and_location: @location=#@location @line=#@line smallest_y_loc=#@smallest_y_loc"
end

#end_documentObject

Called when the document parsing ends



51
52
53
# File 'lib/pdf/reader/text_receiver.rb', line 51

def end_document
  @state.clear
end

#end_pageObject

Called when page parsing ends



86
87
88
89
# File 'lib/pdf/reader/text_receiver.rb', line 86

def end_page
  @main_receiver << @output.join("\n")
  @upper_corners.pop
end

#end_page_containerObject



59
60
61
# File 'lib/pdf/reader/text_receiver.rb', line 59

def end_page_container
  @upper_corners.pop
end

#end_text_objectObject

PDF operator ET



97
98
99
# File 'lib/pdf/reader/text_receiver.rb', line 97

def end_text_object
  @state.pop
end

#media_box_check(dict) ⇒ Object



222
223
224
225
226
227
228
229
230
231
232
# File 'lib/pdf/reader/text_receiver.rb', line 222

def media_box_check (dict)
  corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup

  if dict.has_key?(:MediaBox)
    media_box = dict[:MediaBox]
    corners[:urx] = media_box[2] - media_box[0]
    corners[:ury] = media_box[3] - media_box[1]
  end

  corners
end

#move_text_position(tx, ty) ⇒ Object

PDF operator Td



134
135
136
137
# File 'lib/pdf/reader/text_receiver.rb', line 134

def move_text_position (tx, ty)
  #puts "#{tx} #{ty} Td"
  calculate_line_and_location(@location + ty)
end

#move_text_position_and_set_leading(tx, ty) ⇒ Object

PDF operator TD



140
141
142
143
# File 'lib/pdf/reader/text_receiver.rb', line 140

def move_text_position_and_set_leading (tx, ty)
  set_text_leading(ty)# * -1)
  move_text_position(tx, ty)
end

#move_to_next_line_and_show_text(string) ⇒ Object

PDF operator ‘



210
211
212
213
# File 'lib/pdf/reader/text_receiver.rb', line 210

def move_to_next_line_and_show_text (string)
  move_to_start_of_next_line
  show_text(string)
end

#move_to_start_of_next_lineObject

PDF operator T*



129
130
131
# File 'lib/pdf/reader/text_receiver.rb', line 129

def move_to_start_of_next_line
  move_text_position(0, @state.last[:leading])
end

#set_character_spacing(n) ⇒ Object

PDF operator Tc



109
110
111
# File 'lib/pdf/reader/text_receiver.rb', line 109

def set_character_spacing (n)
  @state.last[:char_spacing] = n
end

#set_horizontal_text_scaling(n) ⇒ Object

PDF operator Tz



119
120
121
# File 'lib/pdf/reader/text_receiver.rb', line 119

def set_horizontal_text_scaling (n)
  @state.last[:hori_scaling] = n/100
end

#set_spacing_next_line_show_text(aw, ac, string) ⇒ Object

PDF operator “



216
217
218
219
220
# File 'lib/pdf/reader/text_receiver.rb', line 216

def set_spacing_next_line_show_text (aw, ac, string)
  set_word_spacing(aw)
  set_character_spacing(ac)
  move_to_next_line_and_show_text(string)
end

#set_text_leading(n) ⇒ Object

PDF operator TL



124
125
126
# File 'lib/pdf/reader/text_receiver.rb', line 124

def set_text_leading (n)
  @state.last[:leading] = n
end

#set_text_matrix_and_text_line_matrix(*args) ⇒ Object

PDF operator Tm



102
103
104
105
106
# File 'lib/pdf/reader/text_receiver.rb', line 102

def set_text_matrix_and_text_line_matrix (*args)
  # these variable names look bad, but they're from the PDF spec
  a, b, c, d, e, f = *args
  calculate_line_and_location(f)
end

#set_word_spacing(n) ⇒ Object

PDF operator Tw



114
115
116
# File 'lib/pdf/reader/text_receiver.rb', line 114

def set_word_spacing (n)
  @state.last[:word_spacing] = n
end

#show_text(string) ⇒ Object

PDF operator Tj



146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/pdf/reader/text_receiver.rb', line 146

def show_text (string)
  #puts "getting line #@line"

  place = (@output[@line] ||= "")
  #place << "  " unless place.empty?

  place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
  place << string

  #puts "place is now: #{place}"
  @written_to = true
end

#show_text_with_positioning(params) ⇒ Object

PDF operator TJ



194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/pdf/reader/text_receiver.rb', line 194

def show_text_with_positioning (params)
  prev_adjustment = @state.last[:tj_adjustment]

  params.each do |p|
    case p
    when Float, Fixnum
      @state.last[:tj_adjustment] = p
    else
      show_text(p)
    end
  end

  @state.last[:tj_adjustment]  = prev_adjustment
end

#super_show_text(string) ⇒ Object



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/pdf/reader/text_receiver.rb', line 158

def super_show_text (string)
  urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
  ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR

  x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
  y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i
  
  #puts "rendering '#{string}' to #{x}x#{y}"

  place = (@output[y] ||= (" " * urx.to_i))
  #puts "#{urx} #{place.size} #{string.size} #{x}"
  return if x+string.size >= urx

  string.split(//).each do |c|
    chars = 1

    case c
    when " "
      chars += @state.last[:word_spacing].to_i
      place[x-1, chars] = (" " * chars)
    else
      chars += @state.last[:char_spacing].to_i
      chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
      chars = 1 if chars < 1

      place[x-1] = c
      place[x, chars-1] = (" " * (chars-1)) if chars > 1
    end

    x += chars
  end

  @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
end