Class: PDF::Reader::TextReceiver

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/text_receiver.rb

Overview

An example receiver class that processes all text found in a PDF file. All text that is found will be printed to the IO object specified in the constructor.

Usage:

receiver = PDF::Reader::TextReceiver.new($stdout)
PDF::Reader.file("somefile.pdf", receiver)

Instance Method Summary collapse

Constructor Details

#initialize(main_receiver) ⇒ TextReceiver

Initialize with the library user’s receiver



37
38
39
40
# File 'lib/pdf/reader/text_receiver.rb', line 37

def initialize (main_receiver)
  @main_receiver = main_receiver
  @upper_corners = []
end

Instance Method Details

#begin_document(root) ⇒ Object

Called when the document parsing begins



43
44
45
# File 'lib/pdf/reader/text_receiver.rb', line 43

def begin_document (root)
  @upper_corners = []
end

#begin_page(info) ⇒ Object

Called when new page parsing begins



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/pdf/reader/text_receiver.rb', line 61

def begin_page (info)
  @page = info

  @state = [{
    :char_spacing     => 0,
    :word_spacing     => 0,
    :hori_scaling     => 100,
    :leading          => 0,
    :tj_adjustment    => 0,
  }]

  @upper_corners.push(media_box_check(info))

  @output = []
  @line = 0
  @location = 0
  @displacement = {}
  @smallest_y_loc = @upper_corners.last[:ury]
  @written_to = false
end

#begin_page_container(page) ⇒ Object



52
53
54
# File 'lib/pdf/reader/text_receiver.rb', line 52

def begin_page_container (page)
  @upper_corners.push(media_box_check(page))
end

#begin_text_objectObject

PDF operator BT



89
90
91
# File 'lib/pdf/reader/text_receiver.rb', line 89

def begin_text_object
  @state.push(@state.last.dup)
end

#calculate_line_and_location(new_loc) ⇒ Object



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/pdf/reader/text_receiver.rb', line 231

def calculate_line_and_location (new_loc)
  ##puts "calculate_line_and_location(#{new_loc})"
  key = new_loc; key.freeze

  #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)

  if @written_to
    unless @displacement.has_key?(key)
      if key < @location
        @displacement[key] = @line + 1
      elsif key < @smallest_y_loc
        @displacement[key] = @line + 1
      else
        key = @displacement.keys.find_all {|i| key > i}.sort.last
        @displacement[key] = 0 unless @displacement.has_key?(key)
      end
    end
  else
    @displacement[key] = 0
  end

  @smallest_y_loc = key if key < @smallest_y_loc
  @location = key
  @line = @displacement[key]
  #puts "calculate_line_and_location: @location=#@location @line=#@line smallest_y_loc=#@smallest_y_loc"
end

#end_documentObject

Called when the document parsing ends



48
49
50
# File 'lib/pdf/reader/text_receiver.rb', line 48

def end_document
  @state.clear
end

#end_pageObject

Called when page parsing ends



83
84
85
86
# File 'lib/pdf/reader/text_receiver.rb', line 83

def end_page
  @main_receiver << @output.join("\n")
  @upper_corners.pop
end

#end_page_containerObject



56
57
58
# File 'lib/pdf/reader/text_receiver.rb', line 56

def end_page_container
  @upper_corners.pop
end

#end_text_objectObject

PDF operator ET



94
95
96
# File 'lib/pdf/reader/text_receiver.rb', line 94

def end_text_object
  @state.pop
end

#media_box_check(dict) ⇒ Object



219
220
221
222
223
224
225
226
227
228
229
# File 'lib/pdf/reader/text_receiver.rb', line 219

def media_box_check (dict)
  corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup

  if dict.has_key?(:MediaBox)
    media_box = dict[:MediaBox]
    corners[:urx] = media_box[2] - media_box[0]
    corners[:ury] = media_box[3] - media_box[1]
  end

  corners
end

#move_text_position(tx, ty) ⇒ Object

PDF operator Td



131
132
133
134
# File 'lib/pdf/reader/text_receiver.rb', line 131

def move_text_position (tx, ty)
  #puts "#{tx} #{ty} Td"
  calculate_line_and_location(@location + ty)
end

#move_text_position_and_set_leading(tx, ty) ⇒ Object

PDF operator TD



137
138
139
140
# File 'lib/pdf/reader/text_receiver.rb', line 137

def move_text_position_and_set_leading (tx, ty)
  set_text_leading(ty)# * -1)
  move_text_position(tx, ty)
end

#move_to_next_line_and_show_text(string) ⇒ Object

PDF operator ‘



207
208
209
210
# File 'lib/pdf/reader/text_receiver.rb', line 207

def move_to_next_line_and_show_text (string)
  move_to_start_of_next_line
  show_text(string)
end

#move_to_start_of_next_lineObject

PDF operator T*



126
127
128
# File 'lib/pdf/reader/text_receiver.rb', line 126

def move_to_start_of_next_line
  move_text_position(0, @state.last[:leading])
end

#set_character_spacing(n) ⇒ Object

PDF operator Tc



106
107
108
# File 'lib/pdf/reader/text_receiver.rb', line 106

def set_character_spacing (n)
  @state.last[:char_spacing] = n
end

#set_horizontal_text_scaling(n) ⇒ Object

PDF operator Tz



116
117
118
# File 'lib/pdf/reader/text_receiver.rb', line 116

def set_horizontal_text_scaling (n)
  @state.last[:hori_scaling] = n/100
end

#set_spacing_next_line_show_text(aw, ac, string) ⇒ Object

PDF operator “



213
214
215
216
217
# File 'lib/pdf/reader/text_receiver.rb', line 213

def set_spacing_next_line_show_text (aw, ac, string)
  set_word_spacing(aw)
  set_character_spacing(ac)
  move_to_next_line_and_show_text(string)
end

#set_text_leading(n) ⇒ Object

PDF operator TL



121
122
123
# File 'lib/pdf/reader/text_receiver.rb', line 121

def set_text_leading (n)
  @state.last[:leading] = n
end

#set_text_matrix_and_text_line_matrix(*args) ⇒ Object

PDF operator Tm



99
100
101
102
103
# File 'lib/pdf/reader/text_receiver.rb', line 99

def set_text_matrix_and_text_line_matrix (*args)
  # these variable names look bad, but they're from the PDF spec
  a, b, c, d, e, f = *args
  calculate_line_and_location(f)
end

#set_word_spacing(n) ⇒ Object

PDF operator Tw



111
112
113
# File 'lib/pdf/reader/text_receiver.rb', line 111

def set_word_spacing (n)
  @state.last[:word_spacing] = n
end

#show_text(string) ⇒ Object

PDF operator Tj



143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/pdf/reader/text_receiver.rb', line 143

def show_text (string)
  #puts "getting line #@line"

  place = (@output[@line] ||= "")
  #place << "  " unless place.empty?

  place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
  place << string

  #puts "place is now: #{place}"
  @written_to = true
end

#show_text_with_positioning(params) ⇒ Object

PDF operator TJ



191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/pdf/reader/text_receiver.rb', line 191

def show_text_with_positioning (params)
  prev_adjustment = @state.last[:tj_adjustment]

  params.each do |p|
    case p
    when Float, Fixnum
      @state.last[:tj_adjustment] = p
    else
      show_text(p)
    end
  end

  @state.last[:tj_adjustment]  = prev_adjustment
end

#super_show_text(string) ⇒ Object



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/pdf/reader/text_receiver.rb', line 155

def super_show_text (string)
  urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
  ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR

  x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
  y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i
  
  #puts "rendering '#{string}' to #{x}x#{y}"

  place = (@output[y] ||= (" " * urx.to_i))
  #puts "#{urx} #{place.size} #{string.size} #{x}"
  return if x+string.size >= urx

  string.split(//).each do |c|
    chars = 1

    case c
    when " "
      chars += @state.last[:word_spacing].to_i
      place[x-1, chars] = (" " * chars)
    else
      chars += @state.last[:char_spacing].to_i
      chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
      chars = 1 if chars < 1

      place[x-1] = c
      place[x, chars-1] = (" " * (chars-1)) if chars > 1
    end

    x += chars
  end

  @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
end