Class: PDF::Reader::TextReceiver

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader/text_receiver.rb

Overview

An example receiver class that processes all text found in a PDF file. All text that is found will be printed to the IO object specified in the constructor.

Usage:

receiver = PDF::Reader::TextReceiver.new($stdout)
PDF::Reader.file("somefile.pdf", receiver)

Instance Method Summary collapse

Constructor Details

#initialize(main_receiver) ⇒ TextReceiver

Initialize with the library user’s receiver



37
38
39
40
# File 'lib/pdf/reader/text_receiver.rb', line 37

def initialize (main_receiver)
  @main_receiver = main_receiver
  @upper_corners = []
end

Instance Method Details

#begin_document(root) ⇒ Object

Called when the document parsing begins



43
44
45
# File 'lib/pdf/reader/text_receiver.rb', line 43

def begin_document (root)
  @upper_corners = []
end

#begin_page(info) ⇒ Object

Called when new page parsing begins



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/pdf/reader/text_receiver.rb', line 61

def begin_page (info)
  @page = info

  @state = [{
    :char_spacing     => 0,
    :word_spacing     => 0,
    :hori_scaling     => 100,
    :leading          => 0,
    :tj_adjustment    => 0,
  }]

  @upper_corners.push(media_box_check(info))

  @output = []
  @line = 0
  @location = 0
  @displacement = {}
  @smallest_y_loc = @upper_corners.last[:ury]
  @written_to = false
end

#begin_page_container(page) ⇒ Object



52
53
54
# File 'lib/pdf/reader/text_receiver.rb', line 52

def begin_page_container (page)
  @upper_corners.push(media_box_check(page))
end

#begin_text_objectObject

PDF operator BT



89
90
91
# File 'lib/pdf/reader/text_receiver.rb', line 89

def begin_text_object
  @state.push(@state.last.dup)
end

#calculate_line_and_location(new_loc) ⇒ Object



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# File 'lib/pdf/reader/text_receiver.rb', line 229

def calculate_line_and_location (new_loc)
  ##puts "calculate_line_and_location(#{new_loc})"
  key = new_loc; key.freeze

  #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)

  if @written_to
    unless @displacement.has_key?(key)
      if key < @location
        @displacement[key] = @line + 1
      elsif key < @smallest_y_loc
        @displacement[key] = @line + 1
      else
        key = @displacement.keys.find_all {|i| key > i}.sort.last
        @displacement[key] = 0 unless @displacement.has_key?(key)
      end
    end
  else
    @displacement[key] = 0
  end

  @smallest_y_loc = key if key < @smallest_y_loc
  @location = key
  @line = @displacement[key]
  #puts "calculate_line_and_location: @location=#@location @line=#@line smallest_y_loc=#@smallest_y_loc"
end

#end_documentObject

Called when the document parsing ends



48
49
50
# File 'lib/pdf/reader/text_receiver.rb', line 48

def end_document
  @state.clear
end

#end_pageObject

Called when page parsing ends



83
84
85
86
# File 'lib/pdf/reader/text_receiver.rb', line 83

def end_page
  @main_receiver << @output.join("\n")
  @upper_corners.pop
end

#end_page_containerObject



56
57
58
# File 'lib/pdf/reader/text_receiver.rb', line 56

def end_page_container
  @upper_corners.pop
end

#end_text_objectObject

PDF operator ET



94
95
96
# File 'lib/pdf/reader/text_receiver.rb', line 94

def end_text_object
  @state.pop
end

#media_box_check(dict) ⇒ Object



217
218
219
220
221
222
223
224
225
226
227
# File 'lib/pdf/reader/text_receiver.rb', line 217

def media_box_check (dict)
  corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup

  if dict.has_key?('MediaBox')
    media_box = dict['MediaBox']
    corners[:urx] = media_box[2] - media_box[0]
    corners[:ury] = media_box[3] - media_box[1]
  end

  corners
end

#move_text_position(tx, ty) ⇒ Object

PDF operator Td



129
130
131
132
# File 'lib/pdf/reader/text_receiver.rb', line 129

def move_text_position (tx, ty)
  #puts "#{tx} #{ty} Td"
  calculate_line_and_location(@location + ty)
end

#move_text_position_and_set_leading(tx, ty) ⇒ Object

PDF operator TD



135
136
137
138
# File 'lib/pdf/reader/text_receiver.rb', line 135

def move_text_position_and_set_leading (tx, ty)
  set_text_leading(ty)# * -1)
  move_text_position(tx, ty)
end

#move_to_next_line_and_show_text(string) ⇒ Object

PDF operator ‘



205
206
207
208
# File 'lib/pdf/reader/text_receiver.rb', line 205

def move_to_next_line_and_show_text (string)
  move_to_start_of_next_line
  show_text(string)
end

#move_to_start_of_next_lineObject

PDF operator T*



124
125
126
# File 'lib/pdf/reader/text_receiver.rb', line 124

def move_to_start_of_next_line
  move_text_position(0, @state.last[:leading])
end

#set_character_spacing(n) ⇒ Object

PDF operator Tc



104
105
106
# File 'lib/pdf/reader/text_receiver.rb', line 104

def set_character_spacing (n)
  @state.last[:char_spacing] = n
end

#set_horizontal_text_scaling(n) ⇒ Object

PDF operator Tz



114
115
116
# File 'lib/pdf/reader/text_receiver.rb', line 114

def set_horizontal_text_scaling (n)
  @state.last[:hori_scaling] = n/100
end

#set_spacing_next_line_show_text(aw, ac, string) ⇒ Object

PDF operator “



211
212
213
214
215
# File 'lib/pdf/reader/text_receiver.rb', line 211

def set_spacing_next_line_show_text (aw, ac, string)
  set_word_spacing(aw)
  set_character_spacing(ac)
  move_to_next_line_and_show_text(string)
end

#set_text_leading(n) ⇒ Object

PDF operator TL



119
120
121
# File 'lib/pdf/reader/text_receiver.rb', line 119

def set_text_leading (n)
  @state.last[:leading] = n
end

#set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) ⇒ Object

PDF operator Tm



99
100
101
# File 'lib/pdf/reader/text_receiver.rb', line 99

def set_text_matrix_and_text_line_matrix (a, b, c, d, e, f)
  calculate_line_and_location(f)
end

#set_word_spacing(n) ⇒ Object

PDF operator Tw



109
110
111
# File 'lib/pdf/reader/text_receiver.rb', line 109

def set_word_spacing (n)
  @state.last[:word_spacing] = n
end

#show_text(string) ⇒ Object

PDF operator Tj



141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/pdf/reader/text_receiver.rb', line 141

def show_text (string)
  #puts "getting line #@line"

  place = (@output[@line] ||= "")
  #place << "  " unless place.empty?

  place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
  place << string

  #puts "place is now: #{place}"
  @written_to = true
end

#show_text_with_positioning(params) ⇒ Object

PDF operator TJ



189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/pdf/reader/text_receiver.rb', line 189

def show_text_with_positioning (params)
  prev_adjustment = @state.last[:tj_adjustment]

  params.each do |p|
    case p
    when Float
      @state.last[:tj_adjustment] = p
    else
      show_text(p)
    end
  end

  @state.last[:tj_adjustment]  = prev_adjustment
end

#super_show_text(string) ⇒ Object



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/pdf/reader/text_receiver.rb', line 153

def super_show_text (string)
  urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
  ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR

  x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
  y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i
  
  #puts "rendering '#{string}' to #{x}x#{y}"

  place = (@output[y] ||= (" " * urx.to_i))
  #puts "#{urx} #{place.size} #{string.size} #{x}"
  return if x+string.size >= urx

  string.split(//).each do |c|
    chars = 1

    case c
    when " "
      chars += @state.last[:word_spacing].to_i
      place[x-1, chars] = (" " * chars)
    else
      chars += @state.last[:char_spacing].to_i
      chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
      chars = 1 if chars < 1

      place[x-1] = c
      place[x, chars-1] = (" " * (chars-1)) if chars > 1
    end

    x += chars
  end

  @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
end