Class: PDF::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader.rb,
lib/pdf/reader/lzw.rb,
lib/pdf/reader/cmap.rb,
lib/pdf/reader/font.rb,
lib/pdf/reader/page.rb,
lib/pdf/reader/xref.rb,
lib/pdf/reader/error.rb,
lib/pdf/reader/token.rb,
lib/pdf/reader/buffer.rb,
lib/pdf/reader/filter.rb,
lib/pdf/reader/parser.rb,
lib/pdf/reader/stream.rb,
lib/pdf/reader/encoding.rb,
lib/pdf/reader/text_run.rb,
lib/pdf/reader/reference.rb,
lib/pdf/reader/cid_widths.rb,
lib/pdf/reader/filter/lzw.rb,
lib/pdf/reader/glyph_hash.rb,
lib/pdf/reader/page_state.rb,
lib/pdf/reader/filter/null.rb,
lib/pdf/reader/object_hash.rb,
lib/pdf/reader/page_layout.rb,
lib/pdf/reader/filter/flate.rb,
lib/pdf/reader/form_xobject.rb,
lib/pdf/reader/object_cache.rb,
lib/pdf/reader/object_stream.rb,
lib/pdf/reader/text_receiver.rb,
lib/pdf/reader/filter/ascii85.rb,
lib/pdf/reader/pages_strategy.rb,
lib/pdf/reader/print_receiver.rb,
lib/pdf/reader/font_descriptor.rb,
lib/pdf/reader/filter/ascii_hex.rb,
lib/pdf/reader/filter/depredict.rb,
lib/pdf/reader/resource_methods.rb,
lib/pdf/reader/abstract_strategy.rb,
lib/pdf/reader/filter/run_length.rb,
lib/pdf/reader/metadata_strategy.rb,
lib/pdf/reader/register_receiver.rb,
lib/pdf/reader/page_text_receiver.rb,
lib/pdf/reader/synchronized_cache.rb,
lib/pdf/reader/orientation_detector.rb,
lib/pdf/reader/transformation_matrix.rb,
lib/pdf/reader/standard_security_handler.rb,
lib/pdf/reader/width_calculator/built_in.rb,
lib/pdf/reader/width_calculator/composite.rb,
lib/pdf/reader/width_calculator/true_type.rb,
lib/pdf/reader/width_calculator/type_zero.rb,
lib/pdf/reader/width_calculator/type_one_or_three.rb

Overview

Copyright © 2010 James Healy ([email protected])

Defined Under Namespace

Modules: Filter, ResourceMethods, WidthCalculator Classes: AbstractStrategy, Buffer, CMap, CidWidths, Encoding, EncryptedPDFError, Error, Font, FontDescriptor, FormXObject, GlyphHash, InvalidObjectError, LZW, MalformedPDFError, MetadataStrategy, ObjectCache, ObjectHash, ObjectStream, OrientationDetector, Page, PageLayout, PageState, PageTextReceiver, PagesStrategy, Parser, PrintReceiver, Reference, RegisterReceiver, StandardSecurityHandler, Stream, SynchronizedCache, TextReceiver, TextRun, Token, TransformationMatrix, UnsupportedFeatureError, XRef

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input = nil, opts = {}) ⇒ Reader

creates a new document reader for the provided PDF.

input can be an IO-ish object (StringIO, File, etc) containing a PDF or a filename

reader = PDF::Reader.new("somefile.pdf")

File.open("somefile.pdf","rb") do |file|
  reader = PDF::Reader.new(file)
end

If the source file is encrypted you can provide a password for decrypting

reader = PDF::Reader.new("somefile.pdf", :password => "apples")


113
114
115
116
117
118
119
120
121
122
123
# File 'lib/pdf/reader.rb', line 113

def initialize(input = nil, opts = {})
  if input # support the deprecated Reader API
    @cache   = PDF::Reader::ObjectCache.new
    opts.merge!(:cache => @cache)
    @objects = PDF::Reader::ObjectHash.new(input, opts)
  else
    msg  = "Calling PDF::Reader#new with no arguments is deprecated and will be removed "
    msg += "in the 2.0 release"
    $stderr.puts(msg)
  end
end

Instance Attribute Details

#objectsObject (readonly)

lowlevel hash-like access to all objects in the underlying PDF



96
97
98
# File 'lib/pdf/reader.rb', line 96

def objects
  @objects
end

Class Method Details

.file(name, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, sending events to the given receiver.



173
174
175
176
177
178
179
# File 'lib/pdf/reader.rb', line 173

def self.file(name, receivers, opts = {})
  msg  = "PDF::Reader#file is deprecated and will be removed in the 2.0 release"
  $stderr.puts(msg)
  File.open(name,"rb") do |f|
    new.parse(f, receivers, opts)
  end
end

.object_file(name, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, returning an unmarshalled ruby version of represents the requested pdf object



200
201
202
203
204
205
206
# File 'lib/pdf/reader.rb', line 200

def self.object_file(name, id, gen = 0)
  msg  = "PDF::Reader#object_file is deprecated and will be removed in the 2.0 release"
  $stderr.puts(msg)
  File.open(name,"rb") { |f|
    new.object(f, id.to_i, gen.to_i)
  }
end

.object_string(str, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, returning an unmarshalled ruby version of represents the requested pdf object



214
215
216
217
218
219
220
# File 'lib/pdf/reader.rb', line 214

def self.object_string(str, id, gen = 0)
  msg  = "PDF::Reader#object_string is deprecated and will be removed in the 2.0 release"
  $stderr.puts(msg)
  StringIO.open(str) { |s|
    new.object(s, id.to_i, gen.to_i)
  }
end

.open(input, opts = {}) {|PDF::Reader.new(input, opts)| ... } ⇒ Object

syntactic sugar for opening a PDF file. Accepts the same arguments as new().

PDF::Reader.open("somefile.pdf") do |reader|
  puts reader.pdf_version
end

or

PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
  puts reader.pdf_version
end

Yields:



163
164
165
# File 'lib/pdf/reader.rb', line 163

def self.open(input, opts = {}, &block)
  yield PDF::Reader.new(input, opts)
end

.string(str, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, sending events to the given receiver.



186
187
188
189
190
191
192
# File 'lib/pdf/reader.rb', line 186

def self.string(str, receivers, opts = {})
  msg  = "PDF::Reader#string is deprecated and will be removed in the 2.0 release"
  $stderr.puts(msg)
  StringIO.open(str) do |s|
    new.parse(s, receivers, opts)
  end
end

Instance Method Details

#infoObject



125
126
127
128
# File 'lib/pdf/reader.rb', line 125

def info
  dict = @objects.deref(@objects.trailer[:Info])
  doc_strings_to_utf8(dict)
end

#metadataObject



130
131
132
133
134
135
136
137
138
139
# File 'lib/pdf/reader.rb', line 130

def 
  stream = @objects.deref(root[:Metadata])
  if stream.nil?
    nil
  else
    xml = stream.unfiltered_data
    xml.force_encoding("utf-8") if xml.respond_to?(:force_encoding)
    xml
  end
end

#object(io, id, gen) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, return the contents of a single object



288
289
290
291
292
293
294
# File 'lib/pdf/reader.rb', line 288

def object(io, id, gen)
  msg  = "PDF::Reader#object is deprecated and will be removed in the 2.0 release"
  $stderr.puts(msg)
  @objects = ObjectHash.new(io)

  @objects.deref(Reference.new(id, gen))
end

#page(num) ⇒ Object

returns a single PDF::Reader::Page for the specified page. Use this instead of pages method when you need to access just a single page

reader = PDF::Reader.new("somefile.pdf")
page   = reader.page(10)

puts page.text

See the docs for PDF::Reader::Page to read more about the methods available on each page



254
255
256
257
258
259
260
# File 'lib/pdf/reader.rb', line 254

def page(num)
  num = num.to_i
  if num < 1 || num > self.page_count
    raise ArgumentError, "valid pages are 1 .. #{self.page_count}"
  end
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
end

#page_countObject



141
142
143
144
# File 'lib/pdf/reader.rb', line 141

def page_count
  pages = @objects.deref(root[:Pages])
  @page_count ||= @objects.deref(pages[:Count])
end

#pagesObject

returns an array of PDF::Reader::Page objects, one for each page in the source PDF.

reader = PDF::Reader.new("somefile.pdf")

reader.pages.each do |page|
  puts page.fonts
  puts page.images
  puts page.text
end

See the docs for PDF::Reader::Page to read more about the methods available on each page



236
237
238
239
240
# File 'lib/pdf/reader.rb', line 236

def pages
  (1..self.page_count).map { |num|
    PDF::Reader::Page.new(@objects, num, :cache => @cache)
  }
end

#parse(io, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, parse it.



268
269
270
271
272
273
274
275
276
277
278
279
280
281
# File 'lib/pdf/reader.rb', line 268

def parse(io, receivers, opts = {})
  msg  = "PDF::Reader#parse is deprecated and will be removed in the 2.0 release"
  $stderr.puts(msg)
  ohash    = ObjectHash.new(io)

  options = {:pages => true, :raw_text => false, :metadata => true}
  options.merge!(opts)

  strategies.each do |s|
    s.new(ohash, receivers, options).process
  end

  self
end

#pdf_versionObject



146
147
148
# File 'lib/pdf/reader.rb', line 146

def pdf_version
  @objects.pdf_version
end