Class: PDF::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/pdf/reader.rb,
lib/pdf/reader/lzw.rb,
lib/pdf/reader/cmap.rb,
lib/pdf/reader/font.rb,
lib/pdf/reader/page.rb,
lib/pdf/reader/xref.rb,
lib/pdf/reader/error.rb,
lib/pdf/reader/token.rb,
lib/pdf/reader/buffer.rb,
lib/pdf/reader/filter.rb,
lib/pdf/reader/parser.rb,
lib/pdf/reader/stream.rb,
lib/pdf/reader/encoding.rb,
lib/pdf/reader/reference.rb,
lib/pdf/reader/glyph_hash.rb,
lib/pdf/reader/page_state.rb,
lib/pdf/reader/object_hash.rb,
lib/pdf/reader/form_xobject.rb,
lib/pdf/reader/object_cache.rb,
lib/pdf/reader/object_stream.rb,
lib/pdf/reader/text_receiver.rb,
lib/pdf/reader/pages_strategy.rb,
lib/pdf/reader/print_receiver.rb,
lib/pdf/reader/resource_methods.rb,
lib/pdf/reader/abstract_strategy.rb,
lib/pdf/reader/metadata_strategy.rb,
lib/pdf/reader/register_receiver.rb,
lib/pdf/reader/page_text_receiver.rb,
lib/pdf/reader/standard_security_handler.rb

Overview

Copyright © 2010 James Healy ([email protected])

Defined Under Namespace

Modules: ResourceMethods Classes: AbstractStrategy, Buffer, CMap, Encoding, EncryptedPDFError, Error, Filter, Font, FormXObject, GlyphHash, InvalidObjectError, LZW, MalformedPDFError, MetadataStrategy, ObjectCache, ObjectHash, ObjectStream, Page, PageState, PageTextReceiver, PagesStrategy, Parser, PrintReceiver, Reference, RegisterReceiver, StandardSecurityHandler, Stream, TextReceiver, Token, UnsupportedFeatureError, XRef

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input = nil, opts = {}) ⇒ Reader

creates a new document reader for the provided PDF.

input can be an IO-ish object (StringIO, File, etc) containing a PDF or a filename

reader = PDF::Reader.new("somefile.pdf")

File.open("somefile.pdf","rb") do |file|
  reader = PDF::Reader.new(file)
end

If the source file is encrypted you can provide a password for decrypting

reader = PDF::Reader.new("somefile.pdf", :password => "apples")


114
115
116
117
118
# File 'lib/pdf/reader.rb', line 114

def initialize(input = nil, opts = {})
  if input # support the deprecated Reader API
    @objects = PDF::Reader::ObjectHash.new(input, opts)
  end
end

Instance Attribute Details

#objectsObject (readonly)

lowlevel hash-like access to all objects in the underlying PDF



97
98
99
# File 'lib/pdf/reader.rb', line 97

def objects
  @objects
end

Class Method Details

.file(name, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, sending events to the given receiver.



168
169
170
171
172
# File 'lib/pdf/reader.rb', line 168

def self.file(name, receivers, opts = {})
  File.open(name,"rb") do |f|
    new.parse(f, receivers, opts)
  end
end

.object_file(name, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the file with the given name, returning an unmarshalled ruby version of represents the requested pdf object



191
192
193
194
195
# File 'lib/pdf/reader.rb', line 191

def self.object_file(name, id, gen = 0)
  File.open(name,"rb") { |f|
    new.object(f, id.to_i, gen.to_i)
  }
end

.object_string(str, id, gen = 0) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, returning an unmarshalled ruby version of represents the requested pdf object



203
204
205
206
207
# File 'lib/pdf/reader.rb', line 203

def self.object_string(str, id, gen = 0)
  StringIO.open(str) { |s|
    new.object(s, id.to_i, gen.to_i)
  }
end

.open(input, opts = {}) {|PDF::Reader.new(input, opts)| ... } ⇒ Object

syntactic sugar for opening a PDF file. Accepts the same arguments as new().

PDF::Reader.open("somefile.pdf") do |reader|
  puts reader.pdf_version
end

or

PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
  puts reader.pdf_version
end

Yields:



158
159
160
# File 'lib/pdf/reader.rb', line 158

def self.open(input, opts = {}, &block)
  yield PDF::Reader.new(input, opts)
end

.string(str, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Parse the given string, sending events to the given receiver.



179
180
181
182
183
# File 'lib/pdf/reader.rb', line 179

def self.string(str, receivers, opts = {})
  StringIO.open(str) do |s|
    new.parse(s, receivers, opts)
  end
end

Instance Method Details

#infoObject



120
121
122
123
# File 'lib/pdf/reader.rb', line 120

def info
  dict = @objects.deref(@objects.trailer[:Info])
  doc_strings_to_utf8(dict)
end

#metadataObject



125
126
127
128
129
130
131
132
133
134
# File 'lib/pdf/reader.rb', line 125

def 
  stream = @objects.deref(root[:Metadata])
  if stream.nil?
    nil
  else
    xml = stream.unfiltered_data
    xml.force_encoding("utf-8") if xml.respond_to?(:force_encoding)
    xml
  end
end

#object(io, id, gen) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, return the contents of a single object



271
272
273
274
275
# File 'lib/pdf/reader.rb', line 271

def object (io, id, gen)
  @objects = ObjectHash.new(io)

  @objects.deref(Reference.new(id, gen))
end

#page(num) ⇒ Object

returns a single PDF::Reader::Page for the specified page. Use this instead of pages method when you need to access just a single page

reader = PDF::Reader.new("somefile.pdf")
page   = reader.page(10)

puts page.text

See the docs for PDF::Reader::Page to read more about the methods available on each page

Raises:

  • (ArgumentError)


241
242
243
244
245
# File 'lib/pdf/reader.rb', line 241

def page(num)
  num = num.to_i
  raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
  PDF::Reader::Page.new(@objects, num)
end

#page_countObject



136
137
138
139
# File 'lib/pdf/reader.rb', line 136

def page_count
  pages = @objects.deref(root[:Pages])
  @page_count ||= @objects.deref(pages[:Count])
end

#pagesObject

returns an array of PDF::Reader::Page objects, one for each page in the source PDF.

reader = PDF::Reader.new("somefile.pdf")

reader.pages.each do |page|
  puts page.fonts
  puts page.images
  puts page.text
end

See the docs for PDF::Reader::Page to read more about the methods available on each page



223
224
225
226
227
# File 'lib/pdf/reader.rb', line 223

def pages
  (1..self.page_count).map { |num|
    PDF::Reader::Page.new(@objects, num)
  }
end

#parse(io, receivers, opts = {}) ⇒ Object

DEPRECATED: this method was deprecated in version 1.0.0 and will

eventually be removed

Given an IO object that contains PDF data, parse it.



253
254
255
256
257
258
259
260
261
262
263
264
# File 'lib/pdf/reader.rb', line 253

def parse(io, receivers, opts = {})
  ohash    = ObjectHash.new(io)

  options = {:pages => true, :raw_text => false, :metadata => true}
  options.merge!(opts)

  strategies.each do |s|
    s.new(ohash, receivers, options).process
  end

  self
end

#pdf_versionObject



141
142
143
# File 'lib/pdf/reader.rb', line 141

def pdf_version
  @objects.pdf_version
end