Class: QuickAndRuby::Pdf::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/quick_and_ruby/pdf/document.rb

Constant Summary collapse

DEFAULT_SCHEMES =
%w[http https].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(filepath) ⇒ Document

Returns a new instance of Document.



13
14
15
# File 'lib/quick_and_ruby/pdf/document.rb', line 13

def initialize(filepath)
  @filepath = filepath
end

Instance Attribute Details

#filepathObject (readonly)

Returns the value of attribute filepath.



11
12
13
# File 'lib/quick_and_ruby/pdf/document.rb', line 11

def filepath
  @filepath
end

Instance Method Details

#doc_readerObject



37
38
39
# File 'lib/quick_and_ruby/pdf/document.rb', line 37

def doc_reader
  @doc_reader ||= PDF::Reader.new(filepath)
end


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/quick_and_ruby/pdf/document.rb', line 17

def extract_links(schemes: DEFAULT_SCHEMES)
  links = []

  doc_reader.pages.each do |page|
    text = page.text
    links += URI.extract(text, schemes)

    annots = page.attributes[:Annots] || []
    annots.each do |annot_ref|
      annot = doc_reader.objects.deref(annot_ref)
      next unless annot.is_a?(Hash) && annot[:A] && annot[:A][:URI]

      target_uri = annot[:A][:URI]

      links += URI.extract(target_uri, schemes)
    end
  end
  links.uniq
end