Class: Pismo::Document

Inherits:
Object
  • Object
show all
Includes:
ExternalAttributes, InternalAttributes
Defined in:
lib/pismo/document.rb

Overview

Pismo::Document represents a single HTML document within Pismo

Constant Summary collapse

ATTRIBUTE_METHODS =
InternalAttributes.instance_methods + ExternalAttributes.instance_methods

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from InternalAttributes

#author, #authors, #body, #datetime, #description, #favicon, #feed, #feeds, #html_body, #html_title, #images, #keywords, #lede, #ledes, #reader_doc, #sentences, #tags, #title, #titles

Constructor Details

#initialize(handle, options = {}) ⇒ Document

Returns a new instance of Document.



15
16
17
18
19
# File 'lib/pismo/document.rb', line 15

def initialize(handle, options = {})
  @options = options
  url = @options.delete(:url)
  load(handle, url)
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



8
9
10
# File 'lib/pismo/document.rb', line 8

def doc
  @doc
end

#optionsObject (readonly)

Returns the value of attribute options.



8
9
10
# File 'lib/pismo/document.rb', line 8

def options
  @options
end

#urlObject (readonly)

Returns the value of attribute url.



8
9
10
# File 'lib/pismo/document.rb', line 8

def url
  @url
end

Class Method Details

.clean_html(html) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/pismo/document.rb', line 47

def self.clean_html(html)
  # Normalize stupid entities
  # TODO: Optimize this so we don't need all these sequential gsubs
  html.gsub!(" ", " ")
  html.gsub!(" ", " ")
  html.gsub!(" ", " ")
  html.gsub!('–', '-')
  html.gsub!("‘", "'")
  html.gsub!('’', "'")
  html.gsub!('“', '"')
  html.gsub!('”', '"')
  html.gsub!("…", '...')
  html.gsub!(' ', ' ')
  html.gsub!('&lt;', '<')
  html.gsub!('&gt;', '>')
  html.gsub!('&amp;', '&')
  html
end

Instance Method Details

#htmlObject

An HTML representation of the document



22
23
24
# File 'lib/pismo/document.rb', line 22

def html
  @doc.to_s
end

#load(handle, url = nil) ⇒ Object



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/pismo/document.rb', line 26

def load(handle, url = nil)
  @url = url if url
  @url = handle if handle =~ /\Ahttp/i
  
  @html = if handle =~ /\Ahttp/i
            open(handle).read
          elsif handle.is_a?(StringIO) || handle.is_a?(IO) || handle.is_a?(Tempfile)
            handle.read
          else
            handle
          end
          
  @html = self.class.clean_html(@html)
  
  @doc = Nokogiri::HTML(@html)
end

#match(args = [], all = false) ⇒ Object



43
44
45
# File 'lib/pismo/document.rb', line 43

def match(args = [], all = false)
  @doc.match([*args], all)
end