Class: WebPageParser::BaseParser

Inherits:
Object
  • Object
show all
Defined in:
lib/web-page-parser/base_parser.rb

Class Attribute Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = { }) ⇒ BaseParser

takes a hash of options. The :url option passes the page url, and the :page option passes the raw html page content for parsing



19
20
21
22
23
# File 'lib/web-page-parser/base_parser.rb', line 19

def initialize(options = { })
  @url = options[:url]
  @page = options[:page]
  @guid = options[:guid]
end

Class Attribute Details

.retrieve_sessionObject

Returns the value of attribute retrieve_session.



12
13
14
# File 'lib/web-page-parser/base_parser.rb', line 12

def retrieve_session
  @retrieve_session
end

Instance Attribute Details

#urlObject (readonly)

Returns the value of attribute url.



15
16
17
# File 'lib/web-page-parser/base_parser.rb', line 15

def url
  @url
end

Instance Method Details

#contentObject



43
44
45
# File 'lib/web-page-parser/base_parser.rb', line 43

def content
  @content || []
end

#dateObject



47
48
# File 'lib/web-page-parser/base_parser.rb', line 47

def date
end

#guidObject



53
54
55
56
57
# File 'lib/web-page-parser/base_parser.rb', line 53

def guid
  return @guid if @guid
  @guid = guid_from_url if url
  @guid
end

#guid_from_urlObject



50
51
# File 'lib/web-page-parser/base_parser.rb', line 50

def guid_from_url
end

#hashObject

Return a hash representing the textual content of this web page



60
61
62
63
64
65
# File 'lib/web-page-parser/base_parser.rb', line 60

def hash
  digest = Digest::MD5.new
  digest << title.to_s
  digest << content.join('').to_s
  digest.to_s
end

#pageObject

return the page contents, retrieving it from the server if necessary



26
27
28
# File 'lib/web-page-parser/base_parser.rb', line 26

def page
  @page ||= retrieve_page
end

#retrieve_page(rurl = nil) ⇒ Object

request the page from the server and return the raw contents



31
32
33
34
35
36
37
# File 'lib/web-page-parser/base_parser.rb', line 31

def retrieve_page(rurl = nil)
  durl = rurl || url
  return nil unless durl
  durl = filter_url(durl) if self.respond_to?(:filter_url)
  self.class.retrieve_session ||= WebPageParser::HTTP::Session.new
  self.class.retrieve_session.get(durl)
end

#titleObject



39
40
41
# File 'lib/web-page-parser/base_parser.rb', line 39

def title
  @title
end