Class: Elibrum::Webpage

Inherits:
Object
  • Object
show all
Defined in:
lib/elibrum/webpage.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, &block) ⇒ Webpage

Returns a new instance of Webpage.



5
6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/elibrum/webpage.rb', line 5

def initialize(url, &block)
	# Delete the trailing / so that URI.join in localize_images doesn't confuse this URL for a directory
	@url = url[-1] == "/" ? url[0...-1] : url
	@modify = proc {|title, text| [title, text]}
	@extractor = CommonExtractors::ARTICLE_EXTRACTOR

	if block
		block.arity < 1 ? instance_eval(&block) : block.call(self)
	end

	@title, @text = @modify.call(title, text)
	@text = localize_images(@text)
end

Instance Attribute Details

#extractorObject

Returns the value of attribute extractor.



3
4
5
# File 'lib/elibrum/webpage.rb', line 3

def extractor
  @extractor
end

#urlObject (readonly)

Returns the value of attribute url.



3
4
5
# File 'lib/elibrum/webpage.rb', line 3

def url
  @url
end

Instance Method Details

#modify(&block) ⇒ Object

Allows post-extraction processing of the text. Useful for removing bits that the extractor accidentally included.



56
57
58
# File 'lib/elibrum/webpage.rb', line 56

def modify(&block)
	@modify = block
end

#textObject



23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/elibrum/webpage.rb', line 23

def text
	# The page is loaded both here and in Webpage#content.
	# TODO: Modify process() to accept a string as input.
	@text ||= begin
		highlighter = HTMLHighlighter.newExtractingInstance(true, false)
		highlighter.process(URL.new(@url), @extractor)
	rescue Exception => e
		# Boilerpipe does not pass along a user agent when retrieving sites, so some return a 403
		msg = "Failed to load #{@url} (#{e})"
		puts msg
		msg
	end
end

#titleObject



19
20
21
# File 'lib/elibrum/webpage.rb', line 19

def title
	@title ||= Nokogiri::HTML(content).xpath("//title").text
end