Class: Elibrum::Webpage

Inherits:

Object

Object
Elibrum::Webpage

show all

Defined in:: lib/elibrum/webpage.rb

Instance Attribute Summary collapse

#extractor ⇒ Object

Returns the value of attribute extractor.
#url ⇒ Object readonly

Returns the value of attribute url.

Instance Method Summary collapse

#initialize(url, &block) ⇒ Webpage constructor

A new instance of Webpage.
#modify(&block) ⇒ Object

Allows post-extraction processing of the text.
#text ⇒ Object
#title ⇒ Object

Constructor Details

#initialize(url, &block) ⇒ `Webpage`

Returns a new instance of Webpage.

# File 'lib/elibrum/webpage.rb', line 5

def initialize(url, &block)
	# Delete the trailing / so that URI.join in localize_images doesn't confuse this URL for a directory
	@url = url[-1] == "/" ? url[0...-1] : url
	@modify = proc {|title, text| [title, text]}
	@extractor = CommonExtractors::ARTICLE_EXTRACTOR

	if block
		block.arity < 1 ? instance_eval(&block) : block.call(self)
	end

	@title, @text = @modify.call(title, text)
	@text = localize_images(@text)
end

Instance Attribute Details

#extractor ⇒ `Object`

Returns the value of attribute extractor.



3
4
5

# File 'lib/elibrum/webpage.rb', line 3

def extractor
  @extractor
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



3
4
5

# File 'lib/elibrum/webpage.rb', line 3

def url
  @url
end

Instance Method Details

#modify(&block) ⇒ `Object`

Allows post-extraction processing of the text. Useful for removing bits that the extractor accidentally included.



56
57
58

# File 'lib/elibrum/webpage.rb', line 56

def modify(&block)
	@modify = block
end

#text ⇒ `Object`

# File 'lib/elibrum/webpage.rb', line 23

def text
	# The page is loaded both here and in Webpage#content.
	# TODO: Modify process() to accept a string as input.
	@text ||= begin
		highlighter = HTMLHighlighter.newExtractingInstance(true, false)
		highlighter.process(URL.new(@url), @extractor)
	rescue Exception => e
		# Boilerpipe does not pass along a user agent when retrieving sites, so some return a 403
		msg = "Failed to load #{@url} (#{e})"
		puts msg
		msg
	end
end

#title ⇒ `Object`



19
20
21

# File 'lib/elibrum/webpage.rb', line 19

def title
	@title ||= Nokogiri::HTML(content).xpath("//title").text
end

Class: Elibrum::Webpage

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, &block) ⇒ Webpage

Instance Attribute Details

#extractor ⇒ Object

#url ⇒ Object (readonly)