Class: Xapian::Indexer::Extractors::HTML

Inherits:

Object

Object
Xapian::Indexer::Extractors::HTML

show all

Defined in:: lib/xapian/indexer/extractors/html.rb

Overview

Represents a resource that will be indexed

Constant Summary collapse

NBSP =

Nokogiri::HTML("&nbsp;").text

WHITESPACE =

/(\s|#{NBSP})+/

Instance Method Summary collapse

#call(resource, status, headers, data) ⇒ Object
#initialize(options = {}) ⇒ HTML constructor

A new instance of HTML.

Constructor Details

#initialize(options = {}) ⇒ `HTML`

Returns a new instance of HTML.

# File 'lib/xapian/indexer/extractors/html.rb', line 31

def initialize(options = {})
	@options = options
	
	@logger = options[:logger] || Logger.new($stderr)
end

Instance Method Details

#call(resource, status, headers, data) ⇒ `Object`

# File 'lib/xapian/indexer/extractors/html.rb', line 37

def call(resource, status, headers, data)
	html = Nokogiri::HTML.parse(data)
	result = {}

	# Extract description
	meta_description = html.css("meta[name='description']").first

	if meta_description
		result[:description] = meta_description['content']
	else
		# Use the first paragraph as a description
		first_paragraph = html.search("p").first
	
		if first_paragraph
			result[:description] = first_paragraph.inner_text.gsub(WHITESPACE, " ")
		end
	end

	base_tag = html.at('html/head/base')
	if base_tag
		base = URI.parse(base_tag['href'])
	else
		base = URI.parse(resource.name)
	end
	
	links = []

	html.css('a').each do |link| 
		href = (link['href'] || "").to_s.gsub(/ /, '%20')
	
		# No scheme but starts with a '/'
		#begin
			links << (base + href)
		#rescue
		#	$stderr.puts "Could not add link #{href}: #{$!}"
		#end
	end

	# Remove any fragment at the end of the URI.
	links.each{|link| link.fragment = nil}

	# Convert to strings and uniq.
	result[:links] = links.map{|link| link.to_s}.uniq
	
	#$stderr.puts "Extracted links = #{result[:links].inspect}"
	
	# Extract title
	title_tag = html.at('html/head/title')
	h1_tag = html.search('h1').first
	if title_tag
		result[:title] = title_tag.inner_text.gsub(WHITESPACE, " ")
	elsif h1_tag
		result[:title] = h1_tag.inner_text.gsub(WHITESPACE, " ")
	end

	# Extract keywords
	meta_keywords = html.css("meta[name='keyword']").first
	if meta_keywords
		result[:keywords] = meta_keywords['content'].gsub(WHITESPACE, " ")
	end
	
	# Remove junk elements from the html
	html.search("script").remove
	html.search("link").remove
	html.search("meta").remove
	html.search("style").remove
	html.search("form").remove
	html.css('.noindex').remove
	
	body = html.at('html/body')
	
	if body
		# We also convert NBSP characters to inner space.
		result[:content] = body.inner_text.gsub(WHITESPACE, " ")
	end

	return result
end

Class: Xapian::Indexer::Extractors::HTML

Overview

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ HTML

Instance Method Details

#call(resource, status, headers, data) ⇒ Object

#initialize(options = {}) ⇒ `HTML`

#call(resource, status, headers, data) ⇒ `Object`