Class: Xapian::Indexer::Extractors::HTML

Inherits:
Object
  • Object
show all
Defined in:
lib/xapian/indexer/extractors/html.rb

Overview

Represents a resource that will be indexed

Constant Summary collapse

NBSP =
Nokogiri::HTML(" ").text
WHITESPACE =
/(\s|#{NBSP})+/

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ HTML

Returns a new instance of HTML.



31
32
33
34
35
# File 'lib/xapian/indexer/extractors/html.rb', line 31

def initialize(options = {})
	@options = options
	
	@logger = options[:logger] || Logger.new($stderr)
end

Instance Method Details

#call(resource, status, headers, data) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/xapian/indexer/extractors/html.rb', line 37

def call(resource, status, headers, data)
	html = Nokogiri::HTML.parse(data)
	result = {}

	# Extract description
	meta_description = html.css("meta[name='description']").first

	if meta_description
		result[:description] = meta_description['content']
	else
		# Use the first paragraph as a description
		first_paragraph = html.search("p").first
	
		if first_paragraph
			result[:description] = first_paragraph.inner_text.gsub(WHITESPACE, " ")
		end
	end

	base_tag = html.at('html/head/base')
	if base_tag
		base = URI.parse(base_tag['href'])
	else
		base = URI.parse(resource.name)
	end
	
	links = []

	html.css('a').each do |link| 
		href = (link['href'] || "").to_s.gsub(/ /, '%20')
	
		# No scheme but starts with a '/'
		#begin
			links << (base + href)
		#rescue
		#	$stderr.puts "Could not add link #{href}: #{$!}"
		#end
	end

	# Remove any fragment at the end of the URI.
	links.each{|link| link.fragment = nil}

	# Convert to strings and uniq.
	result[:links] = links.map{|link| link.to_s}.uniq
	
	#$stderr.puts "Extracted links = #{result[:links].inspect}"
	
	# Extract title
	title_tag = html.at('html/head/title')
	h1_tag = html.search('h1').first
	if title_tag
		result[:title] = title_tag.inner_text.gsub(WHITESPACE, " ")
	elsif h1_tag
		result[:title] = h1_tag.inner_text.gsub(WHITESPACE, " ")
	end

	# Extract keywords
	meta_keywords = html.css("meta[name='keyword']").first
	if meta_keywords
		result[:keywords] = meta_keywords['content'].gsub(WHITESPACE, " ")
	end
	
	# Remove junk elements from the html
	html.search("script").remove
	html.search("link").remove
	html.search("meta").remove
	html.search("style").remove
	html.search("form").remove
	html.css('.noindex').remove
	
	body = html.at('html/body')
	
	if body
		# We also convert NBSP characters to inner space.
		result[:content] = body.inner_text.gsub(WHITESPACE, " ")
	end

	return result
end