Class: WebCrawler::Parsers::Url

Inherits:
Object
  • Object
show all
Defined in:
lib/web_crawler/parsers/url.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(host, options = { }) ⇒ Url

Returns a new instance of Url.



5
6
7
8
9
10
11
# File 'lib/web_crawler/parsers/url.rb', line 5

def initialize(host, options = { })
  @scheme  = options[:secure] ? 'https' : 'http'
  @host    = URI.parse(normalize_host(host.to_s))
  @scheme  = @host.scheme
  @options = options
  set_current_page
end

Instance Attribute Details

#hostObject (readonly)

Returns the value of attribute host.



3
4
5
# File 'lib/web_crawler/parsers/url.rb', line 3

def host
  @host
end

#schemeObject (readonly)

Returns the value of attribute scheme.



3
4
5
# File 'lib/web_crawler/parsers/url.rb', line 3

def scheme
  @scheme
end

Instance Method Details

#normalize(url) ⇒ Object



19
20
21
22
23
24
25
26
27
# File 'lib/web_crawler/parsers/url.rb', line 19

def normalize(url)
  if url[/^(:?#{@host.scheme}|https|)\:\/\/#{@host.host}/]
    normalize_host(url)
  elsif url == '#'
    nil
  else
    (url[0] == '/' || url[0] == '?' || url[0] == '#') ? join(url).to_s : (@options[:same_host] ? nil : url)
  end
end

#parse(response, &filter) ⇒ Object



13
14
15
16
17
# File 'lib/web_crawler/parsers/url.rb', line 13

def parse(response, &filter)
  (Hpricot(response.to_s) / "a").map { |a| normalize(a["href"]) }.compact.uniq.tap do |result|
    result = result.select(&filter) if block_given?
  end
end