Class: SiteMapper::CrawlUrl

Inherits:
Object
  • Object
show all
Defined in:
lib/site_mapper/crawl_url.rb

Overview

Crawl URL formatter.

Constant Summary collapse

TOO_MANY_REQUEST_MSG =

Too many request error message

"You're being challenged with a 'too many requests' captcha"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(base_url) ⇒ CrawlUrl

Returns a new instance of CrawlUrl

Parameters:

  • base_url (String)


10
11
12
13
14
15
16
# File 'lib/site_mapper/crawl_url.rb', line 10

def initialize(base_url)
  uri      = URI.parse(Request.resolve_url(base_url))
  host     = uri.hostname
  protocol = uri.port == 443 ? 'https://' : 'http://'
  @resolved_base_url = "#{protocol}#{host}"
  @base_hostname     = URI.parse(@resolved_base_url).hostname
end

Instance Attribute Details

#base_hostnameObject (readonly)

Returns the value of attribute base_hostname



4
5
6
# File 'lib/site_mapper/crawl_url.rb', line 4

def base_hostname
  @base_hostname
end

#resolved_base_urlObject (readonly)

Returns the value of attribute resolved_base_url



4
5
6
# File 'lib/site_mapper/crawl_url.rb', line 4

def resolved_base_url
  @resolved_base_url
end

Instance Method Details

#absolute_url_from(raw_url, get_url) ⇒ String

Given a link it constructs the absolute path, if valid URL & URL has same domain as @resolved_base_url.

Examples:

Construct absolute URL for '/path', example.com

cu = CrawlUrl.new('example.com')
cu.absolute_url_from('/path', 'example.com/some/path')
# => http://example.com/some/path

Parameters:

  • raw_url (String)

    url found on page

  • get_url (String)

    current page url

Returns:

  • (String)

    with absolute path to resource



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/site_mapper/crawl_url.rb', line 27

def absolute_url_from(raw_url, get_url)
  return unless eligible_url?(raw_url)
  parsed_url = URI.parse(raw_url) rescue false
  if parsed_url && parsed_url.relative?
    url_from_relative(raw_url, get_url)
  elsif parsed_url && same_domain?(raw_url, @resolved_base_url)
    raw_url
  else
    nil
  end
end