Module: RubyCrawl::UrlNormalizer

Defined in:
lib/rubycrawl/url_normalizer.rb

Overview

Normalizes URLs for deduplication.

Class Method Summary collapse

Class Method Details

.canonical_host(host) ⇒ Object



37
38
39
# File 'lib/rubycrawl/url_normalizer.rb', line 37

def canonical_host(host)
  host&.downcase&.delete_prefix('www.')
end

.normalize(url, base_url = nil) ⇒ Object



11
12
13
14
15
16
17
18
19
# File 'lib/rubycrawl/url_normalizer.rb', line 11

def normalize(url, base_url = nil)
  uri = parse_uri(url, base_url)
  return nil unless uri&.host

  normalize_uri_parts(uri)
  uri.to_s
rescue URI::InvalidURIError
  nil
end

.normalize_path(path) ⇒ Object



51
52
53
54
55
56
57
# File 'lib/rubycrawl/url_normalizer.rb', line 51

def normalize_path(path)
  return '/' if path.nil? || path.empty?

  # Remove trailing slash except for root
  path = path.chomp('/') if path.length > 1
  path
end

.normalize_query(query) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/rubycrawl/url_normalizer.rb', line 59

def normalize_query(query)
  return nil if query.nil? || query.empty?

  # Remove tracking params
  tracking_params = %w[utm_source utm_medium utm_campaign utm_term utm_content fbclid gclid]
  params = URI.decode_www_form(query).reject { |k, _| tracking_params.include?(k.downcase) }
  return nil if params.empty?

  URI.encode_www_form(params.sort)
rescue ArgumentError
  query
end

.normalize_uri_parts(uri) ⇒ Object



21
22
23
24
25
26
27
# File 'lib/rubycrawl/url_normalizer.rb', line 21

def normalize_uri_parts(uri)
  uri.scheme = uri.scheme&.downcase
  uri.host = uri.host&.downcase
  uri.path = normalize_path(uri.path)
  uri.fragment = nil
  uri.query = normalize_query(uri.query)
end

.parse_uri(url, base_url) ⇒ Object



41
42
43
44
45
46
47
48
49
# File 'lib/rubycrawl/url_normalizer.rb', line 41

def parse_uri(url, base_url)
  uri = URI.parse(url)
  return uri if uri.absolute?
  return nil unless base_url

  URI.join(base_url, url)
rescue URI::InvalidURIError
  nil
end

.same_host?(url, base_url) ⇒ Boolean

Returns:

  • (Boolean)


29
30
31
32
33
34
35
# File 'lib/rubycrawl/url_normalizer.rb', line 29

def same_host?(url, base_url)
  uri = URI.parse(url)
  base_uri = URI.parse(base_url)
  canonical_host(uri.host) == canonical_host(base_uri.host)
rescue URI::InvalidURIError
  false
end