Module: RubyCrawl::UrlNormalizer

Defined in:
lib/rubycrawl/url_normalizer.rb

Overview

Normalizes URLs for deduplication.

Class Method Summary collapse

Class Method Details

.normalize(url, base_url = nil) ⇒ Object



11
12
13
14
15
16
17
18
19
# File 'lib/rubycrawl/url_normalizer.rb', line 11

def normalize(url, base_url = nil)
  uri = parse_uri(url, base_url)
  return nil unless uri&.host

  normalize_uri_parts(uri)
  uri.to_s
rescue URI::InvalidURIError
  nil
end

.normalize_path(path) ⇒ Object



47
48
49
50
51
52
53
# File 'lib/rubycrawl/url_normalizer.rb', line 47

def normalize_path(path)
  return '/' if path.nil? || path.empty?

  # Remove trailing slash except for root
  path = path.chomp('/') if path.length > 1
  path
end

.normalize_query(query) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/rubycrawl/url_normalizer.rb', line 55

def normalize_query(query)
  return nil if query.nil? || query.empty?

  # Remove tracking params
  tracking_params = %w[utm_source utm_medium utm_campaign utm_term utm_content fbclid gclid]
  params = URI.decode_www_form(query).reject { |k, _| tracking_params.include?(k.downcase) }
  return nil if params.empty?

  URI.encode_www_form(params.sort)
rescue ArgumentError
  query
end

.normalize_uri_parts(uri) ⇒ Object



21
22
23
24
25
26
27
# File 'lib/rubycrawl/url_normalizer.rb', line 21

def normalize_uri_parts(uri)
  uri.scheme = uri.scheme&.downcase
  uri.host = uri.host&.downcase
  uri.path = normalize_path(uri.path)
  uri.fragment = nil
  uri.query = normalize_query(uri.query)
end

.parse_uri(url, base_url) ⇒ Object



37
38
39
40
41
42
43
44
45
# File 'lib/rubycrawl/url_normalizer.rb', line 37

def parse_uri(url, base_url)
  uri = URI.parse(url)
  return uri if uri.absolute?
  return nil unless base_url

  URI.join(base_url, url)
rescue URI::InvalidURIError
  nil
end

.same_host?(url, base_url) ⇒ Boolean



29
30
31
32
33
34
35
# File 'lib/rubycrawl/url_normalizer.rb', line 29

def same_host?(url, base_url)
  uri = URI.parse(url)
  base_uri = URI.parse(base_url)
  uri.host&.downcase == base_uri.host&.downcase
rescue URI::InvalidURIError
  false
end