Module: RubyCrawl::UrlNormalizer
- Defined in:
- lib/rubycrawl/url_normalizer.rb
Overview
Normalizes URLs for deduplication.
Class Method Summary collapse
- .normalize(url, base_url = nil) ⇒ Object
- .normalize_path(path) ⇒ Object
- .normalize_query(query) ⇒ Object
- .normalize_uri_parts(uri) ⇒ Object
- .parse_uri(url, base_url) ⇒ Object
- .same_host?(url, base_url) ⇒ Boolean
Class Method Details
.normalize(url, base_url = nil) ⇒ Object
11 12 13 14 15 16 17 18 19 |
# File 'lib/rubycrawl/url_normalizer.rb', line 11 def normalize(url, base_url = nil) uri = parse_uri(url, base_url) return nil unless uri&.host normalize_uri_parts(uri) uri.to_s rescue URI::InvalidURIError nil end |
.normalize_path(path) ⇒ Object
47 48 49 50 51 52 53 |
# File 'lib/rubycrawl/url_normalizer.rb', line 47 def normalize_path(path) return '/' if path.nil? || path.empty? # Remove trailing slash except for root path = path.chomp('/') if path.length > 1 path end |
.normalize_query(query) ⇒ Object
55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/rubycrawl/url_normalizer.rb', line 55 def normalize_query(query) return nil if query.nil? || query.empty? # Remove tracking params tracking_params = %w[utm_source utm_medium utm_campaign utm_term utm_content fbclid gclid] params = URI.decode_www_form(query).reject { |k, _| tracking_params.include?(k.downcase) } return nil if params.empty? URI.encode_www_form(params.sort) rescue ArgumentError query end |
.normalize_uri_parts(uri) ⇒ Object
21 22 23 24 25 26 27 |
# File 'lib/rubycrawl/url_normalizer.rb', line 21 def normalize_uri_parts(uri) uri.scheme = uri.scheme&.downcase uri.host = uri.host&.downcase uri.path = normalize_path(uri.path) uri.fragment = nil uri.query = normalize_query(uri.query) end |
.parse_uri(url, base_url) ⇒ Object
37 38 39 40 41 42 43 44 45 |
# File 'lib/rubycrawl/url_normalizer.rb', line 37 def parse_uri(url, base_url) uri = URI.parse(url) return uri if uri.absolute? return nil unless base_url URI.join(base_url, url) rescue URI::InvalidURIError nil end |
.same_host?(url, base_url) ⇒ Boolean
29 30 31 32 33 34 35 |
# File 'lib/rubycrawl/url_normalizer.rb', line 29 def same_host?(url, base_url) uri = URI.parse(url) base_uri = URI.parse(base_url) uri.host&.downcase == base_uri.host&.downcase rescue URI::InvalidURIError false end |