Class: HTML::Proofer::UrlValidator
- Inherits:
-
Object
- Object
- HTML::Proofer::UrlValidator
- Includes:
- Utils
- Defined in:
- lib/html/proofer/url_validator.rb
Constant Summary
Constants included from Utils
HTML::Proofer::Utils::STORAGE_DIR
Instance Attribute Summary collapse
-
#external_urls ⇒ Object
Returns the value of attribute external_urls.
-
#hydra ⇒ Object
Returns the value of attribute hydra.
-
#iterable_external_urls ⇒ Object
Returns the value of attribute iterable_external_urls.
-
#logger ⇒ Object
Returns the value of attribute logger.
Instance Method Summary collapse
- #add_external_issue(filenames, desc, status = nil) ⇒ Object
-
#check_hash_in_2xx_response(href, effective_url, response, filenames) ⇒ Object
Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page.
- #clean_url(href) ⇒ Object
-
#external_link_checker(external_urls) ⇒ Object
Proofer runs faster if we pull out all the external URLs and run the checks at the end.
- #extract_domain_path(uri) ⇒ Object
- #handle_failure(href, filenames, response_code) ⇒ Object
- #handle_timeout(href, filenames, response_code) ⇒ Object
- #hash?(url) ⇒ Boolean
-
#initialize(logger, external_urls, options, typhoeus_opts, hydra_opts) ⇒ UrlValidator
constructor
A new instance of UrlValidator.
-
#new_url_query_values?(uri) ⇒ Boolean
remember queries we’ve seen, ignore future ones.
- #queue_request(method, href, filenames) ⇒ Object
- #remove_query_values ⇒ Object
- #response_handler(response, filenames) ⇒ Object
- #run ⇒ Object
- #url_processor(external_urls) ⇒ Object
Methods included from Utils
clean_content, create_nokogiri, #pluralize, swap
Constructor Details
#initialize(logger, external_urls, options, typhoeus_opts, hydra_opts) ⇒ UrlValidator
Returns a new instance of UrlValidator.
13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/html/proofer/url_validator.rb', line 13 def initialize(logger, external_urls, , typhoeus_opts, hydra_opts) @logger = logger @external_urls = external_urls @iterable_external_urls = {} @failed_tests = [] @options = @hydra = Typhoeus::Hydra.new(hydra_opts) @typhoeus_opts = typhoeus_opts @external_domain_paths_with_queries = {} @cache = Cache.new(@logger, @options[:cache]) end |
Instance Attribute Details
#external_urls ⇒ Object
Returns the value of attribute external_urls.
11 12 13 |
# File 'lib/html/proofer/url_validator.rb', line 11 def external_urls @external_urls end |
#hydra ⇒ Object
Returns the value of attribute hydra.
11 12 13 |
# File 'lib/html/proofer/url_validator.rb', line 11 def hydra @hydra end |
#iterable_external_urls ⇒ Object
Returns the value of attribute iterable_external_urls.
11 12 13 |
# File 'lib/html/proofer/url_validator.rb', line 11 def iterable_external_urls @iterable_external_urls end |
#logger ⇒ Object
Returns the value of attribute logger.
11 12 13 |
# File 'lib/html/proofer/url_validator.rb', line 11 def logger @logger end |
Instance Method Details
#add_external_issue(filenames, desc, status = nil) ⇒ Object
207 208 209 210 211 212 213 |
# File 'lib/html/proofer/url_validator.rb', line 207 def add_external_issue(filenames, desc, status = nil) if filenames.nil? @failed_tests << CheckRunner::Issue.new('', desc, nil, status) else filenames.each { |f| @failed_tests << CheckRunner::Issue.new(f, desc, nil, status) } end end |
#check_hash_in_2xx_response(href, effective_url, response, filenames) ⇒ Object
Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/html/proofer/url_validator.rb', line 173 def check_hash_in_2xx_response(href, effective_url, response, filenames) return if @options[:only_4xx] return unless @options[:check_external_hash] return unless (hash = hash?(href)) body_doc = create_nokogiri(response.body) # user-content is a special addition by GitHub. xpath = %(//*[@name="#{hash}"]|//*[@id="#{hash}"]) if URI.parse(href).host.match(/github\.com/i) xpath << %(|//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"]) end return unless body_doc.xpath(xpath).empty? msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not" add_external_issue(filenames, msg, response.code) @cache.add(href, filenames, response.code, msg) end |
#clean_url(href) ⇒ Object
133 134 135 |
# File 'lib/html/proofer/url_validator.rb', line 133 def clean_url(href) Addressable::URI.parse(href).normalize end |
#external_link_checker(external_urls) ⇒ Object
Proofer runs faster if we pull out all the external URLs and run the checks at the end. Otherwise, we’re halting the consuming process for every file during the check_directory_of_files process.
In addition, sorting the list lets libcurl keep connections to the same hosts alive.
Finally, we’ll first make a HEAD request, rather than GETing all the contents. If the HEAD fails, we’ll fall back to GET, as some servers are not configured for HEAD. If we’ve decided to check for hashes, we must do a GET–HEAD is not an option.
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/html/proofer/url_validator.rb', line 99 def external_link_checker(external_urls) external_urls = Hash[external_urls.sort] count = external_urls.length check_text = pluralize(count, 'external link', 'external links') logger.log :info, :blue, "Checking #{check_text}..." Ethon.logger = logger # log from Typhoeus/Ethon url_processor(external_urls) logger.log :debug, :yellow, "Running requests for:" logger.log :debug, :yellow, "###\n" + external_urls.keys.join("\n") + "\n###" hydra.run end |
#extract_domain_path(uri) ⇒ Object
85 86 87 |
# File 'lib/html/proofer/url_validator.rb', line 85 def extract_domain_path(uri) uri.host + uri.path end |
#handle_failure(href, filenames, response_code) ⇒ Object
200 201 202 203 204 205 |
# File 'lib/html/proofer/url_validator.rb', line 200 def handle_failure(href, filenames, response_code) msg = "External link #{href} failed: response code #{response_code} means something's wrong" @cache.add(href, filenames, 0, msg) return if @options[:only_4xx] add_external_issue(filenames, msg, response_code) end |
#handle_timeout(href, filenames, response_code) ⇒ Object
193 194 195 196 197 198 |
# File 'lib/html/proofer/url_validator.rb', line 193 def handle_timeout(href, filenames, response_code) msg = "External link #{href} failed: got a time out (response code #{response_code})" @cache.add(href, filenames, 0, msg) return if @options[:only_4xx] add_external_issue(filenames, msg, response_code) end |
#hash?(url) ⇒ Boolean
215 216 217 218 219 |
# File 'lib/html/proofer/url_validator.rb', line 215 def hash?(url) URI.parse(url).fragment rescue URI::InvalidURIError nil end |
#new_url_query_values?(uri) ⇒ Boolean
remember queries we’ve seen, ignore future ones
71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/html/proofer/url_validator.rb', line 71 def new_url_query_values?(uri) queries = uri.query_values.keys.join('-') domain_path = extract_domain_path(uri) if @external_domain_paths_with_queries[domain_path].nil? @external_domain_paths_with_queries[domain_path] = [queries] true elsif !@external_domain_paths_with_queries[domain_path].include?(queries) @external_domain_paths_with_queries[domain_path] << queries true else false end end |
#queue_request(method, href, filenames) ⇒ Object
137 138 139 140 141 |
# File 'lib/html/proofer/url_validator.rb', line 137 def queue_request(method, href, filenames) request = Typhoeus::Request.new(href, @typhoeus_opts.merge({ :method => method })) request.on_complete { |response| response_handler(response, filenames) } hydra.queue request end |
#remove_query_values ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/html/proofer/url_validator.rb', line 54 def remove_query_values return nil if @external_urls.nil? iterable_external_urls = @external_urls.dup @external_urls.keys.each do |url| uri = begin Addressable::URI.parse(url) rescue URI::Error, Addressable::URI::InvalidURIError @logger.log :error, :red, "#{url} is an invalid URL" nil end next if uri.nil? || uri.query.nil? iterable_external_urls.delete(url) unless new_url_query_values?(uri) end iterable_external_urls end |
#response_handler(response, filenames) ⇒ Object
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/html/proofer/url_validator.rb', line 143 def response_handler(response, filenames) effective_url = response.[:effective_url] href = response.request.base_url.to_s method = response.request.[:method] response_code = response.code debug_msg = "Received a #{response_code} for #{href}" debug_msg << " in #{filenames.join(' ')}" unless filenames.nil? logger.log :debug, :yellow, debug_msg if response_code.between?(200, 299) check_hash_in_2xx_response(href, effective_url, response, filenames) @cache.add(href, filenames, response_code) elsif response.timed_out? handle_timeout(href, filenames, response_code) elsif response_code == 0 handle_failure(href, filenames, response_code) elsif method == :head queue_request(:get, href, filenames) else return if @options[:only_4xx] && !response_code.between?(400, 499) # Received a non-successful http response. msg = "External link #{href} failed: #{response_code} #{response.}" add_external_issue(filenames, msg, response_code) @cache.add(href, filenames, response_code, msg) end end |
#run ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/html/proofer/url_validator.rb', line 25 def run @iterable_external_urls = remove_query_values if @cache.exists && @cache.load cache_count = @cache.cache_log.length cache_text = pluralize(cache_count, 'link', 'links') logger.log :info, :blue, "Found #{cache_text} in the cache..." urls_to_check = @cache.detect_url_changes(@iterable_external_urls) @cache.cache_log.each_pair do |url, cache| if @cache.within_timeframe?(cache['time']) next if cache['message'].empty? # these were successes to skip urls_to_check[url] = cache['filenames'] # these are failures to retry else urls_to_check[url] = cache['filenames'] # pass or fail, recheck expired links end end external_link_checker(urls_to_check) else external_link_checker(@iterable_external_urls) end @cache.write @failed_tests end |
#url_processor(external_urls) ⇒ Object
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/html/proofer/url_validator.rb', line 116 def url_processor(external_urls) external_urls.each_pair do |href, filenames| href = begin clean_url(href) rescue URI::Error, Addressable::URI::InvalidURIError add_external_issue(filenames, "#{href} is an invalid URL") next end if hash?(href) && @options[:check_external_hash] queue_request(:get, href, filenames) else queue_request(:head, href, filenames) end end end |