Class: HTMLProofer::UrlValidator
- Inherits:
-
Object
- Object
- HTMLProofer::UrlValidator
- Includes:
- Utils
- Defined in:
- lib/html-proofer/url_validator.rb
Instance Attribute Summary collapse
-
#before_request ⇒ Object
writeonly
Sets the attribute before_request.
-
#external_urls ⇒ Object
readonly
Returns the value of attribute external_urls.
Instance Method Summary collapse
- #add_external_issue(filenames, desc, status = nil) ⇒ Object
-
#check_hash_in_2xx_response(href, effective_url, response, filenames) ⇒ Object
Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page.
- #clean_url(href) ⇒ Object
- #establish_queue(external_urls) ⇒ Object
-
#external_link_checker(external_urls) ⇒ Object
Proofer runs faster if we pull out all the external URLs and run the checks at the end.
- #extract_domain_path(uri) ⇒ Object
- #handle_failure(href, filenames, response_code, return_message) ⇒ Object
- #handle_timeout(href, filenames, response_code) ⇒ Object
-
#hash?(url) ⇒ Boolean
Does the URL have a hash?.
-
#initialize(logger, external_urls, options) ⇒ UrlValidator
constructor
A new instance of UrlValidator.
- #load_cache ⇒ Object
-
#new_url_query_values?(uri, paths_with_queries) ⇒ Boolean
remember queries we’ve seen, ignore future ones.
- #queue_request(method, href, filenames) ⇒ Object
- #remove_query_values ⇒ Object
- #response_handler(response, filenames) ⇒ Object
- #run ⇒ Object
Methods included from Utils
#create_nokogiri, #pluralize, #swap
Constructor Details
#initialize(logger, external_urls, options) ⇒ UrlValidator
15 16 17 18 19 20 21 22 23 |
# File 'lib/html-proofer/url_validator.rb', line 15 def initialize(logger, external_urls, ) @logger = logger @external_urls = external_urls @failed_tests = [] @options = @hydra = Typhoeus::Hydra.new(@options[:hydra]) @cache = Cache.new(@logger, @options[:cache]) @before_request = [] end |
Instance Attribute Details
#before_request=(value) ⇒ Object (writeonly)
Sets the attribute before_request
13 14 15 |
# File 'lib/html-proofer/url_validator.rb', line 13 def before_request=(value) @before_request = value end |
#external_urls ⇒ Object (readonly)
Returns the value of attribute external_urls.
12 13 14 |
# File 'lib/html-proofer/url_validator.rb', line 12 def external_urls @external_urls end |
Instance Method Details
#add_external_issue(filenames, desc, status = nil) ⇒ Object
230 231 232 233 234 235 236 237 |
# File 'lib/html-proofer/url_validator.rb', line 230 def add_external_issue(filenames, desc, status = nil) # possible if we're checking an array of links if filenames.nil? @failed_tests << Issue.new('', desc, status: status) else filenames.each { |f| @failed_tests << Issue.new(f, desc, status: status) } end end |
#check_hash_in_2xx_response(href, effective_url, response, filenames) ⇒ Object
Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/html-proofer/url_validator.rb', line 186 def check_hash_in_2xx_response(href, effective_url, response, filenames) return false if @options[:only_4xx] return false unless @options[:check_external_hash] return false unless (hash = hash?(href)) body_doc = create_nokogiri(response.body) unencoded_hash = Addressable::URI.unescape(hash) xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])] # user-content is a special addition by GitHub. if URI.parse(href).host =~ /github\.com/i xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])] # when linking to a file on GitHub, like #L12-L34, only the first "L" portion # will be identified as a linkable portion xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/ end return unless body_doc.xpath(xpath.join('|')).empty? msg = "External link #{href} failed: #{effective_url} exists, but the hash '#{hash}' does not" add_external_issue(filenames, msg, response.code) @cache.add(href, filenames, response.code, msg) true end |
#clean_url(href) ⇒ Object
129 130 131 132 133 134 135 136 137 |
# File 'lib/html-proofer/url_validator.rb', line 129 def clean_url(href) # catch any obvious issues, like strings in port numbers parsed = Addressable::URI.parse(href) if href !~ /^([!#{$&}-;=?-\[\]_a-z~]|%[0-9a-fA-F]{2})+$/ parsed.normalize else href end end |
#establish_queue(external_urls) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/html-proofer/url_validator.rb', line 111 def establish_queue(external_urls) external_urls.each_pair do |url, filenames| url = begin clean_url(url) rescue URI::Error, Addressable::URI::InvalidURIError add_external_issue(filenames, "#{url} is an invalid URL") next end method = if hash?(url) && @options[:check_external_hash] :get else :head end queue_request(method, url, filenames) end end |
#external_link_checker(external_urls) ⇒ Object
Proofer runs faster if we pull out all the external URLs and run the checks at the end. Otherwise, we’re halting the consuming process for every file during process_files.
In addition, sorting the list lets libcurl keep connections to the same hosts alive.
Finally, we’ll first make a HEAD request, rather than GETing all the contents. If the HEAD fails, we’ll fall back to GET, as some servers are not configured for HEAD. If we’ve decided to check for hashes, we must do a GET–HEAD is not available as an option.
96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/html-proofer/url_validator.rb', line 96 def external_link_checker(external_urls) external_urls = Hash[external_urls.sort] count = external_urls.length check_text = pluralize(count, 'external link', 'external links') @logger.log :info, "Checking #{check_text}..." # Route log from Typhoeus/Ethon to our own logger Ethon.logger = @logger establish_queue(external_urls) @hydra.run end |
#extract_domain_path(uri) ⇒ Object
73 74 75 |
# File 'lib/html-proofer/url_validator.rb', line 73 def extract_domain_path(uri) uri.host + uri.path end |
#handle_failure(href, filenames, response_code, return_message) ⇒ Object
219 220 221 222 223 224 225 226 227 228 |
# File 'lib/html-proofer/url_validator.rb', line 219 def handle_failure(href, filenames, response_code, ) msg = "External link #{href} failed: response code #{response_code} means something's wrong. It's possible libcurl couldn't connect to the server or perhaps the request timed out. Sometimes, making too many requests at once also breaks things. Either way, the return message (if any) from the server is: #{}" @cache.add(href, filenames, 0, msg) return if @options[:only_4xx] add_external_issue(filenames, msg, response_code) end |
#handle_timeout(href, filenames, response_code) ⇒ Object
211 212 213 214 215 216 217 |
# File 'lib/html-proofer/url_validator.rb', line 211 def handle_timeout(href, filenames, response_code) msg = "External link #{href} failed: got a time out (response code #{response_code})" @cache.add(href, filenames, 0, msg) return if @options[:only_4xx] add_external_issue(filenames, msg, response_code) end |
#hash?(url) ⇒ Boolean
Does the URL have a hash?
240 241 242 243 244 |
# File 'lib/html-proofer/url_validator.rb', line 240 def hash?(url) URI.parse(url).fragment rescue URI::InvalidURIError false end |
#load_cache ⇒ Object
77 78 79 80 81 82 83 84 |
# File 'lib/html-proofer/url_validator.rb', line 77 def load_cache cache_count = @cache.size cache_text = pluralize(cache_count, 'link', 'links') @logger.log :info, "Found #{cache_text} in the cache..." @cache.retrieve_urls(@external_urls) end |
#new_url_query_values?(uri, paths_with_queries) ⇒ Boolean
remember queries we’ve seen, ignore future ones
59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/html-proofer/url_validator.rb', line 59 def new_url_query_values?(uri, paths_with_queries) queries = uri.query_values.keys.join('-') domain_path = extract_domain_path(uri) if paths_with_queries[domain_path].nil? paths_with_queries[domain_path] = [queries] true elsif !paths_with_queries[domain_path].include?(queries) paths_with_queries[domain_path] << queries true else false end end |
#queue_request(method, href, filenames) ⇒ Object
139 140 141 142 143 144 145 146 147 |
# File 'lib/html-proofer/url_validator.rb', line 139 def queue_request(method, href, filenames) opts = @options[:typhoeus].merge(method: method) request = Typhoeus::Request.new(href, opts) @before_request.each do |callback| callback.call(request) end request.on_complete { |response| response_handler(response, filenames) } @hydra.queue request end |
#remove_query_values ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/html-proofer/url_validator.rb', line 39 def remove_query_values return nil if @external_urls.nil? paths_with_queries = {} iterable_external_urls = @external_urls.dup @external_urls.each_key do |url| uri = begin Addressable::URI.parse(url) rescue URI::Error, Addressable::URI::InvalidURIError @logger.log :error, "#{url} is an invalid URL" nil end next if uri.nil? || uri.query.nil? iterable_external_urls.delete(url) unless new_url_query_values?(uri, paths_with_queries) end iterable_external_urls end |
#response_handler(response, filenames) ⇒ Object
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/html-proofer/url_validator.rb', line 149 def response_handler(response, filenames) effective_url = response.[:effective_url] href = response.request.base_url.to_s method = response.request.[:method] response_code = response.code response.body.delete!("\x00") debug_msg = if filenames.nil? "Received a #{response_code} for #{href}" else "Received a #{response_code} for #{href} in #{filenames.join(' ')}" end @logger.log :debug, debug_msg return if @options[:http_status_ignore].include?(response_code) if response_code.between?(200, 299) @cache.add(href, filenames, response_code) unless check_hash_in_2xx_response(href, effective_url, response, filenames) elsif response.timed_out? handle_timeout(href, filenames, response_code) elsif response_code.zero? handle_failure(effective_url, filenames, response_code, response.) elsif method == :head queue_request(:get, href, filenames) else return if @options[:only_4xx] && !response_code.between?(400, 499) # Received a non-successful http response. msg = "External link #{href} failed: #{response_code} #{response.}" add_external_issue(filenames, msg, response_code) @cache.add(href, filenames, response_code, msg) end end |
#run ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/html-proofer/url_validator.rb', line 25 def run @external_urls = remove_query_values if @cache.use_cache? urls_to_check = load_cache external_link_checker(urls_to_check) @cache.write else external_link_checker(@external_urls) end @failed_tests end |