Class: LinkChecker

Inherits:
Object
  • Object
show all
Defined in:
lib/link-checker.rb

Overview

Checks all the links of a static website to make sure they’re all valid

Constant Summary collapse

HEADERS =
{
  "User-Agent" => "Mozilla/5.0 (Windows NT 6.1) " \
                  "AppleWebKit/537.36 (KHTML, like Gecko) " \
                  "Chrome/41.0.2228.0 Safari/537.36",
  "Accept" => "text/html," \
              "application/xhtml+xml," \
              "application/xml;" \
              "q=0.9,*/*;q=0.8",
  "Accept-Language" => "en-US,en;q=0.5",
  "DNT" => "1",
  "Upgrade-Insecure-Requests" => "1",
  "Pragma" => "no-cache",
  "Cache-Control" => "no-cache"
}.freeze
HREF =
/href="([^"\n]+)"/.freeze
ID =
/id="([^"\n]+)"/.freeze
HTML =
%w[.html .htm].freeze
SCHEMES =
%w[https http].freeze
DEFAULT_BASE_URL =
"/"
DEFAULT_SITE_FOLDER =
"_site"
DEFAULT_MODE =
"try_head"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ LinkChecker

Set default values for all the properties



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/link-checker.rb', line 40

def initialize(opts = {})
  @hostname = opts[:hostname]
  @baseurl = opts[:baseurl] || DEFAULT_BASE_URL
  @site_folder = opts[:site_folder] || DEFAULT_SITE_FOLDER

  update_skip_list(opts[:skip_list] || [])
  @ignore_fragments = opts[:ignore_fragments]

  @mode = opts[:mode] || DEFAULT_MODE
  @verbose = opts[:verbose]
  @fail_fast = opts[:fail_fast]

  abort_on_failure = opts[:abort_on_failure]
  @abort_on_failure = abort_on_failure.nil? ? true : abort_on_failure
end

Instance Attribute Details

#abort_on_failureObject

Returns the value of attribute abort_on_failure.



35
36
37
# File 'lib/link-checker.rb', line 35

def abort_on_failure
  @abort_on_failure
end

#baseurlObject

Returns the value of attribute baseurl.



35
36
37
# File 'lib/link-checker.rb', line 35

def baseurl
  @baseurl
end

#fail_fastObject

Returns the value of attribute fail_fast.



35
36
37
# File 'lib/link-checker.rb', line 35

def fail_fast
  @fail_fast
end

#filesObject

Find all files in the site folder



223
224
225
226
227
228
# File 'lib/link-checker.rb', line 223

def files
  return @files unless @files.nil?

  @files = Dir[File.join(@site_folder, "**/*")].select { |f| File.file?(f) }
  @files
end

#hostnameObject

Returns the value of attribute hostname.



35
36
37
# File 'lib/link-checker.rb', line 35

def hostname
  @hostname
end

#html_filesObject

Find all HTML files



251
252
253
254
255
# File 'lib/link-checker.rb', line 251

def html_files
  return @html_files if @html_files

  @html_files = files.filter { |file| html?(file) }
end

#ignore_fragmentsObject

Returns the value of attribute ignore_fragments.



35
36
37
# File 'lib/link-checker.rb', line 35

def ignore_fragments
  @ignore_fragments
end

Find all links in html_files The value returned by this method is formatted like so: {

uri without fragment: {
  uri's fragment: Set [
    "file containing this link"
  ]
}

}



266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# File 'lib/link-checker.rb', line 266

def links
  return @links if @links

  @links = {}
  html_files.each do |file|
    file_path = file_url(file)

    # For each link in the file
    uniq_file_matches(file, HREF).each do |link|
      begin
        uri = Addressable::URI.parse(link)
      rescue
        puts "Link \"#{uri}\" in file \"#{file}\" can't be parsed."
      end

      # Skip the emails and phone numbers URIs
      next if uri.site&.end_with?(":")
      # Skip the URIs with unknown schemes
      next unless uri.scheme.nil? || SCHEMES.include?(uri.scheme)

      # Set the URI's path to the file's valid link if the link is a
      # fragment of the current file
      uri.path = file_path if link.start_with?("#")
      uri.path = uri.path.dup

      # Remove the fragment from the URI and put it in a local variable
      fragment = uri.fragment.nil? || uri.fragment.empty? ? nil : uri.fragment
      uri.fragment = nil

      fragment = nil if @ignore_fragments

      # Get the link for the URI
      uri_fragments = @links[uri] ||= {}

      # Get the files for the fragment
      fragment_files = uri_fragments[fragment] ||= Set.new

      fragment_files << file
    end
  end
  @links
end

#modeObject

Returns the value of attribute mode.



35
36
37
# File 'lib/link-checker.rb', line 35

def mode
  @mode
end

#site_folderObject

Returns the value of attribute site_folder.



35
36
37
# File 'lib/link-checker.rb', line 35

def site_folder
  @site_folder
end

#skip_listObject

Returns the value of attribute skip_list.



35
36
37
# File 'lib/link-checker.rb', line 35

def skip_list
  @skip_list
end

#verboseObject

Returns the value of attribute verbose.



35
36
37
# File 'lib/link-checker.rb', line 35

def verbose
  @verbose
end

Class Method Details

.from_config(config) ⇒ Object

Initialize the link checker from a Jekyll configuration file



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/link-checker.rb', line 57

def self.from_config(config)
  opts = {}

  url = config["url"]
  if url
    uri = Addressable::URI.parse(url)
    opts[:hostname] = uri.hostname
  end

  opts[:baseurl] = config["baseurl"]
  opts[:site_folder] = config["destination"]

  link_checker_config = config["link-checker"]
  if link_checker_config
    opts[:skip_list] = link_checker_config["skip-list"]
    opts[:ignore_fragments] = link_checker_config["ignore-fragments"]
    opts[:mode] = link_checker_config["mode"]
    opts[:verbose] = link_checker_config["verbose"]
    opts[:fail_fast] = link_checker_config["fail-fast"]
    opts[:abort_on_failure] = link_checker_config["abort"]
  end

  LinkChecker.new(opts)
end

Instance Method Details

Checks all the links



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/link-checker.rb', line 110

def check_links
  # Make sure the configuration is valid
  abort "Invalid configuration" unless valid?

  # basepath = @baseurl ? Addressable::URI.parse(@baseurl).path : "/"

  conn = create_connection

  # Test each link
  error_count = 0
  i = 0
  prev_msg_size = 0
  links.each do |uri, fragments|
    i += 1
    if verbose
      prev_msg_size.times { print " " }
      msg = "#{uri} #{i}/#{links.size}"
      print "\r#{msg}\r"
      prev_msg_size = msg.size
    end

    # Skip the link if it's in the skip list
    next if @skip_list.include?(uri.to_s)

    error = false

    # If the link is internal
    if uri.hostname.nil? || uri.hostname == hostname
      uri.path.chomp!("/")

      # If the uri's path is valid
      valid_fragments = valid_links[uri.path]
      if valid_fragments
        fragments.each do |fragment, files|
          # Skip the base fragment
          next unless fragment

          next if valid_fragments.include?(fragment)

          error = true
          puts "Invalid fragment '#{fragment}' in link '#{uri}' " \
               "is present in:"
          files.each { |file| puts "\t#{file}" }
        end
      else
        error = true
        puts "Invalid internal link '#{uri}' is present in:"
        fragments.flat_map { |_, files| files }.uniq
                 .each { |file| puts "\t#{file}" }
      end
    elsif fragments.keys == [nil]
      begin
        status = make_request(conn, uri)
        error = !status_allowed?(status)
        if error
          puts "Request to #{uri} returned #{status} present in"
          fragments[nil].each { |file| puts "\t#{file}" }
        end
      rescue StandardError => e
        puts "Request to #{uri} produced the error #{e.class} present in"
        fragments[nil].each { |file| puts "\t#{file}" }
        puts e.message, "\n"
      end
    else
      begin
        response = get_request(conn, uri)
        status = response.status
        if status == 200
          valid_fragments = uniq_string_matches(response.body, ID)
          fragments.each do |fragment, files|
            unless valid_fragments.include?(fragment)
              puts "Invalid link to fragment '#{fragment}' present in: "
              files.each { |file| puts "\t#{file}" }
            end
          end
        else
          error = true
          puts "Request to #{link} in #{files} returned #{status}"
          error = true
          puts "Invalid internal link '#{link}' is present in:"
          fragments.flat_map { |_, files| files }.uniq
                   .each { |file| puts "\t#{file}" }
        end
      rescue StandardError => e
        puts "Request to #{uri} produced the error #{e.class} present in"
        fragments[nil].each { |file| puts "\t#{file}" }
        puts e.message, "\n"
      end
    end

    next unless error

    error_count += 1
    if fail_fast
      abort if abort_on_failure
      return nil
    end
  end

  puts if verbose

  if error_count != 0
    msg = "There were #{error_count} invalid links"
    if @abort_on_failure
    then abort msg
    else puts msg
    end
  end

  error_count
end

#update_skip_list(skip_list) ⇒ Object

Updates skip_list with the given argument. If the argument is an array, skip_list is set to the array. If the argument is a string, the argument will be interpreted as a filename where each line is a filename.



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/link-checker.rb', line 93

def update_skip_list(skip_list)
  if skip_list.is_a?(Array)
    @skip_list = skip_list
  elsif skip_list.is_a?(String)
    begin
      @skip_list = File.readlines(File.expand_path(skip_list)).map(&:strip)
    rescue StandardError => e
      warn "Couldn't read the skip list"
      raise e
    end
    @skip_list.reject!(&:empty?)
  else
    raise ArgumentError, "skip_list must be a String or an array of String"
  end
end

#valid?Boolean

Whether the options are valid

Returns:

  • (Boolean)


83
84
85
86
87
# File 'lib/link-checker.rb', line 83

def valid?
  return false unless @hostname

  true
end

Find all the valid links for the site The value returned by this method is formatted like so:

"path": [
  fragment
]



237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/link-checker.rb', line 237

def valid_links
  return @valid_links if @valid_links

  @valid_links = files.map do |file|
    fragments = []
    fragments = uniq_file_matches(file, ID) if html?(file) &&
                                               !@ignore_fragments

    [file_url(file), fragments]
  end
  @valid_links = @valid_links.to_h
end