Class: WebChecker

Inherits:
Object
  • Object
show all
Defined in:
lib/web-checker.rb,
lib/web-checker/version.rb

Defined Under Namespace

Classes: Error

Constant Summary collapse

IgnoreErrors =
%Q{
  <table> lacks "summary" attribute
  <img> lacks "alt" attribute
  <form> proprietary attribute "novalidate"
  <input> attribute "type" has invalid value "email"
  <input> attribute "tabindex" has invalid value "-1"
  <input> proprietary attribute "border"
  trimming empty <p>
  <iframe> proprietary attribute "allowfullscreen"
}.split(/\n/).map(&:strip)
LinkElementsXPath =
'//@href | //@src'
SchemasDir =
Path.new(__FILE__).dirname / 'web-checker' / 'schemas'
Schemas =
{
  'feed' => SchemasDir / 'atom.xsd',
  'urlset' => SchemasDir / 'sitemap.xsd',
}
VERSION =
'0.4'

Instance Method Summary collapse

Constructor Details

#initialize(site_uri:, site_dir:) ⇒ WebChecker

Returns a new instance of WebChecker.



28
29
30
31
32
33
# File 'lib/web-checker.rb', line 28

def initialize(site_uri:, site_dir:)
  @site_uri = Addressable::URI.parse(site_uri)
  @site_dir = Path.new(site_dir)
  @schemas = {}
  @visited = {}
end

Instance Method Details

#checkObject



35
36
37
38
39
# File 'lib/web-checker.rb', line 35

def check
  # get/parse robots
  # get/parse sitemap
  check_uri(@site_uri)
end

#check_css(uri, css) ⇒ Object



137
138
139
140
141
# File 'lib/web-checker.rb', line 137

def check_css(uri, css)
  css.gsub(/\burl\(\s*["'](.*?)["']\s*\)/) do
    check_uri(uri + $1)
  end
end

#check_html(uri, html) ⇒ Object



75
76
77
78
# File 'lib/web-checker.rb', line 75

def check_html(uri, html)
  check_html_tidy(uri, html)
  check_html_nokogiri(uri, html)
end

#check_html_nokogiri(uri, html) ⇒ Object



104
105
106
107
108
109
110
111
112
# File 'lib/web-checker.rb', line 104

def check_html_nokogiri(uri, html)
  doc_class = (html =~ /<!DOCTYPE html>/i) ? Nokogiri::HTML5 : Nokogiri::HTML
  doc = doc_class.parse(html) { |config| config.strict }
  unless doc.errors.empty?
    show_errors(doc.errors)
    raise Error, "HTML parsing failed (via Nokogiri)"
  end
  doc.xpath(LinkElementsXPath).each { |e| check_uri(uri + e.value) }
end

#check_html_tidy(uri, html) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/web-checker.rb', line 80

def check_html_tidy(uri, html)
  tmp_file = Path.tmpfile
  tmp_file.write(html)
  errors = %x{tidy -utf8 -quiet -errors #{tmp_file} 2>&1}.split("\n")
  errors = errors.map { |str|
    # line 82 column 1 - Warning: <table> lacks "summary" attribute
    str =~ /^line (\d+) column (\d+) - (.*?): (.*)$/ or raise "Can't parse error: #{str.inspect}"
    {
      msg: str,
      line: $1.to_i,
      column: $2.to_i,
      type: $3.downcase.to_sym,
      error: $4.strip,
    }
  }.reject { |e|
    IgnoreErrors.include?(e[:error])
  }
  unless errors.empty?
    warn "#{uri} has invalid HTML"
    show_errors(errors)
    raise Error, "HTML parsing failed (via Tidy)"
  end
end

#check_uri(uri) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/web-checker.rb', line 41

def check_uri(uri)
  uri = Addressable::URI.parse(uri)
  uri.normalize!
  return unless local?(uri) && !seen?(uri)
  # ;;warn "CHECKING: #{uri}"
  response = HTTP.get(uri)
  # ;;pp(response: response)
  @visited[uri] = true
  case response.code
  when 200...300
    body = response.body.to_s
    # ;;pp(body: body)
    case (type = response.headers['Content-Type'])
    when 'text/html'
      check_html(uri, body)
    when 'text/css'
      check_css(uri, body)
    when 'application/xml', 'text/xml'
      check_xml(uri, body)
    when 'image/jpeg', 'image/png', 'image/gif', 'application/javascript'
      # ignore
    else
      ;;warn "skipping unknown resource type: #{uri} (#{type})"
    end
  when 300...400
    redirect_uri = Addressable::URI.parse(response.headers['Location'])
    check_uri(uri + redirect_uri)
  when 404
    raise Error, "URI not found: #{uri}"
  else
    raise Error, "Bad status: #{response.inspect}"
  end
end

#check_xml(uri, xml) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/web-checker.rb', line 114

def check_xml(uri, xml)
  xml_doc = Nokogiri::XML::Document.parse(xml) { |config| config.strict }
  unless xml_doc.errors.empty?
    show_errors(xml_doc.errors)
    raise Error, "XML parsing failed"
  end
  root_name = xml_doc.root.name
  schema_file = Schemas[root_name] or raise Error, "Unknown schema: #{root_name.inspect}"
  schema = (@schemas[schema_file] ||= Nokogiri::XML::Schema(schema_file.open))
  validation_errors = schema.validate(xml_doc)
  unless validation_errors.empty?
    show_errors(validation_errors)
    raise Error, "XML validation failed"
  end
  xml_doc.xpath(LinkElementsXPath).each { |e| check_uri(uri + e.value) }
end

#local?(uri) ⇒ Boolean

Returns:

  • (Boolean)


143
144
145
146
# File 'lib/web-checker.rb', line 143

def local?(uri)
  (!uri.scheme && !uri.host) ||
    (uri.scheme == @site_uri.scheme && uri.host == @site_uri.host && uri.port == @site_uri.port)
end

#reportObject



152
153
154
155
156
157
158
159
# File 'lib/web-checker.rb', line 152

def report
  unless @files.empty?
    puts "\t" + "unreferenced files:"
    @files.sort.each do |path|
      puts "\t\t" + path.to_s
    end
  end
end

#seen?(uri) ⇒ Boolean

Returns:

  • (Boolean)


148
149
150
# File 'lib/web-checker.rb', line 148

def seen?(uri)
  @visited[uri]
end

#show_errors(errors) ⇒ Object



131
132
133
134
135
# File 'lib/web-checker.rb', line 131

def show_errors(errors)
  errors.each do |error|
    warn "#{error} [line #{error[:line]}, column #{error[:column]}]"
  end
end