Module: ActiveRecord::Validations::ClassMethods

Defined in:
lib/http_url_validation_improved.rb

Instance Method Summary collapse

Instance Method Details

#validates_http_domain(*attr_names) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/http_url_validation_improved.rb', line 140

def validates_http_domain(*attr_names)
  validates_each(attr_names) do |record, attr_name, value|
    # Set valid true on successful connect (all we need is one, one is all we need)
    failed = true
    possibilities = [value, "www."+value]
    possibilities.each do |url|
      begin
        temp = Socket.gethostbyname(url)
      rescue SocketError
        next
      end
      failed = false
      break
    end
    record.errors.add(attr_name, "cannot be resolved.") if failed
  end
end

#validates_http_url(*attr_names) ⇒ Object

Validates a URL.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/http_url_validation_improved.rb', line 10

def validates_http_url(*attr_names)
  configuration = {
    :message_not_accessible => "is not accessible when we tried the link",
    :message_wrong_content => "is not of the appropriate content type",
    :message_moved_permanently => "has moved permanently",
    :message_url_format => "is not formatted correctly. (Missing 'http://'?)"
  }
  configuration.update(attr_names.pop) if attr_names.last.is_a?(Hash)
  validates_each(attr_names, configuration) do |record, attr_name, value|

    # Ignore blank URLs, these can be validated with validates_presence_of
    if value.nil? or value.empty?
      next
    end

    begin
      moved_retry ||= false
      not_allowed_retry ||= false
      retry_without_headers ||= false
      # some domains will block requests that come in more frequently than 1 per second
      sleepy_domains = ['wikipedia.org']
      sleep_interval = 2 # 2 to be on the safe side
      must_sleep ||= false
      response = nil

      # resolve to url escaped version of URL
      # value = URI.escape(value)
      # updated to allow unicode values
      # escaping shouldn't be necessary
      must_sleep = sleepy_domains.select { |d| value.include?(d) }.size > 0


      url = Addressable::URI.parse(value)

      # Check Formatting
      # moved to use the URI library's logic
      # now allows ftp and other non-http(s) protocols
      # must have a protocol specified
      raise unless url.scheme
      # must have a domain name specified
      raise unless url.host

      url.path = "/" if url.path.length < 1
      http = Net::HTTP.new(url.host, (url.scheme == 'https') ? 443 : 80)
      if url.scheme == 'https'
        http.use_ssl = true
        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
      end
      headers = Object.const_defined?('SITE_URL') ? { "User-Agent" => "#{SITE_URL} link checking mechanism (http://github.com/kete/http_url_validation_improved) via Ruby Net/HTTP" } : { "User-Agent" => "Ruby Net/HTTp used for link checking mechanism (http://github.com/kete/http_url_validation_improved)" }
      response = if not_allowed_retry
                   sleep sleep_interval if must_sleep

                   if retry_without_headers
                     http.request_get(url.path) {|r|}
                   else
                     http.request_get(url.path, headers) {|r|}
                   end
                 else
                   http.request_head(url.path, headers)
                 end
      # response = not_allowed_retry ? http.request_get(url.path) {|r|} : http.request_head(url.path)
      # Comment out as you need to
      allowed_codes = [
                       Net::HTTPMovedPermanently,
                       Net::HTTPOK,
                       Net::HTTPCreated,
                       Net::HTTPAccepted,
                       Net::HTTPNonAuthoritativeInformation,
                       Net::HTTPPartialContent,
                       Net::HTTPFound,
                       Net::HTTPTemporaryRedirect,
                       Net::HTTPSeeOther
                      ]
      # If response is not allowed, raise an error
      raise unless allowed_codes.include?(response.class)
      # Check if the model requires a specific content type
      unless configuration[:content_type].nil?
        record.errors.add(attr_name, configuration[:message_wrong_content]) if response['content-type'].index(configuration[:content_type]).nil?
      end
    rescue
      # Has the page moved?
      if response.is_a?(Net::HTTPMovedPermanently)
        unless moved_retry
          moved_retry = true
          value += "/" # In case webserver is just adding a /
          retry
        else
          record.errors.add(attr_name, configuration[:message_moved_permanently])
        end
      elsif response.is_a?(Net::HTTPMethodNotAllowed) || response.is_a?(Net::HTTPInternalServerError)
        unless not_allowed_retry
          # Retry with a GET
          not_allowed_retry = true
          retry
        else
          if response.is_a?(Net::HTTPInternalServerError)
            record.errors.add(attr_name, configuration[:message_not_accessible]+". The site link in question has had a problem. Please raise the issue with them and let them know that requests to the link break when coming from the automatic link checking mechanism on this site.")
          else
            record.errors.add(attr_name, configuration[:message_not_accessible]+" (GET method not allowed)")
          end
        end
      elsif response.is_a?(Net::HTTPForbidden)
        # handle requests where particular variants are forbidden
        unless (not_allowed_retry && retry_without_headers)
          unless not_allowed_retry
            # try a full request GET first (rather than just head)
            not_allowed_retry = true
            retry
          else
            # try again but without headers (sometimes site refuse custom headers)
            # for now, at least, this does a full GET request (rather than just head)
            retry_without_headers = true
            retry
          end
        else
          record.errors.add(attr_name, configuration[:message_not_accessible] + ". The website says the URL is Forbidden.")
        end
      else
        # if response is nil, then it's a format issue
        if response.nil?
          record.errors.add(attr_name, configuration[:message_url_format])
        else
          # Just Plain non-accessible
          record.errors.add(attr_name, configuration[:message_not_accessible]+". This is what the website in question returned to us: "+response.class.to_s)
        end
      end
    end
  end
end