Module: RDig::UrlFilters

Defined in:
lib/rdig/url_filters.rb

Defined Under Namespace

Classes: DepthFilter, FilterChain, PathExclusionFilter, PathInclusionFilter, PatternFilter, UrlExclusionFilter, UrlInclusionFilter, VisitedUrlFilter

Class Method Summary collapse

Class Method Details

.fix_relative_uri(document) ⇒ Object

expands href=“/path/xyz.html”, href=“affe.html” and href=“../lala.html” to full urls



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/rdig/url_filters.rb', line 168

def UrlFilters.fix_relative_uri(document)
  #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
  ref = document.referring_uri
  return document unless ref
  uri = document.uri
  uri.scheme = ref.scheme unless uri.scheme
  uri.host = ref.host unless uri.host
  uri.port = ref.port unless uri.port || ref.port==ref.default_port
  uri.path = ref.path unless uri.path
  
  old_uri_path = uri.path
  if uri.path !~ /^\// || uri.path =~ /^\.\./
    ref_path = ref.path || '/'
    ref_path << '/' if ref_path.empty?
    uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
  end
  uri.path = uri.path.sub( /\/[^\/]*\/\.\./, "" ) if old_uri_path =~ /^\.\./
  return document
rescue
  p document
  p document.uri
end

.hostname_filter(document, include_hosts) ⇒ Object

filter uris by hostname list. With a nil or empty list all documents may pass this filter.



193
194
195
196
# File 'lib/rdig/url_filters.rb', line 193

def UrlFilters.hostname_filter(document, include_hosts)
  #RDig.logger.debug "hostname_filter: #{include_hosts}"
  return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
end

.maximum_redirect_filter(document, max_redirects) ⇒ Object

checks redirect count of the given document takes it out of the chain if number of redirections exceeds the max_redirects setting



161
162
163
164
# File 'lib/rdig/url_filters.rb', line 161

def UrlFilters.maximum_redirect_filter(document, max_redirects)
  return nil if document.respond_to?(:redirections) && document.redirections > max_redirects
  return document
end

.normalize_uri(document, cfg) ⇒ Object



198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/rdig/url_filters.rb', line 198

def UrlFilters.normalize_uri(document, cfg)
  document.uri.fragment = nil
  # document.uri.query = nil
  # trailing slash handling
  if document.uri.path =~ /\/$/
    # append index document if configured
    if cfg.index_document
      document.uri.path << RDig.config.index_document
    elsif cfg.remove_trailing_slash
     document.uri.path.gsub! /\/$/, ''
    end
  end
  return document
end

.scheme_filter_file(document) ⇒ Object



213
214
215
216
# File 'lib/rdig/url_filters.rb', line 213

def UrlFilters.scheme_filter_file(document)
  return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^file$/i)
  nil
end

.scheme_filter_http(document) ⇒ Object



217
218
219
220
# File 'lib/rdig/url_filters.rb', line 217

def UrlFilters.scheme_filter_http(document)
  return document if (document.uri.scheme.nil? || document.uri.scheme =~ /^https?$/i)
  nil
end