Class: Recluse::Profile

Inherits:
Object
  • Object
show all
Defined in:
lib/recluse/profile.rb

Overview

A profile is an atomic unit of rules for link checking.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false) ⇒ Profile

Create a profile.

Raises:



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/recluse/profile.rb', line 58

def initialize(
    name,
    roots,
    email,
    blacklist: [],
    whitelist: [],
    internal_only: false,
    scheme_squash: false,
    redirect: false
)
  raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
  @name = name
  @email = email
  @roots = roots
  @blacklist = blacklist
  @whitelist = whitelist
  @internal_only = internal_only
  @scheme_squash = scheme_squash
  @redirect = redirect
  @results = HashTree.new do |url1, url2|
    url1, url2 = url2, url1 if url2.length > url1.length
    # Detect if URL exists already, but just has a slash at end
    (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
  end
end

Instance Attribute Details

#blacklistObject

Array of URL patterns to check. Optional. Defaults to empty array.



34
35
36
# File 'lib/recluse/profile.rb', line 34

def blacklist
  @blacklist
end

#emailObject

Used in the user-agent to identify who is running the crawler. This is so that if there’s a problem with your spidering, you will be contacted and not the author of Recluse. Required.



30
31
32
# File 'lib/recluse/profile.rb', line 30

def email
  @email
end

#internal_onlyObject

Don’t check external URLs. Optional. Defaults to false.



42
43
44
# File 'lib/recluse/profile.rb', line 42

def internal_only
  @internal_only
end

#nameObject

Identifier of the profile. Make sure that it is filename friendly. Required.



22
23
24
# File 'lib/recluse/profile.rb', line 22

def name
  @name
end

#redirectObject

When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to false.



54
55
56
# File 'lib/recluse/profile.rb', line 54

def redirect
  @redirect
end

#resultsObject

HashTree representation of results.



50
51
52
# File 'lib/recluse/profile.rb', line 50

def results
  @results
end

#rootsObject

Array of URLs to start spidering. Required.



26
27
28
# File 'lib/recluse/profile.rb', line 26

def roots
  @roots
end

#scheme_squashObject

HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to false.



46
47
48
# File 'lib/recluse/profile.rb', line 46

def scheme_squash
  @scheme_squash
end

#whitelistObject

Array of exceptions to the blacklist. Optional. Defaults to empty array.



38
39
40
# File 'lib/recluse/profile.rb', line 38

def whitelist
  @whitelist
end

Class Method Details

.load(profile) ⇒ Object

Loads profile by name.

Raises:



273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/recluse/profile.rb', line 273

def self.load(profile)
  uconf = UserConfig.new '.recluse'
  raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml")
  options = uconf["#{profile}.yaml"]
  expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect]
  opts = {}
  expects.each do |e|
    estr = e.to_s
    opts[e] = options[estr] if options.key?(estr) && !options[estr].nil?
  end
  ret = Profile.new(
    profile,
    (options.key?('roots') && !options['roots'].nil? ? options['roots'] : []),
    (options.key?('email') && !options['email'].nil? ? options['email'] : ''),
    **opts
  )
  ret
end

Instance Method Details

#==(other) ⇒ Object

Test if profiles share the same configuration options.



264
265
266
267
268
269
# File 'lib/recluse/profile.rb', line 264

def ==(other)
  return false if other.class != self.class
  instance_variables.all? do |ivar|
    ivar == '@results'.to_sym || instance_variable_get(ivar) == other.instance_variable_get(ivar)
  end
end

#assert(selectors, quiet: false) ⇒ Object

Asserts existence of CSS selectors.

Raises:



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/recluse/profile.rb', line 195

def assert(selectors, quiet: false)
  queue = @roots.map { |url| Link.new(url, :root) }
  addrroot = @roots.map { |url| Addressable::URI.parse url }
  raise ProfileError, 'No roots to start from' if queue.empty?
  agent = create_agent
  while queue.length >= 1
    element = queue.shift
    internal = element.internal?(addrroot)
    next unless element.run?(@blacklist, @whitelist) && internal && !@results.child?(element.absolute)
    if @scheme_squash
      alt = element.address
      alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
      next if @results.child?(alt.to_s)
    end
    @results.add_child element.absolute
    existence = nil
    result = Result.new 'idk', false
    begin
      page = agent.get element.absolute
      result.code = page.code
      if @redirect
        result_link = Link.new(page.uri.to_s, element.parent)
        next unless result_link.internal?(addrroot)
      end
      unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
        existence = {}
        selectors.each do |selector|
          existence[selector] = !page.css(selector).empty?
        end
        @results.set_child_value element.absolute, existence
        queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) }
      end
    rescue Mechanize::ResponseCodeError => code
      result.code = code.response_code
    rescue => e
      result.error = e
    end
    unless quiet
      if result.error != false
        puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
        puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
      elsif !existence.nil?
        existence.each do |selector, exists|
          puts "[#{@name.colorize(mode: :bold)}][#{selector.colorize(mode: :bold)}][#{exists.to_s.colorize(color: (exists ? :green : :red), mode: :bold)}] #{element.absolute}"
        end
      end
    end
  end
end

#create_agentObject

Create a Mechanize agent.



86
87
88
89
90
91
92
93
94
95
96
# File 'lib/recluse/profile.rb', line 86

def create_agent
  Mechanize.new do |a|
    a.ssl_version = 'TLSv1'
    a.verify_mode = OpenSSL::SSL::VERIFY_NONE
    a.max_history = nil
    a.follow_meta_refresh = true
    a.keep_alive = false
    a.redirect_ok = @redirect
    a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
  end
end

#find(glob, quiet: false) ⇒ Object

Find links matching glob patterns, starting from the roots. Overrides (but does not overwrite) internal_only behavior to true.

Raises:



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/recluse/profile.rb', line 148

def find(glob, quiet: false)
  queue = @roots.map { |url| Link.new(url, :root) }
  addrroot = @roots.map { |url| Addressable::URI.parse url }
  raise ProfileError, 'No roots to start from' if queue.empty?
  progress = ProgressBar.create(total: nil, format: '|%B|') unless quiet
  agent = create_agent
  while queue.length >= 1
    element = queue.shift
    match = element.match? glob
    if match
      @results.add element.absolute, element.parent
      progress.log "[#{@name.colorize(mode: :bold)}][#{'found'.colorize(color: :green, mode: :bold)}] #{element.parent} => #{element.absolute}" unless quiet
    end
    next unless element.run?(@blacklist, @whitelist)
    internal = element.internal?(addrroot)
    next unless internal
    next if @results.parent?(element.absolute)
    if @scheme_squash
      alt = element.address
      alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
      next if @results.parent?(alt.to_s)
    end
    @results.add_parent element.absolute
    result = Result.new 'idk', false
    begin
      page = agent.get element.absolute
      result.code = page.code
      if @redirect
        result_link = Link.new(page.uri.to_s, element.parent)
        next unless result_link.internal?(addrroot)
      end
      queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } unless (page.class == Mechanize::File) || (page.class == Mechanize::Image)
    rescue Mechanize::ResponseCodeError => code
      result.code = code.response_code
    rescue => e
      result.error = e
    end
    progress.increment unless quiet
    unless quiet || (result.error == false)
      progress.log "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}] #{element.absolute}"
      progress.log "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}"
    end
  end
end

#saveObject

Saves profile to ~/.recluse/NAME.yaml.



247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/recluse/profile.rb', line 247

def save
  uconf = UserConfig.new '.recluse'
  fname = "#{@name}.yaml"
  options = uconf[fname]
  options['name'] = @name
  options['roots'] = @roots
  options['email'] = @email
  options['blacklist'] = @blacklist
  options['whitelist'] = @whitelist
  options['internal_only'] = @internal_only
  options['scheme_squash'] = @scheme_squash
  options['redirect'] = @redirect
  options.save
end

#status(quiet: false) ⇒ Object

Starting from the roots, goes through each runnable link and records the referrer, the status code, and any errors. Results are saved in @results.

Raises:



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/recluse/profile.rb', line 101

def status(quiet: false)
  queue = @roots.map { |url| Link.new(url, :root) }
  addrroot = @roots.map { |url| Addressable::URI.parse url }
  raise ProfileError, 'No roots to start from' if queue.empty?
  agent = create_agent
  while queue.length >= 1
    element = queue.shift
    next unless element.run?(@blacklist, @whitelist)
    internal = element.internal?(addrroot)
    next if @internal_only && !internal
    if @results.child?(element.absolute)
      @results.add element.absolute, element.parent
      next
    end
    @results.add element.absolute, element.parent
    if @scheme_squash
      alt = element.address
      alt.scheme = alt.scheme == 'http' ? 'https' : 'http'
      if @results.child?(alt.to_s)
        @results.set_child_value element.absolute, @results.get_child_value(alt.to_s)
        next
      end
    end
    result = Result.new 'idk', false
    begin
      page = agent.get element.absolute
      result.code = page.code
      if @redirect
        result_link = Link.new(page.uri.to_s, element.parent)
        internal = result_link.internal?(addrroot)
      end
      queue += page.links.map { |link| Link.new(link.uri.to_s, element.absolute) } if internal && (page.class != Mechanize::File) && (page.class != Mechanize::Image)
    rescue Mechanize::ResponseCodeError => code
      result.code = code.response_code
    rescue => e
      result.error = e
    end
    @results.set_child_value element.absolute, result
    unless quiet
      puts "[#{@name.colorize(mode: :bold)}][#{result.code.colorize(color: result.color, mode: :bold)}][#{(internal ? 'internal' : 'external').colorize(mode: :bold)}] #{element.absolute}"
      puts "\a^ #{'Error'.colorize(mode: :bold, color: :red)}: #{result.error}" unless result.error == false
    end
  end
end