Class: Recluse::Profile
- Inherits:
-
Object
- Object
- Recluse::Profile
- Defined in:
- lib/recluse/profile.rb
Overview
A profile is an atomic unit of rules for link checking.
Instance Attribute Summary collapse
-
#blacklist ⇒ Object
Array of URL patterns to check.
-
#email ⇒ Object
Used in the user-agent to identify who is running the crawler.
-
#internal_only ⇒ Object
Don’t check external URLs.
-
#name ⇒ Object
Identifier of the profile.
-
#redirect ⇒ Object
When enabled, will follow redirects and report only the status code for the page that is landed upon.
-
#results ⇒ Object
Hash of resulting HashTrees.
-
#roots ⇒ Object
Array of URLs to start spidering.
-
#scheme_squash ⇒ Object
HTTP and HTTPS schemed URLs are treated as equal.
-
#tasks ⇒ Object
The list of run tests.
-
#whitelist ⇒ Object
Array of exceptions to the blacklist.
Class Method Summary collapse
-
.load(profile) ⇒ Object
Loads profile by name.
Instance Method Summary collapse
-
#==(other) ⇒ Object
Test if profiles share the same configuration options.
-
#create_agent ⇒ Object
Create a
Mechanizeagent. -
#initialize(name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false) ⇒ Profile
constructor
Create a profile.
-
#save ⇒ Object
Saves profile to
~/.recluse/NAME.yaml. -
#test(key, options = {}) ⇒ Object
Runs test.
Constructor Details
#initialize(name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false) ⇒ Profile
Create a profile.
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/recluse/profile.rb', line 63 def initialize( name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false ) raise ProfileError, 'Profile needs roots for starting point' if roots.empty? @name = name @email = email @roots = roots.map do |root| if root.class == Link root else Link.new(root, :root) end end @blacklist = blacklist @whitelist = whitelist @internal_only = internal_only @scheme_squash = scheme_squash @redirect = redirect @tasks = {} @results = {} end |
Instance Attribute Details
#blacklist ⇒ Object
Array of URL patterns to check. Optional. Defaults to empty array.
35 36 37 |
# File 'lib/recluse/profile.rb', line 35 def blacklist @blacklist end |
#email ⇒ Object
Used in the user-agent to identify who is running the crawler. This is so that if there’s a problem with your spidering, you will be contacted and not the author of Recluse. Required.
31 32 33 |
# File 'lib/recluse/profile.rb', line 31 def email @email end |
#internal_only ⇒ Object
Don’t check external URLs. Optional. Defaults to false.
43 44 45 |
# File 'lib/recluse/profile.rb', line 43 def internal_only @internal_only end |
#name ⇒ Object
Identifier of the profile. Make sure that it is filename friendly. Required.
23 24 25 |
# File 'lib/recluse/profile.rb', line 23 def name @name end |
#redirect ⇒ Object
When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to false.
51 52 53 |
# File 'lib/recluse/profile.rb', line 51 def redirect @redirect end |
#results ⇒ Object
Hash of resulting HashTrees.
59 60 61 |
# File 'lib/recluse/profile.rb', line 59 def results @results end |
#roots ⇒ Object
Array of URLs to start spidering. Required.
27 28 29 |
# File 'lib/recluse/profile.rb', line 27 def roots @roots end |
#scheme_squash ⇒ Object
HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to false.
47 48 49 |
# File 'lib/recluse/profile.rb', line 47 def scheme_squash @scheme_squash end |
#tasks ⇒ Object
The list of run tests.
55 56 57 |
# File 'lib/recluse/profile.rb', line 55 def tasks @tasks end |
#whitelist ⇒ Object
Array of exceptions to the blacklist. Optional. Defaults to empty array.
39 40 41 |
# File 'lib/recluse/profile.rb', line 39 def whitelist @whitelist end |
Class Method Details
.load(profile) ⇒ Object
Loads profile by name.
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/recluse/profile.rb', line 151 def self.load(profile) uconf = UserConfig.new '.recluse' raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml") = uconf["#{profile}.yaml"] expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect] opts = {} expects.each do |e| estr = e.to_s opts[e] = [estr] if .key?(estr) && ![estr].nil? end ret = Profile.new( profile, (.key?('roots') && !['roots'].nil? ? ['roots'] : []), (.key?('email') && !['email'].nil? ? ['email'] : ''), **opts ) ret end |
Instance Method Details
#==(other) ⇒ Object
Test if profiles share the same configuration options.
140 141 142 143 144 145 146 147 |
# File 'lib/recluse/profile.rb', line 140 def ==(other) return false if other.class != self.class instance_variables.all? do |ivar| next true if ivar == '@results'.to_sym next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s) instance_variable_get(ivar) == other.instance_variable_get(ivar) end end |
#create_agent ⇒ Object
Create a Mechanize agent.
94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/recluse/profile.rb', line 94 def create_agent Mechanize.new do |a| a.ssl_version = 'TLSv1' a.verify_mode = OpenSSL::SSL::VERIFY_NONE a.max_history = nil a. = true a.keep_alive = false a.redirect_ok = @redirect a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}" end end |
#save ⇒ Object
Saves profile to ~/.recluse/NAME.yaml.
123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/recluse/profile.rb', line 123 def save uconf = UserConfig.new '.recluse' fname = "#{@name}.yaml" = uconf[fname] ['name'] = @name ['roots'] = @roots.map(&:to_s) ['email'] = @email ['blacklist'] = @blacklist ['whitelist'] = @whitelist ['internal_only'] = @internal_only ['scheme_squash'] = @scheme_squash ['redirect'] = @redirect .save end |
#test(key, options = {}) ⇒ Object
Runs test.
108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/recluse/profile.rb', line 108 def test(key, = {}) unless @results.key?(key) && @results[key].class == Recluse::HashTree @results[key] = Recluse::HashTree.new do |url1, url2| url1, url2 = url2, url1 if url2.length > url1.length # Detect if URL exists already, but just has a slash at end (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2)) end end @tasks[key] = Recluse::Tasks.get(key).new(self, .merge(results: @results[key])) @tasks[key].run @results[key] end |