Class: KleosTest::LinksCollector
- Inherits:
-
Object
- Object
- KleosTest::LinksCollector
- Defined in:
- lib/kleos_test/links_collector.rb
Constant Summary collapse
- BASE_ADDRESS =
'https://www.kleos.ru'- @@unverified_inside_links =
['/']
- @@unverified_outside_links =
[]
- @@verified_links =
[]
- @@inside_links =
{ valid: [], invalid: [] }
- @@outside_links =
{ valid: [], invalid: [] }
- @@inside_downloads =
0
Class Method Summary collapse
- .inside_links ⇒ Object
- .invalid_inside_links ⇒ Object
- .invalid_outside_links ⇒ Object
- .outside_links ⇒ Object
- .unverified_inside_links ⇒ Object
- .valid_inside_links ⇒ Object
- .valid_outside_links ⇒ Object
Instance Method Summary collapse
- #download_inside_webpage ⇒ Object
- #download_outside_webpage(address, counter) ⇒ Object
- #extract_links(page) ⇒ Object
- #get_new_links ⇒ Object
- #get_target_link ⇒ Object
-
#initialize ⇒ LinksCollector
constructor
A new instance of LinksCollector.
- #refill_unverified_links(links) ⇒ Object
- #verify_outside_links ⇒ Object
Constructor Details
#initialize ⇒ LinksCollector
Returns a new instance of LinksCollector.
11 12 13 |
# File 'lib/kleos_test/links_collector.rb', line 11 def initialize @target = get_target_link end |
Class Method Details
.inside_links ⇒ Object
106 107 108 |
# File 'lib/kleos_test/links_collector.rb', line 106 def inside_links @@inside_links[:valid] | @@inside_links[:invalid] end |
.invalid_inside_links ⇒ Object
114 115 116 |
# File 'lib/kleos_test/links_collector.rb', line 114 def invalid_inside_links @@inside_links[:invalid] end |
.invalid_outside_links ⇒ Object
126 127 128 |
# File 'lib/kleos_test/links_collector.rb', line 126 def invalid_outside_links @@outside_links[:invalid] end |
.outside_links ⇒ Object
118 119 120 |
# File 'lib/kleos_test/links_collector.rb', line 118 def outside_links @@outside_links[:valid] | @@outside_links[:invalid] end |
.unverified_inside_links ⇒ Object
102 103 104 |
# File 'lib/kleos_test/links_collector.rb', line 102 def unverified_inside_links @@unverified_inside_links end |
.valid_inside_links ⇒ Object
110 111 112 |
# File 'lib/kleos_test/links_collector.rb', line 110 def valid_inside_links @@inside_links[:valid] end |
.valid_outside_links ⇒ Object
122 123 124 |
# File 'lib/kleos_test/links_collector.rb', line 122 def valid_outside_links @@outside_links[:valid] end |
Instance Method Details
#download_inside_webpage ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/kleos_test/links_collector.rb', line 43 def download_inside_webpage address = @target.include?('http') ? @target : BASE_ADDRESS + @target @@inside_downloads += 1 print "Downloading (#{@@inside_downloads}|#{@@unverified_inside_links.size})\ #{address}..." response = RestClient.get(address) puts "OK" [response.body, response.code] rescue RestClient::ExceptionWithResponse => e puts "ERROR" [e.response.body, e.response.code] rescue URI::InvalidURIError puts "ERROR" puts "BAD URI" ['fake body', 1] rescue puts "UNKNOWN ERROR" ['fake body', 1] end |
#download_outside_webpage(address, counter) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/kleos_test/links_collector.rb', line 63 def download_outside_webpage(address, counter) print "Downloading (#{counter}|#{@@unverified_outside_links.size - counter})\ #{address}..." response = RestClient.get(address) puts "OK" response.code rescue RestClient::ExceptionWithResponse => e puts "ERROR" e.response.code rescue URI::InvalidURIError puts "ERROR" puts "BAD URI" 1 rescue puts "UNKNOWN ERROR" 1 end |
#extract_links(page) ⇒ Object
81 82 83 84 85 86 87 |
# File 'lib/kleos_test/links_collector.rb', line 81 def extract_links(page) query = '//a[@href!=""]' query += '[not(starts-with(@href, "javascript:void(0)"))]' query += '[not(starts-with(@href, "#"))]' query += '[not(starts-with(@href, "mailto"))]' Nokogiri::HTML(page).xpath(query).map { |link| link.attribute('href').value } end |
#get_new_links ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/kleos_test/links_collector.rb', line 19 def get_new_links page, response_code = download_inside_webpage if response_code == 200 @@inside_links[:valid] << @target @@verified_links << @target links = extract_links(page) refill_unverified_links(links) else @@inside_links[:invalid] << @target @@verified_links << @target end end |
#get_target_link ⇒ Object
15 16 17 |
# File 'lib/kleos_test/links_collector.rb', line 15 def get_target_link @@unverified_inside_links.empty? ? '/' : @@unverified_inside_links.shift end |
#refill_unverified_links(links) ⇒ Object
89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/kleos_test/links_collector.rb', line 89 def refill_unverified_links(links) regexp = /^(\/|\?|https?:\/\/(www\.)?kleos\.ru)(?!\/forum)/ @@unverified_inside_links.concat(links.select do |link| link.match(regexp) && !( @@unverified_inside_links.include?(link) || @@verified_links.include?(link)) end.uniq) @@unverified_outside_links.concat( links.reject { |link| link.match(regexp) }).uniq! end |
#verify_outside_links ⇒ Object
32 33 34 35 36 37 38 39 40 41 |
# File 'lib/kleos_test/links_collector.rb', line 32 def verify_outside_links @@unverified_outside_links.each_with_index do |link, i| code = download_outside_webpage(link, i + 1) if code == 200 @@outside_links[:valid] << link else @@outside_links[:invalid] << link end end end |