Class: Tansaku::Crawler
- Inherits:
-
Object
- Object
- Tansaku::Crawler
- Defined in:
- lib/tansaku/crawler.rb
Constant Summary collapse
- DEFAULT_USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
Instance Attribute Summary collapse
-
#additional_list ⇒ Object
readonly
Returns the value of attribute additional_list.
- #base_uri ⇒ String readonly
- #body ⇒ String? readonly
- #ignore_certificate_errors ⇒ Boolean readonly
- #max_concurrent_requests ⇒ Integer readonly
- #method ⇒ String readonly
- #timeout ⇒ Float? readonly
- #type ⇒ String readonly
Instance Method Summary collapse
- #crawl ⇒ Object
-
#initialize(base_uri, additional_list: nil, headers: {}, method: "HEAD", body: nil, timeout: nil, max_concurrent_requests: nil, ignore_certificate_errors: false, type: "all") ⇒ Crawler
constructor
A new instance of Crawler.
Constructor Details
#initialize(base_uri, additional_list: nil, headers: {}, method: "HEAD", body: nil, timeout: nil, max_concurrent_requests: nil, ignore_certificate_errors: false, type: "all") ⇒ Crawler
Returns a new instance of Crawler.
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/tansaku/crawler.rb', line 39 def initialize( base_uri, additional_list: nil, headers: {}, method: "HEAD", body: nil, timeout: nil, max_concurrent_requests: nil, ignore_certificate_errors: false, type: "all" ) @base_uri = URI.parse(base_uri.downcase) raise ArgumentError, "Invalid URI" unless valid_uri? @additional_list = additional_list raise ArgumentError, "Invalid path" unless valid_additional_path? @method = method.upcase raise ArgumentError, "Invalid HTTP method" unless valid_method? @headers = headers @body = body @timeout = timeout.nil? ? nil : timeout.to_f @max_concurrent_requests = max_concurrent_requests || (Etc.nprocessors * 8) @ignore_certificate_errors = ignore_certificate_errors @type = type end |
Instance Attribute Details
#additional_list ⇒ Object (readonly)
Returns the value of attribute additional_list.
19 20 21 |
# File 'lib/tansaku/crawler.rb', line 19 def additional_list @additional_list end |
#base_uri ⇒ String (readonly)
17 18 19 |
# File 'lib/tansaku/crawler.rb', line 17 def base_uri @base_uri end |
#body ⇒ String? (readonly)
31 32 33 |
# File 'lib/tansaku/crawler.rb', line 31 def body @body end |
#ignore_certificate_errors ⇒ Boolean (readonly)
37 38 39 |
# File 'lib/tansaku/crawler.rb', line 37 def ignore_certificate_errors @ignore_certificate_errors end |
#max_concurrent_requests ⇒ Integer (readonly)
22 23 24 |
# File 'lib/tansaku/crawler.rb', line 22 def max_concurrent_requests @max_concurrent_requests end |
#method ⇒ String (readonly)
28 29 30 |
# File 'lib/tansaku/crawler.rb', line 28 def method @method end |
#timeout ⇒ Float? (readonly)
34 35 36 |
# File 'lib/tansaku/crawler.rb', line 34 def timeout @timeout end |
#type ⇒ String (readonly)
25 26 27 |
# File 'lib/tansaku/crawler.rb', line 25 def type @type end |
Instance Method Details
#crawl ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/tansaku/crawler.rb', line 71 def crawl results = {} log_conditions Async do |task| = Async::Barrier.new semaphore = Async::Semaphore.new(max_concurrent_requests, parent: ) internet = Internet.new paths.each do |path| semaphore.async do url = url_for(path) res = dispatch_http_request(task, internet, url) next unless online?(res.status) log = [method, url, res.status].join(",") Tansaku.logger.info(log) results[url] = res.status rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError next end end .wait end results end |