Class: Tansaku::Crawler
- Inherits:
-
Object
- Object
- Tansaku::Crawler
- Defined in:
- lib/tansaku/crawler.rb
Constant Summary collapse
- DEFAULT_USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
Instance Attribute Summary collapse
-
#additional_list ⇒ Object
readonly
Returns the value of attribute additional_list.
-
#base_uri ⇒ Object
readonly
Returns the value of attribute base_uri.
-
#host ⇒ Object
readonly
Returns the value of attribute host.
-
#max_concurrent_requests ⇒ Object
readonly
Returns the value of attribute max_concurrent_requests.
-
#type ⇒ Object
readonly
Returns the value of attribute type.
-
#user_agent ⇒ Object
readonly
Returns the value of attribute user_agent.
Instance Method Summary collapse
- #crawl ⇒ Object
-
#initialize(base_uri, additional_list: nil, host: nil, max_concurrent_requests: Etc.nprocessors, type: "all", user_agent: DEFAULT_USER_AGENT) ⇒ Crawler
constructor
A new instance of Crawler.
Constructor Details
#initialize(base_uri, additional_list: nil, host: nil, max_concurrent_requests: Etc.nprocessors, type: "all", user_agent: DEFAULT_USER_AGENT) ⇒ Crawler
Returns a new instance of Crawler.
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/tansaku/crawler.rb', line 25 def initialize( base_uri, additional_list: nil, host: nil, max_concurrent_requests: Etc.nprocessors, type: "all", user_agent: DEFAULT_USER_AGENT ) @base_uri = URI.parse(base_uri) raise ArgumentError, "Invalid URI" unless valid_uri? @additional_list = additional_list unless additional_list.nil? raise ArgumentError, "Invalid path" unless valid_path? end @host = host @max_concurrent_requests = max_concurrent_requests @type = type @user_agent = user_agent end |
Instance Attribute Details
#additional_list ⇒ Object (readonly)
Returns the value of attribute additional_list.
19 20 21 |
# File 'lib/tansaku/crawler.rb', line 19 def additional_list @additional_list end |
#base_uri ⇒ Object (readonly)
Returns the value of attribute base_uri.
17 18 19 |
# File 'lib/tansaku/crawler.rb', line 17 def base_uri @base_uri end |
#host ⇒ Object (readonly)
Returns the value of attribute host.
20 21 22 |
# File 'lib/tansaku/crawler.rb', line 20 def host @host end |
#max_concurrent_requests ⇒ Object (readonly)
Returns the value of attribute max_concurrent_requests.
21 22 23 |
# File 'lib/tansaku/crawler.rb', line 21 def max_concurrent_requests @max_concurrent_requests end |
#type ⇒ Object (readonly)
Returns the value of attribute type.
22 23 24 |
# File 'lib/tansaku/crawler.rb', line 22 def type @type end |
#user_agent ⇒ Object (readonly)
Returns the value of attribute user_agent.
23 24 25 |
# File 'lib/tansaku/crawler.rb', line 23 def user_agent @user_agent end |
Instance Method Details
#crawl ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/tansaku/crawler.rb', line 47 def crawl results = {} Async do = Async::Barrier.new semaphore = Async::Semaphore.new(max_concurrent_requests, parent: ) internet = Async::HTTP::Internet.new paths.each do |path| semaphore.async do url = url_for(path) res = internet.head(url, default_request_headers) results[url] = res.status if online?(res.status) rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError next end end .wait end results end |