Class: Tansaku::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/tansaku/crawler.rb

Constant Summary collapse

DEFAULT_USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(base_uri, additional_list: nil, headers: {}, host: nil, max_concurrent_requests: nil, type: "all", user_agent: DEFAULT_USER_AGENT) ⇒ Crawler

Returns a new instance of Crawler.

Raises:

  • (ArgumentError)


26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/tansaku/crawler.rb', line 26

def initialize(
  base_uri,
  additional_list: nil,
  headers: {},
  host: nil,
  max_concurrent_requests: nil,
  type: "all",
  user_agent: DEFAULT_USER_AGENT
)
  @base_uri = URI.parse(base_uri)
  raise ArgumentError, "Invalid URI" unless valid_uri?

  @additional_list = additional_list
  unless additional_list.nil?
    raise ArgumentError, "Invalid path" unless valid_path?
  end

  @headers = headers
  @host = host
  @max_concurrent_requests = max_concurrent_requests || Etc.nprocessors * 8
  @type = type
  @user_agent = user_agent
end

Instance Attribute Details

#additional_listObject (readonly)

Returns the value of attribute additional_list.



19
20
21
# File 'lib/tansaku/crawler.rb', line 19

def additional_list
  @additional_list
end

#base_uriObject (readonly)

Returns the value of attribute base_uri.



17
18
19
# File 'lib/tansaku/crawler.rb', line 17

def base_uri
  @base_uri
end

#headersObject (readonly)

Returns the value of attribute headers.



20
21
22
# File 'lib/tansaku/crawler.rb', line 20

def headers
  @headers
end

#hostObject (readonly)

Returns the value of attribute host.



21
22
23
# File 'lib/tansaku/crawler.rb', line 21

def host
  @host
end

#max_concurrent_requestsObject (readonly)

Returns the value of attribute max_concurrent_requests.



22
23
24
# File 'lib/tansaku/crawler.rb', line 22

def max_concurrent_requests
  @max_concurrent_requests
end

#typeObject (readonly)

Returns the value of attribute type.



23
24
25
# File 'lib/tansaku/crawler.rb', line 23

def type
  @type
end

#user_agentObject (readonly)

Returns the value of attribute user_agent.



24
25
26
# File 'lib/tansaku/crawler.rb', line 24

def user_agent
  @user_agent
end

Instance Method Details

#crawlObject



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/tansaku/crawler.rb', line 50

def crawl
  results = {}
  Async do
    barrier = Async::Barrier.new
    semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
    internet = Async::HTTP::Internet.new

    paths.each do |path|
      semaphore.async do
        url = url_for(path)
        res = internet.head(url, default_request_headers)

        results[url] = res.status if online?(res.status)
      rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
        next
      end
    end
    barrier.wait
  end
  results
end