Class: Tansaku::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/tansaku/crawler.rb

Constant Summary collapse

DEFAULT_USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(base_uri, additional_list: nil, host: nil, max_concurrent_requests: Etc.nprocessors, type: "all", user_agent: DEFAULT_USER_AGENT) ⇒ Crawler

Returns a new instance of Crawler.

Raises:

  • (ArgumentError)


25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/tansaku/crawler.rb', line 25

def initialize(
  base_uri,
  additional_list: nil,
  host: nil,
  max_concurrent_requests: Etc.nprocessors,
  type: "all",
  user_agent: DEFAULT_USER_AGENT
)
  @base_uri = URI.parse(base_uri)
  raise ArgumentError, "Invalid URI" unless valid_uri?

  @additional_list = additional_list
  unless additional_list.nil?
    raise ArgumentError, "Invalid path" unless valid_path?
  end

  @host = host
  @max_concurrent_requests = max_concurrent_requests
  @type = type
  @user_agent = user_agent
end

Instance Attribute Details

#additional_listObject (readonly)

Returns the value of attribute additional_list.



19
20
21
# File 'lib/tansaku/crawler.rb', line 19

def additional_list
  @additional_list
end

#base_uriObject (readonly)

Returns the value of attribute base_uri.



17
18
19
# File 'lib/tansaku/crawler.rb', line 17

def base_uri
  @base_uri
end

#hostObject (readonly)

Returns the value of attribute host.



20
21
22
# File 'lib/tansaku/crawler.rb', line 20

def host
  @host
end

#max_concurrent_requestsObject (readonly)

Returns the value of attribute max_concurrent_requests.



21
22
23
# File 'lib/tansaku/crawler.rb', line 21

def max_concurrent_requests
  @max_concurrent_requests
end

#typeObject (readonly)

Returns the value of attribute type.



22
23
24
# File 'lib/tansaku/crawler.rb', line 22

def type
  @type
end

#user_agentObject (readonly)

Returns the value of attribute user_agent.



23
24
25
# File 'lib/tansaku/crawler.rb', line 23

def user_agent
  @user_agent
end

Instance Method Details

#crawlObject



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/tansaku/crawler.rb', line 47

def crawl
  results = {}
  Async do
    barrier = Async::Barrier.new
    semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
    internet = Async::HTTP::Internet.new

    paths.each do |path|
      semaphore.async do
        url = url_for(path)
        res = internet.head(url, default_request_headers)

        results[url] = res.status if online?(res.status)
      rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
        next
      end
    end
    barrier.wait
  end
  results
end