Class: Tansaku::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/tansaku/crawler.rb

Constant Summary collapse

DEFAULT_USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(base_uri, additional_list: nil, headers: {}, method: "HEAD", body: nil, timeout: nil, max_concurrent_requests: nil, ignore_certificate_errors: false, type: "all") ⇒ Crawler

Returns a new instance of Crawler.

Raises:

  • (ArgumentError)


39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/tansaku/crawler.rb', line 39

def initialize(
  base_uri,
  additional_list: nil,
  headers: {},
  method: "HEAD",
  body: nil,
  timeout: nil,
  max_concurrent_requests: nil,
  ignore_certificate_errors: false,
  type: "all"
)
  @base_uri = URI.parse(base_uri.downcase)
  raise ArgumentError, "Invalid URI" unless valid_uri?

  @additional_list = additional_list
  raise ArgumentError, "Invalid path" unless valid_additional_path?

  @method = method.upcase
  raise ArgumentError, "Invalid HTTP method" unless valid_method?

  @headers = headers
  @body = body

  @timeout = timeout.nil? ? nil : timeout.to_f

  @max_concurrent_requests = max_concurrent_requests || (Etc.nprocessors * 8)

  @ignore_certificate_errors = ignore_certificate_errors

  @type = type
end

Instance Attribute Details

#additional_listObject (readonly)

Returns the value of attribute additional_list.



19
20
21
# File 'lib/tansaku/crawler.rb', line 19

def additional_list
  @additional_list
end

#base_uriString (readonly)

Returns:

  • (String)


17
18
19
# File 'lib/tansaku/crawler.rb', line 17

def base_uri
  @base_uri
end

#bodyString? (readonly)

Returns:

  • (String, nil)


31
32
33
# File 'lib/tansaku/crawler.rb', line 31

def body
  @body
end

#ignore_certificate_errorsBoolean (readonly)

Returns:

  • (Boolean)


37
38
39
# File 'lib/tansaku/crawler.rb', line 37

def ignore_certificate_errors
  @ignore_certificate_errors
end

#max_concurrent_requestsInteger (readonly)

Returns:

  • (Integer)


22
23
24
# File 'lib/tansaku/crawler.rb', line 22

def max_concurrent_requests
  @max_concurrent_requests
end

#methodString (readonly)

Returns:

  • (String)


28
29
30
# File 'lib/tansaku/crawler.rb', line 28

def method
  @method
end

#timeoutFloat? (readonly)

Returns:

  • (Float, nil)


34
35
36
# File 'lib/tansaku/crawler.rb', line 34

def timeout
  @timeout
end

#typeString (readonly)

Returns:

  • (String)


25
26
27
# File 'lib/tansaku/crawler.rb', line 25

def type
  @type
end

Instance Method Details

#crawlObject



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/tansaku/crawler.rb', line 71

def crawl
  results = {}

  log_conditions

  Async do |task|
    barrier = Async::Barrier.new
    semaphore = Async::Semaphore.new(max_concurrent_requests, parent: barrier)
    internet = Internet.new

    paths.each do |path|
      semaphore.async do
        url = url_for(path)

        res = dispatch_http_request(task, internet, url)
        next unless online?(res.status)

        log = [method, url, res.status].join(",")
        Tansaku.logger.info(log)

        results[url] = res.status
      rescue Errno::ECONNRESET, Errno::ECONNREFUSED, Errno::EHOSTUNREACH, EOFError, OpenSSL::SSL::SSLError, Async::TimeoutError
        next
      end
    end
    barrier.wait
  end

  results
end