Class: Tansaku::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/tansaku/crawler.rb

Constant Summary collapse

DEFAULT_USER_AGENT =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(base_uri, additional_list: nil, threads: 10, user_agent: DEFAULT_USER_AGENT) ⇒ Crawler

Returns a new instance of Crawler.

Raises:

  • (ArgumentError)


15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/tansaku/crawler.rb', line 15

def initialize(base_uri, additional_list: nil, threads: 10, user_agent: DEFAULT_USER_AGENT)
  @base_uri = URI.parse(base_uri)
  raise ArgumentError, "Invalid URI" unless valid_uri?

  @additional_list = additional_list
  unless additional_list.nil?
    raise ArgumentError, "Invalid path" unless valid_path?
  end

  @threads = threads
  @user_agent = user_agent
end

Instance Attribute Details

#additional_listObject (readonly)

Returns the value of attribute additional_list.



13
14
15
# File 'lib/tansaku/crawler.rb', line 13

def additional_list
  @additional_list
end

#base_uriObject (readonly)

Returns the value of attribute base_uri.



12
13
14
# File 'lib/tansaku/crawler.rb', line 12

def base_uri
  @base_uri
end

#threadsObject (readonly)

Returns the value of attribute threads.



13
14
15
# File 'lib/tansaku/crawler.rb', line 13

def threads
  @threads
end

#user_agentObject (readonly)

Returns the value of attribute user_agent.



13
14
15
# File 'lib/tansaku/crawler.rb', line 13

def user_agent
  @user_agent
end

Instance Method Details

#crawlObject



33
34
35
36
37
38
# File 'lib/tansaku/crawler.rb', line 33

def crawl
  results = Parallel.map(urls, in_threads: threads) do |url|
    url if online?(url)
  end
  results.compact
end

#online?(url) ⇒ Boolean

Returns:

  • (Boolean)


28
29
30
31
# File 'lib/tansaku/crawler.rb', line 28

def online?(url)
  res = head(url)
  [200, 401, 302].include? res.code.to_i
end