Class: Kudzu::Agent

Inherits:
Object
  • Object
show all
Defined in:
lib/kudzu/agent.rb,
lib/kudzu/agent/robots.rb,
lib/kudzu/agent/fetcher.rb,
lib/kudzu/agent/sleeper.rb,
lib/kudzu/agent/response.rb,
lib/kudzu/agent/reference.rb,
lib/kudzu/agent/robots/txt.rb,
lib/kudzu/agent/url_filterer.rb,
lib/kudzu/agent/util/matcher.rb,
lib/kudzu/agent/page_filterer.rb,
lib/kudzu/agent/robots/parser.rb,
lib/kudzu/agent/url_extractor.rb,
lib/kudzu/agent/http/connection.rb,
lib/kudzu/agent/util/title_parser.rb,
lib/kudzu/agent/http/connection_pool.rb,
lib/kudzu/agent/util/charset_detector.rb,
lib/kudzu/agent/util/mime_type_detector.rb,
lib/kudzu/agent/util/content_type_parser.rb

Defined Under Namespace

Classes: Fetcher, Http, PageFilterer, Reference, Response, Robots, Sleeper, UrlExtractor, UrlFilterer, Util

Instance Method Summary collapse

Constructor Details

#initialize(config, &block) ⇒ Agent

Returns a new instance of Agent.


5
6
7
8
9
10
11
12
13
# File 'lib/kudzu/agent.rb', line 5

def initialize(config, &block)
  @config = config

  @robots = Robots.new(@config)
  @fetcher = Fetcher.new(@config, @robots)
  @url_extractor = UrlExtractor.new(@config)
  @url_filterer = UrlFilterer.new(@config, @robots)
  @page_filterer = PageFilterer.new(@config)
end

Instance Method Details

#extract_refs(response) ⇒ Object


32
33
34
35
# File 'lib/kudzu/agent.rb', line 32

def extract_refs(response)
  refs = @url_extractor.extract(response)
  @url_filterer.filter(refs, response.url)
end

#fetch(url, request_header = {}) ⇒ Object


20
21
22
23
24
25
26
27
28
29
30
# File 'lib/kudzu/agent.rb', line 20

def fetch(url, request_header = {})
  response = @fetcher.fetch(url, request_header: request_header)
  return response unless response.fetched?

  response.size = response.body.size
  response.digest = Digest::MD5.hexdigest(response.body)
  response.mime_type = Util::MimeTypeDetector.detect(response)
  response.charset = Util::CharsetDetector.detect(response) if response.text?
  response.title = Util::TitleParser.parse(response)
  response
end

#filter_response?(response) ⇒ Boolean

Returns:

  • (Boolean)

37
38
39
40
# File 'lib/kudzu/agent.rb', line 37

def filter_response?(response)
  return false if response.redirect_from && !@url_filterer.allowed?(response.url, response.redirect_from)
  !@page_filterer.allowed?(response)
end

#startObject


15
16
17
18
# File 'lib/kudzu/agent.rb', line 15

def start
   yield
   @fetcher.pool.close
end