Class: SimpleCrawler::Crawler

Inherits:

Object

Object
SimpleCrawler::Crawler

show all

Defined in:: lib/simplecrawler.rb

Instance Attribute Summary collapse

#current_count ⇒ Object

Returns the value of attribute current_count.
#include_patterns ⇒ Object

Returns the value of attribute include_patterns.
#load_binary_data ⇒ Object

Returns the value of attribute load_binary_data.
#maxcount ⇒ Object

Returns the value of attribute maxcount.
#queue ⇒ Object

Returns the value of attribute queue.
#site_uri ⇒ Object

Returns the value of attribute site_uri.
#skip_patterns ⇒ Object

Returns the value of attribute skip_patterns.
#user_agent ⇒ Object

Returns the value of attribute user_agent.
#visited ⇒ Object

Returns the value of attribute visited.

Instance Method Summary collapse

#add_uri(uri) ⇒ Object
#crawl ⇒ Object

Initiate crawling.
#get_doc(request_uri) ⇒ Object
#initialize(url) ⇒ Crawler constructor

A new instance of Crawler.
#log(message) ⇒ Object

Override this method for your own logging needs.
#queue_local_links(doc) ⇒ Object
#skip_uri?(uri) ⇒ Boolean

Check if a path should be ignored because it matches a skip pattern or is already visited.

Constructor Details

#initialize(url) ⇒ `Crawler`

# File 'lib/simplecrawler.rb', line 37

def initialize(url)
  @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
  @site_uri = URI.parse(url)
  @site_uri.path = "/" if @site_uri.path == ""
  @visited = Hash.new
  @queue = Array.new
  @current_count = 0
  add_uri(@site_uri)
end

Instance Attribute Details

#current_count ⇒ `Object`

Returns the value of attribute current_count.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def current_count
  @current_count
end

#include_patterns ⇒ `Object`

Returns the value of attribute include_patterns.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def include_patterns
  @include_patterns
end

#load_binary_data ⇒ `Object`

Returns the value of attribute load_binary_data.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def load_binary_data
  @load_binary_data
end

#maxcount ⇒ `Object`

Returns the value of attribute maxcount.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def maxcount
  @maxcount
end

#queue ⇒ `Object`

Returns the value of attribute queue.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def queue
  @queue
end

#site_uri ⇒ `Object`

Returns the value of attribute site_uri.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def site_uri
  @site_uri
end

#skip_patterns ⇒ `Object`

Returns the value of attribute skip_patterns.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def skip_patterns
  @skip_patterns
end

#user_agent ⇒ `Object`

Returns the value of attribute user_agent.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def user_agent
  @user_agent
end

#visited ⇒ `Object`

Returns the value of attribute visited.



35
36
37

# File 'lib/simplecrawler.rb', line 35

def visited
  @visited
end

Instance Method Details

#add_uri(uri) ⇒ `Object`

# File 'lib/simplecrawler.rb', line 105

def add_uri(uri)

  if uri.class == String
    uri = URI.parse(uri.strip)
  end

  unless skip_uri?(uri)
    @queue.push uri.request_uri
    @current_count = @current_count + 1
    @visited[uri.request_uri] = false
    log("   Added #{uri}")
  end

end

#crawl ⇒ `Object`

Initiate crawling.

# File 'lib/simplecrawler.rb', line 184

def crawl()
  while (!@queue.empty?)
    uri = @queue.shift
    current_doc = get_doc(uri)
    yield current_doc
    queue_local_links(current_doc)
    @visited[uri] = true
  end
end

#get_doc(request_uri) ⇒ `Object`

# File 'lib/simplecrawler.rb', line 121

def get_doc(request_uri)
  doc = Document.new
  begin
    log("   Getting #{request_uri}")
    request_uri = URI.parse(request_uri)

    uri = @site_uri.clone
    uri.path = request_uri.path #if request_uri.path.to_s != "/"
    uri.query = request_uri.query
    doc.uri = uri
    doc.fetched_at = Time.now

    log("Opening #{uri}")

    file = open(uri)

    mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]

    if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
      log("Loading data from #{uri}")
      doc.data = file.read
    else
      log("Skipping data for #{uri}")
      doc.data = nil
    end

    doc.headers = file.meta
    doc.http_status = file.status

  rescue => error
    log("Error fetching #{uri}: #{error.message}")
    if error.message[0..2] =~ /\d\d\d/ then
      doc.http_status = [error.message[0..2], error.message[3..-1]]
      return doc
    else
      raise error
    end
  end
  return doc
end

#log(message) ⇒ `Object`

Override this method for your own logging needs.



49
50
51

# File 'lib/simplecrawler.rb', line 49

def log(message)
  puts message
end

#queue_local_links(doc) ⇒ `Object`

# File 'lib/simplecrawler.rb', line 163

def queue_local_links(doc)
  return if doc.data == nil
  log("Queuing links for #{doc.uri}")
  Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
  doc = Hpricot(doc.data)
  links = doc.search("a[@href]")
  for link in links
    if link.attributes["href"].length > 0 then
      begin
        uri = URI.parse(link.attributes["href"])
        add_uri(uri)
      rescue
        #skip this link
      end
    end
  end
  doc = nil
end

#skip_uri?(uri) ⇒ `Boolean`

Check if a path should be ignored because it matches a skip pattern or is already visited.

# File 'lib/simplecrawler.rb', line 54

def skip_uri?(uri)

  #Check if maxcount is reached
  if @maxcount
    if @current_count >= @maxcount
      return true
    end
  end

  #Check if path belongs to site
  unless (uri.relative? or uri.host == @site_uri.host)
    return true
  end

  #Check if fragment identifier (e.g. #content)
  if uri.request_uri.length == 0 and uri.fragment.length > 0
    return true
  end

  #Check if uri already visited in this crawl or if it is queued for crawling
  if @visited.has_key?(uri.request_uri) or @queue.include?(uri.request_uri)
    return true
  end

  #Check if uri is in a skip pattern
  if @skip_patterns
    for skip_pattern in @skip_patterns
      re = Regexp.new(skip_pattern)
      if re.match(uri.request_uri)
        return true
      end
    end
  end

  #Check if uri is in at least one of the include patterns
  if @include_patterns
    match_found = false
    for include_pattern in @include_patterns
      re = Regexp.new(include_pattern)
      if re.match(uri.request_uri)
        match_found = true
      end
    end

    return true unless match_found
  end

  return false
end

Class: SimpleCrawler::Crawler

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Crawler

Instance Attribute Details

#current_count ⇒ Object

#include_patterns ⇒ Object

#load_binary_data ⇒ Object

#maxcount ⇒ Object

#queue ⇒ Object

#site_uri ⇒ Object

#skip_patterns ⇒ Object

#user_agent ⇒ Object

#visited ⇒ Object