Class: Arachnid2

Inherits:
Object
  • Object
show all
Defined in:
lib/arachnid2.rb,
lib/arachnid2/version.rb

Constant Summary collapse

MAX_CRAWL_TIME =

META:

About the origins of this crawling approach

The Crawler is heavily borrowed from by Arachnid. Original: github.com/dchuk/Arachnid Other iterations I’ve borrowed liberally from:

- https://github.com/matstc/Arachnid
- https://github.com/intrigueio/Arachnid
- https://github.com/jhulme/Arachnid

And this was originally written as a part of Tellurion’s bot github.com/samnissen/tellurion_bot

600
BASE_CRAWL_TIME =
15
MAX_URLS =
10000
BASE_URLS =
50
DEFAULT_LANGUAGE =
"en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4"
DEFAULT_USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"
DEFAULT_NON_HTML_EXTENSIONS =
{
  3 => ['.gz'],
  4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip',
        '.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar',
        '.iso', '.dmg', '.bin', '.ics', '.exe', '.wav', '.mid'],
  5 => ['.xlsx', '.docx', '.pptx', '.tiff', '.zipx'],
  8 => ['.torrent']
}
MEMORY_USE_FILE =
"/sys/fs/cgroup/memory/memory.usage_in_bytes"
MEMORY_LIMIT_FILE =
"/sys/fs/cgroup/memory/memory.limit_in_bytes"
DEFAULT_MAXIMUM_LOAD_RATE =
79.9
DEFAULT_TIMEOUT =
10_000
MINIMUM_TIMEOUT =
1
MAXIMUM_TIMEOUT =
999_999
VERSION =
"0.1.4"

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Arachnid2

Creates the object to execute the crawl

Examples:

url = "https://daringfireball.net"
spider = Arachnid2.new(url)

Parameters:

  • url (String)


57
58
59
60
# File 'lib/arachnid2.rb', line 57

def initialize(url)
  @url = url
  @domain = Adomain[@url]
end

Instance Method Details

#crawl(opts = {}) ⇒ Object

Visits a URL, gathering links and visiting them, until running out of time, memory or attempts.

Examples:

url = "https://daringfireball.net"
spider = Arachnid2.new(url)

opts = {
  :followlocation => true,
  :timeout => 25000,
  :time_box => 30,
  :headers => {
    'Accept-Language' => "en-UK",
    'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
  },
  :memory_limit => 89.99,
  :proxy => {
    :ip => "1.2.3.4",
    :port => "1234",
    :username => "sam",
    :password => "coolcoolcool",
  }
  :non_html_extensions => {
    3 => [".abc", ".xyz"],
    4 => [".abcd"],
    6 => [".abcdef"],
    11 => [".abcdefghijk"]
  }
}
responses = []
spider.crawl(opts) { |response|
  responses << response
}

Parameters:

  • opts (Hash) (defaults to: {})

Returns:

  • nil



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/arachnid2.rb', line 101

def crawl(opts = {})
  preflight(opts)

  until @global_queue.empty?
    @max_concurrency.times do
      q = @global_queue.shift

      break if @global_visited.size >= @crawl_options[:max_urls]
      break if Time.now > @crawl_options[:time_limit]
      break if memory_danger?

      @global_visited.insert(q)

      request = Typhoeus::Request.new(q, request_options)

      request.on_complete do |response|
        links = process(response)
        next unless links

        yield response

        vacuum(links, response)
      end

      @hydra.queue(request)
    end # @max_concurrency.times do

    @hydra.run
  end # until @global_queue.empty?

ensure
  @cookie_file.close! if @cookie_file
end