Class: SpeedSpider::Cli

Inherits:
Object
  • Object
show all
Defined in:
lib/speed_spider/cli.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeCli

Returns a new instance of Cli.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/speed_spider/cli.rb', line 9

def initialize
  @options = {
    # only url start with base_url will save to local
    :base_url => '',
    # directory for downloaded files to save to
    :dir => 'download',
    # run 4 Tentacle threads to fetch pages
    :threads => 4,
    # verbose output
    :verbose => true,
    # don't throw away the page response body after scanning it for links
    :discard_page_bodies => false,
    # identify self as WebCrawler/VERSION
    :user_agent => "SpeedSpider/#{SpeedSpider::VERSION}",
    # no delay between requests
    :delay => 0,
    # don't obey the robots exclusion protocol
    :obey_robots_txt => false,
    # by default, don't limit the depth of the crawl
    :depth_limit => false,
    # number of times HTTP redirects will be followed
    :redirect_limit => 5,
    # storage engine defaults to Hash in +process_options+ if none specified
    :storage => nil,
    # Hash of cookie name => value to send with HTTP requests
    :cookies => nil,
    # accept cookies from the server and send them back?
    :accept_cookies => false,
    # skip any link with a query string? e.g. http://foo.com/?u=user
    :skip_query_strings => false,
    # proxy server hostname 
    :proxy_host => nil,
    # proxy server port number
    :proxy_port => false,
    # HTTP read timeout in seconds
    :read_timeout => nil
  }
end

Instance Attribute Details

#option_parserObject (readonly)

Returns the value of attribute option_parser.



7
8
9
# File 'lib/speed_spider/cli.rb', line 7

def option_parser
  @option_parser
end

#optionsObject (readonly)

Returns the value of attribute options.



7
8
9
# File 'lib/speed_spider/cli.rb', line 7

def options
  @options
end

Instance Method Details

#parse!Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/speed_spider/cli.rb', line 48

def parse!
  @option_parser = OptionParser.new do |opts|
    opts.banner = "Usage: spider [options] start_url"
    opts.separator ""
    opts.separator "options:"

    opts.on('-S', '--slient', 'slient output') do
      @options[:verbose] = false
    end

    opts.on('-D', '--dir String', 'directory for download files to save to. "download" by default') do |value|
      options[:dir]  = value
    end

    opts.on('-b', '--base_url String', 'any url not starts with base_url will not be saved') do |value|
      value += '/' unless value.end_with? '/'
      options[:base_url]  = value
    end

    opts.on('-t', '--threads Integer', Integer, 'threads to run for fetching pages, 4 by default') do |value|
      @options[:threads] = value
    end

    opts.on('-u', '--user_agent String', 'words for request header USER_AGENT') do |value|
      @options[:user_agent] = value
    end

    opts.on('-d', '--delay Integer', Integer, 'delay between requests in seconds') do |value|
      @options[:delay] = value
    end

    opts.on('-o', '--obey_robots_text', 'obey robots exclustion protocol') do
      @options[:obey_robots_txt] = true
    end

    opts.on('-l', '--depth_limit', 'limit the depth of the crawl') do
      @options[:delay] = true
    end

    opts.on('-r', '--redirect_limit Integer', Integer, 'number of times HTTP redirects will be followed') do |value|
      @options[:redirect_limit] = value
    end

    opts.on('-a', '--accept_cookies', 'accept cookies from the server and send them back?') do
      @options[:accept_cookies] = true
    end

    opts.on('-s', '--skip_query_strings', 'skip any link with a query string? e.g. http://foo.com/?u=user') do
      @options[:skip_query_strings] = true
    end

    opts.on('-H', '--proxy_host String', 'proxy server hostname') do |value|
      @options[:proxy_host] = value
    end

    opts.on('-P', '--proxy_port Integer', Integer, 'proxy server port number') do |value|
      @options[:proxy_port] = value
    end

    opts.on('-T', '--read_timeout Integer', Integer, 'HTTP read timeout in seconds') do |value|
      @options[:read_timeout] = value
    end

    # print the version.
    opts.on_tail("-V", "--version", "Show version") do
      puts SpeedSpider::VERSION
      exit
    end
  end

  @option_parser.parse!

  self
end