20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
# File 'lib/web2text.rb', line 20
def self.parse_cli(args)
options = {
query: "body",
sleep: 0.0,
avoid: [],
focus: [],
formatter: LinePrinter,
ignore_robots_txt: false,
out: $stdout,
}
args = args.clone
OptionParser.new do |opts|
opts.banner = "Usage: web2text [options] http://example.com/"
opts.on("-q", "--css", "--query=CSS_QUERY", String) do |q|
options[:query] = q
end
opts.on("-s [OPTIONAL]", "--sleep [OPTIONAL]", Float, "Delay between requests. Default 1, -s sets to 1.") do |n|
options[:sleep] = n || 1.0
end
opts.on("--avoid x,y,z", Array, "List of paths to avoid when crawling. These paths and everything below them will be ignored.") do |avoid|
options[:avoid] = avoid
end
opts.on("--focus x,y,z", Array, "List of paths to process when crawling. Only these paths and pages below them will be processed") do |focus|
options[:focus] = focus
end
opts.on("--lines [web2.txt]", String, "One line per page. Can print to std out or a file.") do |f|
options[:formatter] = LinePrinter
options[:out] = if f then File.open(f, 'w') else $stdout end
end
opts.on("--files out/", String, "One file per page. Following website structure, in the specified directory.") do |o|
options[:formatter] = FilePrinter
options[:out] = Pathname(o)
if options[:out].exist? and !options[:out].directory? then
raise Web2Text::CommandError.new 'argument to --files must be a directory'
end
end
opts.on("--bad-robot", "Ignore robots.txt") do
options[:ignore_robots_txt] = true
end
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
end.parse! args
if args.length != 1 then
raise Web2Text::CommandError.new 'incorrect number of arguments!'
end
options[:url] = args[0]
options
end
|