Class: Wraith::Spider
Constant Summary collapse
- EXT =
%w(flv swf png jpg gif asx zip rar tar 7z \ gz jar js css dtd xsd ico raw mp3 mp4 m4a \ wav wmv ape aac ac3 wma aiff mpg mpeg \ avi mov ogg mkv mka asx asf mp2 m1v \ m3u f4v pdf doc xls ppt pps bin exe rss xml)
Instance Attribute Summary collapse
-
#wraith ⇒ Object
readonly
Returns the value of attribute wraith.
Instance Method Summary collapse
- #add_path(path) ⇒ Object
- #crawl ⇒ Object
-
#initialize(config) ⇒ Spider
constructor
A new instance of Spider.
- #write_file ⇒ Object
Methods included from Logging
Constructor Details
Instance Attribute Details
#wraith ⇒ Object (readonly)
Returns the value of attribute wraith.
16 17 18 |
# File 'lib/wraith/spider.rb', line 16 def wraith @wraith end |
Instance Method Details
#add_path(path) ⇒ Object
39 40 41 |
# File 'lib/wraith/spider.rb', line 39 def add_path(path) @paths[path == "/" ? "home" : path.gsub("/", "__").chomp("__").downcase] = path.downcase end |
#crawl ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/wraith/spider.rb', line 23 def crawl logger.info "Crawling #{wraith.base_domain}" Anemone.crawl(wraith.base_domain) do |anemone| anemone.skip_links_like(/\.(#{EXT.join('|')})$/) # Add user specified skips anemone.skip_links_like(wraith.spider_skips) anemone.on_every_page do |page| logger.info " #{page.url.path}" add_path(page.url.path) end end logger.info "Crawl complete." write_file end |
#write_file ⇒ Object
43 44 45 46 47 48 49 50 51 |
# File 'lib/wraith/spider.rb', line 43 def write_file logger.info "Writing to YML file..." config = {} config['paths'] = @paths File.open("#{wraith.config_dir}/#{wraith.imports}", "w+") do |file| file.write(config.to_yaml) logger.info "Spider paths written to #{wraith.imports}" end end |