Class: Wraith::Spider

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/wraith/spider.rb

Constant Summary collapse

EXT =
%w(flv swf png jpg gif asx zip rar tar 7z \
gz jar js css dtd xsd ico raw mp3 mp4 m4a \
wav wmv ape aac ac3 wma aiff mpg mpeg \
avi mov ogg mkv mka asx asf mp2 m1v \
m3u f4v pdf doc xls ppt pps bin exe rss xml)

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Logging

#logger, logger

Constructor Details

#initialize(config) ⇒ Spider

Returns a new instance of Spider.



18
19
20
21
# File 'lib/wraith/spider.rb', line 18

def initialize(config)
  @wraith = Wraith::Wraith.new(config, { imports_must_resolve: false })
  @paths = {}
end

Instance Attribute Details

#wraithObject (readonly)

Returns the value of attribute wraith.



16
17
18
# File 'lib/wraith/spider.rb', line 16

def wraith
  @wraith
end

Instance Method Details

#add_path(path) ⇒ Object



39
40
41
# File 'lib/wraith/spider.rb', line 39

def add_path(path)
  @paths[path == "/" ? "home" : path.gsub("/", "__").chomp("__").downcase] = path.downcase
end

#crawlObject



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/wraith/spider.rb', line 23

def crawl
  logger.info "Crawling #{wraith.base_domain}"
  Anemone.crawl(wraith.base_domain) do |anemone|
    anemone.skip_links_like(/\.(#{EXT.join('|')})$/)
    # Add user specified skips
    anemone.skip_links_like(wraith.spider_skips)
    anemone.on_every_page do |page|
      logger.info "    #{page.url.path}"
      add_path(page.url.path)
    end
  end

  logger.info "Crawl complete."
  write_file
end

#write_fileObject



43
44
45
46
47
48
49
50
51
# File 'lib/wraith/spider.rb', line 43

def write_file
  logger.info "Writing to YML file..."
  config = {}
  config['paths'] = @paths
  File.open("#{wraith.config_dir}/#{wraith.imports}", "w+") do |file|
    file.write(config.to_yaml)
    logger.info "Spider paths written to #{wraith.imports}"
  end
end