Class: Crawl::Register

Inherits:
Object
  • Object
show all
Defined in:
lib/crawl/register.rb

Defined Under Namespace

Classes: Result

Instance Method Summary collapse

Constructor Details

#initializeRegister

Returns a new instance of Register.



5
6
7
8
9
# File 'lib/crawl/register.rb', line 5

def initialize
  @unprocessed = Set.new
  @processing = Set.new
  @processed = Set.new
end

Instance Method Details

#add(pages) ⇒ Object



11
12
13
14
15
16
17
# File 'lib/crawl/register.rb', line 11

def add(pages)
  new_pages = pages.to_set - @processed - @processing - @unprocessed
  new_pages.each do |new_page|
    puts "  Adding #{new_page.url}" if $verbose
  end
  @unprocessed.merge(new_pages)
end

#completed(page) ⇒ Object



34
35
36
37
# File 'lib/crawl/register.rb', line 34

def completed(page)
  @processed << page
  @processing.delete(page)
end

#error_pagesObject



47
48
49
# File 'lib/crawl/register.rb', line 47

def error_pages
  @processed.select{ |page| page.error }
end

#errors?Boolean

Returns:

  • (Boolean)


51
52
53
# File 'lib/crawl/register.rb', line 51

def errors?
  !error_pages.empty?
end

#finished?Boolean

Returns:

  • (Boolean)


39
40
41
# File 'lib/crawl/register.rb', line 39

def finished?
  @unprocessed.size + @processing.size == 0
end

#next_pageObject



19
20
21
22
23
24
25
26
27
# File 'lib/crawl/register.rb', line 19

def next_page
  page = @unprocessed.first
  @unprocessed.delete(page)
  @processing << page if page
  if @processing.size > EM.threadpool_size
    puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
  end
  page
end

Returns:

  • (Boolean)


66
67
68
# File 'lib/crawl/register.rb', line 66

def no_links_found?
  @processed.size <= 1
end

#processing_sizeObject



43
44
45
# File 'lib/crawl/register.rb', line 43

def processing_size
  @processing.size
end

#retry(page) ⇒ Object



29
30
31
32
# File 'lib/crawl/register.rb', line 29

def retry(page)
  @unprocessed << page
  @processing.delete(page)
end

#summarizeObject



55
56
57
58
59
60
61
62
63
64
# File 'lib/crawl/register.rb', line 55

def summarize
  if errors?
    puts "\nPages with errors:"
    error_pages.each do |page|
      puts page.to_s
    end
  else
     puts "\n#{@processed.size} pages crawled without errors."
  end
end