Class: Elasticrawl::Crawl

Inherits:
ActiveRecord::Base
  • Object
show all
Defined in:
lib/elasticrawl/crawl.rb

Overview

Represents a web crawl released by the Common Crawl Foundation. Each crawl is split into multiple crawl segments and is stored in the S3 public datasets bucket.

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.status(show_all = false) ⇒ Object

Returns the status of all saved crawls and the current job history.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/elasticrawl/crawl.rb', line 9

def self.status(show_all = false)
  status = ['Crawl Status']
  Crawl.all.map { |crawl| status << crawl.status }

  if show_all == true
    header = 'Job History'
    jobs = Job.where('job_flow_id is not null').order(:id => :desc)
  else
    header = 'Job History (last 10)'
    jobs = Job.where('job_flow_id is not null').order(:id => :desc).limit(10)
  end

  status << ['', header]
  jobs.map { |job| status << job.history }

  status.join("\n")
end

Instance Method Details

#create_segmentsObject

Creates crawl segments from the warc.paths file for this crawl.



50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/elasticrawl/crawl.rb', line 50

def create_segments
  file_paths = warc_paths(self.crawl_name)

  segments = parse_segments(file_paths)
  save if segments.count > 0

  segments.keys.each do |segment_name|
    file_count = segments[segment_name]
    CrawlSegment.create_segment(self, segment_name, file_count)
  end

  segments.count
end

#has_segments?Boolean

Checks for crawl segments in the database. If none are found then checks the S3 API and creates any segments that are found.

Returns:

  • (Boolean)


40
41
42
43
44
45
46
47
# File 'lib/elasticrawl/crawl.rb', line 40

def has_segments?
  if self.crawl_segments.count == 0
    segment_count = create_segments
    result = segment_count > 0
  else
    result = true
  end
end

#next_segments(max_segments = nil) ⇒ Object

Returns next # segments to be parsed. The maximum is 256 as this is the maximum # of steps for an Elastic MapReduce job flow.



71
72
73
74
75
76
# File 'lib/elasticrawl/crawl.rb', line 71

def next_segments(max_segments = nil)
  max_segments = Elasticrawl::MAX_SEGMENTS if max_segments.nil?
  max_segments = Elasticrawl::MAX_SEGMENTS if max_segments > Elasticrawl::MAX_SEGMENTS

  self.crawl_segments.where(:parse_time => nil).limit(max_segments)
end

#resetObject

Resets parse time of all parsed segments to null so they will be parsed again. Returns the updated crawl status.



80
81
82
83
84
85
86
# File 'lib/elasticrawl/crawl.rb', line 80

def reset
  segments = CrawlSegment.where('crawl_id = ? and parse_time is not null',
                                self.id)
  segments.map { |segment| segment.update_attribute(:parse_time, nil) }

  status
end

#select_segments(segments_list) ⇒ Object

Returns the list of segments from the database.



65
66
67
# File 'lib/elasticrawl/crawl.rb', line 65

def select_segments(segments_list)
  CrawlSegment.where(:segment_name => segments_list)
end

#statusObject

Returns the status of the current crawl.



28
29
30
31
32
33
34
35
36
# File 'lib/elasticrawl/crawl.rb', line 28

def status
  total = self.crawl_segments.count
  remaining = CrawlSegment.where(:crawl_id => self.id,
                                    :parse_time => nil).count
  parsed = total - remaining
  status = self.crawl_name
  status += " Segments: to parse #{remaining}, "
  status += "parsed #{parsed}, total #{total}"
end