Class: Elasticrawl::ParseJob

Inherits:
Job
  • Object
show all
Defined in:
lib/elasticrawl/parse_job.rb

Overview

Represents an Elastic MapReduce job flow that parses segments of Common Crawl data. A job step is created per segment.

Inherits from Job which is the ActiveRecord model class.

Instance Method Summary collapse

Methods inherited from Job

#confirm_message, #history, #result_message

Instance Method Details

#log_uriObject

Returns the S3 location for storing Elastic MapReduce job logs.



39
40
41
42
# File 'lib/elasticrawl/parse_job.rb', line 39

def log_uri
  s3_path = "/logs/1-parse/#{self.job_name}/"
  build_s3_uri(s3_path)
end

#runObject

Runs the job by calling Elastic MapReduce API. If successful the parse time is set for each segment.



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/elasticrawl/parse_job.rb', line 20

def run
  emr_config = job_config['emr_config']
  job_flow_id = run_job_flow(emr_config)

  if job_flow_id.present?
    self.job_flow_id = job_flow_id

    self.job_steps.each do |step|
      segment = step.crawl_segment
      segment.parse_time = DateTime.now
      segment.save
    end

    self.save
    self.result_message
  end
end

#segment_listObject

Return list of segment descriptions.



45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/elasticrawl/parse_job.rb', line 45

def segment_list
  segments = ['Segments']

  job_steps.each do |job_step|
    if job_step.crawl_segment.present?
      segment = job_step.crawl_segment
      segments.push(segment.segment_desc)
    end
  end

  segments.push('')
end

#set_segments(crawl_segments, max_files = nil) ⇒ Object

Populates the job from the list of segments to be parsed.



8
9
10
11
12
13
14
15
16
# File 'lib/elasticrawl/parse_job.rb', line 8

def set_segments(crawl_segments, max_files = nil)
  self.job_name = set_job_name
  self.job_desc = set_job_desc(crawl_segments, max_files)
  self.max_files = max_files

  crawl_segments.each do |segment|
    self.job_steps.push(create_job_step(segment))
  end
end