Class: Mortar::Command::Spark

Inherits:
Base
  • Object
show all
Includes:
Git
Defined in:
lib/mortar/command/spark.rb

Overview

run spark jobs using Spark Job Server

Instance Attribute Summary

Attributes inherited from Base

#args, #options

Instance Method Summary collapse

Methods inherited from Base

#api, #ask_public, #config_parameters, #get_error_message_context, #git, #initialize, #initialize_embedded_project, #luigi_parameters, namespace, #pig_parameters, #project, #register_api_call, #register_do, #register_project, #spark_script_arguments, #validate_project_name, #validate_project_structure

Methods included from Helpers

#action, #ask, #confirm, #copy_if_not_present_at_dest, #default_host, #deprecate, #display, #display_header, #display_object, #display_row, #display_table, #display_with_indent, #download_to_file, #ensure_dir_exists, #error, error_with_failure, error_with_failure=, extended, extended_into, #format_bytes, #format_date, #format_with_bang, #full_host, #get_terminal_environment, #home_directory, #host, #hprint, #hputs, included, included_into, #installed_with_omnibus?, #json_decode, #json_encode, #line_formatter, #longest, #output_with_bang, #pending_github_team_state_message, #quantify, #redisplay, #retry_on_exception, #running_on_a_mac?, #running_on_windows?, #set_buffer, #shell, #spinner, #status, #string_distance, #styled_array, #styled_error, #styled_hash, #styled_header, #suggestion, #test_name, #ticking, #time_ago, #truncate, #warning, #with_tty, #write_to_file

Constructor Details

This class inherits a constructor from Mortar::Command::Base

Instance Method Details

#indexObject

spark CLASS_NAME

Run a spark job on a spark jobserver.

-c, –clusterid CLUSTERID # Run job on an existing cluster with ID of CLUSTERID (Default: runs on an existing available cluster) -s, –clustersize NUMNODES # Run job with NUMNODES nodes (optional; must be >= 2 if provided) -t, –clustertags A,B,C # Run job on an existing cluster with specified tags -3, –spot # Use spot instances for this cluster (Default: true) -B, –branch BRANCHNAME # Used with –project to specify a non-master branch

Examples:

Run a spark job:
    $ mortar spark com.datadog.some.Job

Run a spark job from master branch:
    $ mortar spark -B master com.datadog.some.Job

Run a spark job with some arguments:
    $ mortar spark com.datadog.some.Job --env prod s3://your-bucket/input s3://your-bucket/output 100


47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/mortar/command/spark.rb', line 47

def index
  class_name = shift_argument
  unless class_name
    error("Usage: mortar spark CLASS_NAME\nMust specify CLASS_NAME.")
  end

  project_name = project.name

  script_arguments = spark_script_arguments()

  if options[:branch]
    git_ref = options[:branch]
    gitlab_uri = "commits/#{git_ref}"
  else
    git_ref = sync_code_with_cloud()
    gitlab_uri = "commit/#{git_ref}"
  end

  if options[:clustertags]
    cluster_tags = options[:clustertags].split(',')
  else
    cluster_tags = []
  end

  unless options[:clusterid] || options[:clustersize]
    clusters = api.get_clusters(Mortar::API::Jobs::CLUSTER_BACKEND__EMR_SPARK_JOBSERVER).body['clusters']

    if cluster_tags.length > 0
      tagged_clusters = clusters.select{
        |c| c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING && (cluster_tags - c['tags']).empty?
      }
      if tagged_clusters.length > 1
        display(tagged_clusters)
        error "There're " + tagged_clusters.length + " clusters with tags [" + options[:clustertags] +
          "]. Please, select one cluster."
      elsif tagged_clusters.length == 0
        error "There're no clusters with tags [" + options[:clustertags] + "]"
      end

      largest_cluster = tagged_clusters.max_by{|c| c['size']}

      options[:clusterid] = largest_cluster['cluster_id']
      display("Running job on the cluster with tags [" + options[:clustertags] + "], id = " + largest_cluster['cluster_id'] +
        ", size = " + largest_cluster['size'].to_s)
    else
      largest_free_cluster = clusters.select{ |c| \
        c['running_jobs'].length == 0 && c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING }.
        max_by{|c| c['size']}

      if largest_free_cluster.nil?
        error('No running clusters with Spark Job Server detected, please, launch a SparkJobServer cluster first')
      end

      options[:clusterid] = largest_free_cluster['cluster_id']
      display("Defaulting to running job on largest existing free cluster, id = " +
              largest_free_cluster['cluster_id'] + ", size = " + largest_free_cluster['size'].to_s)
    end
  end

  response = action("Requesting job execution") do
    cluster_id = options[:clusterid]
    args = { script_arguments: script_arguments }
    args[:clustersize] = options[:clustersize] if options[:clustersize]
    api.post_spark_job_on_jobserver(project_name, class_name, git_ref, cluster_id, **args).body
  end
  
  display("job_id: #{response['job_id']}")
  display("git_ref: #{git_ref}")
  display
  display("Gitlab CI pipeline status can be viewed at:\n\n https://gitlab.ddbuild.io/DataDog/dd-analytics/#{gitlab_uri}")
  display
  display("Job status can be viewed on the web at:\n\n #{response['web_job_url']}")
  display

  response['job_id']
end