Class: Mortar::Command::Spark

Inherits:

Base

Object
Base
Mortar::Command::Spark

show all

Includes:: Git

Defined in:: lib/mortar/command/spark.rb

Overview

run spark jobs using Spark Job Server

Instance Attribute Summary

Attributes inherited from Base

#args, #options

Instance Method Summary collapse

#index ⇒ Object

spark CLASS_NAME.

Methods inherited from Base

#api, #ask_public, #config_parameters, #get_error_message_context, #git, #initialize, #initialize_embedded_project, #luigi_parameters, namespace, #pig_parameters, #project, #register_api_call, #register_do, #register_project, #spark_script_arguments, #validate_project_name, #validate_project_structure

Methods included from Helpers

#action, #ask, #confirm, #copy_if_not_present_at_dest, #default_host, #deprecate, #display, #display_header, #display_object, #display_row, #display_table, #display_with_indent, #download_to_file, #ensure_dir_exists, #error, error_with_failure, error_with_failure=, extended, extended_into, #format_bytes, #format_date, #format_with_bang, #full_host, #get_terminal_environment, #home_directory, #host, #hprint, #hputs, included, included_into, #installed_with_omnibus?, #json_decode, #json_encode, #line_formatter, #longest, #output_with_bang, #pending_github_team_state_message, #quantify, #redisplay, #retry_on_exception, #running_on_a_mac?, #running_on_windows?, #set_buffer, #shell, #spinner, #status, #string_distance, #styled_array, #styled_error, #styled_hash, #styled_header, #suggestion, #test_name, #ticking, #time_ago, #truncate, #warning, #with_tty, #write_to_file

Constructor Details

This class inherits a constructor from Mortar::Command::Base

Instance Method Details

#index ⇒ `Object`

spark CLASS_NAME

Run a spark job on a spark jobserver.

-c, –clusterid CLUSTERID # Run job on an existing cluster with ID of CLUSTERID (Default: runs on an existing available cluster) -s, –clustersize NUMNODES # Run job with NUMNODES nodes (optional; must be >= 2 if provided) -t, –clustertags A,B,C # Run job on an existing cluster with specified tags -3, –spot # Use spot instances for this cluster (Default: true) -B, –branch BRANCHNAME # Used with –project to specify a non-master branch

Examples:

Run a spark job:
    $ mortar spark com.datadog.some.Job

Run a spark job from master branch:
    $ mortar spark -B master com.datadog.some.Job

Run a spark job with some arguments:
    $ mortar spark com.datadog.some.Job --env prod s3://your-bucket/input s3://your-bucket/output 100

# File 'lib/mortar/command/spark.rb', line 47

def index
  class_name = shift_argument
  unless class_name
    error("Usage: mortar spark CLASS_NAME\nMust specify CLASS_NAME.")
  end

  project_name = project.name

  script_arguments = spark_script_arguments()

  if options[:branch]
    git_ref = options[:branch]
    gitlab_uri = "commits/#{git_ref}"
  else
    git_ref = sync_code_with_cloud()
    gitlab_uri = "commit/#{git_ref}"
  end

  if options[:clustertags]
    cluster_tags = options[:clustertags].split(',')
  else
    cluster_tags = []
  end

  unless options[:clusterid] || options[:clustersize]
    clusters = api.get_clusters(Mortar::API::Jobs::CLUSTER_BACKEND__EMR_SPARK_JOBSERVER).body['clusters']

    if cluster_tags.length > 0
      tagged_clusters = clusters.select{
        |c| c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING && (cluster_tags - c['tags']).empty?
      }
      if tagged_clusters.length > 1
        display(tagged_clusters)
        error "There're " + tagged_clusters.length + " clusters with tags [" + options[:clustertags] +
          "]. Please, select one cluster."
      elsif tagged_clusters.length == 0
        error "There're no clusters with tags [" + options[:clustertags] + "]"
      end

      largest_cluster = tagged_clusters.max_by{|c| c['size']}

      options[:clusterid] = largest_cluster['cluster_id']
      display("Running job on the cluster with tags [" + options[:clustertags] + "], id = " + largest_cluster['cluster_id'] +
        ", size = " + largest_cluster['size'].to_s)
    else
      largest_free_cluster = clusters.select{ |c| \
        c['running_jobs'].length == 0 && c['status_code'] == Mortar::API::Clusters::STATUS_RUNNING }.
        max_by{|c| c['size']}

      if largest_free_cluster.nil?
        error('No running clusters with Spark Job Server detected, please, launch a SparkJobServer cluster first')
      end

      options[:clusterid] = largest_free_cluster['cluster_id']
      display("Defaulting to running job on largest existing free cluster, id = " +
              largest_free_cluster['cluster_id'] + ", size = " + largest_free_cluster['size'].to_s)
    end
  end

  response = action("Requesting job execution") do
    cluster_id = options[:clusterid]
    args = { script_arguments: script_arguments }
    args[:clustersize] = options[:clustersize] if options[:clustersize]
    api.post_spark_job_on_jobserver(project_name, class_name, git_ref, cluster_id, **args).body
  end
  
  display("job_id: #{response['job_id']}")
  display("git_ref: #{git_ref}")
  display
  display("Gitlab CI pipeline status can be viewed at:\n\n https://gitlab.ddbuild.io/DataDog/dd-analytics/#{gitlab_uri}")
  display
  display("Job status can be viewed on the web at:\n\n #{response['web_job_url']}")
  display

  response['job_id']
end