Module: DatabricksHelper

Defined in:: lib/db/databricks.rb

Constant Summary collapse

DATABRICKS_DATATYPE_NAME_MAP =

{
  # Numeric Types
  'BIGINT' => 20,      # int8
  'BINARY' => 17,      # bytea
  'BOOLEAN' => 16,     # bool
  'DATE' => 1082,      # date
  'DECIMAL' => 1700,   # numeric
  'DOUBLE' => 701,     # float8
  'FLOAT' => 701,      # float8
  'INT' => 23,         # int4
  'INTERVAL' => 1186,  # interval
  'VOID' => 2278,      # void
  'SMALLINT' => 21,    # int2
  'STRING' => 1043,    # varchar
  'TIMESTAMP' => 1114, # timestamp
  'TIMESTAMP_NTZ' => 1114, # timestamp without timezone
  'TINYINT' => 21,     # mapped to smallint (int2) as postgres has no tinyint

  # Complex Types - mapping to closest PostgreSQL equivalents
  'ARRAY' => 2277,     # anyarray
  'MAP' => 3802,       # jsonb
  'STRUCT' => 3802,    # jsonb
  'VARIANT' => 3802,   # jsonb
  'OBJECT' => 3802     # jsonb
}.freeze

Class Method Summary collapse

Class Method Details

.connect_to_databricks(config) ⇒ `Object`

# File 'lib/db/databricks.rb', line 34

def connect_to_databricks(config)
  warehouse_id = config[:path].split('/').last

  {
    host: config[:host],
    warehouse_id: warehouse_id,
    token: config[:token]
  }
end

.disconnect_from_databricks(client) ⇒ `Object`



44
45
46

# File 'lib/db/databricks.rb', line 44

def disconnect_from_databricks(client)
  # No-op as we're using REST API
end

.format_databricks_config(connection_string) ⇒ `Object`

# File 'lib/db/databricks.rb', line 198

def format_databricks_config(connection_string)
  parsed = URI(connection_string)
  {
    host: parsed.host,
    path: parsed.path[1..],  # Remove leading slash
    token: parsed.password
  }
end

.get_columns_by_table_databricks(client, schema_name, table_name) ⇒ `Object`

# File 'lib/db/databricks.rb', line 151

def get_columns_by_table_databricks(client, schema_name, table_name)
  sql = "    SELECT \n      column_name as columnName\n    FROM system.information_schema.columns\n    WHERE table_schema = '\#{schema_name}'\n      AND table_name = '\#{table_name}'\n    ORDER BY ordinal_position\n  SQL\n  \n  results = run_query_databricks(sql, client)\n  results[:rows].map { |row| row['columnName'] }\nend\n"

.get_foreign_keys_databricks(client, schema_name, table_name, primary_key) ⇒ `Object`

# File 'lib/db/databricks.rb', line 165

def get_foreign_keys_databricks(client, schema_name, table_name, primary_key)
  # Databricks doesn't support foreign keys
  []
end

.get_schema_column_info_databricks(client, schema_name, table_names) ⇒ `Object`

# File 'lib/db/databricks.rb', line 170

def get_schema_column_info_databricks(client, schema_name, table_names)
  table_names.map do |table|
    sql = "      SELECT \n        column_name as columnName,\n        data_type as dataType\n      FROM system.information_schema.columns\n      WHERE table_schema = '\#{table[:schemaName]}'\n        AND table_name = '\#{table[:tableName]}'\n      ORDER BY ordinal_position\n    SQL\n\n    results = run_query_databricks(sql, client)\n    {\n      tableName: \"\#{table[:schemaName]}.\#{table[:tableName]}\",\n      columns: results[:rows].map do |row|\n        data_type = row['dataType'].split('<')[0].upcase\n        {\n          columnName: row['columnName'],\n          displayName: row['columnName'],\n          fieldType: row['dataType'],\n          dataTypeID: DATABRICKS_DATATYPE_ID_MAP[data_type] || 1043\n        }\n      end\n    }\n  end\nend\n"

.get_schemas_databricks(client) ⇒ `Object`

# File 'lib/db/databricks.rb', line 116

def get_schemas_databricks(client)
  sql = "    SELECT schema_name \n    FROM system.information_schema.schemata \n    WHERE schema_name != \"information_schema\"\n  SQL\n  \n  results = run_query_databricks(sql, client)\n  results[:rows].map { |row| row['schema_name'] }\nend\n"

.get_tables_by_schema_databricks(client, schema_names) ⇒ `Object`

# File 'lib/db/databricks.rb', line 127

def get_tables_by_schema_databricks(client, schema_names)
  all_tables = schema_names.flat_map do |schema|
    sql = "      SELECT \n        table_name as tableName,\n        table_schema as schemaName,\n        table_catalog as catalogName\n      FROM system.information_schema.tables\n      WHERE table_schema = '\#{schema}'\n    SQL\n    \n    results = run_query_databricks(sql, client)\n    results[:rows].map do |row|\n      {\n        tableName: row['tableName'],\n        schemaName: row['schemaName'],\n        catalogName: row['catalogName']\n      }\n    end\n  end\n\n  all_tables\nend\n"

.run_query_databricks(sql, client) ⇒ `Object`

# File 'lib/db/databricks.rb', line 48

def run_query_databricks(sql, client)
  uri = URI("https://#{client[:host]}/api/2.0/sql/statements")
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = true

  request = Net::HTTP::Post.new(uri)
  request['Authorization'] = "Bearer #{client[:token]}"
  request['Content-Type'] = 'application/json'
  request.body = JSON.generate({
    statement: sql,
    warehouse_id: client[:warehouse_id],
    wait_timeout: '50s'
  })

  response = http.request(request)

  raise "Query failed: #{response.body}" unless response.is_a?(Net::HTTPSuccess)

  result = JSON.parse(response.body)
  
  raise "Query state: #{result['status']['state']}" unless result['status']['state'] == 'SUCCEEDED'

  columns = result['manifest']['schema']['columns']
  column_names = columns.map { |col| col['name'] }
  column_types = columns.map { |col| col['type_name'] }

  json_like_types = ['ARRAY', 'MAP', 'STRUCT', 'JSON']


  fields = columns.map do |column|
    {
      name: column['name'],
      dataTypeID: DATABRICKS_DATATYPE_NAME_MAP[column['type_name']] || 1043 # default to varchar if type not found
    }
  end

  # Transform the data array into rows of hashes
  rows = result['result']['data_array'].map do |row_array|
    row_hash = column_names.zip(row_array).to_h
    transformed_row = {}
  
    column_names.each_with_index do |key, idx|
      value = row_hash[key]
      type = column_types[idx]
      # parse JSON if the field is a JSON type
      transformed_value = if value.is_a?(String) && json_like_types.include?(type.upcase)
        begin
          parsed = JSON.parse(value)
          parsed.is_a?(Array) ? parsed.map(&:to_s) : parsed
        rescue JSON::ParserError
          value
        end
      else
        value
      end
  
      transformed_row[key] = transformed_value
    end
  
    transformed_row
  end

  {
    fields: fields,
    rows: rows
  }
end

Module: DatabricksHelper

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.connect_to_databricks(config) ⇒ Object

.disconnect_from_databricks(client) ⇒ Object

.format_databricks_config(connection_string) ⇒ Object

.get_columns_by_table_databricks(client, schema_name, table_name) ⇒ Object

.get_foreign_keys_databricks(client, schema_name, table_name, primary_key) ⇒ Object

.get_schema_column_info_databricks(client, schema_name, table_names) ⇒ Object

.get_schemas_databricks(client) ⇒ Object

.get_tables_by_schema_databricks(client, schema_names) ⇒ Object

.run_query_databricks(sql, client) ⇒ Object

.connect_to_databricks(config) ⇒ `Object`

.disconnect_from_databricks(client) ⇒ `Object`

.format_databricks_config(connection_string) ⇒ `Object`

.get_columns_by_table_databricks(client, schema_name, table_name) ⇒ `Object`

.get_foreign_keys_databricks(client, schema_name, table_name, primary_key) ⇒ `Object`

.get_schema_column_info_databricks(client, schema_name, table_names) ⇒ `Object`

.get_schemas_databricks(client) ⇒ `Object`

.get_tables_by_schema_databricks(client, schema_names) ⇒ `Object`

.run_query_databricks(sql, client) ⇒ `Object`