Module: DatabricksHelper

Defined in:
lib/db/databricks.rb

Constant Summary collapse

DATABRICKS_DATATYPE_NAME_MAP =
{
  # Numeric Types
  'BIGINT' => 20,      # int8
  'BINARY' => 17,      # bytea
  'BOOLEAN' => 16,     # bool
  'DATE' => 1082,      # date
  'DECIMAL' => 1700,   # numeric
  'DOUBLE' => 701,     # float8
  'FLOAT' => 701,      # float8
  'INT' => 23,         # int4
  'INTERVAL' => 1186,  # interval
  'VOID' => 2278,      # void
  'SMALLINT' => 21,    # int2
  'STRING' => 1043,    # varchar
  'TIMESTAMP' => 1114, # timestamp
  'TIMESTAMP_NTZ' => 1114, # timestamp without timezone
  'TINYINT' => 21,     # mapped to smallint (int2) as postgres has no tinyint

  # Complex Types - mapping to closest PostgreSQL equivalents
  'ARRAY' => 2277,     # anyarray
  'MAP' => 3802,       # jsonb
  'STRUCT' => 3802,    # jsonb
  'VARIANT' => 3802,   # jsonb
  'OBJECT' => 3802     # jsonb
}.freeze

Class Method Summary collapse

Class Method Details

.connect_to_databricks(config) ⇒ Object



34
35
36
37
38
39
40
41
42
# File 'lib/db/databricks.rb', line 34

def connect_to_databricks(config)
  warehouse_id = config[:path].split('/').last

  {
    host: config[:host],
    warehouse_id: warehouse_id,
    token: config[:token]
  }
end

.disconnect_from_databricks(client) ⇒ Object



44
45
46
# File 'lib/db/databricks.rb', line 44

def disconnect_from_databricks(client)
  # No-op as we're using REST API
end

.format_databricks_config(connection_string) ⇒ Object



198
199
200
201
202
203
204
205
# File 'lib/db/databricks.rb', line 198

def format_databricks_config(connection_string)
  parsed = URI(connection_string)
  {
    host: parsed.host,
    path: parsed.path[1..],  # Remove leading slash
    token: parsed.password
  }
end

.get_columns_by_table_databricks(client, schema_name, table_name) ⇒ Object



151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/db/databricks.rb', line 151

def get_columns_by_table_databricks(client, schema_name, table_name)
  sql = "    SELECT \n      column_name as columnName\n    FROM system.information_schema.columns\n    WHERE table_schema = '\#{schema_name}'\n      AND table_name = '\#{table_name}'\n    ORDER BY ordinal_position\n  SQL\n  \n  results = run_query_databricks(sql, client)\n  results[:rows].map { |row| row['columnName'] }\nend\n"

.get_foreign_keys_databricks(client, schema_name, table_name, primary_key) ⇒ Object



165
166
167
168
# File 'lib/db/databricks.rb', line 165

def get_foreign_keys_databricks(client, schema_name, table_name, primary_key)
  # Databricks doesn't support foreign keys
  []
end

.get_schema_column_info_databricks(client, schema_name, table_names) ⇒ Object



170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/db/databricks.rb', line 170

def get_schema_column_info_databricks(client, schema_name, table_names)
  table_names.map do |table|
    sql = "      SELECT \n        column_name as columnName,\n        data_type as dataType\n      FROM system.information_schema.columns\n      WHERE table_schema = '\#{table[:schemaName]}'\n        AND table_name = '\#{table[:tableName]}'\n      ORDER BY ordinal_position\n    SQL\n\n    results = run_query_databricks(sql, client)\n    {\n      tableName: \"\#{table[:schemaName]}.\#{table[:tableName]}\",\n      columns: results[:rows].map do |row|\n        data_type = row['dataType'].split('<')[0].upcase\n        {\n          columnName: row['columnName'],\n          displayName: row['columnName'],\n          fieldType: row['dataType'],\n          dataTypeID: DATABRICKS_DATATYPE_ID_MAP[data_type] || 1043\n        }\n      end\n    }\n  end\nend\n"

.get_schemas_databricks(client) ⇒ Object



116
117
118
119
120
121
122
123
124
125
# File 'lib/db/databricks.rb', line 116

def get_schemas_databricks(client)
  sql = "    SELECT schema_name \n    FROM system.information_schema.schemata \n    WHERE schema_name != \"information_schema\"\n  SQL\n  \n  results = run_query_databricks(sql, client)\n  results[:rows].map { |row| row['schema_name'] }\nend\n"

.get_tables_by_schema_databricks(client, schema_names) ⇒ Object



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/db/databricks.rb', line 127

def get_tables_by_schema_databricks(client, schema_names)
  all_tables = schema_names.flat_map do |schema|
    sql = "      SELECT \n        table_name as tableName,\n        table_schema as schemaName,\n        table_catalog as catalogName\n      FROM system.information_schema.tables\n      WHERE table_schema = '\#{schema}'\n    SQL\n    \n    results = run_query_databricks(sql, client)\n    results[:rows].map do |row|\n      {\n        tableName: row['tableName'],\n        schemaName: row['schemaName'],\n        catalogName: row['catalogName']\n      }\n    end\n  end\n\n  all_tables\nend\n"

.run_query_databricks(sql, client) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/db/databricks.rb', line 48

def run_query_databricks(sql, client)
  uri = URI("https://#{client[:host]}/api/2.0/sql/statements")
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = true

  request = Net::HTTP::Post.new(uri)
  request['Authorization'] = "Bearer #{client[:token]}"
  request['Content-Type'] = 'application/json'
  request.body = JSON.generate({
    statement: sql,
    warehouse_id: client[:warehouse_id],
    wait_timeout: '50s'
  })

  response = http.request(request)

  raise "Query failed: #{response.body}" unless response.is_a?(Net::HTTPSuccess)

  result = JSON.parse(response.body)
  
  raise "Query state: #{result['status']['state']}" unless result['status']['state'] == 'SUCCEEDED'

  columns = result['manifest']['schema']['columns']
  column_names = columns.map { |col| col['name'] }
  column_types = columns.map { |col| col['type_name'] }

  json_like_types = ['ARRAY', 'MAP', 'STRUCT', 'JSON']


  fields = columns.map do |column|
    {
      name: column['name'],
      dataTypeID: DATABRICKS_DATATYPE_NAME_MAP[column['type_name']] || 1043 # default to varchar if type not found
    }
  end

  # Transform the data array into rows of hashes
  rows = result['result']['data_array'].map do |row_array|
    row_hash = column_names.zip(row_array).to_h
    transformed_row = {}
  
    column_names.each_with_index do |key, idx|
      value = row_hash[key]
      type = column_types[idx]
      # parse JSON if the field is a JSON type
      transformed_value = if value.is_a?(String) && json_like_types.include?(type.upcase)
        begin
          parsed = JSON.parse(value)
          parsed.is_a?(Array) ? parsed.map(&:to_s) : parsed
        rescue JSON::ParserError
          value
        end
      else
        value
      end
  
      transformed_row[key] = transformed_value
    end
  
    transformed_row
  end

  {
    fields: fields,
    rows: rows
  }
end