Module: DatabricksHelper

Defined in:
lib/db/databricks.rb

Constant Summary collapse

DATABRICKS_DATATYPE_NAME_MAP =
{
  # Numeric Types
  'BIGINT' => 20,      # int8
  'BINARY' => 17,      # bytea
  'BOOLEAN' => 16,     # bool
  'DATE' => 1082,      # date
  'DECIMAL' => 1700,   # numeric
  'DOUBLE' => 701,     # float8
  'FLOAT' => 701,      # float8
  'INT' => 23,         # int4
  'INTERVAL' => 1186,  # interval
  'VOID' => 2278,      # void
  'SMALLINT' => 21,    # int2
  'STRING' => 1043,    # varchar
  'TIMESTAMP' => 1114, # timestamp
  'TIMESTAMP_NTZ' => 1114, # timestamp without timezone
  'TINYINT' => 21,     # mapped to smallint (int2) as postgres has no tinyint

  # Complex Types - mapping to closest PostgreSQL equivalents
  'ARRAY' => 2277,     # anyarray
  'MAP' => 3802,       # jsonb
  'STRUCT' => 3802,    # jsonb
  'VARIANT' => 3802,   # jsonb
  'OBJECT' => 3802     # jsonb
}.freeze

Class Method Summary collapse

Class Method Details

.connect_to_databricks(config) ⇒ Object



34
35
36
37
38
39
40
41
42
# File 'lib/db/databricks.rb', line 34

def connect_to_databricks(config)
  warehouse_id = config[:path].split('/').last

  {
    host: config[:host],
    warehouse_id: warehouse_id,
    token: config[:token]
  }
end

.disconnect_from_databricks(client) ⇒ Object



44
45
46
# File 'lib/db/databricks.rb', line 44

def disconnect_from_databricks(client)
  # No-op as we're using REST API
end

.format_databricks_config(connection_string) ⇒ Object



201
202
203
204
205
206
207
208
# File 'lib/db/databricks.rb', line 201

def format_databricks_config(connection_string)
  parsed = URI(connection_string)
  {
    host: parsed.host,
    path: parsed.path[1..],  # Remove leading slash
    token: parsed.password
  }
end

.get_columns_by_table_databricks(client, schema_name, table_name) ⇒ Object



154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/db/databricks.rb', line 154

def get_columns_by_table_databricks(client, schema_name, table_name)
  sql = "    SELECT \n      column_name as columnName\n    FROM system.information_schema.columns\n    WHERE table_schema = '\#{schema_name}'\n      AND table_name = '\#{table_name}'\n    ORDER BY ordinal_position\n  SQL\n  \n  results = run_query_databricks(sql, client)\n  results[:rows].map { |row| row['columnName'] }\nend\n"

.get_foreign_keys_databricks(client, schema_name, table_name, primary_key) ⇒ Object



168
169
170
171
# File 'lib/db/databricks.rb', line 168

def get_foreign_keys_databricks(client, schema_name, table_name, primary_key)
  # Databricks doesn't support foreign keys
  []
end

.get_schema_column_info_databricks(client, schema_name, table_names) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/db/databricks.rb', line 173

def get_schema_column_info_databricks(client, schema_name, table_names)
  table_names.map do |table|
    sql = "      SELECT \n        column_name as columnName,\n        data_type as dataType\n      FROM system.information_schema.columns\n      WHERE table_schema = '\#{table[:schemaName]}'\n        AND table_name = '\#{table[:tableName]}'\n      ORDER BY ordinal_position\n    SQL\n\n    results = run_query_databricks(sql, client)\n    {\n      tableName: \"\#{table[:schemaName]}.\#{table[:tableName]}\",\n      columns: results[:rows].map do |row|\n        data_type = row['dataType'].split('<')[0].upcase\n        {\n          columnName: row['columnName'],\n          displayName: row['columnName'],\n          fieldType: row['dataType'],\n          dataTypeID: DATABRICKS_DATATYPE_ID_MAP[data_type] || 1043\n        }\n      end\n    }\n  end\nend\n"

.get_schemas_databricks(client) ⇒ Object



119
120
121
122
123
124
125
126
127
128
# File 'lib/db/databricks.rb', line 119

def get_schemas_databricks(client)
  sql = "    SELECT schema_name \n    FROM system.information_schema.schemata \n    WHERE schema_name != \"information_schema\"\n  SQL\n  \n  results = run_query_databricks(sql, client)\n  results[:rows].map { |row| row['schema_name'] }\nend\n"

.get_tables_by_schema_databricks(client, schema_names) ⇒ Object



130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/db/databricks.rb', line 130

def get_tables_by_schema_databricks(client, schema_names)
  all_tables = schema_names.flat_map do |schema|
    sql = "      SELECT \n        table_name as tableName,\n        table_schema as schemaName,\n        table_catalog as catalogName\n      FROM system.information_schema.tables\n      WHERE table_schema = '\#{schema}'\n    SQL\n    \n    results = run_query_databricks(sql, client)\n    results[:rows].map do |row|\n      {\n        tableName: row['tableName'],\n        schemaName: row['schemaName'],\n        catalogName: row['catalogName']\n      }\n    end\n  end\n\n  all_tables\nend\n"

.run_query_databricks(sql, client) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/db/databricks.rb', line 48

def run_query_databricks(sql, client)
  uri = URI("https://#{client[:host]}/api/2.0/sql/statements")
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = true

  request = Net::HTTP::Post.new(uri)
  request['Authorization'] = "Bearer #{client[:token]}"
  request['Content-Type'] = 'application/json'
  request.body = JSON.generate({
    statement: sql,
    warehouse_id: client[:warehouse_id],
    wait_timeout: '50s'
  })

  response = http.request(request)

  raise "Query failed: #{response.body}" unless response.is_a?(Net::HTTPSuccess)

  result = JSON.parse(response.body)
  
  raise "Query state: #{result['status']['state']}" unless result['status']['state'] == 'SUCCEEDED'

  columns = result['manifest']['schema']['columns']
  column_names = columns.map { |col| col['name'] }
  column_types = columns.map { |col| col['type_name'] }

  json_like_types = ['ARRAY', 'MAP', 'STRUCT', 'JSON']

  fields = columns.map do |column|
    {
      name: column['name'],
      dataTypeID: DATABRICKS_DATATYPE_NAME_MAP[column['type_name']] || 1043 # default to varchar if type not found
    }
  end

  # Handle empty results
  rows = if result['result'].nil? || result['result']['data_array'].nil?
    []
  else
    # Transform the data array into rows of hashes
    result['result']['data_array'].map do |row_array|
      row_hash = column_names.zip(row_array).to_h
      transformed_row = {}
    
      column_names.each_with_index do |key, idx|
        value = row_hash[key]
        type = column_types[idx]
        # parse JSON if the field is a JSON type
        transformed_value = if value.is_a?(String) && json_like_types.include?(type.upcase)
          begin
            parsed = JSON.parse(value)
            parsed.is_a?(Array) ? parsed.map(&:to_s) : parsed
          rescue JSON::ParserError
            value
          end
        else
          value
        end
        transformed_row[key] = transformed_value
      end
    
      transformed_row
    end
  end

  {
    fields: fields,
    rows: rows
  }
end