Class: RBHive::TCLISchemaDefinition

Inherits:
Object
  • Object
show all
Defined in:
lib/rbhive/t_c_l_i_schema_definition.rb

Constant Summary collapse

NAN =
Float::NAN rescue 0.0/0.0
INFINITY =
Float::INFINITY rescue 1.0/0.0
TYPES =
{
  :boolean  => :to_s,
  :string   => :to_s,
  :float    => :to_f,
  :double   => :to_f,
  :int      => :to_i,
  :bigint   => :to_i,
  :smallint => :to_i,
  :tinyint  => :to_i,
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(schema, example_row) ⇒ TCLISchemaDefinition

Returns a new instance of TCLISchemaDefinition.



20
21
22
23
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 20

def initialize(schema, example_row)
  @schema = schema
  @example_row = example_row ? example_row.colVals : []
end

Instance Attribute Details

#schemaObject (readonly)

Returns the value of attribute schema.



5
6
7
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 5

def schema
  @schema
end

Instance Method Details

#coerce_column(column_name, value) ⇒ Object



67
68
69
70
71
72
73
74
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 67

def coerce_column(column_name, value)
  type = column_type_map[column_name]
  return INFINITY if (type != :string && value == "Infinity")
  return NAN if (type != :string && value == "NaN")
  return coerce_complex_value(value) if type.to_s =~ /^array/
  conversion_method = TYPES[type]
  conversion_method ? value.send(conversion_method) : value
end

#coerce_complex_value(value) ⇒ Object



80
81
82
83
84
85
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 80

def coerce_complex_value(value)
  return nil if value.nil?
  return nil if value.length == 0
  return nil if value == 'null'
  JSON.parse(value)
end

#coerce_row(row) ⇒ Object



60
61
62
63
64
65
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 60

def coerce_row(row)
  column_names.zip(row.colVals.map(&:get_value).map(&:value)).inject({}) do |hsh, (column_name, value)|
    hsh[column_name] = coerce_column(column_name, value)
    hsh
  end
end

#coerce_row_to_array(row) ⇒ Object



76
77
78
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 76

def coerce_row_to_array(row)
  column_names.map { |n| row[n] }
end

#column_namesObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 25

def column_names
  @column_names ||= begin
    schema_names = @schema.columns.map {|c| c.columnName }

    # In rare cases Hive can return two identical column names
    # consider SELECT a.foo, b.foo...
    # in this case you get two columns called foo with no disambiguation.
    # as a (far from ideal) solution we detect this edge case and rename them
    # a.foo => foo1, b.foo => foo2
    # otherwise we will trample one of the columns during Hash mapping.
    s = Hash.new(0)
    schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c }
    schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c }
    schema_names.map! { |c| c.gsub('---|---', '_').to_sym }

    # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries
    # For now we will call them :_p1, :_p2, etc. to avoid collisions.
    offset = 0
    while schema_names.length < @example_row.length
      schema_names.push(:"_p#{offset+=1}")
    end
    schema_names
  end
end

#column_type_mapObject



50
51
52
53
54
55
56
57
58
# File 'lib/rbhive/t_c_l_i_schema_definition.rb', line 50

def column_type_map
  @column_type_map ||= column_names.inject({}) do |hsh, c| 
    definition = @schema.columns.find {|s| s.columnName.to_sym == c }
    # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings
    type = TYPE_NAMES[definition.typeDesc.types.first.primitiveEntry.type].downcase rescue nil
    hsh[c] = definition && type ? type.to_sym : :string
    hsh
  end
end