Class: Parquet::Schema::SchemaBuilder

Inherits:
Object
  • Object
show all
Defined in:
lib/parquet/schema.rb

Overview

Internal builder class that provides the DSL methods

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeSchemaBuilder

Returns a new instance of SchemaBuilder.



45
46
47
# File 'lib/parquet/schema.rb', line 45

def initialize
  @fields = []
end

Instance Attribute Details

#fieldsObject (readonly)

Returns the value of attribute fields.



43
44
45
# File 'lib/parquet/schema.rb', line 43

def fields
  @fields
end

Instance Method Details

#build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block) ⇒ Object



171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/parquet/schema.rb', line 171

def build_map(key_type, value_type, key_nullable: false, value_nullable: true, nullable: true, &block)
  # Wrap the key type (maps typically use non-nullable keys)
  key = wrap_subtype(key_type, nullable: key_nullable)

  # Handle the case where value_type is a complex type (:struct or :list) and a block is provided
  value =
    if (value_type == :struct || value_type == :list) && block
      wrap_subtype(value_type, nullable: value_nullable, &block)
    else
      wrap_subtype(value_type, nullable: value_nullable)
    end

  # Map is represented as a list of key/value pairs in Parquet
  {
    type: :map,
    nullable: nullable,
    item: {
      type: :struct,
      nullable: false,
      name: "key_value",
      fields: [key, value]
    }
  }
end

#field(name, type, nullable: true, **kwargs, &block) ⇒ Object

Define a field in the schema Additional keyword args:

- `item:` if type == :list
- `item_nullable:` controls nullability of list items (default: true)
- `key:, value:` if type == :map
- `key_nullable:, value_nullable:` controls nullability of map keys/values (default: true)
- `format:` if you want to store some format string
- `precision:, scale:` if type == :decimal (precision defaults to 38, scale to 0)
- `has_timezone:` if type is timestamp - true means UTC storage (default), false means local/unzoned
- `timezone:` (DEPRECATED) if type is timestamp - any value means UTC storage
- `nullable:` default to true if not specified

Parameters:

  • name (String, Symbol)

    field name

  • type (Symbol)

    data type (:int32, :int64, :string, :list, :map, :struct, :decimal, etc)

  • nullable (Boolean) (defaults to: true)

    whether the field can be null (default: true)

  • kwargs (Hash)

    additional options depending on type



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/parquet/schema.rb', line 65

def field(name, type, nullable: true, **kwargs, &block)
  field_hash = { name: name.to_s, type: type, nullable: !!nullable }

  # Possibly store a format if provided
  field_hash[:format] = kwargs[:format] if kwargs.key?(:format)
  
  # Handle timezone for timestamp types
  if [:timestamp_second, :timestamp_millis, :timestamp_micros, :timestamp_nanos].include?(type)
    # Support new has_timezone parameter (preferred)
    if kwargs.key?(:has_timezone)
      # If has_timezone is true, store "UTC" to indicate timezone presence
      # If explicitly false, don't store timezone (indicates local/unzoned)
      field_hash[:timezone] = "UTC" if kwargs[:has_timezone]
    elsif kwargs.key?(:timezone)
      # Legacy support: any timezone value means UTC storage
      # Store "UTC" regardless of the actual value to make it clear
      field_hash[:timezone] = "UTC"
    else
      # Default behavior when neither parameter is specified: UTC storage
      field_hash[:timezone] = "UTC"
    end
  end

  case type
  when :struct
    # We'll parse subfields from the block
    sub_builder = SchemaBuilder.new
    sub_builder.instance_eval(&block) if block
    field_hash[:fields] = sub_builder.fields
  when :list
    item_type = kwargs[:item]
    raise ArgumentError, "list field `#{name}` requires `item:` type" unless item_type
    # Pass item_nullable if provided, otherwise use true as default
    item_nullable = kwargs[:item_nullable].nil? ? true : !!kwargs[:item_nullable]

    # Pass precision and scale if type is decimal
    if item_type == :decimal
      precision = kwargs[:precision]
      scale = kwargs[:scale]
      field_hash[:item] = wrap_subtype(
        item_type,
        nullable: item_nullable,
        precision: precision,
        scale: scale,
        &block
      )
    else
      field_hash[:item] = wrap_subtype(item_type, nullable: item_nullable, &block)
    end
  when :map
    # user must specify key:, value:
    key_type = kwargs[:key]
    value_type = kwargs[:value]
    raise ArgumentError, "map field `#{name}` requires `key:` and `value:`" if key_type.nil? || value_type.nil?
    # Pass key_nullable and value_nullable if provided, otherwise use true as default
    key_nullable = kwargs[:key_nullable].nil? ? true : !!kwargs[:key_nullable]
    value_nullable = kwargs[:value_nullable].nil? ? true : !!kwargs[:value_nullable]

    field_hash[:key] = wrap_subtype(key_type, nullable: key_nullable)

    # Pass precision and scale if value type is decimal
    if value_type == :decimal
      precision = kwargs[:precision]
      scale = kwargs[:scale]
      field_hash[:value] = wrap_subtype(
        value_type,
        nullable: value_nullable,
        precision: precision,
        scale: scale,
        &block
      )
    else
      field_hash[:value] = wrap_subtype(value_type, nullable: value_nullable, &block)
    end
  when :decimal
    # Store precision and scale for decimal type according to rules:
    # 1. When neither precision nor scale is provided, use maximum precision (38)
    # 2. When only precision is provided, scale defaults to 0
    # 3. When only scale is provided, use maximum precision (38)
    # 4. When both are provided, use the provided values

    if kwargs[:precision].nil? && kwargs[:scale].nil?
      # No precision or scale provided - use maximum precision
      field_hash[:precision] = 38
      field_hash[:scale] = 0
    elsif kwargs[:precision] && kwargs[:scale].nil?
      # Precision only - scale defaults to 0
      field_hash[:precision] = kwargs[:precision]
      field_hash[:scale] = 0
    elsif kwargs[:precision].nil? && kwargs[:scale]
      # Scale only - use maximum precision
      field_hash[:precision] = 38
      field_hash[:scale] = kwargs[:scale]
    else
      # Both provided
      field_hash[:precision] = kwargs[:precision]
      field_hash[:scale] = kwargs[:scale]
    end
  else
    # primitive type: :int32, :int64, :string, etc.
    # do nothing else special
  end

  @fields << field_hash
end