Class: Fluent::Plugin::WebHDFSOutput

Inherits:
Output
  • Object
show all
Defined in:
lib/fluent/plugin/webhdfs_compressor_lzo_command.rb,
lib/fluent/plugin/webhdfs_compressor_snappy.rb,
lib/fluent/plugin/webhdfs_compressor_bzip2.rb,
lib/fluent/plugin/webhdfs_compressor_text.rb,
lib/fluent/plugin/webhdfs_compressor_gzip.rb,
lib/fluent/plugin/out_webhdfs.rb

Defined Under Namespace

Classes: Bzip2Compressor, Compressor, GzipCompressor, LZOCommandCompressor, SnappyCompressor, TextCompressor

Constant Summary collapse

SUPPORTED_COMPRESS =
[:gzip, :bzip2, :snappy, :lzo_command, :text]
CHUNK_ID_PLACE_HOLDER =
'${chunk_id}'
HOSTNAME_PLACEHOLDERS_DEPRECATED =
['${hostname}', '%{hostname}', '__HOSTNAME__']
UUID_RANDOM_PLACEHOLDERS_DEPRECATED =
['${uuid}', '${uuid:random}', '__UUID__', '__UUID_RANDOM__']
UUID_OTHER_PLACEHOLDERS_OBSOLETED =
['${uuid:hostname}', '%{uuid:hostname}', '__UUID_HOSTNAME__', '${uuid:timestamp}', '%{uuid:timestamp}', '__UUID_TIMESTAMP__']
COMPRESSOR_REGISTRY =
Fluent::Registry.new(:webhdfs_compressor_type, 'fluent/plugin/webhdfs_compressor_')

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeWebHDFSOutput

Returns a new instance of WebHDFSOutput.



92
93
94
95
96
97
98
# File 'lib/fluent/plugin/out_webhdfs.rb', line 92

def initialize
  super
  @compressor = nil
  @standby_namenode_host = nil
  @output_include_tag = @output_include_time = nil # TODO: deprecated
  @header_separator = @field_separator = nil # TODO: deprecated
end

Instance Attribute Details

#compressorObject (readonly)

Returns the value of attribute compressor.



90
91
92
# File 'lib/fluent/plugin/out_webhdfs.rb', line 90

def compressor
  @compressor
end

#formatterObject (readonly)

Returns the value of attribute formatter.



90
91
92
# File 'lib/fluent/plugin/out_webhdfs.rb', line 90

def formatter
  @formatter
end

Class Method Details

.register_compressor(name, compressor) ⇒ Object



512
513
514
# File 'lib/fluent/plugin/out_webhdfs.rb', line 512

def self.register_compressor(name, compressor)
  COMPRESSOR_REGISTRY.register(name, compressor)
end

Instance Method Details

#compat_parameters_convert_plaintextformatter(conf) ⇒ Object



413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
# File 'lib/fluent/plugin/out_webhdfs.rb', line 413

def compat_parameters_convert_plaintextformatter(conf)
  if !conf.elements('format').empty? || !conf['output_data_type']
    @using_formatter_config = true
    @null_convert_keys = []
    return
  end

  log.warn "webhdfs output plugin is working with old configuration parameters. use <inject>/<format> sections instead for further releases."
  @using_formatter_config = false
  @null_convert_keys = []

  @header_separator = case conf['field_separator']
                      when nil     then "\t"
                      when 'SPACE' then ' '
                      when 'TAB'   then "\t"
                      when 'COMMA' then ','
                      when 'SOH'   then "\x01"
                      else conf['field_separator']
                      end

  format_section = Fluent::Config::Element.new('format', '', {}, [])
  case conf['output_data_type']
  when '', 'json' # blank value is for compatibility reason (especially in testing)
    format_section['@type'] = 'json'
  when 'ltsv'
    format_section['@type'] = 'ltsv'
  else
    unless conf['output_data_type'].start_with?('attr:')
      raise Fluent::ConfigError, "output_data_type is invalid: #{conf['output_data_type']}"
    end
    format_section['@format'] = 'tsv'
    keys_part = conf['output_data_type'].sub(/^attr:/, '')
    @null_convert_keys = keys_part.split(',')
    format_section['keys'] = keys_part
    format_section['delimiter'] = case conf['field_separator']
                                  when nil then '\t'
                                  when 'SPACE' then ' '
                                  when 'TAB'   then '\t'
                                  when 'COMMA' then ','
                                  when 'SOH'   then 'SOH' # fixed later
                                  else conf['field_separator']
                                  end
  end

  conf.elements << format_section

  @output_include_time = conf.has_key?('output_include_time') ? Fluent::Config.bool_value(conf['output_include_time']) : true
  @output_include_tag = conf.has_key?('output_include_tag') ? Fluent::Config.bool_value(conf['output_include_tag']) : true

  if @output_include_time
    # default timezone is UTC
    using_localtime = if !conf.has_key?('utc') && !conf.has_key?('localtime')
                        false
                      elsif conf.has_key?('localtime') && conf.has_key?('utc')
                        raise Fluent::ConfigError, "specify either 'localtime' or 'utc'"
                      elsif conf.has_key?('localtime')
                        Fluent::Config.bool_value('localtime')
                      else
                        Fluent::Config.bool_value('utc')
                      end
    @time_formatter = Fluent::TimeFormatter.new(conf['time_format'], using_localtime)
  else
    @time_formatter = nil
  end
end

#compress_context(chunk, &block) ⇒ Object



330
331
332
333
334
335
336
337
338
339
# File 'lib/fluent/plugin/out_webhdfs.rb', line 330

def compress_context(chunk, &block)
  begin
    tmp = Tempfile.new("webhdfs-")
    @compressor.compress(chunk, tmp)
    tmp.rewind
    yield tmp
  ensure
    tmp.close(true) rescue nil
  end
end

#configure(conf) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/fluent/plugin/out_webhdfs.rb', line 100

def configure(conf)
  # #compat_parameters_convert ignore time format in conf["path"],
  # so check conf["path"] and overwrite the default value later if needed
  timekey = case conf["path"]
            when /%S/ then 1
            when /%M/ then 60
            when /%H/ then 3600
            else 86400
            end
  if buffer_config = conf.elements(name: "buffer").first
    timekey = buffer_config["timekey"] || timekey 
  end

  compat_parameters_convert(conf, :buffer, default_chunk_key: "time")

  if conf.elements(name: "buffer").empty?
    e = Fluent::Config::Element.new("buffer", "time", {}, [])
    conf.elements << e
  end
  buffer_config = conf.elements(name: "buffer").first
  # explicitly set timekey
  buffer_config["timekey"] = timekey

  compat_parameters_convert_plaintextformatter(conf)
  verify_config_placeholders_in_path!(conf)

  super

  @formatter = formatter_create

  if @using_formatter_config
    @null_value = nil
  else
    @formatter.delimiter = "\x01" if @formatter.respond_to?(:delimiter) && @formatter.delimiter == 'SOH'
    @null_value ||= 'NULL'
  end

  if @default_tag.nil? && !@using_formatter_config && @output_include_tag
    @default_tag = "tag_missing"
  end
  if @remove_prefix
    @remove_prefix_actual = @remove_prefix + "."
    @remove_prefix_actual_length = @remove_prefix_actual.length
  end

  @replace_random_uuid = @path.include?('%{uuid}') || @path.include?('%{uuid_flush}')
  if @replace_random_uuid
    # to check SecureRandom.uuid is available or not (NotImplementedError raised in such environment)
    begin
      SecureRandom.uuid
    rescue
      raise Fluent::ConfigError, "uuid feature (SecureRandom) is unavailable in this environment"
    end
  end

  @compressor = COMPRESSOR_REGISTRY.lookup(@compress.to_s).new

  if @host
    @namenode_host = @host
    @namenode_port = @port
  elsif @namenode
    unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @namenode
      raise Fluent::ConfigError, "Invalid config value about namenode: '#{@namenode}', needs NAMENODE_HOST:PORT"
    end
    @namenode_host = $1
    @namenode_port = $2.to_i
  else
    raise Fluent::ConfigError, "WebHDFS host or namenode missing"
  end
  if @standby_namenode
    unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @standby_namenode
      raise Fluent::ConfigError, "Invalid config value about standby namenode: '#{@standby_namenode}', needs STANDBY_NAMENODE_HOST:PORT"
    end
    if @httpfs
      raise Fluent::ConfigError, "Invalid configuration: specified to use both of standby_namenode and httpfs."
    end
    @standby_namenode_host = $1
    @standby_namenode_port = $2.to_i
  end
  unless @path.index('/') == 0
    raise Fluent::ConfigError, "Path on hdfs MUST starts with '/', but '#{@path}'"
  end

  @client = prepare_client(@namenode_host, @namenode_port, @username)
  if @standby_namenode_host
    @client_standby = prepare_client(@standby_namenode_host, @standby_namenode_port, @username)
  else
    @client_standby = nil
  end

  unless @append
    if @path.index(CHUNK_ID_PLACE_HOLDER).nil?
      raise Fluent::ConfigError, "path must contain ${chunk_id}, which is the placeholder for chunk_id, when append is set to false."
    end
  end
end

#format(tag, time, record) ⇒ Object



341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# File 'lib/fluent/plugin/out_webhdfs.rb', line 341

def format(tag, time, record)
  if @remove_prefix # TODO: remove when it's obsoleted
    if tag.start_with?(@remove_prefix_actual)
      if tag.length > @remove_prefix_actual_length
        tag = tag[@remove_prefix_actual_length..-1]
      else
        tag = @default_tag
      end
    elsif tag.start_with?(@remove_prefix)
      if tag == @remove_prefix
        tag = @default_tag
      else
        tag = tag.sub(@remove_prefix, '')
      end
    end
  end

  if @null_value # TODO: remove when it's obsoleted
    check_keys = (record.keys + @null_convert_keys).uniq
    check_keys.each do |key|
      record[key] = @null_value if record[key].nil?
    end
  end

  if @using_formatter_config
    record = inject_values_to_record(tag, time, record)
    line = @formatter.format(tag, time, record)
  else # TODO: remove when it's obsoleted
    time_str = @output_include_time ? @time_formatter.call(time) + @header_separator : ''
    tag_str = @output_include_tag ? tag + @header_separator : ''
    record_str = @formatter.format(tag, time, record)
    line = time_str + tag_str + record_str
  end
  line << "\n" if @end_with_newline && !line.end_with?("\n")
  line
rescue => e # remove this clause when @suppress_log_broken_string is obsoleted
  unless @suppress_log_broken_string
    log.info "unexpected error while formatting events, ignored", tag: tag, record: record, error: e
  end
  ''
end

#generate_path(chunk) ⇒ Object



316
317
318
319
320
321
322
323
324
325
326
327
328
# File 'lib/fluent/plugin/out_webhdfs.rb', line 316

def generate_path(chunk)
  hdfs_path = if @append
                extract_placeholders(@path, chunk.)
              else
                extract_placeholders(@path.gsub(CHUNK_ID_PLACE_HOLDER, dump_unique_id_hex(chunk.unique_id)), chunk.)
              end
  hdfs_path = "#{hdfs_path}#{@compressor.ext}"
  if @replace_random_uuid
    uuid_random = SecureRandom.uuid
    hdfs_path = hdfs_path.gsub('%{uuid}', uuid_random).gsub('%{uuid_flush}', uuid_random)
  end
  hdfs_path
end

#is_standby_exception(e) ⇒ Object



258
259
260
# File 'lib/fluent/plugin/out_webhdfs.rb', line 258

def is_standby_exception(e)
  e.is_a?(WebHDFS::IOError) && e.message.match(/org\.apache\.hadoop\.ipc\.StandbyException/)
end

#multi_workers_ready?Boolean

Returns:

  • (Boolean)


197
198
199
# File 'lib/fluent/plugin/out_webhdfs.rb', line 197

def multi_workers_ready?
  true
end

#namenode_available(client) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/fluent/plugin/out_webhdfs.rb', line 226

def namenode_available(client)
  if client
    available = true
    begin
      client.list('/')
    rescue => e
      log.warn "webhdfs check request failed. (namenode: #{client.host}:#{client.port}, error: #{e.message})"
      available = false
    end
    available
  else
    false
  end
end

#namenode_failoverObject



262
263
264
265
266
267
# File 'lib/fluent/plugin/out_webhdfs.rb', line 262

def namenode_failover
  if @standby_namenode
    @client, @client_standby = @client_standby, @client
    log.warn "Namenode failovered, now using #{@client.host}:#{@client.port}."
  end
end

#prepare_client(host, port, username) ⇒ Object



201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/fluent/plugin/out_webhdfs.rb', line 201

def prepare_client(host, port, username)
  client = WebHDFS::Client.new(host, port, username)
  if @httpfs
    client.httpfs_mode = true
  end
  client.open_timeout = @open_timeout
  client.read_timeout = @read_timeout
  if @retry_known_errors
    client.retry_known_errors = true
    client.retry_interval = @retry_interval if @retry_interval
    client.retry_times = @retry_times if @retry_times
  end
  if @ssl
    client.ssl = true
    client.ssl_ca_file = @ssl_ca_file if @ssl_ca_file
    client.ssl_verify_mode = @ssl_verify_mode if @ssl_verify_mode
  end
  if @kerberos
    client.kerberos = true
    client.kerberos_keytab = @kerberos_keytab if @kerberos_keytab
  end

  client
end

#send_data(path, data) ⇒ Object



269
270
271
272
273
274
275
276
277
278
279
# File 'lib/fluent/plugin/out_webhdfs.rb', line 269

def send_data(path, data)
  if @append
    begin
      @client.append(path, data)
    rescue WebHDFS::FileNotFoundError
      @client.create(path, data)
    end
  else
    @client.create(path, data, {'overwrite' => 'true'})
  end
end

#startObject



241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/fluent/plugin/out_webhdfs.rb', line 241

def start
  super

  if namenode_available(@client)
    log.info "webhdfs connection confirmed: #{@namenode_host}:#{@namenode_port}"
    return
  end
  if @client_standby && namenode_available(@client_standby)
    log.info "webhdfs connection confirmed: #{@standby_namenode_host}:#{@standby_namenode_port}"
    return
  end

  unless @ignore_start_check_error
    raise RuntimeError, "webhdfs is not available now."
  end
end

#verify_config_placeholders_in_path!(conf) ⇒ Object



285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# File 'lib/fluent/plugin/out_webhdfs.rb', line 285

def verify_config_placeholders_in_path!(conf)
  return unless conf.has_key?('path')

  path = conf['path']

  # check @path for ${hostname}, %{hostname} and __HOSTNAME__ to warn to use #{Socket.gethostbyname}
  if HOSTNAME_PLACEHOLDERS_DEPRECATED.any?{|ph| path.include?(ph) }
    log.warn "hostname placeholder is now deprecated. use '\#\{Socket.gethostname\}' instead."
    hostname = conf['hostname'] || Socket.gethostname
    HOSTNAME_PLACEHOLDERS_DEPRECATED.each do |ph|
      path.gsub!(ph, hostname)
    end
  end

  if UUID_RANDOM_PLACEHOLDERS_DEPRECATED.any?{|ph| path.include?(ph) }
    log.warn "random uuid placeholders are now deprecated. use %{uuid} (or %{uuid_flush}) instead."
    UUID_RANDOM_PLACEHOLDERS_DEPRECATED.each do |ph|
      path.gsub!(ph, '%{uuid}')
    end
  end

  if UUID_OTHER_PLACEHOLDERS_OBSOLETED.any?{|ph| path.include?(ph) }
    UUID_OTHER_PLACEHOLDERS_OBSOLETED.each do |ph|
      if path.include?(ph)
        log.error "configuration placeholder #{ph} is now unsupported by webhdfs output plugin."
      end
    end
    raise ConfigError, "there are unsupported placeholders in path."
  end
end

#write(chunk) ⇒ Object



383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# File 'lib/fluent/plugin/out_webhdfs.rb', line 383

def write(chunk)
  hdfs_path = generate_path(chunk)

  failovered = false
  begin
    compress_context(chunk) do |data|
      send_data(hdfs_path, data)
    end
  rescue => e
    log.warn "failed to communicate hdfs cluster, path: #{hdfs_path}"

    raise e if !@client_standby || failovered

    if is_standby_exception(e) && namenode_available(@client_standby)
      log.warn "Seems the connected host status is not active (maybe due to failovers). Gonna try another namenode immediately."
      namenode_failover
      failovered = true
      retry
    end
    if @num_errors && ((@num_errors + 1) >= @failures_before_use_standby) && namenode_available(@client_standby)
      log.warn "Too many failures. Try to use the standby namenode instead."
      namenode_failover
      failovered = true
      retry
    end
    raise e
  end
  hdfs_path
end