Class: Fluent::WebHDFSOutput

Inherits:
TimeSlicedOutput
  • Object
show all
Includes:
Mixin::ConfigPlaceholders, Mixin::PlainTextFormatter
Defined in:
lib/fluent/plugin/webhdfs_compressor_gzip.rb,
lib/fluent/plugin/webhdfs_compressor_text.rb,
lib/fluent/plugin/webhdfs_compressor_bzip2.rb,
lib/fluent/plugin/webhdfs_compressor_snappy.rb,
lib/fluent/plugin/webhdfs_compressor_lzo_command.rb,
lib/fluent/plugin/out_webhdfs.rb

Defined Under Namespace

Classes: Bzip2Compressor, Compressor, GzipCompressor, LZOCommandCompressor, SnappyCompressor, TextCompressor

Constant Summary collapse

SUPPORTED_COMPRESS =
['gzip', 'bzip2', 'snappy', 'lzo_command', 'text']
CHUNK_ID_PLACE_HOLDER =
'${chunk_id}'
COMPRESSOR_REGISTRY =
Fluent::Registry.new(:webhdfs_compressor_type, 'fluent/plugin/webhdfs_compressor_')

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeWebHDFSOutput

Returns a new instance of WebHDFSOutput.



101
102
103
104
105
106
107
108
# File 'lib/fluent/plugin/out_webhdfs.rb', line 101

def initialize
  super
  require 'net/http'
  require 'time'
  require 'webhdfs'

  @compressor = nil
end

Instance Attribute Details

#compressorObject (readonly)

Returns the value of attribute compressor.



99
100
101
# File 'lib/fluent/plugin/out_webhdfs.rb', line 99

def compressor
  @compressor
end

Class Method Details

.register_compressor(name, compressor) ⇒ Object



354
355
356
# File 'lib/fluent/plugin/out_webhdfs.rb', line 354

def self.register_compressor(name, compressor)
  COMPRESSOR_REGISTRY.register(name, compressor)
end

Instance Method Details

#chunk_unique_id_to_str(unique_id) ⇒ Object



252
253
254
# File 'lib/fluent/plugin/out_webhdfs.rb', line 252

def chunk_unique_id_to_str(unique_id)
  unique_id.unpack('C*').map{|x| x.to_s(16).rjust(2,'0')}.join('')
end

#compress_context(chunk, &block) ⇒ Object



280
281
282
283
284
285
286
287
288
289
# File 'lib/fluent/plugin/out_webhdfs.rb', line 280

def compress_context(chunk, &block)
  begin
    tmp = Tempfile.new("webhdfs-")
    @compressor.compress(chunk, tmp)
    tmp.rewind
    yield tmp
  ensure
    tmp.close(true) rescue nil
  end
end

#configure(conf) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/fluent/plugin/out_webhdfs.rb', line 115

def configure(conf)
  if conf['path']
    if conf['path'].index('%S')
      conf['time_slice_format'] = '%Y%m%d%H%M%S'
    elsif conf['path'].index('%M')
      conf['time_slice_format'] = '%Y%m%d%H%M'
    elsif conf['path'].index('%H')
      conf['time_slice_format'] = '%Y%m%d%H'
    end
  end

  super

  begin
    @compressor = COMPRESSOR_REGISTRY.lookup(@compress || 'text').new
  rescue Fluent::ConfigError
    raise
  rescue
    $log.warn "#{@comress} not found. Use 'text' instead"
    @compressor = COMPRESSOR_REGISTRY.lookup('text').new
  end

  if @host
    @namenode_host = @host
    @namenode_port = @port
  elsif @namenode
    unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @namenode
      raise Fluent::ConfigError, "Invalid config value about namenode: '#{@namenode}', needs NAMENODE_HOST:PORT"
    end
    @namenode_host = $1
    @namenode_port = $2.to_i
  else
    raise Fluent::ConfigError, "WebHDFS host or namenode missing"
  end
  if @standby_namenode
    unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @standby_namenode
      raise Fluent::ConfigError, "Invalid config value about standby namenode: '#{@standby_namenode}', needs STANDBY_NAMENODE_HOST:PORT"
    end
    if @httpfs
      raise Fluent::ConfigError, "Invalid configuration: specified to use both of standby_namenode and httpfs."
    end
    @standby_namenode_host = $1
    @standby_namenode_port = $2.to_i
  end
  unless @path.index('/') == 0
    raise Fluent::ConfigError, "Path on hdfs MUST starts with '/', but '#{@path}'"
  end

  @client = prepare_client(@namenode_host, @namenode_port, @username)
  if @standby_namenode_host
    @client_standby = prepare_client(@standby_namenode_host, @standby_namenode_port, @username)
  else
    @client_standby = nil
  end

  if not @append
    if @path.index(CHUNK_ID_PLACE_HOLDER).nil?
      raise Fluent::ConfigError, "path must contain ${chunk_id}, which is the placeholder for chunk_id, when append is set to false."
    end
  end
end

#desc(description) ⇒ Object



17
18
# File 'lib/fluent/plugin/out_webhdfs.rb', line 17

def desc(description)
end

#generate_path(chunk) ⇒ Object



270
271
272
273
274
275
276
277
278
# File 'lib/fluent/plugin/out_webhdfs.rb', line 270

def generate_path(chunk)
  hdfs_path = if @append
                path_format(chunk.key)
              else
                path_format(chunk.key).gsub(CHUNK_ID_PLACE_HOLDER, chunk_unique_id_to_str(chunk.unique_id))
              end
  hdfs_path = "#{hdfs_path}#{@compressor.ext}"
  hdfs_path
end

#is_standby_exception(e) ⇒ Object



241
242
243
# File 'lib/fluent/plugin/out_webhdfs.rb', line 241

def is_standby_exception(e)
  e.is_a?(WebHDFS::IOError) && e.message.match(/org\.apache\.hadoop\.ipc\.StandbyException/)
end

#namenode_available(client) ⇒ Object



201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/fluent/plugin/out_webhdfs.rb', line 201

def namenode_available(client)
  if client
    available = true
    begin
      client.list('/')
    rescue => e
      log.warn "webhdfs check request failed. (namenode: #{client.host}:#{client.port}, error: #{e.message})"
      available = false
    end
    available
  else
    false
  end
end

#namenode_failoverObject



245
246
247
248
249
250
# File 'lib/fluent/plugin/out_webhdfs.rb', line 245

def namenode_failover
  if @standby_namenode
    @client, @client_standby = @client_standby, @client
    log.warn "Namenode failovered, now using #{@client.host}:#{@client.port}."
  end
end

#path_format(chunk_key) ⇒ Object



237
238
239
# File 'lib/fluent/plugin/out_webhdfs.rb', line 237

def path_format(chunk_key)
  Time.strptime(chunk_key, @time_slice_format).strftime(@path)
end

#prepare_client(host, port, username) ⇒ Object



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/fluent/plugin/out_webhdfs.rb', line 177

def prepare_client(host, port, username)
  client = WebHDFS::Client.new(host, port, username)
  if @httpfs
    client.httpfs_mode = true
  end
  client.open_timeout = @open_timeout
  client.read_timeout = @read_timeout
  if @retry_known_errors
    client.retry_known_errors = true
    client.retry_interval = @retry_interval if @retry_interval
    client.retry_times = @retry_times if @retry_times
  end
  if @ssl
    client.ssl = true
    client.ssl_ca_file = @ssl_ca_file if @ssl_ca_file
    client.ssl_verify_mode = @ssl_verify_mode if @ssl_verify_mode
  end
  if @kerberos
    client.kerberos = true
  end

  client
end

#send_data(path, data) ⇒ Object

TODO check conflictions



258
259
260
261
262
263
264
265
266
267
268
# File 'lib/fluent/plugin/out_webhdfs.rb', line 258

def send_data(path, data)
  if @append
    begin
      @client.append(path, data)
    rescue WebHDFS::FileNotFoundError
      @client.create(path, data)
    end
  else
    @client.create(path, data, {'overwrite' => 'true'})
  end
end

#shutdownObject



233
234
235
# File 'lib/fluent/plugin/out_webhdfs.rb', line 233

def shutdown
  super
end

#startObject



216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# File 'lib/fluent/plugin/out_webhdfs.rb', line 216

def start
  super

  if namenode_available(@client)
    log.info "webhdfs connection confirmed: #{@namenode_host}:#{@namenode_port}"
    return
  end
  if @client_standby && namenode_available(@client_standby)
    log.info "webhdfs connection confirmed: #{@standby_namenode_host}:#{@standby_namenode_port}"
    return
  end

  unless @ignore_start_check_error
    raise RuntimeError, "webhdfs is not available now."
  end
end

#write(chunk) ⇒ Object



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# File 'lib/fluent/plugin/out_webhdfs.rb', line 291

def write(chunk)
  hdfs_path = generate_path(chunk)

  failovered = false
  begin
    compress_context(chunk) do |data|
      send_data(hdfs_path, data)
    end
  rescue => e
    log.warn "failed to communicate hdfs cluster, path: #{hdfs_path}"

    raise e if !@client_standby || failovered

    if is_standby_exception(e) && namenode_available(@client_standby)
      log.warn "Seems the connected host status is not active (maybe due to failovers). Gonna try another namenode immediately."
      namenode_failover
      failovered = true
      retry
    end
    if @num_errors && ((@num_errors + 1) >= @failures_before_use_standby) && namenode_available(@client_standby)
      log.warn "Too many failures. Try to use the standby namenode instead."
      namenode_failover
      failovered = true
      retry
    end
    raise e
  end
  hdfs_path
end