Class: Fluent::WebHDFSOutput

Inherits:
TimeSlicedOutput
  • Object
show all
Includes:
Mixin::ConfigPlaceholders, Mixin::PlainTextFormatter
Defined in:
lib/fluent/plugin/out_webhdfs.rb

Constant Summary collapse

CHUNK_ID_PLACE_HOLDER =
'${chunk_id}'

Instance Method Summary collapse

Constructor Details

#initializeWebHDFSOutput

Returns a new instance of WebHDFSOutput.



46
47
48
49
50
51
# File 'lib/fluent/plugin/out_webhdfs.rb', line 46

def initialize
  super
  require 'net/http'
  require 'time'
  require 'webhdfs'
end

Instance Method Details

#chunk_unique_id_to_str(unique_id) ⇒ Object



173
174
175
# File 'lib/fluent/plugin/out_webhdfs.rb', line 173

def chunk_unique_id_to_str(unique_id)
  unique_id.unpack('C*').map{|x| x.to_s(16).rjust(2,'0')}.join('')
end

#configure(conf) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/fluent/plugin/out_webhdfs.rb', line 53

def configure(conf)
  if conf['path']
    if conf['path'].index('%S')
      conf['time_slice_format'] = '%Y%m%d%H%M%S'
    elsif conf['path'].index('%M')
      conf['time_slice_format'] = '%Y%m%d%H%M'
    elsif conf['path'].index('%H')
      conf['time_slice_format'] = '%Y%m%d%H'
    end
  end

  super

  if @host
    @namenode_host = @host
    @namenode_port = @port
  elsif @namenode
    unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @namenode
      raise Fluent::ConfigError, "Invalid config value about namenode: '#{@namenode}', needs NAMENODE_HOST:PORT"
    end
    @namenode_host = $1
    @namenode_port = $2.to_i
  else
    raise Fluent::ConfigError, "WebHDFS host or namenode missing"
  end
  if @standby_namenode
    unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @standby_namenode
      raise Fluent::ConfigError, "Invalid config value about standby namenode: '#{@standby_namenode}', needs STANDBY_NAMENODE_HOST:PORT"
    end
    if @httpfs
      raise Fluent::ConfigError, "Invalid configuration: specified to use both of standby_namenode and httpfs."
    end
    @standby_namenode_host = $1
    @standby_namenode_port = $2.to_i
  end
  unless @path.index('/') == 0
    raise Fluent::ConfigError, "Path on hdfs MUST starts with '/', but '#{@path}'"
  end

  @client = prepare_client(@namenode_host, @namenode_port, @username)
  if @standby_namenode_host
    @client_standby = prepare_client(@standby_namenode_host, @standby_namenode_port, @username)
  else
    @client_standby = nil
  end

  if not @append
    if @path.index(CHUNK_ID_PLACE_HOLDER).nil?
      raise Fluent::ConfigError, "path must contain ${chunk_id}, which is the placeholder for chunk_id, when append is set to false."
    end
  end
end

#is_standby_exception(e) ⇒ Object



162
163
164
# File 'lib/fluent/plugin/out_webhdfs.rb', line 162

def is_standby_exception(e)
  e.is_a?(WebHDFS::IOError) && e.message.match(/org\.apache\.hadoop\.ipc\.StandbyException/)
end

#namenode_available(client) ⇒ Object



122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/fluent/plugin/out_webhdfs.rb', line 122

def namenode_available(client)
  if client
    available = true
    begin
      client.list('/')
    rescue => e
      $log.warn "webhdfs check request failed. (namenode: #{client.host}:#{client.port}, error: #{e.message})"
      available = false
    end
    available
  else
    false
  end
end

#namenode_failoverObject



166
167
168
169
170
171
# File 'lib/fluent/plugin/out_webhdfs.rb', line 166

def namenode_failover
  if @standby_namenode
    @client, @client_standby = @client_standby, @client
    $log.warn "Namenode failovered, now using #{@client.host}:#{@client.port}."
  end
end

#path_format(chunk_key) ⇒ Object



158
159
160
# File 'lib/fluent/plugin/out_webhdfs.rb', line 158

def path_format(chunk_key)
  Time.strptime(chunk_key, @time_slice_format).strftime(@path)
end

#prepare_client(host, port, username) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/fluent/plugin/out_webhdfs.rb', line 106

def prepare_client(host, port, username)
  client = WebHDFS::Client.new(host, port, username)
  if @httpfs
    client.httpfs_mode = true
  end
  client.open_timeout = @open_timeout
  client.read_timeout = @read_timeout
  if @retry_known_errors
    client.retry_known_errors = true
    client.retry_interval = @retry_interval if @retry_interval
    client.retry_times = @retry_times if @retry_times
  end

  client
end

#send_data(path, data) ⇒ Object

TODO check conflictions



179
180
181
182
183
184
185
186
187
188
189
# File 'lib/fluent/plugin/out_webhdfs.rb', line 179

def send_data(path, data)
  if @append
    begin
      @client.append(path, data)
    rescue WebHDFS::FileNotFoundError
      @client.create(path, data)
    end
  else
    @client.create(path, data, {'overwrite' => 'true'})
  end
end

#shutdownObject



154
155
156
# File 'lib/fluent/plugin/out_webhdfs.rb', line 154

def shutdown
  super
end

#startObject



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/fluent/plugin/out_webhdfs.rb', line 137

def start
  super

  if namenode_available(@client)
    $log.info "webhdfs connection confirmed: #{@namenode_host}:#{@namenode_port}"
    return
  end
  if @client_standby && namenode_available(@client_standby)
    $log.info "webhdfs connection confirmed: #{@standby_namenode_host}:#{@standby_namenode_port}"
    return
  end

  unless @ignore_start_check_error
    raise RuntimeError, "webhdfs is not available now."
  end
end

#write(chunk) ⇒ Object



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/fluent/plugin/out_webhdfs.rb', line 191

def write(chunk)
  hdfs_path = if @append
                path_format(chunk.key)
              else
                path_format(chunk.key).gsub(CHUNK_ID_PLACE_HOLDER, chunk_unique_id_to_str(chunk.unique_id))
              end

  failovered = false
  begin
    send_data(hdfs_path, chunk.read)
  rescue => e
    $log.warn "failed to communicate hdfs cluster, path: #{hdfs_path}"

    raise e if !@client_standby || failovered

    if is_standby_exception(e) && namenode_available(@client_standby)
      $log.warn "Seems the connected host status is not active (maybe due to failovers). Gonna try another namenode immediately."
      namenode_failover
      failovered = true
      retry
    end
    if ((@error_history.size + 1) >= @failures_before_use_standby) && namenode_available(@client_standby)
      $log.warn "Too many failures. Try to use the standby namenode instead."
      namenode_failover
      failovered = true
      retry
    end
    raise e
  end
  hdfs_path
end