Class: Keep::Manifest

Inherits:
Object
  • Object
show all
Defined in:
lib/arvados/keep.rb

Constant Summary collapse

STRICT_STREAM_TOKEN_REGEXP =
/^(\.)(\/[^\/\s]+)*$/
STRICT_FILE_TOKEN_REGEXP =
/^[[:digit:]]+:[[:digit:]]+:([^\s\/]+(\/[^\s\/]+)*)$/

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(manifest_text) ⇒ Manifest

Class to parse a manifest text and provide common views of that data.



108
109
110
111
# File 'lib/arvados/keep.rb', line 108

def initialize(manifest_text)
  @text = manifest_text
  @files = nil
end

Class Method Details

.valid?(manifest) ⇒ Boolean

Returns:

  • (Boolean)


284
285
286
287
288
289
290
291
# File 'lib/arvados/keep.rb', line 284

def self.valid? manifest
  begin
    validate! manifest
    true
  rescue ArgumentError
    false
  end
end

.validate!(manifest) ⇒ Object

Verify that a given manifest is valid according to arvados.org/projects/arvados/wiki/Keep_manifest_format



237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/arvados/keep.rb', line 237

def self.validate! manifest
  raise ArgumentError.new "No manifest found" if !manifest

  return true if manifest.empty?

  raise ArgumentError.new "Invalid manifest: does not end with newline" if !manifest.end_with?("\n")
  line_count = 0
  manifest.each_line do |line|
    line_count += 1

    words = line[0..-2].split(/ /)
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty?

    count = 0

    word = words.shift
    count += 1 if word =~ STRICT_STREAM_TOKEN_REGEXP and word !~ /\/\.\.?(\/|$)/
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1

    count = 0
    word = words.shift
    while word =~ Locator::LOCATOR_REGEXP
      word = words.shift
      count += 1
    end
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0

    count = 0
    while word =~ STRICT_FILE_TOKEN_REGEXP and ($~[1].split('/') & ['..','.']).empty?
      word = words.shift
      count += 1
    end

    if word
      raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}"
    elsif count == 0
      raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens"
    end

    # Ruby's split() method silently drops trailing empty tokens
    # (which are not allowed by the manifest format) so we have to
    # check trailing spaces manually.
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: trailing space" if line.end_with? " \n"
  end
  true
end

Instance Method Details

#each_file_specObject



154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/arvados/keep.rb', line 154

def each_file_spec
  return to_enum(__method__) unless block_given?
  @text.each_line do |line|
    stream_name = nil
    in_file_tokens = false
    line.scan(/\S+/) do |token|
      if stream_name.nil?
        stream_name = unescape token
      elsif in_file_tokens or not Locator.valid? token
        in_file_tokens = true

        file_tokens = split_file_token(token)
        stream_name_adjuster = ''
        if file_tokens[2].include?('/')                # '/' in filename
          parts = file_tokens[2].rpartition('/')
          stream_name_adjuster = parts[1] + parts[0]   # /dir_parts
          file_tokens[2] = parts[2]
        end

        yield [stream_name + stream_name_adjuster] + file_tokens
      end
    end
  end
  true
end

#each_lineObject



113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/arvados/keep.rb', line 113

def each_line
  return to_enum(__method__) unless block_given?
  @text.each_line do |line|
    stream_name = nil
    block_tokens = []
    file_tokens = []
    line.scan(/\S+/) do |token|
      if stream_name.nil?
        stream_name = unescape token
      elsif file_tokens.empty? and Locator.valid? token
        block_tokens << token
      else
        file_tokens << unescape(token)
      end
    end
    # Ignore blank lines
    next if stream_name.nil?
    yield [stream_name, block_tokens, file_tokens]
  end
end

#exact_file_count?(want_count) ⇒ Boolean

Returns:

  • (Boolean)


215
216
217
# File 'lib/arvados/keep.rb', line 215

def exact_file_count?(want_count)
  files_count(want_count + 1) == want_count
end

#filesObject



180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/arvados/keep.rb', line 180

def files
  if @files.nil?
    file_sizes = Hash.new(0)
    each_file_spec do |streamname, _, filesize, filename|
      file_sizes[[streamname, filename]] += filesize
    end
    @files = file_sizes.each_pair.map do |(streamname, filename), size|
      [streamname, filename, size]
    end
  end
  @files
end

#files_count(stop_after = nil) ⇒ Object



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/arvados/keep.rb', line 193

def files_count(stop_after=nil)
  # Return the number of files represented in this manifest.
  # If stop_after is provided, files_count will read the manifest
  # incrementally, and return immediately when it counts that number of
  # files.  This can help you avoid parsing the entire manifest if you
  # just want to check if a small number of files are specified.
  if stop_after.nil? or not @files.nil?
    return files.size
  end
  seen_files = {}
  each_file_spec do |streamname, _, _, filename|
    seen_files[[streamname, filename]] = true
    return stop_after if (seen_files.size >= stop_after)
  end
  seen_files.size
end

#files_sizeObject



210
211
212
213
# File 'lib/arvados/keep.rb', line 210

def files_size
  # Return the total size of all files in this manifest.
  files.reduce(0) { |total, (_, _, size)| total + size }
end

#has_file?(want_stream, want_file = nil) ⇒ Boolean

Returns:

  • (Boolean)


223
224
225
226
227
228
229
230
231
232
233
# File 'lib/arvados/keep.rb', line 223

def has_file?(want_stream, want_file=nil)
  if want_file.nil?
    want_stream, want_file = File.split(want_stream)
  end
  each_file_spec do |streamname, _, _, name|
    if streamname == want_stream and name == want_file
      return true
    end
  end
  false
end

#minimum_file_count?(want_count) ⇒ Boolean

Returns:

  • (Boolean)


219
220
221
# File 'lib/arvados/keep.rb', line 219

def minimum_file_count?(want_count)
  files_count(want_count) >= want_count
end

#split_file_token(token) ⇒ Object



146
147
148
149
150
151
152
# File 'lib/arvados/keep.rb', line 146

def split_file_token token
  start_pos, filesize, filename = token.split(':', 3)
  if filename.nil?
    raise ArgumentError.new "Invalid file token '#{token}'"
  end
  [start_pos.to_i, filesize.to_i, unescape(filename)]
end

#unescape(s) ⇒ Object



134
135
136
137
138
139
140
141
142
143
144
# File 'lib/arvados/keep.rb', line 134

def unescape(s)
  # Parse backslash escapes in a Keep manifest stream or file name.
  s.gsub(/\\(\\|[0-7]{3})/) do |_|
    case $1
    when '\\'
      '\\'
    else
      $1.to_i(8).chr
    end
  end
end