Class: Dap::Input::InputWARC

Inherits:
Object
  • Object
show all
Includes:
FileSource
Defined in:
lib/dap/input/warc.rb

Overview

WARC

Instance Attribute Summary collapse

Attributes included from FileSource

#fd

Instance Method Summary collapse

Methods included from FileSource

#close, #open

Constructor Details

#initialize(args) ⇒ InputWARC

Returns a new instance of InputWARC.



13
14
15
16
# File 'lib/dap/input/warc.rb', line 13

def initialize(args)
  self.open(args.first)
  read_warc_header
end

Instance Attribute Details

#headerObject

Returns the value of attribute header.



11
12
13
# File 'lib/dap/input/warc.rb', line 11

def header
  @header
end

#infoObject

Returns the value of attribute info.



11
12
13
# File 'lib/dap/input/warc.rb', line 11

def info
  @info
end

Instance Method Details

#read_recordObject



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/dap/input/warc.rb', line 37

def read_record
  begin

    version = self.fd.readline
    unless version and version =~ /^WARC\/\d+\.\d+/
      return Error::EOF
    end
    warc = {}
  
    loop do
      line = self.fd.readline
      
      unless line.strip.length == 0
        k, v = line.strip.split(/\s*:\s*/, 2)
        k    = k.downcase.gsub('-', '_')
        warc[k] = v.to_s
        next
      end

      unless warc['content_length']
        return Error::EOF
      end

      warc['content'] = self.fd.read(warc['content_length'].to_i)
      skip = self.fd.readline
      skip = self.fd.readline

      unless skip.strip.length == 0
        return Error::EOF
      end

      break
    end

    return warc

  rescue ::EOFError
    return Error::EOF
  end
end

#read_warc_headerObject



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/dap/input/warc.rb', line 18

def read_warc_header
  self.header = read_record
  
  if self.header == Error::EOF
    raise RuntimeError, "Invalid WARC header"
  end

  unless self.header['warc_type'].to_s == "warcinfo"
    raise RuntimeError, "Invalid WARC header (missing warcinfo)"
  end

  self.info = {}
  self.header['content'].to_s.split("\n").each do |line|
    k, v = line.strip.split(/\s*:\s*/, 2)
    next unless v
    self.info[k] = v
  end
end