Class: IOBlockReader::IOBlockReader

Inherits:
Object
  • Object
show all
Defined in:
lib/ioblockreader/ioblockreader.rb

Overview

Class giving a String-like interface over an IO, reading it by blocks. Very useful to access big files’ content as it was a String containing the whole file’s content.

Instance Method Summary collapse

Constructor Details

#initialize(io, options = {}) ⇒ IOBlockReader

Constructor

Parameters
  • io (IO): The IO object used to give the String interface

  • options (map<Symbol,Object>): Additional options:

    • :block_size (Fixnum): The block size in bytes used internally. [default = 268435456]

    • :blocks_in_memory (Fixnum): Maximal number of blocks in memory. If it is required to load more blocks than this value for a single operation, this value is ignored. [default = 2]



16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/ioblockreader/ioblockreader.rb', line 16

def initialize(io, options = {})
  # The underlying IO

  @io = io
  # Parse options

  @block_size = options[:block_size] || 268435456
  @blocks_in_memory = options[:blocks_in_memory] || 2
  # The blocks

  @blocks = []
  # The last accessed block, used as a cache for quick [] access

  @cached_block = nil
  @cached_block_end_offset = nil
end

Instance Method Details

#[](range) ⇒ Object

Get a subset of the data. DO NOT USE NEGATIVE INDEXES.

Parameters
  • range (Fixnum or Range): Range to extract

Result
  • String: The resulting data



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/ioblockreader/ioblockreader.rb', line 36

def [](range)
  #puts "[IOBlockReader] - [](#{range.inspect})"

  if (range.is_a?(Fixnum))
    # Use the cache if possible

    return @cached_block.data[range - @cached_block.offset] if ((@cached_block != nil) and (range >= @cached_block.offset) and (range < @cached_block_end_offset))
    #puts "[IOBlockReader] - [](#{range.inspect}) - Cache miss"

    # Separate this case for performance

    single_block_index, offset_in_block = range.divmod(@block_size)
    # First check if all blocks are already loaded

    if ((block = @blocks[single_block_index]) == nil)
      read_needed_blocks([single_block_index], single_block_index, single_block_index)
      block = @blocks[single_block_index]
    else
      block.touch
    end
    set_cache_block(block)
    return block.data[offset_in_block]
  else
    # Use the cache if possible

    return @cached_block.data[range.first - @cached_block.offset..range.last - @cached_block.offset] if ((@cached_block != nil) and (range.first >= @cached_block.offset) and (range.last < @cached_block_end_offset))
    #puts "[IOBlockReader] - [](#{range.inspect}) - Cache miss"

    first_block_index, first_offset_in_block = range.first.divmod(@block_size)
    last_block_index, last_offset_in_block = range.last.divmod(@block_size)
    # First check if all blocks are already loaded

    if (first_block_index == last_block_index)
      if ((block = @blocks[first_block_index]) == nil)
        read_needed_blocks([first_block_index], first_block_index, last_block_index)
        block = @blocks[first_block_index]
      else
        block.touch
      end
      set_cache_block(block)
      return block.data[first_offset_in_block..last_offset_in_block]
    else
      # Get all indexes to be loaded

      indexes_needing_loading = []
      (first_block_index..last_block_index).each do |block_index|
        if ((block = @blocks[block_index]) == nil)
          indexes_needing_loading << block_index
        else
          block.touch
        end
      end
      read_needed_blocks(indexes_needing_loading, first_block_index, last_block_index) if (!indexes_needing_loading.empty?)
      # Now read across the blocks

      result = @blocks[first_block_index].data[first_offset_in_block..-1].dup
      (first_block_index+1..last_block_index-1).each do |block_index|
        result.concat(@blocks[block_index].data)
      end
      result.concat(@blocks[last_block_index].data[0..last_offset_in_block])
      # There are more chances that the last block will be accessed again. Cache this one.

      set_cache_block(@blocks[last_block_index])
      return result
    end
  end
end

#index(token, offset = 0, max_size_regexp = 32) ⇒ Object

Perform a search of a token (or a list of tokens) in the IO. Warning: The token(s) to be found have to be smaller than the block size given to the constructor, otherwise they won’t be found (you’ve been warned!). If you really need to search for tokens bigger than block size, extract the data using [] operator first, and then use index on it ; it will however make a complete copy of the data in memory prior to searching tokens.

Parameters
  • token (String, Regexp or list<Object>): Token to be found. Can be a list of tokens.

  • offset (Fixnum): Offset starting the search [optional = 0]

  • max_size_regexp (Fixnum): Maximal number of characters the match should take in case of a Regexp token. Ignored if token is a String. [optional = 32]

Result
  • Fixnum: Index of the token (or the first one found from the given token list), or nil if none found.

  • Fixnum: In case token was an Array, return the index of the matching token in the array, or nil if none found.



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/ioblockreader/ioblockreader.rb', line 103

def index(token, offset = 0, max_size_regexp = 32)
  #puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp})"

  # Separate the trivial algo for performance reasons

  current_block_index, offset_in_current_block = offset.divmod(@block_size)
  if ((current_block = @blocks[current_block_index]) == nil)
    read_needed_blocks([current_block_index], current_block_index, current_block_index)
    current_block = @blocks[current_block_index]
  else
    current_block.touch
  end
  index_in_block = nil
  index_matching_token = nil
  if (token_is_array = token.is_a?(Array))
    token.each_with_index do |token2, idx|
      index_token2_in_block = current_block.data.index(token2, offset_in_current_block)
      if (index_token2_in_block != nil) and ((index_in_block == nil) or (index_token2_in_block < index_in_block))
        index_in_block = index_token2_in_block
        index_matching_token = idx
      end
    end
  else
    index_in_block = current_block.data.index(token, offset_in_current_block)
  end
  if (index_in_block == nil)
    # We have to search further: across blocks

    # Compute the size of the token to be searched

    token_size = 0
    if token_is_array
      token.each do |token2|
        if (token2.is_a?(String))
          token_size = token2.size if (token2.size > token_size)
        else
          token_size = max_size_regexp if (max_size_regexp > token_size)
        end
      end
    elsif (token.is_a?(String))
      token_size = token.size
    else
      token_size = max_size_regexp
    end
    # Loop on subsequent blocks to search for token

    result = nil
    while ((result == nil) and (!current_block.last_block?))
      # Check that next block is loaded

      if ((next_block = @blocks[current_block_index+1]) == nil)
        read_needed_blocks([current_block_index+1], current_block_index+1, current_block_index+1)
        next_block = @blocks[current_block_index+1]
      else
        next_block.touch
      end
      # Get data across the 2 blocks: enough to search for token_size data only

      cross_data = current_block.data[1-token_size..-1] + next_block.data[0..token_size-2]
      if token_is_array
        token.each_with_index do |token2, idx|
          index_token2_in_block = cross_data.index(token2)
          if (index_token2_in_block != nil) and ((index_in_block == nil) or (index_token2_in_block < index_in_block))
            index_in_block = index_token2_in_block
            index_matching_token = idx
          end
        end
      else
        index_in_block = cross_data.index(token)
      end
      if (index_in_block == nil)
        # Search in the next block

        if token_is_array
          token.each_with_index do |token2, idx|
            index_token2_in_block = next_block.data.index(token2)
            if (index_token2_in_block != nil) and ((index_in_block == nil) or (index_token2_in_block < index_in_block))
              index_in_block = index_token2_in_block
              index_matching_token = idx
            end
          end
        else
          index_in_block = next_block.data.index(token)
        end
        if (index_in_block == nil)
          # Loop on the next block

          current_block_index += 1
          current_block = next_block
        else
          result = next_block.offset + index_in_block
        end
      else
        result = next_block.offset - token_size + 1 + index_in_block
      end
    end
    if token_is_array
      return result, index_matching_token
    else
      return result
    end
  elsif token_is_array
    return current_block.offset + index_in_block, index_matching_token
  else
    return current_block.offset + index_in_block
  end
end