Class: Purplelight::Partitioner

Inherits:
Object
  • Object
show all
Defined in:
lib/purplelight/partitioner.rb

Overview

Partitioner builds MongoDB range filters to split work across workers.

Given a Mongo collection and an optional base query, it returns N contiguous ‘_id` ranges that can be processed independently while maintaining ascending order. Optimized for ObjectId-based `_id`.

Class Method Summary collapse

Class Method Details

.build_range(from_id, to_id) ⇒ Object



141
142
143
144
145
146
147
148
149
150
151
# File 'lib/purplelight/partitioner.rb', line 141

def self.build_range(from_id, to_id)
  if from_id && to_id
    { '$gt' => from_id, '$lte' => to_id }
  elsif from_id && !to_id
    { '$gt' => from_id }
  elsif !from_id && to_id
    { '$lte' => to_id }
  else
    {}
  end
end

.cursor_sampling_partitions(collection:, query:, partitions:) ⇒ Object

Legacy cursor sampling planner



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/purplelight/partitioner.rb', line 103

def self.cursor_sampling_partitions(collection:, query:, partitions:)
  # Ensure sort order for sampling
  base_query = collection.find(query || {}, {}.merge(sort: { _id: 1 }))

  # Fast path: if small dataset, just chunk by count
  total = collection.estimated_document_count
  return simple_ranges(collection: collection, query: query, partitions: partitions) if total <= partitions * 5_000

  # Sample boundaries: take approx quantiles by skipping
  step = [total / partitions, 1].max
  boundaries = []
  cursor = base_query.projection(_id: 1).batch_size(1_000).no_cursor_timeout
  i = 0
  cursor.each do |doc|
    boundaries << doc['_id'] if (i % step).zero?
    i += 1
    break if boundaries.size >= partitions
  end

  ranges = []
  prev = nil
  boundaries.each_with_index do |b, idx|
    if idx.zero?
      prev = nil
      next
    end
    ranges << build_range(prev, b)
    prev = b
  end
  ranges << build_range(prev, nil)

  ranges.map do |r|
    filter = query ? query.dup : {}
    filter['_id'] = r
    { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
  end
end

.object_id_partitions(collection:, query:, partitions:, mode: nil, telemetry: nil) ⇒ Object

Builds contiguous _id range filters for N partitions. For ObjectId _id, we sample quantiles to split into near-equal document counts.



14
15
16
17
18
19
20
21
22
# File 'lib/purplelight/partitioner.rb', line 14

def self.object_id_partitions(collection:, query:, partitions:, mode: nil, telemetry: nil)
  # Choose planning mode: :timestamp (fast), :cursor (legacy)
  chosen_mode = (mode || ENV['PL_PARTITIONER_MODE'] || :timestamp).to_sym
  telemetry ||= (defined?(Telemetry) ? Telemetry::NULL : nil)

  return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if chosen_mode == :cursor

  timestamp_partitions(collection: collection, query: query, partitions: partitions, telemetry: telemetry)
end

.simple_ranges(collection:, query:, partitions:) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/purplelight/partitioner.rb', line 24

def self.simple_ranges(collection:, query:, partitions:)
  # Split by _id quantiles using min/max endpoints
  min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
  max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
  return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?

  # Create contiguous ranges using ascending inner boundaries.
  # We intentionally skip the very first _id so the first range includes the smallest document.
  inner_boundaries = collection.find(query || {})
                               .projection(_id: 1)
                               .sort(_id: 1)
                               .skip(1)
                               .limit([partitions - 1, 0].max)
                               .to_a
                               .map { |d| d['_id'] }

  ranges = []
  prev = nil
  inner_boundaries.each do |b|
    ranges << build_range(prev, b)
    prev = b
  end
  ranges << build_range(prev, nil)

  ranges.map do |r|
    filter = query ? query.dup : {}
    filter['_id'] = r
    { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
  end
end

.timestamp_partitions(collection:, query:, partitions:, telemetry: nil) ⇒ Object

Faster planning using ObjectId timestamps: O(partitions) indexed lookups



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/purplelight/partitioner.rb', line 56

def self.timestamp_partitions(collection:, query:, partitions:, telemetry: nil)
  t_minmax = telemetry&.start(:plan_minmax_time)
  min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id')
  max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id')
  telemetry&.finish(:plan_minmax_time, t_minmax)

  return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil?

  min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil
  max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil

  # Fallback to cursor sampling if _id isn't anObjectId
  return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts

  step = [(max_ts - min_ts) / partitions, 1].max
  inner_boundaries = []
  t_boundaries = telemetry&.start(:plan_boundary_queries_time)
  1.upto(partitions - 1) do |i|
    target_ts = min_ts + (step * i)
    candidate = BSON::ObjectId.from_time(Time.at(target_ts))
    f = query ? query.dup : {}
    f['_id'] = { '$gt' => candidate }
    b = collection.find(f).projection(_id: 1).sort(_id: 1).hint(_id: 1).limit(1).first&.dig('_id')
    inner_boundaries << b if b
  end
  telemetry&.finish(:plan_boundary_queries_time, t_boundaries)

  # Build ranges: first range has nil lower bound to include min_id,
  # middle ranges are (prev, current], and last is (last, +inf)
  ranges = []
  t_ranges = telemetry&.start(:plan_ranges_build_time)
  prev = nil
  inner_boundaries.each do |b|
    ranges << build_range(prev, b)
    prev = b
  end
  ranges << build_range(prev, nil)
  telemetry&.finish(:plan_ranges_build_time, t_ranges)

  ranges.map do |r|
    filter = query ? query.dup : {}
    filter['_id'] = r
    { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } }
  end
end