Class: Purplelight::Partitioner
- Inherits:
-
Object
- Object
- Purplelight::Partitioner
- Defined in:
- lib/purplelight/partitioner.rb
Overview
Partitioner builds MongoDB range filters to split work across workers.
Given a Mongo collection and an optional base query, it returns N contiguous ‘_id` ranges that can be processed independently while maintaining ascending order. Optimized for ObjectId-based `_id`.
Class Method Summary collapse
- .build_range(from_id, to_id) ⇒ Object
-
.cursor_sampling_partitions(collection:, query:, partitions:) ⇒ Object
Legacy cursor sampling planner.
-
.object_id_partitions(collection:, query:, partitions:, mode: nil, telemetry: nil) ⇒ Object
Builds contiguous _id range filters for N partitions.
- .simple_ranges(collection:, query:, partitions:) ⇒ Object
-
.timestamp_partitions(collection:, query:, partitions:, telemetry: nil) ⇒ Object
Faster planning using ObjectId timestamps: O(partitions) indexed lookups.
Class Method Details
.build_range(from_id, to_id) ⇒ Object
141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/purplelight/partitioner.rb', line 141 def self.build_range(from_id, to_id) if from_id && to_id { '$gt' => from_id, '$lte' => to_id } elsif from_id && !to_id { '$gt' => from_id } elsif !from_id && to_id { '$lte' => to_id } else {} end end |
.cursor_sampling_partitions(collection:, query:, partitions:) ⇒ Object
Legacy cursor sampling planner
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'lib/purplelight/partitioner.rb', line 103 def self.cursor_sampling_partitions(collection:, query:, partitions:) # Ensure sort order for sampling base_query = collection.find(query || {}, {}.merge(sort: { _id: 1 })) # Fast path: if small dataset, just chunk by count total = collection.estimated_document_count return simple_ranges(collection: collection, query: query, partitions: partitions) if total <= partitions * 5_000 # Sample boundaries: take approx quantiles by skipping step = [total / partitions, 1].max boundaries = [] cursor = base_query.projection(_id: 1).batch_size(1_000).no_cursor_timeout i = 0 cursor.each do |doc| boundaries << doc['_id'] if (i % step).zero? i += 1 break if boundaries.size >= partitions end ranges = [] prev = nil boundaries.each_with_index do |b, idx| if idx.zero? prev = nil next end ranges << build_range(prev, b) prev = b end ranges << build_range(prev, nil) ranges.map do |r| filter = query ? query.dup : {} filter['_id'] = r { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } } end end |
.object_id_partitions(collection:, query:, partitions:, mode: nil, telemetry: nil) ⇒ Object
Builds contiguous _id range filters for N partitions. For ObjectId _id, we sample quantiles to split into near-equal document counts.
14 15 16 17 18 19 20 21 22 |
# File 'lib/purplelight/partitioner.rb', line 14 def self.object_id_partitions(collection:, query:, partitions:, mode: nil, telemetry: nil) # Choose planning mode: :timestamp (fast), :cursor (legacy) chosen_mode = (mode || ENV['PL_PARTITIONER_MODE'] || :timestamp).to_sym telemetry ||= (defined?(Telemetry) ? Telemetry::NULL : nil) return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if chosen_mode == :cursor (collection: collection, query: query, partitions: partitions, telemetry: telemetry) end |
.simple_ranges(collection:, query:, partitions:) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/purplelight/partitioner.rb', line 24 def self.simple_ranges(collection:, query:, partitions:) # Split by _id quantiles using min/max endpoints min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id') max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id') return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil? # Create contiguous ranges using ascending inner boundaries. # We intentionally skip the very first _id so the first range includes the smallest document. inner_boundaries = collection.find(query || {}) .projection(_id: 1) .sort(_id: 1) .skip(1) .limit([partitions - 1, 0].max) .to_a .map { |d| d['_id'] } ranges = [] prev = nil inner_boundaries.each do |b| ranges << build_range(prev, b) prev = b end ranges << build_range(prev, nil) ranges.map do |r| filter = query ? query.dup : {} filter['_id'] = r { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } } end end |
.timestamp_partitions(collection:, query:, partitions:, telemetry: nil) ⇒ Object
Faster planning using ObjectId timestamps: O(partitions) indexed lookups
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/purplelight/partitioner.rb', line 56 def self.(collection:, query:, partitions:, telemetry: nil) t_minmax = telemetry&.start(:plan_minmax_time) min_id = collection.find(query || {}).projection(_id: 1).sort(_id: 1).limit(1).first&.dig('_id') max_id = collection.find(query || {}).projection(_id: 1).sort(_id: -1).limit(1).first&.dig('_id') telemetry&.finish(:plan_minmax_time, t_minmax) return [{ filter: query || {}, sort: { _id: 1 } }] if min_id.nil? || max_id.nil? min_ts = min_id.respond_to?(:generation_time) ? min_id.generation_time.to_i : nil max_ts = max_id.respond_to?(:generation_time) ? max_id.generation_time.to_i : nil # Fallback to cursor sampling if _id isn't anObjectId return cursor_sampling_partitions(collection: collection, query: query, partitions: partitions) if min_ts.nil? || max_ts.nil? || max_ts <= min_ts step = [(max_ts - min_ts) / partitions, 1].max inner_boundaries = [] t_boundaries = telemetry&.start(:plan_boundary_queries_time) 1.upto(partitions - 1) do |i| target_ts = min_ts + (step * i) candidate = BSON::ObjectId.from_time(Time.at(target_ts)) f = query ? query.dup : {} f['_id'] = { '$gt' => candidate } b = collection.find(f).projection(_id: 1).sort(_id: 1).hint(_id: 1).limit(1).first&.dig('_id') inner_boundaries << b if b end telemetry&.finish(:plan_boundary_queries_time, t_boundaries) # Build ranges: first range has nil lower bound to include min_id, # middle ranges are (prev, current], and last is (last, +inf) ranges = [] t_ranges = telemetry&.start(:plan_ranges_build_time) prev = nil inner_boundaries.each do |b| ranges << build_range(prev, b) prev = b end ranges << build_range(prev, nil) telemetry&.finish(:plan_ranges_build_time, t_ranges) ranges.map do |r| filter = query ? query.dup : {} filter['_id'] = r { filter: filter, sort: { _id: 1 }, hint: { _id: 1 } } end end |