Module: ElasticGraph::GraphQL::Filtering
- Defined in:
- lib/elastic_graph/graphql/filtering/field_path.rb,
lib/elastic_graph/graphql/filtering/range_query.rb,
lib/elastic_graph/graphql/filtering/boolean_query.rb,
lib/elastic_graph/graphql/filtering/filter_interpreter.rb,
lib/elastic_graph/graphql/filtering/filter_args_translator.rb,
lib/elastic_graph/graphql/filtering/filter_value_set_extractor.rb
Defined Under Namespace
Classes: BooleanQuery, FilterArgsTranslator, FilterValueSetExtractor, RangeQuery
Constant Summary collapse
- FieldPath =
Tracks state related to field paths as we traverse our filtering data structure in order to translate it to its Elasticsearch/OpenSearch form.
Instances of this class are immutable–callers must use the provided APIs (‘+`, `counts_path`, `nested`) to get back new instances with state changes applied.
::Data.define( # The path from the overall document root. :from_root, # The path from the current parent document. Usually `from_parent` and `from_root` are the same, # but they'll be different when we encounter a list field indexed using the `nested` mapping type. # When we're traversing a subfield of a `nested` field, `from_root` will contain the full path from # the original, overall document root, while `from_parent` will contain the path from the current # nested document's root. :from_parent ) do # @implements FieldPath # Builds an empty instance. def self.empty new([], []) end def self.of(parts) new(parts, parts) end # Used when we encounter a `nested` field to restart the `from_parent` path (while preserving the `from_root` path). def nested FieldPath.new(from_root, []) end # Creates a new instance with `sub_path` appended. def +(other) FieldPath.new(from_root + [other], from_parent + [other]) end # Converts the current paths to what they need to be to be able to query our hidden `__counts` field (which # is a map containing the counts of elements of every list field on the document). The `__counts` field # sits a the root of every document (for both an overall root document and a `nested` document). Here's an # example (which assumes `seasons` and `seasons.players` fields which are both `nested` and an `awards` field # which is a list of strings). Given a filter like this: # # filter: {seasons: {any_satisfy: {players: {any_satisfy: {results: {awards: {count: {gt: 1}}}}}}}} # # ...after processing the `awards` key, our `FieldPath` will be: # # FieldPath.new(["seasons", "players", "results", "awards"], ["results", "awards"]) # # When we then reach the `count` sub field and `counts_path` is called on it, the following will be returned: # # FieldPath.new(["seasons", "players", LIST_COUNTS_FIELD, "results|awards"], [LIST_COUNTS_FIELD, "results|awards"]) # # This gives us what we want: # - The path from the root is `seasons.players.__counts.results|awards`. # - The path from the (nested) parent is `__counts.results|awards`. # # Note that our `__counts` field is a flat map which uses `|` (the `LIST_COUNTS_FIELD_PATH_KEY_SEPARATOR` character) # to separate its parts (hence, it's `results|awards` instead of `results.awards`). def counts_path from_root_to_parent_of_counts_field = from_root[0...-from_parent.size] # : ::Array[::String] counts_sub_field = [LIST_COUNTS_FIELD, from_parent.join(LIST_COUNTS_FIELD_PATH_KEY_SEPARATOR)] FieldPath.new(from_root_to_parent_of_counts_field + counts_sub_field, counts_sub_field) end end
- FilterInterpreter =
Contains all query logic related to filtering. Not tested directly; tests drive the ‘Query` interface instead. For more info on how this works, see: www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html www.elastic.co/blog/lost-in-translation-boolean-operations-and-filters-in-the-bool-query
Support::MemoizableData.define(:runtime_metadata, :schema_names, :logger) do # @implements FilterInterpreter def initialize(runtime_metadata:, logger:) super( runtime_metadata: , schema_names: .schema_element_names, logger: logger ) end # Builds a datastore query from the given collection of filter hashes. # # Returns `nil` if there are no query clauses, to make it easy for a caller to `compact` out # `query: {}` in a larger search request body. # # https://www.elastic.co/guide/en/elasticsearch/reference/8.11/query-dsl.html def build_query(filter_hashes, from_field_path: FieldPath.empty) build_bool_hash do |bool_node| filter_hashes.each do |filter_hash| process_filter_hash(bool_node, filter_hash, from_field_path) end end end def to_s # The inspect/to_s output of `runtime_metadata` and `logger` can be quite large and noisy. We generally don't care about # those details but want to be able to tell at a glance if two `FilterInterpreter` instances are equal or not--and, if they # aren't equal, which part is responsible for the inequality. # # Using the hash of the two initialize args provides us with that. "#<data #{FilterInterpreter.name} runtime_metadata=(hash: #{.hash}) logger=(hash: #{logger.hash})>" end alias_method :inspect, :to_s private def process_filter_hash(bool_node, filter_hash, field_path) filter_hash.each do |field_or_op, expression| # `nil` filter predicates should be ignored, so we can safely `compact` them out. # It also is simpler to handle them once here instead of the different branches # below having to be aware of possible `nil` predicates. expression = expression.compact if expression.is_a?(::Hash) case identify_expression_type(field_or_op, expression) when :empty # This is an "empty" filter predicate and we can ignore it. when :not process_not_expression(bool_node, expression, field_path) when :list_any_filter process_list_any_filter_expression(bool_node, expression, field_path) when :any_of process_any_of_expression(bool_node, expression, field_path) when :all_of process_all_of_expression(bool_node, expression, field_path) when :operator process_operator_expression(bool_node, field_or_op, expression, field_path) when :list_count process_list_count_expression(bool_node, expression, field_path) when :sub_field process_sub_field_expression(bool_node, expression, field_path + field_or_op) else logger.warn("Ignoring unknown filtering operator (#{field_or_op}: #{expression.inspect}) on field `#{field_path.from_root.join(".")}`") end end end def identify_expression_type(field_or_op, expression) return :empty if expression.nil? || expression == {} return :not if field_or_op == schema_names.not return :list_any_filter if field_or_op == schema_names.any_satisfy return :all_of if field_or_op == schema_names.all_of return :any_of if field_or_op == schema_names.any_of return :operator if filter_operators.key?(field_or_op) return :list_count if field_or_op == LIST_COUNTS_FIELD return :sub_field if expression.is_a?(::Hash) :unknown end # Indicates if the given `expression` applies filtering to subfields or just applies # operators at the current field path. def filters_on_sub_fields?(expression) expression.any? do |field_or_op, sub_expression| case identify_expression_type(field_or_op, sub_expression) when :sub_field true when :not, :list_any_filter filters_on_sub_fields?(sub_expression) when :any_of, :all_of # These are the only two cases where the `sub_expression` is an array of filter sub expressions, # so we use `.any?` on it here. (Even for `all_of`--the overall `expression` filters on sub fields so # long as at least one of the sub expressions does, regardless of it being `any_of` vs `all_of`). sub_expression.any? { |expr| filters_on_sub_fields?(expr) } else # :empty, :operator, :unknown, :list_count false end end end def process_not_expression(bool_node, expression, field_path) sub_filter = build_bool_hash do |inner_node| process_filter_hash(inner_node, expression, field_path) end return unless sub_filter # Prevent any negated filters from being unnecessarily double-negated by # converting them to a positive filter (i.e., !!A == A). if sub_filter[:bool].key?(:must_not) # Pull clauses up to current bool_node to remove negation sub_filter[:bool][:must_not].each do |negated_clause| negated_clause[:bool].each { |k, v| bool_node[k].concat(v) } end end # Don't drop any other filters! Let's negate them now. other_filters = sub_filter[:bool].except(:must_not) bool_node[:must_not] << {bool: other_filters} unless other_filters.empty? end # There are two cases for `any_satisfy`, each of which is handled differently: # # - List-of-scalars # - List-of-nested-objects # # We can detect which it is by checking `filter` to see if it filters on any subfields. # If so, we know the filter is being applied to a `nested` list field. We can count on # this because we do not generate `any_satisfy` filters on `object` list fields (instead, # they get generated on their leaf fields). def process_list_any_filter_expression(bool_node, filter, field_path) if filters_on_sub_fields?(filter) process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path) else process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path) end end def process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path) sub_filter = build_bool_hash do |inner_node| process_filter_hash(inner_node, filter, field_path.nested) end bool_node[:filter] << {nested: {path: field_path.from_root.join("."), query: sub_filter}} end # On a list-of-leaf-values field, `any_satisfy` doesn't _do_ anything: it just expresses # the fact that documents with any list element values matching the predicates will match # the overall filter. def process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path) return unless (processed = build_bool_hash { |node| process_filter_hash(node, filter, field_path) }) processed_bool_query = processed.fetch(:bool) # The semantics we want for `any_satisfy` are that it matches when a value exists in the list that # satisfies all of the provided subfilter. That's the semantics the datastore provides when the bool # query only requires one clause to match, but if multiple clauses are required to match there's a subtle # issue. A document matches so long as each required clause matches *some* value, but it doesn't require # that they all match the *same* value. The list field on a document could contain N values, where # each value matches a different one of the required clauses, and the document will be a search hit. # # Rather than behaving in a surprising way here, we'd rather disallow a filter that has multiple required # clauses, so we return an error in this case. if required_matching_clause_count(processed_bool_query) > 1 formatted_filter = Support::GraphQLFormatter.serialize( {schema_names.any_satisfy => filter}, wrap_hash_with_braces: false ) raise ::GraphQL::ExecutionError, "`#{formatted_filter}` is not supported because it produces " \ "multiple filtering clauses under `#{schema_names.any_satisfy}`, which doesn't work as expected. " \ "Remove one or more of your `#{schema_names.any_satisfy}` predicates and try again." else bool_node.update(processed_bool_query) do |_, existing_clauses, any_satisfy_clauses| existing_clauses + any_satisfy_clauses end end end def process_any_of_expression(bool_node, expressions, field_path) shoulds = expressions.filter_map do |expression| build_bool_hash do |inner_bool_node| process_filter_hash(inner_bool_node, expression, field_path) end end # When our `shoulds` array is empty, the filtering semantics we want is to match no documents. # However, that's not the behavior the datastore will give us if we have an empty array in the # query under `should`. To get the behavior we want, we need to pass the datastore some filter # criteria that will evaluate to false for every document. bool_query = shoulds.empty? ? BooleanQuery::ALWAYS_FALSE_FILTER : BooleanQuery.should(*shoulds) bool_query.merge_into(bool_node) end def process_all_of_expression(bool_node, expressions, field_path) # `all_of` represents an AND. AND is the default way that `process_filter_hash` combines # filters so we just have to call it for each sub-expression. expressions.each do |sub_expression| process_filter_hash(bool_node, sub_expression, field_path) end end def process_operator_expression(bool_node, operator, expression, field_path) # `operator` is a filtering operator, and `expression` is the value the filtering # operator should be applied to. The `op_applicator` lambda, when called, will # return a Clause instance (defined in this module). bool_query = filter_operators.fetch(operator).call(field_path.from_root.join("."), expression) bool_query&.merge_into(bool_node) end def process_sub_field_expression(bool_node, expression, field_path) # `sub_field` is a field name, and `expression` is a hash of filters to apply to that field. # We want to add the field name to the field path and recursively process the hash. # # However, if the hash has `any_of` in it, then we need to process the filter hash on # a nested bool node instead of on the `bool_node` we are already operating on. # # To understand why, first consider a filter that has no `any_of` but does use field nesting: # # filter: { # weight: {lt: 2000}, # cost: { # currency: {equal_to_any_of: ["USD"]} # amount: {gt: 1000} # } # } # # While this `currency` and `amount` are expressed as sub-filters under `cost` in our GraphQL # syntax, we do not actually need to create a nested bool node structure for the datastore # query. We get a flat filter structure like this: # # {bool: {filter: [ # {range: {"weight": {lt: 2000}}}, # {terms: {"cost.currency": ["USD"]}}, # {range: {"amount": {gt: 1000}}} # ]}} # # The 3 filter conditions are ANDed together as a single list under `filter`. # The nested field structure gets flattened using a dot-separated path. # # Now consider a filter that has multiple `any_of` sub-expressions: # # filter: { # weight: {any_of: [ # {gt: 9000}, # {lt: 2000} # ]}, # cost: {any_of: [ # currency: {equal_to_any_of: ["USD"]}, # amount: {gt: 1000} # ]} # } # # If we did not make a nested structure, we would wind up with a single list of sub-expressions # that are OR'd together: # # {bool: {filter: [{bool: {should: [ # {range: {"weight": {gt: 9000}}}, # {range: {"weight": {lt: 2000}}}, # {terms: {"cost.currency": ["USD"]}}, # {range: {"amount": {gt: 1000}}} # ]}}]}} # # ...but that's clearly wrong. By creating a nested bool node based on the presence of `any_of`, # we can instead produce a structure like this: # # {bool: {filter: [ # {bool: {should: [ # {range: {"weight": {gt: 9000}}}, # {range: {"weight": {lt: 2000}}} # ]}}, # {bool: {should: [ # {terms: {"cost.currency": ["USD"]}}, # {range: {"amount": {gt: 1000}}} # ]}} # ]}} # # ...which will actually work correctly. if expression.key?(schema_names.any_of) sub_filter = build_bool_hash do |inner_node| process_filter_hash(inner_node, expression, field_path) end bool_node[:filter] << sub_filter else process_filter_hash(bool_node, expression, field_path) end end def process_list_count_expression(bool_node, expression, field_path) # Normally, we don't have to do anything special for list count expressions. # That's the case, for example, for an expression like: # # filter: {tags: {count: {gt: 2}}} # # However, if the count expression could match count of 0 (that is, if it doesn't # exclude a count of zero), such as this: # # filter: {tags: {count: {lt: 1}}} # # ...then we need some special handling here. A count of 0 is equivalent to the list field not existing. # While we index an explicit count of 0, the count field will be missing from documents indexed before # the list field was defined on the ElasticGraph schema. To properly match those documents, we need to # convert this into an OR (using `any_of`) to also match documents that lack the field entirely. unless excludes_zero?(expression) expression = {schema_names.any_of => [ expression, {schema_names.equal_to_any_of => [nil]} ]} end process_sub_field_expression(bool_node, expression, field_path.counts_path) end def build_bool_hash(&block) bool_node = Hash.new { |h, k| h[k] = [] }.tap(&block) # To ignore "empty" filter predicates we need to return `nil` here. return nil if bool_node.empty? # According to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html#bool-min-should-match, # if the bool query includes at least one should clause and no must or filter clauses, the default value is 1. Otherwise, the default value is 0. # However, we want should clauses to work with musts and filters, so we need to set it explicitly to 1 when we have should clauses. bool_node[:minimum_should_match] = 1 if bool_node.key?(:should) {bool: bool_node} end # Determines if the given filter expression excludes the value `0`. def excludes_zero?(expression) expression.any? do |operator, operand| case operator when schema_names.equal_to_any_of then !operand.include?(0) when schema_names.lt then operand <= 0 when schema_names.lte then operand < 0 when schema_names.gt then operand >= 0 when schema_names.gte then operand > 0 else # :nocov: -- all operators are covered above. But simplecov complains about an implicit `else` branch being uncovered, so here we've defined it to wrap it with `:nocov:`. false # :nocov: end end end def filter_operators @filter_operators ||= build_filter_operators() end def build_filter_operators() schema_names = .schema_element_names filter_by_time_of_day_script_id = .static_script_ids_by_scoped_name .fetch("filter/by_time_of_day") { schema_names.equal_to_any_of => ->(field_name, value) { values = to_datastore_value(value.compact.uniq) # : ::Array[untyped] equality_sub_expression = if field_name == "id" # Use specialized "ids" query when querying on ID field. # See: https://www.elastic.co/guide/en/elasticsearch/reference/7.15/query-dsl-ids-query.html # # We reject empty strings because we otherwise get an error from the datastore: # "failed to create query: Ids can't be empty" {ids: {values: values - [""]}} else {terms: {field_name => values}} end exists_sub_expression = {exists: {"field" => field_name}} if !value.empty? && value.all?(&:nil?) BooleanQuery.new(:must_not, [{bool: {filter: [exists_sub_expression]}}]) elsif value.include?(nil) BooleanQuery.filter({bool: { minimum_should_match: 1, should: [ {bool: {filter: [equality_sub_expression]}}, {bool: {must_not: [{bool: {filter: [exists_sub_expression]}}]}} ] }}) else BooleanQuery.filter(equality_sub_expression) end }, schema_names.gt => ->(field_name, value) { RangeQuery.new(field_name, :gt, value) }, schema_names.gte => ->(field_name, value) { RangeQuery.new(field_name, :gte, value) }, schema_names.lt => ->(field_name, value) { RangeQuery.new(field_name, :lt, value) }, schema_names.lte => ->(field_name, value) { RangeQuery.new(field_name, :lte, value) }, schema_names.matches => ->(field_name, value) { BooleanQuery.must({match: {field_name => value}}) }, schema_names.matches_query => ->(field_name, value) do allowed_edits_per_term = value.fetch(schema_names.allowed_edits_per_term)..datastore_abbreviation BooleanQuery.must( { match: { field_name => { query: value.fetch(schema_names.query), # This is always a string field, even though the value is often an integer fuzziness: allowed_edits_per_term.to_s, operator: value[schema_names.require_all_terms] ? "AND" : "OR" } } } ) end, schema_names.matches_phrase => ->(field_name, value) { BooleanQuery.must( { match_phrase_prefix: { field_name => { query: value.fetch(schema_names.phrase) } } } ) }, # This filter operator wraps a geo distance query: # https://www.elastic.co/guide/en/elasticsearch/reference/7.10/query-dsl-geo-distance-query.html schema_names.near => ->(field_name, value) do unit_abbreviation = value.fetch(schema_names.unit)..datastore_abbreviation BooleanQuery.filter({geo_distance: { "distance" => "#{value.fetch(schema_names.max_distance)}#{unit_abbreviation}", field_name => { "lat" => value.fetch(schema_names.latitude), "lon" => value.fetch(schema_names.longitude) } }}) end, schema_names.time_of_day => ->(field_name, value) do # To filter on time of day, we use the `filter/by_time_of_day` script. We accomplish # this with a script because Elasticsearch/OpenSearch do not support this natively, and it's # incredibly hard to implement correctly with respect to time zones without using a # script. We considered indexing the `time_of_day` as a separate index field # that we could directly filter on, but since we need the time of day to be relative # to a specific time zone, there's no way to make that work with the reality of # daylight savings time. For example, the `America/Los_Angeles` time zone has a -07:00 # UTC offset for part of the year and a `America/Los_Angeles` -08:00 UTC offset for # part of the year. In a script we can use Java time zone APIs to handle this correctly. params = { field: field_name, equal_to_any_of: list_of_nanos_of_day_from(value, schema_names.equal_to_any_of), gt: nano_of_day_from(value, schema_names.gt), gte: nano_of_day_from(value, schema_names.gte), lt: nano_of_day_from(value, schema_names.lt), lte: nano_of_day_from(value, schema_names.lte), time_zone: value[schema_names.time_zone] }.compact # If there are no comparison operators, return `nil` instead of a `Clause` so that we avoid # invoking the script for no reason. Note that `field` and `time_zone` will always be in # `params` so we can't just check for an empty hash here. if (params.keys - [:field, :time_zone]).any? BooleanQuery.filter({script: {script: {id: filter_by_time_of_day_script_id, params: params}}}) end end }.freeze end def to_datastore_value(value) case value when ::Array value.map { |v| to_datastore_value(v) } when Schema::EnumValue value.name.to_s else value end end def nano_of_day_from(value, field) local_time = value[field] Support::TimeUtil.nano_of_day_from_local_time(local_time) if local_time end def list_of_nanos_of_day_from(value, field) value[field]&.map { |t| Support::TimeUtil.nano_of_day_from_local_time(t) } end # Counts how many clauses in `bool_query` are required to match for a document to be a search hit. def required_matching_clause_count(bool_query) bool_query.reduce(0) do |count, (occurrence, clauses)| case occurrence when :should # The number of required matching clauses imposed by `:should` depends on the `:minimum_should_match` value. # https://www.elastic.co/guide/en/elasticsearch/reference/8.9/query-dsl-bool-query.html#bool-min-should-match bool_query.fetch(:minimum_should_match) when :minimum_should_match 0 # doesn't have any clauses on its own, just controls how many `:should` clauses are required. else # For all other occurrences, each cluse must match. clauses.size end + count end end end