Module: ElasticGraph::GraphQL::Filtering

Defined in:: lib/elastic_graph/graphql/filtering/field_path.rb,
lib/elastic_graph/graphql/filtering/range_query.rb,
lib/elastic_graph/graphql/filtering/boolean_query.rb,
lib/elastic_graph/graphql/filtering/filter_interpreter.rb,
lib/elastic_graph/graphql/filtering/filter_args_translator.rb,
lib/elastic_graph/graphql/filtering/filter_value_set_extractor.rb

Defined Under Namespace

Classes: BooleanQuery, FilterArgsTranslator, FilterValueSetExtractor, RangeQuery

Constant Summary collapse

FieldPath = Tracks state related to field paths as we traverse our filtering data structure in order to translate it to its Elasticsearch/OpenSearch form. Instances of this class are immutable–callers must use the provided APIs (‘+`, `counts_path`, `nested`) to get back new instances with state changes applied.

::Data.define(
  # The path from the overall document root.
  :from_root,
  # The path from the current parent document. Usually `from_parent` and `from_root` are the same,
  # but they'll be different when we encounter a list field indexed using the `nested` mapping type.
  # When we're traversing a subfield of a `nested` field, `from_root` will contain the full path from
  # the original, overall document root, while `from_parent` will contain the path from the current
  # nested document's root.
  :from_parent
) do
  # @implements FieldPath

  # Builds an empty instance.
  def self.empty
    new([], [])
  end

  def self.of(parts)
    new(parts, parts)
  end

  # Used when we encounter a `nested` field to restart the `from_parent` path (while preserving the `from_root` path).
  def nested
    FieldPath.new(from_root, [])
  end

  # Creates a new instance with `sub_path` appended.
  def +(other)
    FieldPath.new(from_root + [other], from_parent + [other])
  end

  # Converts the current paths to what they need to be to be able to query our hidden `__counts` field (which
  # is a map containing the counts of elements of every list field on the document). The `__counts` field
  # sits a the root of every document (for both an overall root document and a `nested` document). Here's an
  # example (which assumes `seasons` and `seasons.players` fields which are both `nested` and an `awards` field
  # which is a list of strings). Given a filter like this:
  #
  # filter: {seasons: {any_satisfy: {players: {any_satisfy: {results: {awards: {count: {gt: 1}}}}}}}}
  #
  # ...after processing the `awards` key, our `FieldPath` will be:
  #
  # FieldPath.new(["seasons", "players", "results", "awards"], ["results", "awards"])
  #
  # When we then reach the `count` sub field and `counts_path` is called on it, the following will be returned:
  #
  # FieldPath.new(["seasons", "players", LIST_COUNTS_FIELD, "results|awards"], [LIST_COUNTS_FIELD, "results|awards"])
  #
  # This gives us what we want:
  # - The path from the root is `seasons.players.__counts.results|awards`.
  # - The path from the (nested) parent is `__counts.results|awards`.
  #
  # Note that our `__counts` field is a flat map which uses `|` (the `LIST_COUNTS_FIELD_PATH_KEY_SEPARATOR` character)
  # to separate its parts (hence, it's `results|awards` instead of `results.awards`).
  def counts_path
    from_root_to_parent_of_counts_field = from_root[0...-from_parent.size] # : ::Array[::String]
    counts_sub_field = [LIST_COUNTS_FIELD, from_parent.join(LIST_COUNTS_FIELD_PATH_KEY_SEPARATOR)]

    FieldPath.new(from_root_to_parent_of_counts_field + counts_sub_field, counts_sub_field)
  end
end

FilterInterpreter = Contains all query logic related to filtering. Not tested directly; tests drive the ‘Query` interface instead. For more info on how this works, see: www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html www.elastic.co/blog/lost-in-translation-boolean-operations-and-filters-in-the-bool-query

Support::MemoizableData.define(:runtime_metadata, :schema_names, :logger) do
  # @implements FilterInterpreter

  def initialize(runtime_metadata:, logger:)
    super(
      runtime_metadata: runtime_metadata,
      schema_names: runtime_metadata.schema_element_names,
      logger: logger
    )
  end

  # Builds a datastore query from the given collection of filter hashes.
  #
  # Returns `nil` if there are no query clauses, to make it easy for a caller to `compact` out
  # `query: {}` in a larger search request body.
  #
  # https://www.elastic.co/guide/en/elasticsearch/reference/8.11/query-dsl.html
  def build_query(filter_hashes, from_field_path: FieldPath.empty)
    build_bool_hash do |bool_node|
      filter_hashes.each do |filter_hash|
        process_filter_hash(bool_node, filter_hash, from_field_path)
      end
    end
  end

  def to_s
    # The inspect/to_s output of `runtime_metadata` and `logger` can be quite large and noisy. We generally don't care about
    # those details but want to be able to tell at a glance if two `FilterInterpreter` instances are equal or not--and, if they
    # aren't equal, which part is responsible for the inequality.
    #
    # Using the hash of the two initialize args provides us with that.
    "#<data #{FilterInterpreter.name} runtime_metadata=(hash: #{runtime_metadata.hash}) logger=(hash: #{logger.hash})>"
  end
  alias_method :inspect, :to_s

  private

  def process_filter_hash(bool_node, filter_hash, field_path)
    filter_hash.each do |field_or_op, expression|
      # `nil` filter predicates should be ignored, so we can safely `compact` them out.
      # It also is simpler to handle them once here instead of the different branches
      # below having to be aware of possible `nil` predicates.
      expression = expression.compact if expression.is_a?(::Hash)

      case identify_expression_type(field_or_op, expression)
      when :empty
        # This is an "empty" filter predicate and we can ignore it.
      when :not
        process_not_expression(bool_node, expression, field_path)
      when :list_any_filter
        process_list_any_filter_expression(bool_node, expression, field_path)
      when :any_of
        process_any_of_expression(bool_node, expression, field_path)
      when :all_of
        process_all_of_expression(bool_node, expression, field_path)
      when :operator
        process_operator_expression(bool_node, field_or_op, expression, field_path)
      when :list_count
        process_list_count_expression(bool_node, expression, field_path)
      when :sub_field
        process_sub_field_expression(bool_node, expression, field_path + field_or_op)
      else
        logger.warn("Ignoring unknown filtering operator (#{field_or_op}: #{expression.inspect}) on field `#{field_path.from_root.join(".")}`")
      end
    end
  end

  def identify_expression_type(field_or_op, expression)
    return :empty if expression.nil? || expression == {}
    return :not if field_or_op == schema_names.not
    return :list_any_filter if field_or_op == schema_names.any_satisfy
    return :all_of if field_or_op == schema_names.all_of
    return :any_of if field_or_op == schema_names.any_of
    return :operator if filter_operators.key?(field_or_op)
    return :list_count if field_or_op == LIST_COUNTS_FIELD
    return :sub_field if expression.is_a?(::Hash)
    :unknown
  end

  # Indicates if the given `expression` applies filtering to subfields or just applies
  # operators at the current field path.
  def filters_on_sub_fields?(expression)
    expression.any? do |field_or_op, sub_expression|
      case identify_expression_type(field_or_op, sub_expression)
      when :sub_field
        true
      when :not, :list_any_filter
        filters_on_sub_fields?(sub_expression)
      when :any_of, :all_of
        # These are the only two cases where the `sub_expression` is an array of filter sub expressions,
        # so we use `.any?` on it here. (Even for `all_of`--the overall `expression` filters on sub fields so
        # long as at least one of the sub expressions does, regardless of it being `any_of` vs `all_of`).
        sub_expression.any? { |expr| filters_on_sub_fields?(expr) }
      else # :empty, :operator, :unknown, :list_count
        false
      end
    end
  end

  def process_not_expression(bool_node, expression, field_path)
    sub_filter = build_bool_hash do |inner_node|
      process_filter_hash(inner_node, expression, field_path)
    end

    return unless sub_filter

    # Prevent any negated filters from being unnecessarily double-negated by
    # converting them to a positive filter (i.e., !!A == A).
    if sub_filter[:bool].key?(:must_not)
      # Pull clauses up to current bool_node to remove negation
      sub_filter[:bool][:must_not].each do |negated_clause|
        negated_clause[:bool].each { |k, v| bool_node[k].concat(v) }
      end
    end

    # Don't drop any other filters! Let's negate them now.
    other_filters = sub_filter[:bool].except(:must_not)
    bool_node[:must_not] << {bool: other_filters} unless other_filters.empty?
  end

  # There are two cases for `any_satisfy`, each of which is handled differently:
  #
  # - List-of-scalars
  # - List-of-nested-objects
  #
  # We can detect which it is by checking `filter` to see if it filters on any subfields.
  # If so, we know the filter is being applied to a `nested` list field. We can count on
  # this because we do not generate `any_satisfy` filters on `object` list fields (instead,
  # they get generated on their leaf fields).
  def process_list_any_filter_expression(bool_node, filter, field_path)
    if filters_on_sub_fields?(filter)
      process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path)
    else
      process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path)
    end
  end

  def process_any_satisfy_filter_expression_on_nested_object_list(bool_node, filter, field_path)
    sub_filter = build_bool_hash do |inner_node|
      process_filter_hash(inner_node, filter, field_path.nested)
    end

    bool_node[:filter] << {nested: {path: field_path.from_root.join("."), query: sub_filter}}
  end

  # On a list-of-leaf-values field, `any_satisfy` doesn't _do_ anything: it just expresses
  # the fact that documents with any list element values matching the predicates will match
  # the overall filter.
  def process_any_satisfy_filter_expression_on_scalar_list(bool_node, filter, field_path)
    return unless (processed = build_bool_hash { |node| process_filter_hash(node, filter, field_path) })

    processed_bool_query = processed.fetch(:bool)

    # The semantics we want for `any_satisfy` are that it matches when a value exists in the list that
    # satisfies all of the provided subfilter. That's the semantics the datastore provides when the bool
    # query only requires one clause to match, but if multiple clauses are required to match there's a subtle
    # issue. A document matches so long as each required clause matches *some* value, but it doesn't require
    # that they all match the *same* value. The list field on a document could contain N values, where
    # each value matches a different one of the required clauses, and the document will be a search hit.
    #
    # Rather than behaving in a surprising way here, we'd rather disallow a filter that has multiple required
    # clauses, so we return an error in this case.
    if required_matching_clause_count(processed_bool_query) > 1
      formatted_filter = Support::GraphQLFormatter.serialize(
        {schema_names.any_satisfy => filter},
        wrap_hash_with_braces: false
      )

      raise ::GraphQL::ExecutionError, "`#{formatted_filter}` is not supported because it produces " \
        "multiple filtering clauses under `#{schema_names.any_satisfy}`, which doesn't work as expected. " \
        "Remove one or more of your `#{schema_names.any_satisfy}` predicates and try again."
    else
      bool_node.update(processed_bool_query) do |_, existing_clauses, any_satisfy_clauses|
        existing_clauses + any_satisfy_clauses
      end
    end
  end

  def process_any_of_expression(bool_node, expressions, field_path)
    shoulds = expressions.filter_map do |expression|
      build_bool_hash do |inner_bool_node|
        process_filter_hash(inner_bool_node, expression, field_path)
      end
    end

    # When our `shoulds` array is empty, the filtering semantics we want is to match no documents.
    # However, that's not the behavior the datastore will give us if we have an empty array in the
    # query under `should`. To get the behavior we want, we need to pass the datastore some filter
    # criteria that will evaluate to false for every document.
    bool_query = shoulds.empty? ? BooleanQuery::ALWAYS_FALSE_FILTER : BooleanQuery.should(*shoulds)
    bool_query.merge_into(bool_node)
  end

  def process_all_of_expression(bool_node, expressions, field_path)
    # `all_of` represents an AND. AND is the default way that `process_filter_hash` combines
    # filters so we just have to call it for each sub-expression.
    expressions.each do |sub_expression|
      process_filter_hash(bool_node, sub_expression, field_path)
    end
  end

  def process_operator_expression(bool_node, operator, expression, field_path)
    # `operator` is a filtering operator, and `expression` is the value the filtering
    # operator should be applied to. The `op_applicator` lambda, when called, will
    # return a Clause instance (defined in this module).
    bool_query = filter_operators.fetch(operator).call(field_path.from_root.join("."), expression)
    bool_query&.merge_into(bool_node)
  end

  def process_sub_field_expression(bool_node, expression, field_path)
    # `sub_field` is a field name, and `expression` is a hash of filters to apply to that field.
    # We want to add the field name to the field path and recursively process the hash.
    #
    # However, if the hash has `any_of` in it, then we need to process the filter hash on
    # a nested bool node instead of on the `bool_node` we are already operating on.
    #
    # To understand why, first consider a filter that has no `any_of` but does use field nesting:
    #
    # filter: {
    #   weight: {lt: 2000},
    #   cost: {
    #     currency: {equal_to_any_of: ["USD"]}
    #     amount: {gt: 1000}
    #   }
    # }
    #
    # While this `currency` and `amount` are expressed as sub-filters under `cost` in our GraphQL
    # syntax, we do not actually need to create a nested bool node structure for the datastore
    # query. We get a flat filter structure like this:
    #
    # {bool: {filter: [
    #   {range: {"weight": {lt: 2000}}},
    #   {terms: {"cost.currency": ["USD"]}},
    #   {range: {"amount": {gt: 1000}}}
    # ]}}
    #
    # The 3 filter conditions are ANDed together as a single list under `filter`.
    # The nested field structure gets flattened using a dot-separated path.
    #
    # Now consider a filter that has multiple `any_of` sub-expressions:
    #
    # filter: {
    #   weight: {any_of: [
    #     {gt: 9000},
    #     {lt: 2000}
    #   ]},
    #   cost: {any_of: [
    #     currency: {equal_to_any_of: ["USD"]},
    #     amount: {gt: 1000}
    #   ]}
    # }
    #
    # If we did not make a nested structure, we would wind up with a single list of sub-expressions
    # that are OR'd together:
    #
    # {bool: {filter: [{bool: {should: [
    #   {range: {"weight": {gt: 9000}}},
    #   {range: {"weight": {lt: 2000}}},
    #   {terms: {"cost.currency": ["USD"]}},
    #   {range: {"amount": {gt: 1000}}}
    # ]}}]}}
    #
    # ...but that's clearly wrong. By creating a nested bool node based on the presence of `any_of`,
    # we can instead produce a structure like this:
    #
    # {bool: {filter: [
    #   {bool: {should: [
    #     {range: {"weight": {gt: 9000}}},
    #     {range: {"weight": {lt: 2000}}}
    #   ]}},
    #   {bool: {should: [
    #     {terms: {"cost.currency": ["USD"]}},
    #     {range: {"amount": {gt: 1000}}}
    #   ]}}
    # ]}}
    #
    # ...which will actually work correctly.
    if expression.key?(schema_names.any_of)
      sub_filter = build_bool_hash do |inner_node|
        process_filter_hash(inner_node, expression, field_path)
      end

      bool_node[:filter] << sub_filter
    else
      process_filter_hash(bool_node, expression, field_path)
    end
  end

  def process_list_count_expression(bool_node, expression, field_path)
    # Normally, we don't have to do anything special for list count expressions.
    # That's the case, for example, for an expression like:
    #
    # filter: {tags: {count: {gt: 2}}}
    #
    # However, if the count expression could match count of 0 (that is, if it doesn't
    # exclude a count of zero), such as this:
    #
    # filter: {tags: {count: {lt: 1}}}
    #
    # ...then we need some special handling here. A count of 0 is equivalent to the list field not existing.
    # While we index an explicit count of 0, the count field will be missing from documents indexed before
    # the list field was defined on the ElasticGraph schema. To properly match those documents, we need to
    # convert this into an OR (using `any_of`) to also match documents that lack the field entirely.
    unless excludes_zero?(expression)
      expression = {schema_names.any_of => [
        expression,
        {schema_names.equal_to_any_of => [nil]}
      ]}
    end

    process_sub_field_expression(bool_node, expression, field_path.counts_path)
  end

  def build_bool_hash(&block)
    bool_node = Hash.new { |h, k| h[k] = [] }.tap(&block)

    # To ignore "empty" filter predicates we need to return `nil` here.
    return nil if bool_node.empty?

    # According to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html#bool-min-should-match,
    # if the bool query includes at least one should clause and no must or filter clauses, the default value is 1. Otherwise, the default value is 0.
    # However, we want should clauses to work with musts and filters, so we need to set it explicitly to 1 when we have should clauses.
    bool_node[:minimum_should_match] = 1 if bool_node.key?(:should)

    {bool: bool_node}
  end

  # Determines if the given filter expression excludes the value `0`.
  def excludes_zero?(expression)
    expression.any? do |operator, operand|
      case operator
      when schema_names.equal_to_any_of then !operand.include?(0)
      when schema_names.lt then operand <= 0
      when schema_names.lte then operand < 0
      when schema_names.gt then operand >= 0
      when schema_names.gte then operand > 0
      else
        # :nocov: -- all operators are covered above. But simplecov complains about an implicit `else` branch being uncovered, so here we've defined it to wrap it with `:nocov:`.
        false
        # :nocov:
      end
    end
  end

  def filter_operators
    @filter_operators ||= build_filter_operators(runtime_metadata)
  end

  def build_filter_operators(runtime_metadata)
    schema_names = runtime_metadata.schema_element_names

    filter_by_time_of_day_script_id = runtime_metadata
      .static_script_ids_by_scoped_name
      .fetch("filter/by_time_of_day")

    {
      schema_names.equal_to_any_of => ->(field_name, value) {
        values = to_datastore_value(value.compact.uniq) # : ::Array[untyped]

        equality_sub_expression =
          if field_name == "id"
            # Use specialized "ids" query when querying on ID field.
            # See: https://www.elastic.co/guide/en/elasticsearch/reference/7.15/query-dsl-ids-query.html
            #
            # We reject empty strings because we otherwise get an error from the datastore:
            # "failed to create query: Ids can't be empty"
            {ids: {values: values - [""]}}
          else
            {terms: {field_name => values}}
          end

        exists_sub_expression = {exists: {"field" => field_name}}

        if !value.empty? && value.all?(&:nil?)
          BooleanQuery.new(:must_not, [{bool: {filter: [exists_sub_expression]}}])
        elsif value.include?(nil)
          BooleanQuery.filter({bool: {
            minimum_should_match: 1,
            should: [
              {bool: {filter: [equality_sub_expression]}},
              {bool: {must_not: [{bool: {filter: [exists_sub_expression]}}]}}
            ]
          }})
        else
          BooleanQuery.filter(equality_sub_expression)
        end
      },
      schema_names.gt => ->(field_name, value) { RangeQuery.new(field_name, :gt, value) },
      schema_names.gte => ->(field_name, value) { RangeQuery.new(field_name, :gte, value) },
      schema_names.lt => ->(field_name, value) { RangeQuery.new(field_name, :lt, value) },
      schema_names.lte => ->(field_name, value) { RangeQuery.new(field_name, :lte, value) },
      schema_names.matches => ->(field_name, value) { BooleanQuery.must({match: {field_name => value}}) },
      schema_names.matches_query => ->(field_name, value) do
        allowed_edits_per_term = value.fetch(schema_names.allowed_edits_per_term).runtime_metadata.datastore_abbreviation

        BooleanQuery.must(
          {
            match: {
              field_name => {
                query: value.fetch(schema_names.query),
                # This is always a string field, even though the value is often an integer
                fuzziness: allowed_edits_per_term.to_s,
                operator: value[schema_names.require_all_terms] ? "AND" : "OR"
              }
            }
          }
        )
      end,
      schema_names.matches_phrase => ->(field_name, value) {
        BooleanQuery.must(
          {
            match_phrase_prefix: {
              field_name => {
                query: value.fetch(schema_names.phrase)
              }
            }
          }
        )
      },

      # This filter operator wraps a geo distance query:
      # https://www.elastic.co/guide/en/elasticsearch/reference/7.10/query-dsl-geo-distance-query.html
      schema_names.near => ->(field_name, value) do
        unit_abbreviation = value.fetch(schema_names.unit).runtime_metadata.datastore_abbreviation

        BooleanQuery.filter({geo_distance: {
          "distance" => "#{value.fetch(schema_names.max_distance)}#{unit_abbreviation}",
          field_name => {
            "lat" => value.fetch(schema_names.latitude),
            "lon" => value.fetch(schema_names.longitude)
          }
        }})
      end,

      schema_names.time_of_day => ->(field_name, value) do
        # To filter on time of day, we use the `filter/by_time_of_day` script. We accomplish
        # this with a script because Elasticsearch/OpenSearch do not support this natively, and it's
        # incredibly hard to implement correctly with respect to time zones without using a
        # script. We considered indexing the `time_of_day` as a separate index field
        # that we could directly filter on, but since we need the time of day to be relative
        # to a specific time zone, there's no way to make that work with the reality of
        # daylight savings time. For example, the `America/Los_Angeles` time zone has a -07:00
        # UTC offset for part of the year and a `America/Los_Angeles` -08:00 UTC offset for
        # part of the year. In a script we can use Java time zone APIs to handle this correctly.
        params = {
          field: field_name,
          equal_to_any_of: list_of_nanos_of_day_from(value, schema_names.equal_to_any_of),
          gt: nano_of_day_from(value, schema_names.gt),
          gte: nano_of_day_from(value, schema_names.gte),
          lt: nano_of_day_from(value, schema_names.lt),
          lte: nano_of_day_from(value, schema_names.lte),
          time_zone: value[schema_names.time_zone]
        }.compact

        # If there are no comparison operators, return `nil` instead of a `Clause` so that we avoid
        # invoking the script for no reason. Note that `field` and `time_zone` will always be in
        # `params` so we can't just check for an empty hash here.
        if (params.keys - [:field, :time_zone]).any?
          BooleanQuery.filter({script: {script: {id: filter_by_time_of_day_script_id, params: params}}})
        end
      end
    }.freeze
  end

  def to_datastore_value(value)
    case value
    when ::Array
      value.map { |v| to_datastore_value(v) }
    when Schema::EnumValue
      value.name.to_s
    else
      value
    end
  end

  def nano_of_day_from(value, field)
    local_time = value[field]
    Support::TimeUtil.nano_of_day_from_local_time(local_time) if local_time
  end

  def list_of_nanos_of_day_from(value, field)
    value[field]&.map { |t| Support::TimeUtil.nano_of_day_from_local_time(t) }
  end

  # Counts how many clauses in `bool_query` are required to match for a document to be a search hit.
  def required_matching_clause_count(bool_query)
    bool_query.reduce(0) do |count, (occurrence, clauses)|
      case occurrence
      when :should
        # The number of required matching clauses imposed by `:should` depends on the `:minimum_should_match` value.
        # https://www.elastic.co/guide/en/elasticsearch/reference/8.9/query-dsl-bool-query.html#bool-min-should-match
        bool_query.fetch(:minimum_should_match)
      when :minimum_should_match
        0 # doesn't have any clauses on its own, just controls how many `:should` clauses are required.
      else
        # For all other occurrences, each cluse must match.
        clauses.size
      end + count
    end
  end
end