Method: Informers::TokenClassificationPipeline#group_entities

Defined in:
lib/informers/pipelines.rb

#group_entities(entities) ⇒ Object



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/informers/pipelines.rb', line 192

def group_entities(entities)
  entity_groups = []
  entity_group_disagg = []

  entities.each do |entity|
    if entity_group_disagg.empty?
      entity_group_disagg << entity
      next
    end

    # If the current entity is similar and adjacent to the previous entity,
    # append it to the disaggregated entity group
    # The split is meant to account for the "B" and "I" prefixes
    # Shouldn't merge if both entities are B-type
    bi, tag = get_tag(entity[:entity])
    _last_bi, last_tag = get_tag(entity_group_disagg[-1][:entity])

    if tag == last_tag && bi != "B"
      # Modify subword type to be previous_type
      entity_group_disagg << entity
    else
      # If the current entity is different from the previous entity
      # aggregate the disaggregated entity group
      entity_groups << group_sub_entities(entity_group_disagg)
      entity_group_disagg = [entity]
    end
  end
  if entity_group_disagg.any?
    # it's the last entity, add it to the entity groups
    entity_groups << group_sub_entities(entity_group_disagg)
  end

  entity_groups
end