Class: Informers::TokenClassificationPipeline

Inherits:

Pipeline

Object
Pipeline
Informers::TokenClassificationPipeline

show all

Defined in:: lib/informers/pipelines.rb

Instance Method Summary collapse

Methods inherited from Pipeline

#initialize

Constructor Details

This class inherits a constructor from Informers::Pipeline

Instance Method Details

#call(texts, ignore_labels: ["O"], aggregation_strategy: "simple") ⇒ `Object`

# File 'lib/informers/pipelines.rb', line 55

def call(
  texts,
  ignore_labels: ["O"],
  aggregation_strategy: "simple"
)
  is_batched = texts.is_a?(Array)

  # Run tokenization
  model_inputs = @tokenizer.(is_batched ? texts : [texts],
    padding: true,
    truncation: true,
    return_offsets: true
  )

  # Run model
  outputs = @model.(model_inputs)

  logits = outputs.logits
  id2label = @model.config.id2label

  to_return = []
  logits.length.times do |i|
    ids = model_inputs[:input_ids][i]
    batch = logits[i]
    offsets = model_inputs[:offsets][i]

    # List of tokens that aren't ignored
    tokens = []
    batch.length.times do |j|
      token_data = batch[j]
      top_score_index = Utils.max(token_data)[1]

      entity = id2label ? id2label[top_score_index.to_s] : "LABEL_#{top_score_index}"
      if ignore_labels.include?(entity)
        # We predicted a token that should be ignored. So, we skip it.
        next
      end

      # TODO add option to keep special tokens?
      word = @tokenizer.decode([ids[j]], skip_special_tokens: true)
      if word == ""
        # Was a special token. So, we skip it.
        next
      end

      scores = Utils.softmax(token_data)

      tokens << {
        entity: entity,
        score: scores[top_score_index],
        index: j,
        word: word,
        start: offsets[j][0],
        end: offsets[j][1]
      }
    end

    case aggregation_strategy
    when "simple"
      tokens = group_entities(tokens)
    when "none"
      # do nothing
    else
      raise ArgumentError, "Invalid aggregation_strategy"
    end

    to_return << tokens
  end
  is_batched ? to_return : to_return[0]
end

#get_tag(entity_name) ⇒ `Object`

# File 'lib/informers/pipelines.rb', line 142

def get_tag(entity_name)
  if entity_name.start_with?("B-")
    bi = "B"
    tag = entity_name[2..]
  elsif entity_name.start_with?("I-")
    bi = "I"
    tag = entity_name[2..]
  else
    # It's not in B-, I- format
    # Default to I- for continuation.
    bi = "I"
    tag = entity_name
  end
  [bi, tag]
end

#group_entities(entities) ⇒ `Object`

# File 'lib/informers/pipelines.rb', line 158

def group_entities(entities)
  entity_groups = []
  entity_group_disagg = []

  entities.each do |entity|
    if entity_group_disagg.empty?
      entity_group_disagg << entity
      next
    end

    # If the current entity is similar and adjacent to the previous entity,
    # append it to the disaggregated entity group
    # The split is meant to account for the "B" and "I" prefixes
    # Shouldn't merge if both entities are B-type
    bi, tag = get_tag(entity[:entity])
    _last_bi, last_tag = get_tag(entity_group_disagg[-1][:entity])

    if tag == last_tag && bi != "B"
      # Modify subword type to be previous_type
      entity_group_disagg << entity
    else
      # If the current entity is different from the previous entity
      # aggregate the disaggregated entity group
      entity_groups << group_sub_entities(entity_group_disagg)
      entity_group_disagg = [entity]
    end
  end
  if entity_group_disagg.any?
    # it's the last entity, add it to the entity groups
    entity_groups << group_sub_entities(entity_group_disagg)
  end

  entity_groups
end

#group_sub_entities(entities) ⇒ `Object`

# File 'lib/informers/pipelines.rb', line 126

def group_sub_entities(entities)
  # Get the first entity in the entity group
  entity = entities[0][:entity].split("-", 2)[-1]
  scores = entities.map { |entity| entity[:score] }
  tokens = entities.map { |entity| entity[:word] }

  entity_group = {
    entity_group: entity,
    score: scores.sum / scores.count.to_f,
    word: @tokenizer.convert_tokens_to_string(tokens),
    start: entities[0][:start],
    end: entities[-1][:end]
  }
  entity_group
end

Class: Informers::TokenClassificationPipeline

Instance Method Summary collapse

Methods inherited from Pipeline

Constructor Details

Instance Method Details

#call(texts, ignore_labels: ["O"], aggregation_strategy: "simple") ⇒ Object

#get_tag(entity_name) ⇒ Object

#group_entities(entities) ⇒ Object

#group_sub_entities(entities) ⇒ Object

#call(texts, ignore_labels: ["O"], aggregation_strategy: "simple") ⇒ `Object`

#get_tag(entity_name) ⇒ `Object`

#group_entities(entities) ⇒ `Object`

#group_sub_entities(entities) ⇒ `Object`