Module: Informers

Defined in:: lib/informers.rb,
lib/informers/env.rb,
lib/informers/model.rb,
lib/informers/models.rb,
lib/informers/configs.rb,
lib/informers/version.rb,
lib/informers/pipelines.rb,
lib/informers/utils/hub.rb,
lib/informers/tokenizers.rb,
lib/informers/utils/core.rb,
lib/informers/utils/math.rb,
lib/informers/utils/tensor.rb

Defined Under Namespace

Modules: Utils Classes: AutoConfig, AutoModel, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoTokenizer, BertForSequenceClassification, BertForTokenClassification, BertModel, BertPreTrainedModel, BertTokenizer, DebertaV2Model, DebertaV2PreTrainedModel, DebertaV2Tokenizer, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DistilBertModel, DistilBertPreTrainedModel, DistilBertTokenizer, EmbeddingPipeline, Error, FeatureExtractionPipeline, MPNetModel, MPNetPreTrainedModel, MPNetTokenizer, Model, ModelOutput, NomicBertModel, NomicBertPreTrainedModel, Pipeline, PreTrainedModel, PreTrainedTokenizer, PretrainedConfig, PretrainedMixin, QuestionAnsweringModelOutput, QuestionAnsweringPipeline, RerankingPipeline, RobertaTokenizer, SequenceClassifierOutput, TextClassificationPipeline, Todo, TokenClassificationPipeline, TokenClassifierOutput, XLMRobertaForSequenceClassification, XLMRobertaModel, XLMRobertaPreTrainedModel, XLMRobertaTokenizer

Constant Summary collapse

CACHE_HOME =

ENV.fetch("XDG_CACHE_HOME", File.join(ENV.fetch("HOME"), ".cache"))

DEFAULT_CACHE_DIR =

File.expand_path(File.join(CACHE_HOME, "informers"))

MODEL_TYPES =

{
  EncoderOnly: 0,
  EncoderDecoder: 1,
  Seq2Seq: 2,
  Vision2Seq: 3,
  DecoderOnly: 4,
  MaskGeneration: 5
}

MODEL_TYPE_MAPPING = NOTE: These will be populated fully later

{}

MODEL_NAME_TO_CLASS_MAPPING =

{}

MODEL_CLASS_TO_NAME_MAPPING =

{}

MODEL_MAPPING_NAMES_ENCODER_ONLY =

{
  "bert" => ["BertModel", BertModel],
  "nomic_bert" => ["NomicBertModel", NomicBertModel],
  "deberta-v2" => ["DebertaV2Model", DebertaV2Model],
  "mpnet" => ["MPNetModel", MPNetModel],
  "distilbert" => ["DistilBertModel", DistilBertModel],
  "xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel]
}

MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES =

{
  "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
  "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
  "xlm-roberta" => ["XLMRobertaForSequenceClassification", XLMRobertaForSequenceClassification]
}

MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES =

{
  "bert" => ["BertForTokenClassification", BertForTokenClassification]
}

MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES =

{
  "distilbert" => ["DistilBertForQuestionAnswering", DistilBertForQuestionAnswering]
}

MODEL_CLASS_TYPE_MAPPING =

[
  [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
  [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
  [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
  [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
]

VERSION =

"1.0.3"

SUPPORTED_TASKS =

{
  "text-classification" => {
    tokenizer: AutoTokenizer,
    pipeline: TextClassificationPipeline,
    model: AutoModelForSequenceClassification,
    default: {
      model: "Xenova/distilbert-base-uncased-finetuned-sst-2-english"
    },
    type: "text"
  },
  "token-classification" => {
    tokenizer: AutoTokenizer,
    pipeline: TokenClassificationPipeline,
    model: AutoModelForTokenClassification,
    default: {
      model: "Xenova/bert-base-multilingual-cased-ner-hrl"
    },
    type: "text"
  },
  "question-answering" => {
    tokenizer: AutoTokenizer,
    pipeline: QuestionAnsweringPipeline,
    model: AutoModelForQuestionAnswering,
    default: {
      model: "Xenova/distilbert-base-cased-distilled-squad"
    },
    type: "text"
  },
  "feature-extraction" => {
    tokenizer: AutoTokenizer,
    pipeline: FeatureExtractionPipeline,
    model: AutoModel,
    default: {
      model: "Xenova/all-MiniLM-L6-v2"
    },
    type: "text"
  },
  "embedding" => {
    tokenizer: AutoTokenizer,
    pipeline: EmbeddingPipeline,
    model: AutoModel,
    default: {
      model: "sentence-transformers/all-MiniLM-L6-v2"
    },
    type: "text"
  },
  "reranking" => {
    tokenizer: AutoTokenizer,
    pipeline: RerankingPipeline,
    model: AutoModel,
    default: {
      model: "mixedbread-ai/mxbai-rerank-base-v1"
    },
    type: "text"
  }
}

TASK_ALIASES =

{
  "sentiment-analysis" => "text-classification",
  "ner" => "token-classification"
}

DEFAULT_PROGRESS_CALLBACK =

lambda do |msg|
  stream = $stderr
  tty = stream.tty?
  width = tty ? stream.winsize[1] : 80

  if msg[:status] == "progress" && tty
    stream.print "\r#{Utils::Hub.display_progress(msg[:file], width, msg[:size], msg[:total_size])}"
  elsif msg[:status] == "done" && !msg[:cache_hit]
    if tty
      stream.puts
    else
      stream.puts Utils::Hub.display_progress(msg[:file], width, 1, 1)
    end
  end
end

NO_DEFAULT =

Object.new

Class Attribute Summary collapse

.allow_remote_models ⇒ Object

Returns the value of attribute allow_remote_models.
.cache_dir ⇒ Object

Returns the value of attribute cache_dir.
.remote_host ⇒ Object

Returns the value of attribute remote_host.
.remote_path_template ⇒ Object

Returns the value of attribute remote_path_template.

Class Method Summary collapse

.pipeline(task, model = nil, quantized: NO_DEFAULT, progress_callback: DEFAULT_PROGRESS_CALLBACK, config: nil, cache_dir: nil, local_files_only: false, revision: "main", model_file_name: nil) ⇒ Object

Class Attribute Details

.allow_remote_models ⇒ `Object`

Returns the value of attribute allow_remote_models.



6
7
8

# File 'lib/informers/env.rb', line 6

def allow_remote_models
  @allow_remote_models
end

.cache_dir ⇒ `Object`

Returns the value of attribute cache_dir.



6
7
8

# File 'lib/informers/env.rb', line 6

def cache_dir
  @cache_dir
end

.remote_host ⇒ `Object`

Returns the value of attribute remote_host.



6
7
8

# File 'lib/informers/env.rb', line 6

def remote_host
  @remote_host
end

.remote_path_template ⇒ `Object`

Returns the value of attribute remote_path_template.



6
7
8

# File 'lib/informers/env.rb', line 6

def remote_path_template
  @remote_path_template
end

Class Method Details

.pipeline(task, model = nil, quantized: NO_DEFAULT, progress_callback: DEFAULT_PROGRESS_CALLBACK, config: nil, cache_dir: nil, local_files_only: false, revision: "main", model_file_name: nil) ⇒ `Object`

# File 'lib/informers/pipelines.rb', line 431

def pipeline(
  task,
  model = nil,
  quantized: NO_DEFAULT,
  progress_callback: DEFAULT_PROGRESS_CALLBACK,
  config: nil,
  cache_dir: nil,
  local_files_only: false,
  revision: "main",
  model_file_name: nil
)
  if quantized == NO_DEFAULT
    # TODO move default to task class
    quantized = !["embedding", "reranking"].include?(task)
  end

  # Apply aliases
  task = TASK_ALIASES[task] || task

  # Get pipeline info
  pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
  if !pipeline_info
    raise Error, "Unsupported pipeline: #{task}. Must be one of #{SUPPORTED_TASKS.keys}"
  end

  # Use model if specified, otherwise, use default
  if !model
    model = pipeline_info[:default][:model]
    warn "No model specified. Using default model: #{model.inspect}."
  end

  pretrained_options = {
    quantized:,
    progress_callback:,
    config:,
    cache_dir:,
    local_files_only:,
    revision:,
    model_file_name:
  }

  classes = {
    tokenizer: pipeline_info[:tokenizer],
    model: pipeline_info[:model],
    processor: pipeline_info[:processor]
  }

  # Load model, tokenizer, and processor (if they exist)
  results = load_items(classes, model, pretrained_options)
  results[:task] = task

  if model == "sentence-transformers/all-MiniLM-L6-v2"
    results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
  end

  Utils.dispatch_callback(progress_callback, {
    status: "ready",
    task: task,
    model: model
  })

  pipeline_class = pipeline_info.fetch(:pipeline)
  pipeline_class.new(**results)
end