Module: Informers

Defined in:
lib/informers.rb,
lib/informers/env.rb,
lib/informers/model.rb,
lib/informers/models.rb,
lib/informers/configs.rb,
lib/informers/version.rb,
lib/informers/pipelines.rb,
lib/informers/utils/hub.rb,
lib/informers/tokenizers.rb,
lib/informers/utils/core.rb,
lib/informers/utils/math.rb,
lib/informers/utils/tensor.rb

Defined Under Namespace

Modules: Utils Classes: AutoConfig, AutoModel, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoTokenizer, BertForSequenceClassification, BertForTokenClassification, BertModel, BertPreTrainedModel, BertTokenizer, DebertaV2Model, DebertaV2PreTrainedModel, DebertaV2Tokenizer, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DistilBertModel, DistilBertPreTrainedModel, DistilBertTokenizer, EmbeddingPipeline, Error, FeatureExtractionPipeline, MPNetModel, MPNetPreTrainedModel, MPNetTokenizer, Model, ModelOutput, NomicBertModel, NomicBertPreTrainedModel, Pipeline, PreTrainedModel, PreTrainedTokenizer, PretrainedConfig, PretrainedMixin, QuestionAnsweringModelOutput, QuestionAnsweringPipeline, RerankingPipeline, RobertaTokenizer, SequenceClassifierOutput, TextClassificationPipeline, Todo, TokenClassificationPipeline, TokenClassifierOutput, XLMRobertaForSequenceClassification, XLMRobertaModel, XLMRobertaPreTrainedModel, XLMRobertaTokenizer

Constant Summary collapse

CACHE_HOME =
ENV.fetch("XDG_CACHE_HOME", File.join(ENV.fetch("HOME"), ".cache"))
DEFAULT_CACHE_DIR =
File.expand_path(File.join(CACHE_HOME, "informers"))
MODEL_TYPES =
{
  EncoderOnly: 0,
  EncoderDecoder: 1,
  Seq2Seq: 2,
  Vision2Seq: 3,
  DecoderOnly: 4,
  MaskGeneration: 5
}
MODEL_TYPE_MAPPING =

NOTE: These will be populated fully later

{}
MODEL_NAME_TO_CLASS_MAPPING =
{}
MODEL_CLASS_TO_NAME_MAPPING =
{}
MODEL_MAPPING_NAMES_ENCODER_ONLY =
{
  "bert" => ["BertModel", BertModel],
  "nomic_bert" => ["NomicBertModel", NomicBertModel],
  "deberta-v2" => ["DebertaV2Model", DebertaV2Model],
  "mpnet" => ["MPNetModel", MPNetModel],
  "distilbert" => ["DistilBertModel", DistilBertModel],
  "xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel]
}
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES =
{
  "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
  "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
  "xlm-roberta" => ["XLMRobertaForSequenceClassification", XLMRobertaForSequenceClassification]
}
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES =
{
  "bert" => ["BertForTokenClassification", BertForTokenClassification]
}
MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES =
{
  "distilbert" => ["DistilBertForQuestionAnswering", DistilBertForQuestionAnswering]
}
MODEL_CLASS_TYPE_MAPPING =
[
  [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
  [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
  [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
  [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
]
VERSION =
"1.0.3"
SUPPORTED_TASKS =
{
  "text-classification" => {
    tokenizer: AutoTokenizer,
    pipeline: TextClassificationPipeline,
    model: AutoModelForSequenceClassification,
    default: {
      model: "Xenova/distilbert-base-uncased-finetuned-sst-2-english"
    },
    type: "text"
  },
  "token-classification" => {
    tokenizer: AutoTokenizer,
    pipeline: TokenClassificationPipeline,
    model: AutoModelForTokenClassification,
    default: {
      model: "Xenova/bert-base-multilingual-cased-ner-hrl"
    },
    type: "text"
  },
  "question-answering" => {
    tokenizer: AutoTokenizer,
    pipeline: QuestionAnsweringPipeline,
    model: AutoModelForQuestionAnswering,
    default: {
      model: "Xenova/distilbert-base-cased-distilled-squad"
    },
    type: "text"
  },
  "feature-extraction" => {
    tokenizer: AutoTokenizer,
    pipeline: FeatureExtractionPipeline,
    model: AutoModel,
    default: {
      model: "Xenova/all-MiniLM-L6-v2"
    },
    type: "text"
  },
  "embedding" => {
    tokenizer: AutoTokenizer,
    pipeline: EmbeddingPipeline,
    model: AutoModel,
    default: {
      model: "sentence-transformers/all-MiniLM-L6-v2"
    },
    type: "text"
  },
  "reranking" => {
    tokenizer: AutoTokenizer,
    pipeline: RerankingPipeline,
    model: AutoModel,
    default: {
      model: "mixedbread-ai/mxbai-rerank-base-v1"
    },
    type: "text"
  }
}
TASK_ALIASES =
{
  "sentiment-analysis" => "text-classification",
  "ner" => "token-classification"
}
DEFAULT_PROGRESS_CALLBACK =
lambda do |msg|
  stream = $stderr
  tty = stream.tty?
  width = tty ? stream.winsize[1] : 80

  if msg[:status] == "progress" && tty
    stream.print "\r#{Utils::Hub.display_progress(msg[:file], width, msg[:size], msg[:total_size])}"
  elsif msg[:status] == "done" && !msg[:cache_hit]
    if tty
      stream.puts
    else
      stream.puts Utils::Hub.display_progress(msg[:file], width, 1, 1)
    end
  end
end
NO_DEFAULT =
Object.new

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.allow_remote_modelsObject

Returns the value of attribute allow_remote_models.



6
7
8
# File 'lib/informers/env.rb', line 6

def allow_remote_models
  @allow_remote_models
end

.cache_dirObject

Returns the value of attribute cache_dir.



6
7
8
# File 'lib/informers/env.rb', line 6

def cache_dir
  @cache_dir
end

.remote_hostObject

Returns the value of attribute remote_host.



6
7
8
# File 'lib/informers/env.rb', line 6

def remote_host
  @remote_host
end

.remote_path_templateObject

Returns the value of attribute remote_path_template.



6
7
8
# File 'lib/informers/env.rb', line 6

def remote_path_template
  @remote_path_template
end

Class Method Details

.pipeline(task, model = nil, quantized: NO_DEFAULT, progress_callback: DEFAULT_PROGRESS_CALLBACK, config: nil, cache_dir: nil, local_files_only: false, revision: "main", model_file_name: nil) ⇒ Object



431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
# File 'lib/informers/pipelines.rb', line 431

def pipeline(
  task,
  model = nil,
  quantized: NO_DEFAULT,
  progress_callback: DEFAULT_PROGRESS_CALLBACK,
  config: nil,
  cache_dir: nil,
  local_files_only: false,
  revision: "main",
  model_file_name: nil
)
  if quantized == NO_DEFAULT
    # TODO move default to task class
    quantized = !["embedding", "reranking"].include?(task)
  end

  # Apply aliases
  task = TASK_ALIASES[task] || task

  # Get pipeline info
  pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
  if !pipeline_info
    raise Error, "Unsupported pipeline: #{task}. Must be one of #{SUPPORTED_TASKS.keys}"
  end

  # Use model if specified, otherwise, use default
  if !model
    model = pipeline_info[:default][:model]
    warn "No model specified. Using default model: #{model.inspect}."
  end

  pretrained_options = {
    quantized:,
    progress_callback:,
    config:,
    cache_dir:,
    local_files_only:,
    revision:,
    model_file_name:
  }

  classes = {
    tokenizer: pipeline_info[:tokenizer],
    model: pipeline_info[:model],
    processor: pipeline_info[:processor]
  }

  # Load model, tokenizer, and processor (if they exist)
  results = load_items(classes, model, pretrained_options)
  results[:task] = task

  if model == "sentence-transformers/all-MiniLM-L6-v2"
    results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
  end

  Utils.dispatch_callback(progress_callback, {
    status: "ready",
    task: task,
    model: model
  })

  pipeline_class = pipeline_info.fetch(:pipeline)
  pipeline_class.new(**results)
end