Class: Informers::ImageFeatureExtractor

Inherits:

Object
FeatureExtractor
Informers::ImageFeatureExtractor

show all

Defined in:: lib/informers/processors.rb

Direct Known Subclasses

CLIPFeatureExtractor, DPTFeatureExtractor, DetrFeatureExtractor, DonutFeatureExtractor, OwlViTFeatureExtractor, Swin2SRImageProcessor, ViTFeatureExtractor

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ `ImageFeatureExtractor`

Returns a new instance of ImageFeatureExtractor.

# File 'lib/informers/processors.rb', line 10

def initialize(config)
  super(config)

  @image_mean = @config["image_mean"] || @config["mean"]
  @image_std = @config["image_std"] || @config["std"]

  @resample = @config["resample"] || 2 # 2 => bilinear
  @do_rescale = @config.fetch("do_rescale", true)
  @rescale_factor = @config["rescale_factor"] || (1 / 255.0)
  @do_normalize = @config["do_normalize"]

  @do_resize = @config["do_resize"]
  @do_thumbnail = @config["do_thumbnail"]
  @size = @config["size"]
  @size_divisibility = @config["size_divisibility"] || @config["size_divisor"]

  @do_center_crop = @config["do_center_crop"]
  @crop_size = @config["crop_size"]
  @do_convert_rgb = @config.fetch("do_convert_rgb", true)
  @do_crop_margin = @config["do_crop_margin"]

  @pad_size = @config["pad_size"]
  @do_pad = @config["do_pad"]

  if @do_pad && !@pad_size && @size && !@size["width"].nil? && !@size["height"].nil?
    # Should pad, but no pad size specified
    # We infer the pad size from the resize size
    @pad_size = @size
  end

  @do_flip_channel_order = @config["do_flip_channel_order"] || false
end

Instance Method Details

#call(images, *args) ⇒ `Object`

# File 'lib/informers/processors.rb', line 330

def call(images, *args)
  if !images.is_a?(Array)
    images = [images]
  end

  image_data = images.map { |x| preprocess(x) }

  # Stack pixel values
  pixel_values = Utils.stack(image_data.map { |x| x[:pixel_values] }, 0)

  {
    pixel_values: pixel_values,

    # Original sizes of images
    original_sizes: image_data.map { |x| x[:original_size] },

    # Reshaped sizes of images, before padding or cropping
    reshaped_input_sizes: image_data.map { |x| x[:reshaped_input_size] }
  }
end

#get_resize_output_image_size(image, size) ⇒ `Object`

# File 'lib/informers/processors.rb', line 151

def get_resize_output_image_size(image, size)
  src_width, src_height = image.size

  if @do_thumbnail
    # NOTE: custom logic for `Donut` models
    height = size["height"]
    width = size["width"]
    shortest_edge = [height, width].min
  elsif size.is_a?(Numeric)
    shortest_edge = size
    longest_edge = @config["max_size"] || shortest_edge
  elsif !size.nil?
    # Extract known properties from `size`
    shortest_edge = size["shortest_edge"]
    longest_edge = size["longest_edge"]
  end

  if !shortest_edge.nil? || !longest_edge.nil?
    # http://opensourcehacker.com/2011/12/01/calculate-aspect-ratio-conserving-resize-for-images-in-javascript/
    # Try resize so that shortest edge is `shortest_edge` (target)
    short_resize_factor =
      if shortest_edge.nil?
        1 # If `shortest_edge` is not set, don't upscale
      else
        [shortest_edge / src_width.to_f, shortest_edge / src_height.to_f].max
      end

    new_width = src_width * short_resize_factor
    new_height = src_height * short_resize_factor

    # The new width and height might be greater than `longest_edge`, so
    # we downscale again to ensure the largest dimension is `longest_edge`
    long_resize_factor =
      if longest_edge.nil?
        1 # If `longest_edge` is not set, don't downscale
      else
        [longest_edge / new_width.to_f, longest_edge / new_height.to_f].min
      end

    # To avoid certain floating point precision issues, we round to 2 decimal places
    final_width = (new_width * long_resize_factor).round(2).floor
    final_height = (new_height * long_resize_factor).round(2).floor

    if !@size_divisibility.nil?
      raise Todo
    end
    [final_width, final_height]
  elsif !size.nil? && !size["width"].nil? && !size["height"].nil?
    new_width = size["width"]
    new_height = size["height"]

    if @config["keep_aspect_ratio"] && @config["ensure_multiple_of"]
      raise Todo
    end

    [new_width, new_height]
  else
    raise Todo
  end
end

#pad_image(pixel_data, img_dims, pad_size, mode: "constant", center: false, constant_values: 0) ⇒ `Object`

# File 'lib/informers/processors.rb', line 65

def pad_image(
  pixel_data,
  img_dims,
  pad_size,
  mode: "constant",
  center: false,
  constant_values: 0
)
  image_height, image_width, image_channels = img_dims

  if pad_size.is_a?(Numeric)
    padded_image_width = pad_size
    padded_image_height = pad_size
  else
    padded_image_width = pad_size[:width] || pad_size["width"]
    padded_image_height = pad_size[:height] || pad_size["height"]
  end

  # Only add padding if there is a difference in size
  if padded_image_width != image_width || padded_image_height != image_height
    padded_pixel_data = Array.new(padded_image_width * padded_image_height * image_channels)
    if constant_values.is_a?(Array)
      # Fill with constant values, cycling through the array
      padded_pixel_data.length.times do |i|
        padded_pixel_data[i] = constant_values[i % image_channels]
      end
    elsif constant_values != 0
      padded_pixel_data.fill(constant_values)
    end

    left, top =
      if center
        [((padded_image_width - image_width) / 2.0).floor, ((padded_image_height - image_height) / 2.0).floor]
      else
        [0, 0]
      end

    # Copy the original image into the padded image
    image_height.times do |i|
      a = (i + top) * padded_image_width
      b = i * image_width
      image_width.times do |j|
        c = (a + j + left) * image_channels
        d = (b + j) * image_channels
        image_channels.times do |k|
          padded_pixel_data[c + k] = pixel_data[d + k]
        end
      end
    end

    if mode == "symmetric"
      if center
        raise Error, "`center` padding is not supported when `mode` is set to `symmetric`."
      end
      h1 = image_height - 1
      w1 = image_width - 1
      padded_image_height.times do |i|
        a = i * padded_image_width
        b = Utils.calculate_reflect_offset(i, h1) * image_width

        padded_image_width.times do |j|
          next if i < image_height && j < image_width # Do not overwrite original image
          c = (a + j) * image_channels
          d = (b + Utils.calculate_reflect_offset(j, w1)) * image_channels

          # Copy channel-wise
          image_channels.times do |k|
            padded_pixel_data[c + k] = pixel_data[d + k]
          end
        end
      end
    end

    # Update pixel data and image dimensions
    pixel_data = padded_pixel_data
    img_dims = [padded_image_height, padded_image_width, image_channels]
  end
  [pixel_data, img_dims]
end

#preprocess(image, do_normalize: nil, do_pad: nil, do_convert_rgb: nil, do_convert_grayscale: nil, do_flip_channel_order: nil) ⇒ `Object`

# File 'lib/informers/processors.rb', line 217

def preprocess(
  image,
  do_normalize: nil,
  do_pad: nil,
  do_convert_rgb: nil,
  do_convert_grayscale: nil,
  do_flip_channel_order: nil
)
  if @do_crop_margin
    # NOTE: Specific to nougat processors. This is done before resizing,
    # and can be interpreted as a pre-preprocessing step.
    image = crop_margin(image)
  end

  src_width, src_height = image.size # original image size

  # Convert image to RGB if specified in config.
  if !do_convert_rgb.nil? ? do_convert_rgb : @do_convert_rgb
    image = image.rgb
  elsif do_convert_grayscale
    image = image.grayscale
  end

  # Resize all images
  if @do_resize
    image = resize(image)
  end

  # Resize the image using thumbnail method.
  if @do_thumbnail
    image = thumbnail(image, @size, @resample)
  end

  if @do_center_crop
    if @crop_size.is_a?(Integer)
      crop_width = @crop_size
      crop_height = @crop_size
    else
      crop_width = @crop_size["width"]
      crop_height = @crop_size["height"]
    end
    image = image.center_crop(crop_width, crop_height)
  end

  reshaped_input_size = [image.height, image.width]

  # NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
  # occurs with data in the hwc format (height, width, channels),
  # to emulate the behavior of the original Python code (w/ numpy).
  pixel_data = image.data
  img_dims = [image.height, image.width, image.channels]

  if @do_rescale
    rescale(pixel_data)
  end

  if !do_normalize.nil? ? do_normalize : @do_normalize
    image_mean = @image_mean
    if !@image_mean.is_a?(Array)
      image_mean = new Array(image.channels) { image_mean }
    end

    image_std = @image_std
    if !@image_std.is_a?(Array)
      image_std = new Array(image.channels) { image_std }
    end

    if image_mean.length != image.channels || image_std.length != image.channels
      raise Error, "When set to arrays, the length of `image_mean` (#{image_mean.length}) and `image_std` (#{image_std.length}) must match the number of channels in the image (#{image.channels})."
    end

    i = 0
    while i < pixel_data.length
      image.channels.times do |j|
        pixel_data[i + j] = (pixel_data[i + j] - image_mean[j]) / image_std[j]
      end
      i += image.channels
    end
  end

  # do padding after rescaling/normalizing
  if !do_pad.nil? ? do_pad : @do_pad
    if @pad_size
      padded = pad_image(pixel_data, [image.height, image.width, image.channels], @pad_size)
      pixel_data, img_dims = padded # Update pixel data and image dimensions
    elsif @size_divisibility
      raise Todo
    end
  end

  if !do_flip_channel_order.nil? ? do_flip_channel_order : @do_flip_channel_order
    raise Todo
  end

  # convert to channel dimension format (hwc -> chw)
  h, w, c = img_dims
  pixel_values =
    c.times.map do |ci|
      h.times.map do |hi|
        w.times.map do |wi|
          index = (hi * w * c) + (wi * c) + ci
          pixel_data[index]
        end
      end
    end

  {
    original_size: [src_height, src_width],
    reshaped_input_size: reshaped_input_size,
    pixel_values: pixel_values
  }
end

#rescale(pixel_data) ⇒ `Object`

# File 'lib/informers/processors.rb', line 145

def rescale(pixel_data)
  pixel_data.length.times do |i|
    pixel_data[i] *= @rescale_factor
  end
end

#resize(image) ⇒ `Object`

# File 'lib/informers/processors.rb', line 212

def resize(image)
  new_width, new_height = get_resize_output_image_size(image, @size)
  image.resize(new_width, new_height, resample: @resample)
end

#thumbnail(image, size, resample = 2) ⇒ `Object`

# File 'lib/informers/processors.rb', line 43

def thumbnail(image, size, resample = 2)
  input_height = image.height
  input_width = image.width

  output_height = size["height"]
  output_width = size["width"]

  # We always resize to the smallest of either the input or output size.
  height = [input_height, output_height].min
  width = [input_width, output_width].min

  if height == input_height && width == input_width
    return image
  end
  if input_height > input_width
    width = (input_width * height / input_height).floor
  elsif input_width > input_height
    height = (input_height * width / input_width).floor
  end
  image.resize(width, height, resample:)
end

Class: Informers::ImageFeatureExtractor

Direct Known Subclasses

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ ImageFeatureExtractor

Instance Method Details

#call(images, *args) ⇒ Object

#get_resize_output_image_size(image, size) ⇒ Object

#pad_image(pixel_data, img_dims, pad_size, mode: "constant", center: false, constant_values: 0) ⇒ Object

#preprocess(image, do_normalize: nil, do_pad: nil, do_convert_rgb: nil, do_convert_grayscale: nil, do_flip_channel_order: nil) ⇒ Object

#rescale(pixel_data) ⇒ Object

#resize(image) ⇒ Object

#thumbnail(image, size, resample = 2) ⇒ Object

#initialize(config) ⇒ `ImageFeatureExtractor`

#call(images, *args) ⇒ `Object`

#get_resize_output_image_size(image, size) ⇒ `Object`

#pad_image(pixel_data, img_dims, pad_size, mode: "constant", center: false, constant_values: 0) ⇒ `Object`

#preprocess(image, do_normalize: nil, do_pad: nil, do_convert_rgb: nil, do_convert_grayscale: nil, do_flip_channel_order: nil) ⇒ `Object`

#rescale(pixel_data) ⇒ `Object`

#resize(image) ⇒ `Object`

#thumbnail(image, size, resample = 2) ⇒ `Object`