tensorrt-rb

Minimal TensorRT bindings for Ruby using Rice (C++ bindings).

Requirements

  • Linux (x86_64 or aarch64)
  • Ruby >= 3.0
  • TensorRT (with headers and libraries)
  • CUDA runtime
  • Rice gem (gem install rice)

Installation

cd tensorrt-rb

# Set paths if not in standard locations
export TENSORRT_INCLUDE=/path/to/tensorrt/include
export TENSORRT_LIB=/path/to/tensorrt/lib
export CUDA_INCLUDE=/usr/local/cuda/include
export CUDA_LIB=/usr/local/cuda/lib64

# Build
rake compile

# Or install as gem
gem build tensorrt.gemspec
gem install tensorrt-*.gem

Default Library Paths

x86_64:

  • TensorRT: /usr/include/x86_64-linux-gnu, /usr/lib/x86_64-linux-gnu
  • CUDA: /usr/local/cuda/include, /usr/local/cuda/lib64

aarch64:

  • TensorRT: /usr/include/aarch64-linux-gnu, /usr/lib/aarch64-linux-gnu
  • CUDA: /usr/local/cuda/include, /usr/local/cuda/lib64

API

TensorRT::Engine

engine = TensorRT::Engine.new(path, verbose: false)

# Tensor info
engine.num_io_tensors                  # Number of input/output tensors
engine.get_tensor_name(index)          # Tensor name by index
engine.is_input?(name)                 # Check if tensor is input
engine.get_tensor_shape(name)          # Shape as array [1, 3, 640, 640]
engine.get_tensor_bytes(name)          # Size in bytes

# Memory binding
engine.set_tensor_address(name, device_ptr)

# Inference
engine.execute                         # Synchronous (blocking)
engine.enqueue                         # Asynchronous (non-blocking)

# Stream management
engine.get_stream                      # CUDA stream handle (uint64)
engine.stream_synchronize              # Wait for stream completion

TensorRT::CUDA

# Memory allocation
ptr = TensorRT::CUDA.malloc(bytes)
TensorRT::CUDA.free(ptr)

# Synchronous copy
TensorRT::CUDA.memcpy_htod(device_ptr, host_ptr, bytes)  # Host → Device
TensorRT::CUDA.memcpy_dtoh(host_ptr, device_ptr, bytes)  # Device → Host

# Asynchronous copy
TensorRT::CUDA.memcpy_htod_async(device_ptr, host_ptr, bytes, stream)
TensorRT::CUDA.memcpy_dtoh_async(host_ptr, device_ptr, bytes, stream)

# Synchronization
TensorRT::CUDA.synchronize                    # All operations
TensorRT::CUDA.stream_synchronize(stream)     # Specific stream

Examples

Synchronous Inference

require "tensorrt"

engine = TensorRT::Engine.new("model.engine")

# Allocate GPU memory
input_bytes = engine.get_tensor_bytes("input")
output_bytes = engine.get_tensor_bytes("output")
output_size = engine.get_tensor_shape("output").reduce(1, :*)

input_device = TensorRT::CUDA.malloc(input_bytes)
output_device = TensorRT::CUDA.malloc(output_bytes)
engine.set_tensor_address("input", input_device)
engine.set_tensor_address("output", output_device)

# Prepare input data
input_data = preprocess_image(image_path)  # Returns Numo::SFloat
input_host = FFI::MemoryPointer.new(:float, input_data.size)
input_host.write_bytes(input_data.to_binary)

# Copy input to GPU
TensorRT::CUDA.memcpy_htod(input_device, input_host, input_bytes)

# Run inference
engine.execute

# Copy output from GPU
output_host = FFI::MemoryPointer.new(:float, output_size)
TensorRT::CUDA.memcpy_dtoh(output_host, output_device, output_bytes)
output_data = output_host.read_array_of_float(output_size)

# Cleanup
TensorRT::CUDA.free(input_device)
TensorRT::CUDA.free(output_device)

Pipelined Async Inference

Overlap CPU preprocessing with GPU inference for maximum throughput:

require "tensorrt"

engine = TensorRT::Engine.new("model.engine")
stream = engine.get_stream

# Allocate GPU memory
input_bytes = engine.get_tensor_bytes("input")
output_bytes = engine.get_tensor_bytes("output")
output_size = engine.get_tensor_shape("output").reduce(1, :*)

input_device = TensorRT::CUDA.malloc(input_bytes)
output_device = TensorRT::CUDA.malloc(output_bytes)
engine.set_tensor_address("input", input_device)
engine.set_tensor_address("output", output_device)

# Allocate host buffers
input_host = FFI::MemoryPointer.new(:float, input_bytes / 4)
output_host = FFI::MemoryPointer.new(:float, output_size)

# Preload first image
current_image = preprocess_image(images[0])

images.each_with_index do |image_path, i|
  # Copy current image to GPU (async)
  input_host.write_bytes(current_image.to_binary)
  TensorRT::CUDA.memcpy_htod_async(input_device, input_host, input_bytes, stream)

  # Start async inference
  engine.enqueue

  # Preprocess next image on CPU while GPU is busy
  next_image = preprocess_image(images[i + 1]) if i < images.size - 1

  # Wait for GPU inference to complete
  engine.stream_synchronize

  # Copy output from GPU
  TensorRT::CUDA.memcpy_dtoh(output_host, output_device, output_bytes)
  output_data = output_host.read_array_of_float(output_size)

  # Process results
  process_detections(output_data)

  current_image = next_image
end

# Cleanup
TensorRT::CUDA.free(input_device)
TensorRT::CUDA.free(output_device)