Class: Sqed::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/sqed/extractor.rb

Overview

An Extractor takes Boundaries object and a metadata_map and returns a Sqed::Result

Extract assumes a successful preprocessing (e.g. finding boundaries, cropping images)!

Only Tesseract based raises errors should be occurring at this point.

Defined Under Namespace

Classes: Error

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(**opts) ⇒ Extractor

Returns a new instance of Extractor.

Raises:



25
26
27
28
29
30
31
32
33
34
# File 'lib/sqed/extractor.rb', line 25

def initialize(**opts)
  @metadata_map = opts[:metadata_map]
  @boundaries = opts[:boundaries]
  @image = opts[:image]

  # TODO: `.extractable?` catches the nil? case
  raise Sqed::Error, 'boundaries not provided or provided boundary is not a Sqed::Boundaries' if boundaries.nil? || !boundaries.kind_of?(Sqed::Boundaries)
  raise Sqed::Error, 'metadata_map not provided or metadata_map not a Hash' if .nil? || !.kind_of?(Hash)
  raise Sqed::Error, 'image not provided' if image.nil? || !image.kind_of?(Magick::Image)
end

Instance Attribute Details

#boundariesObject

a Sqed::Boundaries instance



16
17
18
# File 'lib/sqed/extractor.rb', line 16

def boundaries
  @boundaries
end

#imageMagick::Image file

Returns:

  • (Magick::Image file)


23
24
25
# File 'lib/sqed/extractor.rb', line 23

def image
  @image
end

#metadata_mapHash

a metadata_map hash from EXTRACTION_PATTERNS like:

Returns:

  • (Hash)

    like ‘{0 => :annotated_specimen, 1 => :identifier, 2 => :image_registration }`



20
21
22
# File 'lib/sqed/extractor.rb', line 20

def 
  @metadata_map
end

Instance Method Details

#extract_image(coords) ⇒ Object

crop takes x, y, width, height



69
70
71
# File 'lib/sqed/extractor.rb', line 69

def extract_image(coords)
  @image.crop(*coords, true)
end

#resultObject



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/sqed/extractor.rb', line 36

def result
  r = Sqed::Result.new

  r.sections = .keys.sort.collect{|k| [k]}

  # assign the images to the result
  boundaries.each do |section_index, coords|
    section_type = [section_index]

    r.send("#{section_type}_image=", extract_image(coords))
    r.boundary_coordinates[section_type] = coords
  end

  # assign the metadata to the result
  .each do |section_index, section_type|
    # only extract data if a parser exists
    if parsers = SqedConfig::SECTION_PARSERS[section_type]
      section_image = r.send("#{section_type}_image")
      updated = r.send(section_type)

      parsers.each do |p|
        parsed_result = p.new(section_image).get_text(section_type: section_type)
        updated[p::TYPE] = parsed_result if parsed_result && parsed_result.length > 0
      end

      r.send("#{section_type}=", updated)
    end
  end

  r
end