Class: ChupaText::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/chupa-text/data.rb

Direct Known Subclasses

InputData, TextData, VirtualFileData

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Data

Returns a new instance of Data.



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/chupa-text/data.rb', line 82

def initialize(options={})
  @uri = nil
  @body = nil
  @size = nil
  @path = nil
  @mime_type = nil
  @attributes = Attributes.new
  @source = nil
  @screenshot = nil
  @need_screenshot = true
  @expected_screenshot_size = [200, 200]
  @max_body_size = nil
  @timeout = nil
  @limit_cpu = nil
  @limit_as = nil
  @options = options || {}
  source_data = @options[:source_data]
  if source_data
    merge!(source_data)
    @source = source_data
  end
end

Instance Attribute Details

#attributesAttributes (readonly)

Returns The attributes of the data.

Returns:



50
51
52
# File 'lib/chupa-text/data.rb', line 50

def attributes
  @attributes
end

#bodyString?

Returns The content of the data, nil if the data doesn't have any content.

Returns:

  • (String, nil)

    The content of the data, nil if the data doesn't have any content.



32
33
34
# File 'lib/chupa-text/data.rb', line 32

def body
  @body
end

#expected_screenshot_sizeArray<Integer, Integer>

Returns the expected screenshot size.

Returns:

  • (Array<Integer, Integer>)

    the expected screenshot size.



66
67
68
# File 'lib/chupa-text/data.rb', line 66

def expected_screenshot_size
  @expected_screenshot_size
end

#limit_asNumeric, ...

Returns the max memory on extraction by external command.

Returns:

  • (Numeric, String, nil)

    the max memory on extraction by external command.



80
81
82
# File 'lib/chupa-text/data.rb', line 80

def limit_as
  @limit_as
end

#limit_cpuNumeric, ...

Returns the max CPU time on extraction by external command.

Returns:

  • (Numeric, String, nil)

    the max CPU time on extraction by external command.



76
77
78
# File 'lib/chupa-text/data.rb', line 76

def limit_cpu
  @limit_cpu
end

#max_body_sizeInteger?

Returns the max body size in bytes.

Returns:

  • (Integer, nil)

    the max body size in bytes.



69
70
71
# File 'lib/chupa-text/data.rb', line 69

def max_body_size
  @max_body_size
end

#need_screenshot=(value) ⇒ Bool (writeonly)

Returns the specified value.

Parameters:

  • value (Bool)

    true when screenshot is needed.

Returns:

  • (Bool)

    the specified value



63
64
65
# File 'lib/chupa-text/data.rb', line 63

def need_screenshot=(value)
  @need_screenshot = value
end

#pathString?

Returns The path associated with the content of the data, nil if the data doesn't associated with any file.

The path may not be related with the original content. For example, "/tmp/XXX.txt" may be returned for the data of "http://example.com/XXX.txt".

This value is useful to use an external command to extract text and meta-data.

Returns:

  • (String, nil)

    The path associated with the content of the data, nil if the data doesn't associated with any file.

    The path may not be related with the original content. For example, "/tmp/XXX.txt" may be returned for the data of "http://example.com/XXX.txt".

    This value is useful to use an external command to extract text and meta-data.



47
48
49
# File 'lib/chupa-text/data.rb', line 47

def path
  @path
end

#screenshotScreenshot?

Returns The screenshot of the data. For example, the first page image for PDF file.text.

Returns:

  • (Screenshot, nil)

    The screenshot of the data. For example, the first page image for PDF file.text.



59
60
61
# File 'lib/chupa-text/data.rb', line 59

def screenshot
  @screenshot
end

#sizeInteger?

Returns The byte size of the data, nil if the data doesn't have any content.

Returns:

  • (Integer, nil)

    The byte size of the data, nil if the data doesn't have any content.



36
37
38
# File 'lib/chupa-text/data.rb', line 36

def size
  @size
end

#sourceData?

Returns The source of the data. For example, text data (hello.txt) in archive data (hello.tar) have the archive data in #source.

Returns:

  • (Data, nil)

    The source of the data. For example, text data (hello.txt) in archive data (hello.tar) have the archive data in #source.



55
56
57
# File 'lib/chupa-text/data.rb', line 55

def source
  @source
end

#timeoutNumeric, ...

Returns the timeout on extraction.

Returns:

  • (Numeric, String, nil)

    the timeout on extraction.



72
73
74
# File 'lib/chupa-text/data.rb', line 72

def timeout
  @timeout
end

#uriURI?

Returns The URI of the data if the data is for remote or local file, nil if the data isn't associated with any URIs.

Returns:

  • (URI, nil)

    The URI of the data if the data is for remote or local file, nil if the data isn't associated with any URIs.



28
29
30
# File 'lib/chupa-text/data.rb', line 28

def uri
  @uri
end

Instance Method Details

#[](name) ⇒ Object



174
175
176
# File 'lib/chupa-text/data.rb', line 174

def [](name)
  @attributes[name]
end

#[]=(name, value) ⇒ Object



178
179
180
# File 'lib/chupa-text/data.rb', line 178

def []=(name, value)
  @attributes[name] = value
end

#extensionString?

Returns Normalized extension as String if #uri is not nil, nil otherwise. The normalized extension uses lower case like pdf not PDF.

Returns:

  • (String, nil)

    Normalized extension as String if #uri is not nil, nil otherwise. The normalized extension uses lower case like pdf not PDF.



200
201
202
203
204
205
206
207
# File 'lib/chupa-text/data.rb', line 200

def extension
  return nil if @uri.nil?
  if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/")
    "html"
  else
    File.extname(@uri.path).downcase.gsub(/\A\./, "")
  end
end

#initialize_copy(object) ⇒ Object



105
106
107
108
109
# File 'lib/chupa-text/data.rb', line 105

def initialize_copy(object)
  super
  @attributes = @attributes.dup
  self
end

#merge!(data) ⇒ void

This method returns an undefined value.

Merges metadata from data.

Parameters:

  • data (Data)

    The data to be merged.



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/chupa-text/data.rb', line 116

def merge!(data)
  self.uri = data.uri
  self.path = data.path
  data.attributes.each do |name, value|
    self[name] = value
  end
  if data.mime_type
    self["source-mime-types"] ||= []
    self["source-mime-types"].unshift(data.mime_type)
  end
  self.need_screenshot = data.need_screenshot?
  self.expected_screenshot_size = data.expected_screenshot_size
  self.max_body_size = data.max_body_size
  self.timeout = data.timeout
  self.limit_cpu = data.limit_cpu
  self.limit_as = data.limit_as
end

#mime_typeString?

Returns:

  • (String)

    The MIME type of the data. If MIME type isn't set, guesses MIME type from path and body.

  • (nil)

    If MIME type isn't set and it can't guess MIME type from path and body.



186
187
188
# File 'lib/chupa-text/data.rb', line 186

def mime_type
  @mime_type || guess_mime_type
end

#mime_type=(type) ⇒ Object

Parameters:

  • type (String, nil)

    The MIME type of the data. You can unset MIME type by nil. If you unset MIME type, MIME type is guessed from path and body of the data.



193
194
195
# File 'lib/chupa-text/data.rb', line 193

def mime_type=(type)
  @mime_type = type
end

#need_screenshot?Bool

Returns true when screenshot is needed if available.

Returns:

  • (Bool)

    true when screenshot is needed if available.



222
223
224
# File 'lib/chupa-text/data.rb', line 222

def need_screenshot?
  @need_screenshot
end

#open {|StringIO.new(body)| ... } ⇒ Object

Yields:

  • (StringIO.new(body))


161
162
163
# File 'lib/chupa-text/data.rb', line 161

def open
  yield(StringIO.new(body))
end

#peek_body(size) ⇒ Object



168
169
170
171
172
# File 'lib/chupa-text/data.rb', line 168

def peek_body(size)
  _body = body
  return nil if _body.nil?
  _body[0, size]
end

#releaseObject



165
166
# File 'lib/chupa-text/data.rb', line 165

def release
end

#text?Bool

Returns true if MIME type is "text/XXX", false otherwise.

Returns:

  • (Bool)

    true if MIME type is "text/XXX", false otherwise.



211
212
213
# File 'lib/chupa-text/data.rb', line 211

def text?
  (mime_type || "").start_with?("text/")
end

#text_plain?Bool

Returns true if MIME type is "text/plain", false otherwise.

Returns:

  • (Bool)

    true if MIME type is "text/plain", false otherwise.



217
218
219
# File 'lib/chupa-text/data.rb', line 217

def text_plain?
  mime_type == "text/plain"
end

#to_utf8_body_dataObject



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/chupa-text/data.rb', line 226

def to_utf8_body_data
  b = nil
  if @max_body_size
    open do |input|
      b = input.read(@max_body_size)
    end
  else
    b = body
  end
  return self if b.nil?

  converter = UTF8Converter.new(b)
  utf8_body = converter.convert
  if @max_body_size.nil? and b.equal?(utf8_body)
    self
  else
    TextData.new(utf8_body, source_data: self)
  end
end