Class: ChupaText::Data

Inherits:
Object
  • Object
show all
Defined in:
lib/chupa-text/data.rb

Direct Known Subclasses

InputData, TextData, VirtualFileData

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Data

Returns a new instance of Data



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/chupa-text/data.rb', line 68

def initialize(options={})
  @uri = nil
  @body = nil
  @size = nil
  @path = nil
  @mime_type = nil
  @attributes = Attributes.new
  @source = nil
  @screenshot = nil
  @need_screenshot = true
  @expected_screenshot_size = [200, 200]
  @options = options || {}
  source_data = @options[:source_data]
  if source_data
    merge!(source_data)
    @source = source_data
  end
end

Instance Attribute Details

#attributesAttributes (readonly)

Returns The attributes of the data.

Returns:



50
51
52
# File 'lib/chupa-text/data.rb', line 50

def attributes
  @attributes
end

#bodyString?

Returns The content of the data, nil if the data doesn't have any content.

Returns:

  • (String, nil)

    The content of the data, nil if the data doesn't have any content.



32
33
34
# File 'lib/chupa-text/data.rb', line 32

def body
  @body
end

#expected_screenshot_sizeArray<Integer, Integer>

Returns the expected screenshot size.

Returns:

  • (Array<Integer, Integer>)

    the expected screenshot size.



66
67
68
# File 'lib/chupa-text/data.rb', line 66

def expected_screenshot_size
  @expected_screenshot_size
end

#need_screenshot=(value) ⇒ Bool (writeonly)

Returns the specified value

Parameters:

  • value (Bool)

    true when screenshot is needed.

Returns:

  • (Bool)

    the specified value



63
64
65
# File 'lib/chupa-text/data.rb', line 63

def need_screenshot=(value)
  @need_screenshot = value
end

#pathString?

Returns The path associated with the content of the data, nil if the data doesn't associated with any file.

The path may not be related with the original content. For example, "/tmp/XXX.txt" may be returned for the data of "http://example.com/XXX.txt".

This value is useful to use an external command to extract text and meta-data.

Returns:

  • (String, nil)

    The path associated with the content of the data, nil if the data doesn't associated with any file.

    The path may not be related with the original content. For example, "/tmp/XXX.txt" may be returned for the data of "http://example.com/XXX.txt".

    This value is useful to use an external command to extract text and meta-data.



47
48
49
# File 'lib/chupa-text/data.rb', line 47

def path
  @path
end

#screenshotScreenshot?

Returns The screenshot of the data. For example, the first page image for PDF file.text.

Returns:

  • (Screenshot, nil)

    The screenshot of the data. For example, the first page image for PDF file.text.



59
60
61
# File 'lib/chupa-text/data.rb', line 59

def screenshot
  @screenshot
end

#sizeInteger?

Returns The byte size of the data, nil if the data doesn't have any content.

Returns:

  • (Integer, nil)

    The byte size of the data, nil if the data doesn't have any content.



36
37
38
# File 'lib/chupa-text/data.rb', line 36

def size
  @size
end

#sourceData?

Returns The source of the data. For example, text data (hello.txt) in archive data (hello.tar) have the archive data in #source.

Returns:

  • (Data, nil)

    The source of the data. For example, text data (hello.txt) in archive data (hello.tar) have the archive data in #source.



55
56
57
# File 'lib/chupa-text/data.rb', line 55

def source
  @source
end

#uriURI?

Returns The URI of the data if the data is for remote or local file, nil if the data isn't associated with any URIs.

Returns:

  • (URI, nil)

    The URI of the data if the data is for remote or local file, nil if the data isn't associated with any URIs.



28
29
30
# File 'lib/chupa-text/data.rb', line 28

def uri
  @uri
end

Instance Method Details

#[](name) ⇒ Object



149
150
151
# File 'lib/chupa-text/data.rb', line 149

def [](name)
  @attributes[name]
end

#[]=(name, value) ⇒ Object



153
154
155
# File 'lib/chupa-text/data.rb', line 153

def []=(name, value)
  @attributes[name] = value
end

#extensionString?

Returns Normalized extension as String if #uri is not nil, nil otherwise. The normalized extension uses lower case like pdf not PDF.

Returns:

  • (String, nil)

    Normalized extension as String if #uri is not nil, nil otherwise. The normalized extension uses lower case like pdf not PDF.



175
176
177
178
179
180
181
182
# File 'lib/chupa-text/data.rb', line 175

def extension
  return nil if @uri.nil?
  if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/")
    "html"
  else
    File.extname(@uri.path).downcase.gsub(/\A\./, "")
  end
end

#initialize_copy(object) ⇒ Object



87
88
89
90
91
# File 'lib/chupa-text/data.rb', line 87

def initialize_copy(object)
  super
  @attributes = @attributes.dup
  self
end

#merge!(data) ⇒ void

This method returns an undefined value.

Merges metadata from data.

Parameters:

  • data (Data)

    The data to be merged.



98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/chupa-text/data.rb', line 98

def merge!(data)
  self.uri = data.uri
  self.path = data.path
  data.attributes.each do |name, value|
    self[name] = value
  end
  if data.mime_type
    self["source-mime-types"] ||= []
    self["source-mime-types"].unshift(data.mime_type)
  end
  self.need_screenshot = data.need_screenshot?
  self.expected_screenshot_size = data.expected_screenshot_size
end

#mime_typeString?

Returns:

  • (String)

    The MIME type of the data. If MIME type isn't set, guesses MIME type from path and body.

  • (nil)

    If MIME type isn't set and it can't guess MIME type from path and body.



161
162
163
# File 'lib/chupa-text/data.rb', line 161

def mime_type
  @mime_type || guess_mime_type
end

#mime_type=(type) ⇒ Object

Parameters:

  • type (String, nil)

    The MIME type of the data. You can unset MIME type by nil. If you unset MIME type, MIME type is guessed from path and body of the data.



168
169
170
# File 'lib/chupa-text/data.rb', line 168

def mime_type=(type)
  @mime_type = type
end

#need_screenshot?Bool

Returns true when screenshot is needed if available.

Returns:

  • (Bool)

    true when screenshot is needed if available.



197
198
199
# File 'lib/chupa-text/data.rb', line 197

def need_screenshot?
  @need_screenshot
end

#open {|StringIO.new(body)| ... } ⇒ Object

Yields:

  • (StringIO.new(body))


139
140
141
# File 'lib/chupa-text/data.rb', line 139

def open
  yield(StringIO.new(body))
end

#peek_body(size) ⇒ Object



143
144
145
146
147
# File 'lib/chupa-text/data.rb', line 143

def peek_body(size)
  _body = body
  return nil if _body.nil?
  _body[0, size]
end

#text?Bool

Returns true if MIME type is "text/XXX", false otherwise.

Returns:

  • (Bool)

    true if MIME type is "text/XXX", false otherwise.



186
187
188
# File 'lib/chupa-text/data.rb', line 186

def text?
  (mime_type || "").start_with?("text/")
end

#text_plain?Bool

Returns true if MIME type is "text/plain", false otherwise.

Returns:

  • (Bool)

    true if MIME type is "text/plain", false otherwise.



192
193
194
# File 'lib/chupa-text/data.rb', line 192

def text_plain?
  mime_type == "text/plain"
end

#to_utf8_body_data(max_body_size: nil) ⇒ Object



201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/chupa-text/data.rb', line 201

def to_utf8_body_data(max_body_size: nil)
  b = nil
  if max_body_size
    open do |input|
      b = input.read(max_body_size)
    end
  else
    b = body
  end
  return self if b.nil?

  converter = UTF8Converter.new(b)
  utf8_body = converter.convert
  if max_body_size.nil? and b.equal?(utf8_body)
    self
  else
    TextData.new(utf8_body, source_data: self)
  end
end