Class: Stevedore::StevedoreBlob

Inherits:
Object
  • Object
show all
Defined in:
lib/parsers/stevedore_blob.rb

Direct Known Subclasses

StevedoreCsvRow, StevedoreEmail, StevedoreHTML

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(title, text, download_url = nil, extra = {}) ⇒ StevedoreBlob

Returns a new instance of StevedoreBlob.

Raises:

  • (ArgumentError)


7
8
9
10
11
12
13
# File 'lib/parsers/stevedore_blob.rb', line 7

def initialize(title, text, download_url=nil, extra={})
  self.title = title || download_url
  self.text = text
  self.download_url = download_url
  self.extra = extra
  raise ArgumentError, "StevedoreBlob extra support not yet implemented" if extra.keys.size > 0
end

Instance Attribute Details

#download_urlObject

Returns the value of attribute download_url.



6
7
8
# File 'lib/parsers/stevedore_blob.rb', line 6

def download_url
  @download_url
end

#extraObject

Returns the value of attribute extra.



6
7
8
# File 'lib/parsers/stevedore_blob.rb', line 6

def extra
  @extra
end

#textObject

Returns the value of attribute text.



6
7
8
# File 'lib/parsers/stevedore_blob.rb', line 6

def text
  @text
end

#titleObject

Returns the value of attribute title.



6
7
8
# File 'lib/parsers/stevedore_blob.rb', line 6

def title
  @title
end

Class Method Details

.new_from_tika(content, metadata, download_url, filename) ⇒ Object



19
20
21
# File 'lib/parsers/stevedore_blob.rb', line 19

def self.new_from_tika(content, , download_url, filename)
  self.new( ((["title"] && ["title"] != "Untitled") ? ["title"] : File.basename(filename)), content, download_url)
end

Instance Method Details

#analyze!Object



23
24
25
26
# File 'lib/parsers/stevedore_blob.rb', line 23

def analyze!
  # probably does nothing on blobs.
  # this should do the HTML boilerplate extraction thingy on HTML.
end

#clean_textObject



15
16
17
# File 'lib/parsers/stevedore_blob.rb', line 15

def clean_text
  @clean_text ||= text.gsub(/<\/?[^>]+>/, '') # removes all tags
end

#to_hashObject



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/parsers/stevedore_blob.rb', line 28

def to_hash
  sha =  Digest::SHA1.hexdigest(download_url)
  # TODO should merge in or something?
  {
    "sha1" => sha,
    "id" => sha,
    "_id" => sha,
    "title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
    "source_url" => download_url.to_s,
    "file" => {
      "title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
      "file" => clean_text.to_s
    },
    "analyzed" => {
      "body" => clean_text.to_s,
      "metadata" => {
        "Content-Type" => extra["Content-Type"] || "text/plain"
      }
    },
    "_updatedAt" => Time.now,   
  }
end