Class: Yomu
- Inherits:
-
Object
- Object
- Yomu
- Defined in:
- lib/yomu.rb,
lib/yomu/version.rb
Constant Summary collapse
- GEMPATH =
File.dirname(File.dirname(__FILE__))
- JARPATH =
File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.11.jar')
- DEFAULT_SERVER_PORT =
an arbitrary, but perfectly cromulent, port
9293
- VERSION =
'0.2.4'
- @@server_port =
nil
- @@server_pid =
nil
Class Method Summary collapse
- ._client_read(type, data) ⇒ Object
- ._server_read(_, data) ⇒ Object
-
.kill_server! ⇒ Object
Kills server started by Yomu.server.
-
.read(type, data) ⇒ Object
Read text or metadata from a data buffer.
-
.server(type, custom_port = nil) ⇒ Object
Returns pid of Tika server, started as a new spawned process.
Instance Method Summary collapse
- #creation_date ⇒ Object
-
#data ⇒ Object
Returns the raw/unparsed content of the Yomu document.
-
#html ⇒ Object
Returns the text content of the Yomu document in HTML.
-
#initialize(input) ⇒ Yomu
constructor
Create a new instance of Yomu with a given document.
-
#metadata ⇒ Object
Returns the metadata hash of the Yomu document.
-
#mimetype ⇒ Object
Returns the mimetype object of the Yomu document.
- #path? ⇒ Boolean
-
#stream? ⇒ Boolean
Returns
true
if the Yomu document was specified from a stream or an object which responds toread
. -
#text ⇒ Object
Returns the text content of the Yomu document.
-
#uri? ⇒ Boolean
Returns
true
if the Yomu document was specified using a URI.
Constructor Details
#initialize(input) ⇒ Yomu
Create a new instance of Yomu with a given document.
Using a file path:
Yomu.new 'sample.pages'
Using a URL:
Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
From a stream or an object which responds to read
Yomu.new File.open('sample.pages')
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/yomu.rb', line 95 def initialize(input) if input.is_a? String if File.exists? input @path = input elsif input =~ URI::regexp @uri = URI.parse input else raise Errno::ENOENT.new "missing file or invalid URI - #{input}" end elsif input.respond_to? :read @stream = input else raise TypeError.new "can't read from #{input.class.name}" end end |
Class Method Details
._client_read(type, data) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/yomu.rb', line 39 def self._client_read(type, data) switch = case type when :text '-t' when :html '-h' when :metadata '-m -j' when :mimetype '-m -j' end IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| io.write data io.close_write io.read end end |
._server_read(_, data) ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/yomu.rb', line 59 def self._server_read(_, data) s = TCPSocket.new('localhost', @@server_port) file = StringIO.new(data, 'r') while 1 chunk = file.read(65536) break unless chunk s.write(chunk) end # tell Tika that we're done sending data s.shutdown(Socket::SHUT_WR) resp = '' while 1 chunk = s.recv(65536) break if chunk.empty? || !chunk resp << chunk end resp end |
.kill_server! ⇒ Object
Kills server started by Yomu.server
Always run this when you're done, or else Tika might run until you kill it manually
You might try putting your extraction in a begin..rescue...ensure...end block and
putting this method in the ensure block.
Yomu.server(:text)
reports = ["report1.docx", "report2.doc", "report3.pdf"]
begin
my_texts = reports.map{|report_path| Yomu.new(report_path).text }
rescue
ensure
Yomu.kill_server!
end
256 257 258 259 260 261 262 |
# File 'lib/yomu.rb', line 256 def self.kill_server! if @@server_pid Process.kill('INT', @@server_pid) @@server_pid = nil @@server_port = nil end end |
.read(type, data) ⇒ Object
Read text or metadata from a data buffer.
data = File.read 'sample.pages'
text = Yomu.read :text, data
= Yomu.read :metadata, data
24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/yomu.rb', line 24 def self.read(type, data) result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data) case type when :text result when :html result when :metadata JSON.parse(result) when :mimetype MIME::Types[JSON.parse(result)['Content-Type']].first end end |
.server(type, custom_port = nil) ⇒ Object
Returns pid of Tika server, started as a new spawned process.
type :html, :text or :metadata
custom_port e.g. 9293
Yomu.server(:text, 9294)
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
# File 'lib/yomu.rb', line 223 def self.server(type, custom_port=nil) switch = case type when :text '-t' when :html '-h' when :metadata '-m -j' when :mimetype '-m -j' end @@server_port = custom_port || DEFAULT_SERVER_PORT @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}") sleep(2) # Give the server 2 seconds to spin up. @@server_pid end |
Instance Method Details
#creation_date ⇒ Object
164 165 166 167 168 169 170 171 172 |
# File 'lib/yomu.rb', line 164 def creation_date return @creation_date if defined? @creation_date if ['Creation-Date'] @creation_date = Time.parse(['Creation-Date']) else nil end end |
#data ⇒ Object
Returns the raw/unparsed content of the Yomu document.
yomu = Yomu.new 'sample.pages'
yomu.data
202 203 204 205 206 207 208 209 210 211 212 213 214 |
# File 'lib/yomu.rb', line 202 def data return @data if defined? @data if path? @data = File.read @path elsif uri? @data = Net::HTTP.get @uri elsif stream? @data = @stream.read end @data end |
#html ⇒ Object
Returns the text content of the Yomu document in HTML.
yomu = Yomu.new 'sample.pages'
yomu.html
127 128 129 130 131 |
# File 'lib/yomu.rb', line 127 def html return @html if defined? @html @html = Yomu.read :html, data end |
#metadata ⇒ Object
Returns the metadata hash of the Yomu document.
yomu = Yomu.new 'sample.pages'
yomu.['Content-Type']
138 139 140 141 142 |
# File 'lib/yomu.rb', line 138 def return if defined? = Yomu.read :metadata, data end |
#mimetype ⇒ Object
Returns the mimetype object of the Yomu document.
yomu = Yomu.new 'sample.docx'
yomu.mimetype.content_type #=> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
yomu.mimetype.extensions #=> ['docx']
150 151 152 153 154 155 156 |
# File 'lib/yomu.rb', line 150 def mimetype return @mimetype if defined? @mimetype type = ["Content-Type"].is_a?(Array) ? ["Content-Type"].first : ["Content-Type"] @mimetype = MIME::Types[type].first end |
#path? ⇒ Boolean
174 175 176 |
# File 'lib/yomu.rb', line 174 def path? defined? @path end |
#stream? ⇒ Boolean
Returns true
if the Yomu document was specified from a stream
or an object which responds to read
.
file = File.open('sample.pages')
yomu = Yomu.new file
yomu.stream? #=> true
193 194 195 |
# File 'lib/yomu.rb', line 193 def stream? defined? @stream end |
#text ⇒ Object
Returns the text content of the Yomu document.
yomu = Yomu.new 'sample.pages'
yomu.text
116 117 118 119 120 |
# File 'lib/yomu.rb', line 116 def text return @text if defined? @text @text = Yomu.read :text, data end |
#uri? ⇒ Boolean
Returns true
if the Yomu document was specified using a URI.
yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
yomu.uri? #=> true
183 184 185 |
# File 'lib/yomu.rb', line 183 def uri? defined? @uri end |