Class: Slaw::Extract::Extractor

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/slaw/extract/extractor.rb

Overview

Routines for extracting and cleaning up context from other formats, such as HTML.

Instance Method Summary collapse

Methods included from Logging

#logger

Instance Method Details

#extract_from_file(filename) ⇒ String

Extract text from a file.

Parameters:

  • filename (String)

    filename to extract from

Returns:

  • (String)

    extracted text



15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/slaw/extract/extractor.rb', line 15

def extract_from_file(filename)
  mimetype = get_mimetype(filename)

  case mimetype && mimetype.type
  when 'text/html'
    extract_from_html(filename)
  when 'text/plain', nil
    extract_from_text(filename)
  else
    raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
  end
end

#extract_from_html(filename) ⇒ Object



32
33
34
# File 'lib/slaw/extract/extractor.rb', line 32

def extract_from_html(filename)
  html_to_text(File.read(filename))
end

#extract_from_text(filename) ⇒ Object



28
29
30
# File 'lib/slaw/extract/extractor.rb', line 28

def extract_from_text(filename)
  File.read(filename)
end

#get_mimetype(filename) ⇒ Object



45
46
47
48
# File 'lib/slaw/extract/extractor.rb', line 45

def get_mimetype(filename)
  File.open(filename) { |f| MimeMagic.by_magic(f) } \
    || MimeMagic.by_path(filename)
end

#html_to_text(html) ⇒ Object



36
37
38
39
40
41
42
43
# File 'lib/slaw/extract/extractor.rb', line 36

def html_to_text(html)
  here = File.dirname(__FILE__)
  xslt = Nokogiri::XSLT(File.open(File.join([here, 'html_to_akn_text.xsl'])))

  text = xslt.transform(Nokogiri::HTML(html)).to_s
  # remove XML encoding at top
  text.sub(/^<\?xml [^>]*>/, '')
end