Class: Slaw::Extract::Extractor

Inherits:

Object

Object
Slaw::Extract::Extractor

show all

Includes:: Logging

Defined in:: lib/slaw/extract/extractor.rb

Overview

Routines for extracting and cleaning up context from other formats, such as HTML.

Instance Method Summary collapse

Methods included from Logging

#logger

Instance Method Details

#extract_from_file(filename) ⇒ `String`

Extract text from a file.

Parameters:

filename (String) —

filename to extract from

Returns:

(String) —

extracted text

# File 'lib/slaw/extract/extractor.rb', line 15

def extract_from_file(filename)
  mimetype = get_mimetype(filename)

  case mimetype && mimetype.type
  when 'text/html'
    extract_from_html(filename)
  when 'text/plain', nil
    extract_from_text(filename)
  else
    raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
  end
end

#extract_from_html(filename) ⇒ `Object`



32
33
34

# File 'lib/slaw/extract/extractor.rb', line 32

def extract_from_html(filename)
  html_to_text(File.read(filename))
end

#extract_from_text(filename) ⇒ `Object`



28
29
30

# File 'lib/slaw/extract/extractor.rb', line 28

def extract_from_text(filename)
  File.read(filename)
end

#get_mimetype(filename) ⇒ `Object`

# File 'lib/slaw/extract/extractor.rb', line 45

def get_mimetype(filename)
  File.open(filename) { |f| MimeMagic.by_magic(f) } \
    || MimeMagic.by_path(filename)
end

#html_to_text(html) ⇒ `Object`

# File 'lib/slaw/extract/extractor.rb', line 36

def html_to_text(html)
  here = File.dirname(__FILE__)
  xslt = Nokogiri::XSLT(File.open(File.join([here, 'html_to_akn_text.xsl'])))

  text = xslt.transform(Nokogiri::HTML(html)).to_s
  # remove XML encoding at top
  text.sub(/^<\?xml [^>]*>/, '')
end

Class: Slaw::Extract::Extractor

Overview

Instance Method Summary collapse

Methods included from Logging

Instance Method Details

#extract_from_file(filename) ⇒ String

#extract_from_html(filename) ⇒ Object

#extract_from_text(filename) ⇒ Object

#get_mimetype(filename) ⇒ Object

#html_to_text(html) ⇒ Object

#extract_from_file(filename) ⇒ `String`

#extract_from_html(filename) ⇒ `Object`

#extract_from_text(filename) ⇒ `Object`

#get_mimetype(filename) ⇒ `Object`

#html_to_text(html) ⇒ `Object`