Class: Slaw::Extract::Extractor

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/slaw/extract/extractor.rb

Overview

Routines for extracting and cleaning up context from other formats, such as HTML.

Instance Method Summary collapse

Methods included from Logging

#logger

Instance Method Details

#extract_from_file(filename) ⇒ String

Extract text from a file.

Parameters:

  • filename (String)

    filename to extract from

Returns:

  • (String)

    extracted text



13
14
15
16
17
18
19
# File 'lib/slaw/extract/extractor.rb', line 13

def extract_from_file(filename)
  if filename.end_with? '.html' or filename.end_with? '.htm'
    extract_from_html(filename)
  else
    extract_from_text(filename)
  end
end

#extract_from_html(filename) ⇒ Object



25
26
27
# File 'lib/slaw/extract/extractor.rb', line 25

def extract_from_html(filename)
  html_to_text(File.read(filename))
end

#extract_from_text(filename) ⇒ Object



21
22
23
# File 'lib/slaw/extract/extractor.rb', line 21

def extract_from_text(filename)
  File.read(filename)
end

#get_mimetype(filename) ⇒ Object



38
39
40
41
# File 'lib/slaw/extract/extractor.rb', line 38

def get_mimetype(filename)
  File.open(filename) { |f| MimeMagic.by_magic(f) } \
    || MimeMagic.by_path(filename)
end

#html_to_text(html) ⇒ Object



29
30
31
32
33
34
35
36
# File 'lib/slaw/extract/extractor.rb', line 29

def html_to_text(html)
  here = File.dirname(__FILE__)
  xslt = Nokogiri::XSLT(File.open(File.join([here, 'html_to_akn_text.xsl'])))

  text = xslt.transform(Nokogiri::HTML(html)).to_s
  # remove XML encoding at top
  text.sub(/^<\?xml [^>]*>/, '')
end