Class: Keynote::Extractor::TextExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/keynote/extractor/text_extractor.rb

Class Method Summary collapse

Class Method Details

.extract_iwa(file) ⇒ Object



36
37
38
39
40
41
42
43
44
45
# File 'lib/keynote/extractor/text_extractor.rb', line 36

def self.extract_iwa(file) 
  res = []
  words = file.encode!('UTF-8', 'UTF-8', :invalid => :replace)
    .gsub(/[^0-9a-z ]/i, '').split(" ")
  words.each do |w|
    res << w unless w.length > 20 or w.include? "Transition" or w.length <= 2
  end

  res
end

.get_from_bodies(body_text) ⇒ Object



32
33
34
# File 'lib/keynote/extractor/text_extractor.rb', line 32

def self.get_from_bodies(body_text)
  body_text.child.content
end

.get_text(content) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/keynote/extractor/text_extractor.rb', line 7

def self.get_text(content)
  text = []
  if content.kind_of?(Array)
    content.each do |c|
      extracted = extract_iwa(c)
      extracted.each do |word|
        text << word
      end
    end 
  else 
    text_nodes = search_for_bodies(content) 
    text_nodes.each do |tn|
      content = get_from_bodies(tn)
      text << content unless content.empty?
    end
  end

  text.uniq
end

.search_for_bodies(content) ⇒ Object



27
28
29
30
# File 'lib/keynote/extractor/text_extractor.rb', line 27

def self.search_for_bodies(content)
  doc = Nokogiri::XML(content)
  doc.xpath("//sf:text-body")
end