Module: PlainTextExtractorDSL

Included in:
PlainTextExtractor
Defined in:
lib/picolena/templates/lib/plain_text_extractor_DSL.rb

Overview

Defines plain text extractors with DSL For example, to convert “Microsoft Office Word document” to plain text

PlainTextExtractor.new {
  every :doc, :dot
  as "application/msword"
  aka "Microsoft Office Word document"
  with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
  which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
  or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#commandObject (readonly)

Returns the value of attribute command.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 13

def command
  @command
end

#content_and_file_examplesObject (readonly)

Returns the value of attribute content_and_file_examples.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 13

def content_and_file_examples
  @content_and_file_examples
end

#descriptionObject (readonly)

Returns the value of attribute description.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 13

def description
  @description
end

#extsObject (readonly)

Returns the value of attribute exts.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 13

def exts
  @exts
end

#mime_nameObject (readonly)

Returns the value of attribute mime_name.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 13

def mime_name
  @mime_name
end

Instance Method Details

#aka(description) ⇒ Object



30
31
32
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 30

def aka(description)
  @description=description
end

#as(mime_name) ⇒ Object



26
27
28
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 26

def as(mime_name)
  @mime_name=mime_name
end

#every(*exts) ⇒ Object



22
23
24
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 22

def every(*exts)
  @exts=exts
end

#initialize(&block) ⇒ Object



15
16
17
18
19
20
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 15

def initialize(&block)
  @content_and_file_examples=[]
  self.instance_eval(&block)
  PlainTextExtractor.add(self)
  MimeType.add(self.exts,self.mime_name)
end

#which_requires(*dependencies) ⇒ Object



34
35
36
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 34

def which_requires(*dependencies)
  @dependencies=dependencies
end

#which_should_for_example_extract(content, file) ⇒ Object Also known as: or_extract

used by rspec to test extractors:

which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'

this spec will pass if ‘basic.pdf’ and ‘yet_another.pdf’ are included in an indexed directory, if every dependency is installed, and if plain text output from the extractor applied to ‘basic.pdf’ and ‘yet_another.pdf’ respectively include ‘in a pdf file’ and ‘some other stuff inside another pdf file’



44
45
46
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 44

def which_should_for_example_extract(content, file)
  @content_and_file_examples << [content,file[:from]]
end

#with(command_as_hash_or_string = nil, &block) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/picolena/templates/lib/plain_text_extractor_DSL.rb', line 53

def with(command_as_hash_or_string=nil,&block)
  #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
  platform=case RUBY_PLATFORM
  when /linux/
    :linux
  when /win/
    :windows
  when /darwin/
    :mac_os
  end
  @command=case command_as_hash_or_string
  when String
    command_as_hash_or_string
  when Hash
    # Allows to write
    #     with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
    #          "some other command" => :on_windows
    #
    # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
    # on windows, it returns "some other command"
    #
    # If commands for linux & mac os were different :
    #     with "some command"        => :on_linux,
    #          "another command"     => :on_mac_os,
    #          "yet another command" => :on_windows
    #
    #TODO: Make it clearer and more robust.
    #NOTE: What to do when no command is defined for a given platform?
    command_as_hash_or_string.invert.find{|platforms,command|
      platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform)
    }.last.dup
  else
    block || raise("No command defined for this extractor: #{description}")
  end
  # TODO, replace it with Open3 or something.
  @command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|'))
end