Class: PlainTextExtractor

Inherits:
Object show all
Includes:
PlainTextExtractorDSL
Defined in:
lib/picolena/templates/app/models/plain_text_extractor.rb

Overview

PlainTextExtractor is the class responsible for extracting plain text contents from different documents filetypes (.doc, .html, .pdf, .od?), as defined in

lib/plain_text_extractors/*.rb

Instance Attribute Summary collapse

Attributes included from PlainTextExtractorDSL

#command, #content_and_file_examples, #description, #exts, #mime_name, #thumbnail_command

Class Method Summary collapse

Instance Method Summary collapse

Methods included from PlainTextExtractorDSL

#aka, #as, #every, #extract_content_from_archive_with, #extract_content_with, #extract_thumbnail_with, #initialize, #which_requires, #which_should_for_example_extract

Instance Attribute Details

#sourceObject

Returns the value of attribute source.



67
68
69
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 67

def source
  @source
end

Class Method Details

.add(extractor) ⇒ Object

Add an extractor to the extractors list



15
16
17
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 15

def add(extractor)
  all<<extractor
end

.allObject

Returns every defined extractor



10
11
12
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 10

def all
  Picolena::Extractors
end

.dependenciesObject

Returns every required dependency for every defined extractor



20
21
22
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 20

def dependencies
  @@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort
end

.extract_content_from(source) ⇒ Object

Launches extractor on given file and outputs plain text result



45
46
47
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 45

def extract_content_from(source)
  find_by_filename(source).extract_content
end

.extract_information_from(source) ⇒ Object

Launches extractor on given file and outputs plain text result and language (if found)



50
51
52
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 50

def extract_information_from(source)
  find_by_filename(source).extract_information
end

.extract_thumbnail_from(source) ⇒ Object

Tries to extract a thumbnail from source. Doesn’t do anything if thumbnail_command isn’t defined for the corresponding filetype.



56
57
58
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 56

def extract_thumbnail_from(source)
  find_by_filename(source).extract_thumbnail
end

.find_by_extension(ext) ⇒ Object

Finds which extractor should be used for a given file, according to its extension Raises if the file is unsupported.



40
41
42
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 40

def find_by_extension(ext)
  all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for .#{ext}")
end

.find_by_filename(filename) ⇒ Object

Finds which extractor should be used for a given file. Raises if the file is unsupported.



31
32
33
34
35
36
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 31

def find_by_filename(filename)
  ext=File.ext_as_sym(filename)
  returning find_by_extension(ext) do |found_extractor|
    found_extractor.source=filename
  end
end

.language_guesserObject

Returns which language guesser should be used by the system. Returns nil if none is found.



62
63
64
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 62

def language_guesser
  @@language_guesser||=('mguesser -n1' if 'mguesser'.installed?)
end

.supported_extensionsObject

Returns every supported file extensions



25
26
27
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 25

def supported_extensions
  @@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq
end

Instance Method Details

#dependenciesObject

Parses commands in order to know which programs are needed. rspec will then check that every dependecy is installed on the system



71
72
73
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 71

def dependencies
    [@dependencies, command.dependencies, thumbnail_command.dependencies].flatten
end

#extract_contentObject

Conversion part Returns plain text content of source file



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 77

def extract_content
  if command.is_a?(String) then
    # If command is a String, launch it via system(command).
    if command.include?('DESTINATION') then
      # If command includes 'DESTINATION' keyword,
      # launches the command and returns the content of
      # DESTINATION file.
      silently_execute(specific_command)
      File.read_and_remove(destination)
    else
      # Otherwise, launches the command and returns STDOUT.
      silently_execute(specific_command)
    end
  else
    # command is a Block.
    # Returns the result of command.call,
    # with source file as parameter.
    command.call(source)
  end
end

#extract_informationObject

Returns plain text content and language of source file, using mguesser to guess used language. This method only returns probable language if the content is bigger than 500 chars and if probability score is higher than 90%.



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 102

def extract_information
  content=extract_content

  return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)
                                       Picolena::UseLanguageRecognition,
                                       # Is a language guesser already installed?
                                       PlainTextExtractor.language_guesser,
                                       # Language recognition is too unreliable for small files.
                                       content.size > 500].all?

  language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser|
    lang_guesser.write content
    lang_guesser.close_write
    output=lang_guesser.read
    if output=~/^([01]\.\d+)\t(\w+)\t(\w+)/ then
      score, lang, encoding = $1.to_f, $2, $3
      # Language recognition isn't reliable if score is too low.
      lang unless score<0.9
    end
  }

  {:content => content, :language => language}
end

#extract_thumbnailObject



126
127
128
# File 'lib/picolena/templates/app/models/plain_text_extractor.rb', line 126

def extract_thumbnail
  silently_execute(specific_thumbnail_command) if thumbnail_command
end