Module: PdfExtract::Language

Defined in:
lib/language.rb

Class Method Summary collapse

Class Method Details

.cap_ratio(s) ⇒ Object

TODO Ignore caps in middle of words



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/language.rb', line 30

def self.cap_ratio s
  sentence_end = true
  cap_count = 0
  
  s.each_char do |c|
    if c =~ /\./
      sentence_end = true
    elsif c =~ /[A-Z]/
      cap_count = cap_count + 1 unless sentence_end
      sentence_end = false
    elsif c =~ /[^\s]/
      sentence_end = false
    end
  end
  
  cap_count / s.split.length.to_f
end

.letter_ratio(s) ⇒ Object



25
26
27
# File 'lib/language.rb', line 25

def self.letter_ratio s
  s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
end

.name_ratio(content) ⇒ Object



58
59
60
# File 'lib/language.rb', line 58

def self.name_ratio content
  PdfExtract::Names.detect_names(content)[:name_frequency]
end

.transliterate(s) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/language.rb', line 5

def self.transliterate s
  s = s.gsub "\ufb01", "fi"
  s = s.gsub "\ufb02", "fl"
  s = s.gsub "\ufb03", "ffi"
  s = s.gsub "\ufb04", "ffl"
  s = s.gsub "\ufb06", "st"
  s = s.gsub "\u2018", "'"
  s = s.gsub "\u2019", "'"
  s = s.gsub "\u2013", "-"
  s = s.gsub "\u2014", "-"
  s = s.gsub "\u201c", "\""
  s = s.gsub "\u201d", "\""
  s = s.gsub "\u25af", "("
  s = s.gsub "\u00b4", ""
  s = s.gsub "\u00b1", "-"
  

  s = s.gsub /\s+/, " "
end

.word_count(s) ⇒ Object



62
63
64
# File 'lib/language.rb', line 62

def self.word_count s
  s.split.count
end

.year_ratio(s) ⇒ Object



48
49
50
51
52
53
54
55
56
# File 'lib/language.rb', line 48

def self.year_ratio s
  words = s.split
  
  year_words = words.map do |word|
    word =~ /[^\d]\d{4}[^\d]/
  end

  year_words.reject { |year_word| not year_word }.length / words.length.to_f
end