Module: PdfExtract::Language
- Defined in:
- lib/language.rb
Class Method Summary collapse
-
.cap_ratio(s) ⇒ Object
TODO Ignore caps in middle of words.
- .letter_ratio(s) ⇒ Object
- .name_ratio(content) ⇒ Object
- .transliterate(s) ⇒ Object
- .word_count(s) ⇒ Object
- .year_ratio(s) ⇒ Object
Class Method Details
.cap_ratio(s) ⇒ Object
TODO Ignore caps in middle of words
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/language.rb', line 30 def self.cap_ratio s sentence_end = true cap_count = 0 s.each_char do |c| if c =~ /\./ sentence_end = true elsif c =~ /[A-Z]/ cap_count = cap_count + 1 unless sentence_end sentence_end = false elsif c =~ /[^\s]/ sentence_end = false end end cap_count / s.split.length.to_f end |
.letter_ratio(s) ⇒ Object
25 26 27 |
# File 'lib/language.rb', line 25 def self.letter_ratio s s.count("A-Z0-9\-[],.\"'()") / s.length.to_f end |
.name_ratio(content) ⇒ Object
58 59 60 |
# File 'lib/language.rb', line 58 def self.name_ratio content PdfExtract::Names.detect_names(content)[:name_frequency] end |
.transliterate(s) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/language.rb', line 5 def self.transliterate s s = s.gsub "\ufb01", "fi" s = s.gsub "\ufb02", "fl" s = s.gsub "\ufb03", "ffi" s = s.gsub "\ufb04", "ffl" s = s.gsub "\ufb06", "st" s = s.gsub "\u2018", "'" s = s.gsub "\u2019", "'" s = s.gsub "\u2013", "-" s = s.gsub "\u2014", "-" s = s.gsub "\u201c", "\"" s = s.gsub "\u201d", "\"" s = s.gsub "\u25af", "(" s = s.gsub "\u00b4", "" s = s.gsub "\u00b1", "-" s = s.gsub /\s+/, " " end |
.word_count(s) ⇒ Object
62 63 64 |
# File 'lib/language.rb', line 62 def self.word_count s s.split.count end |
.year_ratio(s) ⇒ Object
48 49 50 51 52 53 54 55 56 |
# File 'lib/language.rb', line 48 def self.year_ratio s words = s.split year_words = words.map do |word| word =~ /[^\d]\d{4}[^\d]/ end year_words.reject { |year_word| not year_word }.length / words.length.to_f end |