Module: Ebooks::NLP
- Defined in:
- lib/twitter_ebooks/nlp.rb
Constant Summary collapse
- PUNCTUATION =
Deliberately limit our punctuation handling to stuff we can do consistently It’ll just be a part of a token if we don’t split it out, and that’s fine
".?!,"
Class Method Summary collapse
- .adjectives ⇒ Object
- .gingerice ⇒ Object
- .htmlentities ⇒ Object
- .normalize(text) ⇒ Object
- .nouns ⇒ Object
- .punctuation?(token) ⇒ Boolean
- .reconstruct(tokens) ⇒ Object
-
.sentences(text) ⇒ Object
Utility functions which wrap the above.
- .space_between?(token1, token2) ⇒ Boolean
- .stemmer ⇒ Object
-
.stopwords ⇒ Object
We don’t necessarily want to use all of this stuff all the time Only load it when it is needed.
- .tactful ⇒ Object
- .tagger ⇒ Object
- .tokenize(sentence) ⇒ Object
- .tokenizer ⇒ Object
- .tokenset(sentence) ⇒ Object
- .unmatched_enclosers?(text) ⇒ Boolean
Class Method Details
.adjectives ⇒ Object
19 20 21 |
# File 'lib/twitter_ebooks/nlp.rb', line 19 def self.adjectives @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split end |
.gingerice ⇒ Object
45 46 47 48 |
# File 'lib/twitter_ebooks/nlp.rb', line 45 def self.gingerice require 'gingerice' Gingerice::Parser.new # No caching for this one end |
.htmlentities ⇒ Object
50 51 52 53 |
# File 'lib/twitter_ebooks/nlp.rb', line 50 def self.htmlentities require 'htmlentities' @htmlentities ||= HTMLEntities.new end |
.normalize(text) ⇒ Object
61 62 63 |
# File 'lib/twitter_ebooks/nlp.rb', line 61 def self.normalize(text) htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...') end |
.nouns ⇒ Object
15 16 17 |
# File 'lib/twitter_ebooks/nlp.rb', line 15 def self.nouns @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split end |
.punctuation?(token) ⇒ Boolean
116 117 118 |
# File 'lib/twitter_ebooks/nlp.rb', line 116 def self.punctuation?(token) (token.chars.to_set - PUNCTUATION.chars.to_set).empty? end |
.reconstruct(tokens) ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/twitter_ebooks/nlp.rb', line 100 def self.reconstruct(tokens) # Put tokens back together into a nice looking sentence text = "" last_token = nil tokens.each do |token| text += ' ' if last_token && space_between?(last_token, token) text += token last_token = token end text end |
.sentences(text) ⇒ Object
Utility functions which wrap the above
57 58 59 |
# File 'lib/twitter_ebooks/nlp.rb', line 57 def self.sentences(text) tactful.tokenize_text(text) end |
.space_between?(token1, token2) ⇒ Boolean
86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/twitter_ebooks/nlp.rb', line 86 def self.space_between?(token1, token2) p1 = self.punctuation?(token1) p2 = self.punctuation?(token2) if p1 && p2 # "foo?!" false elsif !p1 && p2 # "foo." false elsif p1 && !p2 # "foo. rah" true else # "foo rah" true end end |
.stemmer ⇒ Object
40 41 42 43 |
# File 'lib/twitter_ebooks/nlp.rb', line 40 def self.stemmer require 'lingua/stemmer' @stemmer ||= Lingua::Stemmer.new end |
.stopwords ⇒ Object
We don’t necessarily want to use all of this stuff all the time Only load it when it is needed
11 12 13 |
# File 'lib/twitter_ebooks/nlp.rb', line 11 def self.stopwords @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split end |
.tactful ⇒ Object
30 31 32 33 |
# File 'lib/twitter_ebooks/nlp.rb', line 30 def self.tactful require 'tactful_tokenizer' @tactful ||= TactfulTokenizer::Model.new end |
.tagger ⇒ Object
35 36 37 38 |
# File 'lib/twitter_ebooks/nlp.rb', line 35 def self.tagger require 'engtagger' @tagger ||= EngTagger.new end |
.tokenize(sentence) ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/twitter_ebooks/nlp.rb', line 65 def self.tokenize(sentence) # This is hacky, but an ad hoc approach seems to be # most reliable for now. Tokenization libraries have oddities # that are hard to correct. sentence.split(/\s/).map do |token| exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/] if exceptions.find { |r| r.match(token) } token else token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/) end end.flatten end |
.tokenizer ⇒ Object
23 24 25 26 27 28 |
# File 'lib/twitter_ebooks/nlp.rb', line 23 def self.tokenizer # This tokenizer is used for dividing sentences into words # It's too slow for finding sentences in paragraphs, hence tactful require 'tokenizer' @tokenizer ||= Tokenizer::Tokenizer.new(:en) end |
.tokenset(sentence) ⇒ Object
79 80 81 82 83 84 |
# File 'lib/twitter_ebooks/nlp.rb', line 79 def self.tokenset(sentence) tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence) tokens.map(&:downcase) .reject { |token| stopwords.include?(token) } .to_set end |
.unmatched_enclosers?(text) ⇒ Boolean
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/twitter_ebooks/nlp.rb', line 120 def self.unmatched_enclosers?(text) # Weird quotes are an instant giveaway. Let's do paren-matching. enclosers = ['**', '""', '()', '[]', '``'] enclosers.each do |pair| starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S') ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)') opened = 0 tokenize(text).each do |token| opened += 1 if token.match(starter) opened -= 1 if token.match(ender) return true if opened < 0 # Too many ends! end return true if opened != 0 # Mismatch somewhere. end false end |