Module: Ebooks::NLP

Defined in:
lib/twitter_ebooks/nlp.rb

Constant Summary collapse

PUNCTUATION =

Deliberately limit our punctuation handling to stuff we can do consistently It’ll just be a part of a token if we don’t split it out, and that’s fine

".?!,"

Class Method Summary collapse

Class Method Details

.adjectivesObject



19
20
21
# File 'lib/twitter_ebooks/nlp.rb', line 19

def self.adjectives
  @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
end

.gingericeObject



45
46
47
48
# File 'lib/twitter_ebooks/nlp.rb', line 45

def self.gingerice
  require 'gingerice'
  Gingerice::Parser.new # No caching for this one
end

.htmlentitiesObject



50
51
52
53
# File 'lib/twitter_ebooks/nlp.rb', line 50

def self.htmlentities
  require 'htmlentities'
  @htmlentities ||= HTMLEntities.new
end

.normalize(text) ⇒ Object



61
62
63
# File 'lib/twitter_ebooks/nlp.rb', line 61

def self.normalize(text)
  htmlentities.decode text.gsub('', '"').gsub('', '"').gsub('', "'").gsub('', '...')
end

.nounsObject



15
16
17
# File 'lib/twitter_ebooks/nlp.rb', line 15

def self.nouns
  @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
end

.punctuation?(token) ⇒ Boolean

Returns:

  • (Boolean)


116
117
118
# File 'lib/twitter_ebooks/nlp.rb', line 116

def self.punctuation?(token)
  (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
end

.reconstruct(tokens) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
# File 'lib/twitter_ebooks/nlp.rb', line 100

def self.reconstruct(tokens)
  # Put tokens back together into a nice looking sentence
  text = ""
  last_token = nil
  tokens.each do |token|
    text += ' ' if last_token && space_between?(last_token, token)
    text += token
    last_token = token
  end
  text
end

.sentences(text) ⇒ Object

Utility functions which wrap the above



57
58
59
# File 'lib/twitter_ebooks/nlp.rb', line 57

def self.sentences(text)
  tactful.tokenize_text(text)
end

.space_between?(token1, token2) ⇒ Boolean

Returns:

  • (Boolean)


86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/twitter_ebooks/nlp.rb', line 86

def self.space_between?(token1, token2)
  p1 = self.punctuation?(token1)
  p2 = self.punctuation?(token2)
  if p1 && p2 # "foo?!"
    false
  elsif !p1 && p2 # "foo."
    false
  elsif p1 && !p2 # "foo. rah"
    true
  else # "foo rah"
    true
  end
end

.stemmerObject



40
41
42
43
# File 'lib/twitter_ebooks/nlp.rb', line 40

def self.stemmer
  require 'lingua/stemmer'
  @stemmer ||= Lingua::Stemmer.new
end

.stopwordsObject

We don’t necessarily want to use all of this stuff all the time Only load it when it is needed



11
12
13
# File 'lib/twitter_ebooks/nlp.rb', line 11

def self.stopwords
  @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
end

.tactfulObject



30
31
32
33
# File 'lib/twitter_ebooks/nlp.rb', line 30

def self.tactful
  require 'tactful_tokenizer'
  @tactful ||= TactfulTokenizer::Model.new
end

.taggerObject



35
36
37
38
# File 'lib/twitter_ebooks/nlp.rb', line 35

def self.tagger
  require 'engtagger'
  @tagger ||= EngTagger.new
end

.tokenize(sentence) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/twitter_ebooks/nlp.rb', line 65

def self.tokenize(sentence)
  # This is hacky, but an ad hoc approach seems to be
  # most reliable for now. Tokenization libraries have oddities
  # that are hard to correct.
  sentence.split(/\s/).map do |token|
    exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
    if exceptions.find { |r| r.match(token) }
      token
    else
      token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
    end
  end.flatten
end

.tokenizerObject



23
24
25
26
27
28
# File 'lib/twitter_ebooks/nlp.rb', line 23

def self.tokenizer
  # This tokenizer is used for dividing sentences into words
  # It's too slow for finding sentences in paragraphs, hence tactful
  require 'tokenizer'
  @tokenizer ||= Tokenizer::Tokenizer.new(:en)
end

.tokenset(sentence) ⇒ Object



79
80
81
82
83
84
# File 'lib/twitter_ebooks/nlp.rb', line 79

def self.tokenset(sentence)
  tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
  tokens.map(&:downcase)
        .reject { |token| stopwords.include?(token) }
        .to_set
end

.unmatched_enclosers?(text) ⇒ Boolean

Returns:

  • (Boolean)


120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/twitter_ebooks/nlp.rb', line 120

def self.unmatched_enclosers?(text)
  # Weird quotes are an instant giveaway. Let's do paren-matching.
  enclosers = ['**', '""', '()', '[]', '``']
  enclosers.each do |pair|
    starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
    ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')

    opened = 0

    tokenize(text).each do |token|
      opened += 1 if token.match(starter)
      opened -= 1 if token.match(ender)

      return true if opened < 0 # Too many ends!
    end

    return true if opened != 0 # Mismatch somewhere.
  end

  false
end