Module: Ebooks::NLP

Defined in:: lib/twitter_ebooks/nlp.rb

Constant Summary collapse

PUNCTUATION = Deliberately limit our punctuation handling to stuff we can do consistently It’ll just be a part of a token if we don’t split it out, and that’s fine

".?!,"

Class Method Summary collapse

.adjectives ⇒ Object
.gingerice ⇒ Object
.htmlentities ⇒ Object
.normalize(text) ⇒ Object
.nouns ⇒ Object
.punctuation?(token) ⇒ Boolean
.reconstruct(tokens) ⇒ Object
.sentences(text) ⇒ Object

Utility functions which wrap the above.
.space_between?(token1, token2) ⇒ Boolean
.stemmer ⇒ Object
.stopwords ⇒ Object

We don’t necessarily want to use all of this stuff all the time Only load it when it is needed.
.tactful ⇒ Object
.tagger ⇒ Object
.tokenize(sentence) ⇒ Object
.tokenizer ⇒ Object
.tokenset(sentence) ⇒ Object
.unmatched_enclosers?(text) ⇒ Boolean

Class Method Details

.adjectives ⇒ `Object`



19
20
21

# File 'lib/twitter_ebooks/nlp.rb', line 19

def self.adjectives
  @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
end

.gingerice ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 45

def self.gingerice
  require 'gingerice'
  Gingerice::Parser.new # No caching for this one
end

.htmlentities ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 50

def self.htmlentities
  require 'htmlentities'
  @htmlentities ||= HTMLEntities.new
end

.normalize(text) ⇒ `Object`



61
62
63

# File 'lib/twitter_ebooks/nlp.rb', line 61

def self.normalize(text)
  htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
end

.nouns ⇒ `Object`



15
16
17

# File 'lib/twitter_ebooks/nlp.rb', line 15

def self.nouns
  @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
end

.punctuation?(token) ⇒ `Boolean`

Returns:

(Boolean)



116
117
118

# File 'lib/twitter_ebooks/nlp.rb', line 116

def self.punctuation?(token)
  (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
end

.reconstruct(tokens) ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 100

def self.reconstruct(tokens)
  # Put tokens back together into a nice looking sentence
  text = ""
  last_token = nil
  tokens.each do |token|
    text += ' ' if last_token && space_between?(last_token, token)
    text += token
    last_token = token
  end
  text
end

.sentences(text) ⇒ `Object`

Utility functions which wrap the above



57
58
59

# File 'lib/twitter_ebooks/nlp.rb', line 57

def self.sentences(text)
  tactful.tokenize_text(text)
end

.space_between?(token1, token2) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/twitter_ebooks/nlp.rb', line 86

def self.space_between?(token1, token2)
  p1 = self.punctuation?(token1)
  p2 = self.punctuation?(token2)
  if p1 && p2 # "foo?!"
    false
  elsif !p1 && p2 # "foo."
    false
  elsif p1 && !p2 # "foo. rah"
    true
  else # "foo rah"
    true
  end
end

.stemmer ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 40

def self.stemmer
  require 'lingua/stemmer'
  @stemmer ||= Lingua::Stemmer.new
end

.stopwords ⇒ `Object`

We don’t necessarily want to use all of this stuff all the time Only load it when it is needed



11
12
13

# File 'lib/twitter_ebooks/nlp.rb', line 11

def self.stopwords
  @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
end

.tactful ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 30

def self.tactful
  require 'tactful_tokenizer'
  @tactful ||= TactfulTokenizer::Model.new
end

.tagger ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 35

def self.tagger
  require 'engtagger'
  @tagger ||= EngTagger.new
end

.tokenize(sentence) ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 65

def self.tokenize(sentence)
  # This is hacky, but an ad hoc approach seems to be
  # most reliable for now. Tokenization libraries have oddities
  # that are hard to correct.
  sentence.split(/\s/).map do |token|
    exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
    if exceptions.find { |r| r.match(token) }
      token
    else
      token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
    end
  end.flatten
end

.tokenizer ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 23

def self.tokenizer
  # This tokenizer is used for dividing sentences into words
  # It's too slow for finding sentences in paragraphs, hence tactful
  require 'tokenizer'
  @tokenizer ||= Tokenizer::Tokenizer.new(:en)
end

.tokenset(sentence) ⇒ `Object`

# File 'lib/twitter_ebooks/nlp.rb', line 79

def self.tokenset(sentence)
  tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
  tokens.map(&:downcase)
        .reject { |token| stopwords.include?(token) }
        .to_set
end

.unmatched_enclosers?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/twitter_ebooks/nlp.rb', line 120

def self.unmatched_enclosers?(text)
  # Weird quotes are an instant giveaway. Let's do paren-matching.
  enclosers = ['**', '""', '()', '[]', '``']
  enclosers.each do |pair|
    starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
    ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')

    opened = 0

    tokenize(text).each do |token|
      opened += 1 if token.match(starter)
      opened -= 1 if token.match(ender)

      return true if opened < 0 # Too many ends!
    end

    return true if opened != 0 # Mismatch somewhere.
  end

  false
end

Module: Ebooks::NLP

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.adjectives ⇒ Object

.gingerice ⇒ Object

.htmlentities ⇒ Object

.normalize(text) ⇒ Object

.nouns ⇒ Object

.punctuation?(token) ⇒ Boolean

.reconstruct(tokens) ⇒ Object

.sentences(text) ⇒ Object

.space_between?(token1, token2) ⇒ Boolean

.stemmer ⇒ Object

.stopwords ⇒ Object

.tactful ⇒ Object

.tagger ⇒ Object

.tokenize(sentence) ⇒ Object

.tokenizer ⇒ Object

.tokenset(sentence) ⇒ Object

.unmatched_enclosers?(text) ⇒ Boolean

.adjectives ⇒ `Object`

.gingerice ⇒ `Object`

.htmlentities ⇒ `Object`

.normalize(text) ⇒ `Object`

.nouns ⇒ `Object`

.punctuation?(token) ⇒ `Boolean`

.reconstruct(tokens) ⇒ `Object`

.sentences(text) ⇒ `Object`

.space_between?(token1, token2) ⇒ `Boolean`

.stemmer ⇒ `Object`

.stopwords ⇒ `Object`

.tactful ⇒ `Object`

.tagger ⇒ `Object`

.tokenize(sentence) ⇒ `Object`

.tokenizer ⇒ `Object`

.tokenset(sentence) ⇒ `Object`

.unmatched_enclosers?(text) ⇒ `Boolean`