Class: Ebooks::Model

Inherits:
Object
  • Object
show all
Defined in:
lib/foxdear_ebooks/model.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeModel

Returns a new instance of Model.



101
102
103
104
105
106
107
# File 'lib/foxdear_ebooks/model.rb', line 101

def initialize
  @tokens = []
  @banned_words_file ||= 'banned_words.txt'
  @banned_words ||= File.exists?(@banned_words_file) ? File.read(@banned_words_file).split : []
  # Reverse lookup tiki by token, for faster generation
  @tikis = {}
end

Instance Attribute Details

#keywordsArray<String>

The top 200 most important keywords, in descending order

Returns:

  • (Array<String>)


27
28
29
# File 'lib/foxdear_ebooks/model.rb', line 27

def keywords
  @keywords
end

#mentionsArray<Array<Integer>>

Sentences derived from Twitter mentions

Returns:

  • (Array<Array<Integer>>)


23
24
25
# File 'lib/foxdear_ebooks/model.rb', line 23

def mentions
  @mentions
end

#sentencesArray<Array<Integer>>

Sentences represented by arrays of tikis

Returns:

  • (Array<Array<Integer>>)


19
20
21
# File 'lib/foxdear_ebooks/model.rb', line 19

def sentences
  @sentences
end

#tokensArray<String>

An array of unique tokens. This is the main source of actual strings in the model. Manipulation of a token is done using its index in this array, which we call a “tiki”

Returns:

  • (Array<String>)


15
16
17
# File 'lib/foxdear_ebooks/model.rb', line 15

def tokens
  @tokens
end

Class Method Details

.consume(path) ⇒ Ebooks::Model

Generate a new model from a corpus file

Parameters:

  • path (String)

Returns:



32
33
34
# File 'lib/foxdear_ebooks/model.rb', line 32

def self.consume(path)
  Model.new.consume(path)
end

.consume_all(paths) ⇒ Ebooks::Model

Generate a new model from multiple corpus files

Parameters:

  • paths (Array<String>)

Returns:



39
40
41
# File 'lib/foxdear_ebooks/model.rb', line 39

def self.consume_all(paths)
  Model.new.consume_all(paths)
end

.load(path) ⇒ Ebooks::Model

Load a saved model

Parameters:

  • path (String)

Returns:



46
47
48
49
50
51
52
53
54
55
56
# File 'lib/foxdear_ebooks/model.rb', line 46

def self.load(path)
  model = Model.new
  model.instance_eval do
    props = Marshal.load(File.open(path, 'rb') { |f| f.read })
    @tokens = props[:tokens]
    @sentences = props[:sentences]
    @mentions = props[:mentions]
    @keywords = props[:keywords]
  end
  model
end

Instance Method Details

#append(path) ⇒ Object

Append a generated model to existing model file instead of overwriting it

Parameters:

  • path (String)


74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/foxdear_ebooks/model.rb', line 74

def append(path)
  existing = File.file?(path)
  if !existing
    log "No existing model found at #{path}"
    return
  else
    #read-in and deserialize existing model
    props = Marshal.load(File.open(path,'rb') { |old| old.read })
    old_tokens = props[:tokens]
    old_sentences = props[:sentences]
    old_mentions = props[:mentions]
    old_keywords = props[:keywords]

    #append existing properties to new ones and overwrite with new model
    File.open(path, 'wb') do |f|
      f.write(Marshal.dump({
        tokens: @tokens.concat(old_tokens),
        sentences: @sentences.concat(old_sentences),
        mentions: @mentions.concat(old_mentions),
        keywords: @keywords.concat(old_keywords)
      }))
    end
  end
  self
end

#consume(path) ⇒ Object

Consume a corpus into this model

Parameters:

  • path (String)


154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/foxdear_ebooks/model.rb', line 154

def consume(path)
  content = File.read(path, :encoding => 'utf-8')

  if path.split('.')[-1] == "json"
    log "Reading json corpus from #{path}"
    lines = JSON.parse(content).map do |tweet|
      tweet['text'] || tweet['full_text']
    end
  elsif path.split('.')[-1] == "csv"
    log "Reading CSV corpus from #{path}"
    content = CSV.parse(content)
    header = content.shift
    text_col = header.index('text')
    lines = content.map do |tweet|
      tweet[text_col]
    end
  else
    log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
    lines = content.split("\n")
  end

  consume_lines(lines)
end

#consume_all(paths) ⇒ Object

Consume multiple corpuses into this model

Parameters:

  • paths (Array<String>)


215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/foxdear_ebooks/model.rb', line 215

def consume_all(paths)
  lines = []
  paths.each do |path|
    content = File.read(path, :encoding => 'utf-8')

    if path.split('.')[-1] == "json"
      log "Reading json corpus from #{path}"
      l = JSON.parse(content).map do |tweet|
        tweet['text'] || tweet['full_text']
      end
      lines.concat(l)
    elsif path.split('.')[-1] == "csv"
      log "Reading CSV corpus from #{path}"
      content = CSV.parse(content)
      header = content.shift
      text_col = header.index('text')
      l = content.map do |tweet|
        tweet[text_col]
      end
      lines.concat(l)
    else
      log "Reading plaintext corpus from #{path}"
      l = content.split("\n")
      lines.concat(l)
    end
  end
  consume_lines(lines)
end

#consume_lines(lines) ⇒ Object

Consume a sequence of lines

Parameters:

  • lines (Array<String>)


180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/foxdear_ebooks/model.rb', line 180

def consume_lines(lines)
  log "Removing commented lines and sorting mentions"

  statements = []
  mentions = []
  lines.each do |l|
    next if l.start_with?('#') # Remove commented lines
    next if l.include?('RT') || l.include?('MT') # Remove soft retweets

    if l.include?('@')
      mentions << NLP.normalize(l)
    else
      statements << NLP.normalize(l)
    end
  end

  text = statements.join("\n").encode('UTF-8', :invalid => :replace)
  mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)

  lines = nil; statements = nil; mentions = nil # Allow garbage collection

  log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"

  @sentences = mass_tikify(text)
  @mentions = mass_tikify(mention_text)

  log "Ranking keywords"
  @keywords = NLP.keywords(text).top(200).map(&:to_s)
  log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"

  self
end

#find_relevant(sentences, input) ⇒ Array<Array<Array<Integer>>, Array<Array<Integer>>>

Finds relevant and slightly relevant tokenized sentences to input comparing non-stopword token overlaps

Parameters:

  • sentences (Array<Array<Integer>>)
  • input (String)

Returns:

  • (Array<Array<Array<Integer>>, Array<Array<Integer>>>)


320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# File 'lib/foxdear_ebooks/model.rb', line 320

def find_relevant(sentences, input)
  relevant = []
  slightly_relevant = []

  tokenized = NLP.tokenize(input).map(&:downcase)

  sentences.each do |sent|
    tokenized.each do |token|
      if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
        relevant << sent unless NLP.stopword?(token)
        slightly_relevant << sent
      end
    end
  end

  [relevant, slightly_relevant]
end

#fix(text) ⇒ String

Correct encoding issues in generated text

Parameters:

  • text (String)

Returns:

  • (String)


247
248
249
# File 'lib/foxdear_ebooks/model.rb', line 247

def fix(text)
  NLP.htmlentities.decode text
end

#make_response(input, limit = 280, sentences = @mentions) ⇒ String

Generates a response by looking for related sentences in the corpus and building a smaller generator from these

Parameters:

  • input (String)
  • limit (Integer) (defaults to: 280)

    characters available for response

  • sentences (Array<Array<Integer>>) (defaults to: @mentions)

Returns:

  • (String)


344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# File 'lib/foxdear_ebooks/model.rb', line 344

def make_response(input, limit=280, sentences=@mentions)
  # Prefer mentions
  relevant, slightly_relevant = find_relevant(sentences, input)

  if relevant.length >= 3
    generator = SuffixGenerator.build(relevant)
    make_statement(limit, generator)
  elsif slightly_relevant.length >= 5
    generator = SuffixGenerator.build(slightly_relevant)
    make_statement(limit, generator)
  elsif sentences.equal?(@mentions)
    make_response(input, limit, @sentences)
  else
    make_statement(limit)
  end
end

#make_statement(limit = 280, generator = nil, retry_limit = 10) ⇒ String

Generate some text

Parameters:

  • limit (Integer) (defaults to: 280)

    available characters

  • generator (SuffixGenerator, nil) (defaults to: nil)
  • retry_limit (Integer) (defaults to: 10)

    how many times to retry on invalid tweet

Returns:

  • (String)


268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/foxdear_ebooks/model.rb', line 268

def make_statement(limit=280, generator=nil, retry_limit=10)
  responding = !generator.nil?
  generator ||= SuffixGenerator.build(@sentences)

  retries = 0
  tweet = ""

  while (tikis = generator.generate(3, :bigrams)) do
    #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
    break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)

    retries += 1
    break if retries >= retry_limit
  end

  if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
    #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
    while (tikis = generator.generate(3, :unigrams)) do
      break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

      retries += 1
      break if retries >= retry_limit
    end
  end

  tweet = NLP.reconstruct(tikis, @tokens)

  if retries >= retry_limit
    log "Unable to produce valid non-verbatim tweet; result was \"#{tweet}\""
    if valid_tweet?(tikis, limit)
      log "Tweet contains no banned words; sending anyways"
    else
      log "Tweet contains banned words or is invalid; replacing with dummy message"
      tweet = "Sorry, try again."
    end
  end

  fix tweet
end

#mass_tikify(text) ⇒ Array<Array<Integer>>

Convert a body of text into arrays of tikis

Parameters:

  • text (String)

Returns:

  • (Array<Array<Integer>>)


139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/foxdear_ebooks/model.rb', line 139

def mass_tikify(text)
  sentences = NLP.sentences(text)

  sentences.map do |s|
    tokens = NLP.tokenize(s).reject do |t|
      # Don't include usernames/urls as tokens
      t.include?('@') || t.include?('http')
    end

    tokens.map { |t| tikify(t) }
  end
end

#save(path) ⇒ Object

Save model to a file

Parameters:

  • path (String)


60
61
62
63
64
65
66
67
68
69
70
# File 'lib/foxdear_ebooks/model.rb', line 60

def save(path)
  File.open(path, 'wb') do |f|
    f.write(Marshal.dump({
      tokens: @tokens,
      sentences: @sentences,
      mentions: @mentions,
      keywords: @keywords
    }))
  end
  self
end

#set_banned_words(path = 'banned_words.txt') ⇒ Object

Set the banned words list for the model

Parameters:

  • path (String) (defaults to: 'banned_words.txt')


124
125
126
127
128
129
130
131
132
133
# File 'lib/foxdear_ebooks/model.rb', line 124

def set_banned_words(path = 'banned_words.txt')
  return if @banned_words_file == path
  @banned_words_file = path
  if File.exists?(@banned_words_file)
    @banned_words = File.read(@banned_words_file).split
    log "Successfully loaded banned words list #{path}"
  else
    log "Error: Banned words list #{path} does not exist"
  end
end

#tikify(token) ⇒ Integer

Reverse lookup a token index from a token

Parameters:

  • token (String)

Returns:

  • (Integer)


112
113
114
115
116
117
118
119
120
# File 'lib/foxdear_ebooks/model.rb', line 112

def tikify(token)
  if @tikis.has_key?(token) then
    return @tikis[token]
  else
    (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
    @tokens << token
    return @tikis[token] = @tokens.length-1
  end
end

#valid_tweet?(tikis, limit) ⇒ Boolean

Check if an array of tikis comprises a valid tweet

Parameters:

  • tikis (Array<Integer>)
  • limit

    Integer how many chars we have left

Returns:

  • (Boolean)


254
255
256
257
258
259
260
261
# File 'lib/foxdear_ebooks/model.rb', line 254

def valid_tweet?(tikis, limit)
  tweet = NLP.reconstruct(tikis, @tokens)
  found_banned = @banned_words.any? do |word|
    re = Regexp.new("\\b#{word}\\b", "i")
    re.match tweet
  end
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) && !found_banned
end

#verbatim?(tikis) ⇒ Boolean

Test if a sentence has been copied verbatim from original

Parameters:

  • tikis (Array<Integer>)

Returns:

  • (Boolean)


311
312
313
# File 'lib/foxdear_ebooks/model.rb', line 311

def verbatim?(tikis)
  @sentences.include?(tikis) || @mentions.include?(tikis)
end