Class: Ebooks::Model

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_ebooks/model.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeModel

Returns a new instance of Model.



101
102
103
104
105
106
# File 'lib/twitter_ebooks/model.rb', line 101

def initialize
  @tokens = []

  # Reverse lookup tiki by token, for faster generation
  @tikis = {}
end

Instance Attribute Details

#keywordsArray<String>

The top 200 most important keywords, in descending order

Returns:

  • (Array<String>)


27
28
29
# File 'lib/twitter_ebooks/model.rb', line 27

def keywords
  @keywords
end

#mentionsArray<Array<Integer>>

Sentences derived from Twitter mentions

Returns:

  • (Array<Array<Integer>>)


23
24
25
# File 'lib/twitter_ebooks/model.rb', line 23

def mentions
  @mentions
end

#sentencesArray<Array<Integer>>

Sentences represented by arrays of tikis

Returns:

  • (Array<Array<Integer>>)


19
20
21
# File 'lib/twitter_ebooks/model.rb', line 19

def sentences
  @sentences
end

#tokensArray<String>

An array of unique tokens. This is the main source of actual strings in the model. Manipulation of a token is done using its index in this array, which we call a “tiki”

Returns:

  • (Array<String>)


15
16
17
# File 'lib/twitter_ebooks/model.rb', line 15

def tokens
  @tokens
end

Class Method Details

.consume(path) ⇒ Ebooks::Model

Generate a new model from a corpus file

Parameters:

  • path (String)

Returns:



32
33
34
# File 'lib/twitter_ebooks/model.rb', line 32

def self.consume(path)
  Model.new.consume(path)
end

.consume_all(paths) ⇒ Ebooks::Model

Generate a new model from multiple corpus files

Parameters:

  • paths (Array<String>)

Returns:



39
40
41
# File 'lib/twitter_ebooks/model.rb', line 39

def self.consume_all(paths)
  Model.new.consume_all(paths)
end

.load(path) ⇒ Ebooks::Model

Load a saved model

Parameters:

  • path (String)

Returns:



46
47
48
49
50
51
52
53
54
55
56
# File 'lib/twitter_ebooks/model.rb', line 46

def self.load(path)
  model = Model.new
  model.instance_eval do
    props = Marshal.load(File.open(path, 'rb') { |f| f.read })
    @tokens = props[:tokens]
    @sentences = props[:sentences]
    @mentions = props[:mentions]
    @keywords = props[:keywords]
  end
  model
end

Instance Method Details

#append(path) ⇒ Object

Append a generated model to existing model file instead of overwriting it

Parameters:

  • path (String)


74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/twitter_ebooks/model.rb', line 74

def append(path)
  existing = File.file?(path)
  if !existing
    log "No existing model found at #{path}"
    return
  else
    #read-in and deserialize existing model
    props = Marshal.load(File.open(path,'rb') { |old| old.read })
    old_tokens = props[:tokens]
    old_sentences = props[:sentences]
    old_mentions = props[:mentions]
    old_keywords = props[:keywords]

    #append existing properties to new ones and overwrite with new model
    File.open(path, 'wb') do |f|
      f.write(Marshal.dump({
        tokens: @tokens.concat(old_tokens),
        sentences: @sentences.concat(old_sentences),
        mentions: @mentions.concat(old_mentions),
        keywords: @keywords.concat(old_keywords)
      }))
    end
  end
  self
end

#consume(path) ⇒ Object

Consume a corpus into this model

Parameters:

  • path (String)


139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/twitter_ebooks/model.rb', line 139

def consume(path)
  content = File.read(path, :encoding => 'utf-8')

  if path.split('.')[-1] == "json"
    log "Reading json corpus from #{path}"
    lines = JSON.parse(content).map do |tweet|
        tweet['text'] || tweet['full_text']
    end
  elsif path.split('.')[-1] == "csv"
    log "Reading CSV corpus from #{path}"
    content = CSV.parse(content)
    header = content.shift
    text_col = header.index('text')
    lines = content.map do |tweet|
      tweet[text_col]
    end
  else
    log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
    lines = content.split("\n")
  end

  consume_lines(lines)
end

#consume_all(paths) ⇒ Object

Consume multiple corpuses into this model

Parameters:

  • paths (Array<String>)


200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/twitter_ebooks/model.rb', line 200

def consume_all(paths)
  lines = []
  paths.each do |path|
    content = File.read(path, :encoding => 'utf-8')

    if path.split('.')[-1] == "json"
      log "Reading json corpus from #{path}"
      l = JSON.parse(content).map do |tweet|
        tweet['text'] || tweet['full_text']
      end
      lines.concat(l)
    elsif path.split('.')[-1] == "csv"
      log "Reading CSV corpus from #{path}"
      content = CSV.parse(content)
      header = content.shift
      text_col = header.index('text')
      l = content.map do |tweet|
        tweet[text_col]
      end
      lines.concat(l)
    else
      log "Reading plaintext corpus from #{path}"
      l = content.split("\n")
      lines.concat(l)
    end
  end
  consume_lines(lines)
end

#consume_lines(lines) ⇒ Object

Consume a sequence of lines

Parameters:

  • lines (Array<String>)


165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/twitter_ebooks/model.rb', line 165

def consume_lines(lines)
  log "Removing commented lines and sorting mentions"

  statements = []
  mentions = []
  lines.each do |l|
    next if l.start_with?('#') # Remove commented lines
    next if l.include?('RT') || l.include?('MT') # Remove soft retweets

    if l.include?('@')
      mentions << NLP.normalize(l)
    else
      statements << NLP.normalize(l)
    end
  end

  text = statements.join("\n").encode('UTF-8', :invalid => :replace)
  mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)

  lines = nil; statements = nil; mentions = nil # Allow garbage collection

  log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"

  @sentences = mass_tikify(text)
  @mentions = mass_tikify(mention_text)

  log "Ranking keywords"
  @keywords = NLP.keywords(text).top(200).map(&:to_s)
  log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"

  self
end

#find_relevant(sentences, input) ⇒ Array<Array<Array<Integer>>, Array<Array<Integer>>>

Finds relevant and slightly relevant tokenized sentences to input comparing non-stopword token overlaps

Parameters:

  • sentences (Array<Array<Integer>>)
  • input (String)

Returns:

  • (Array<Array<Array<Integer>>, Array<Array<Integer>>>)


295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/twitter_ebooks/model.rb', line 295

def find_relevant(sentences, input)
  relevant = []
  slightly_relevant = []

  tokenized = NLP.tokenize(input).map(&:downcase)

  sentences.each do |sent|
    tokenized.each do |token|
      if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
        relevant << sent unless NLP.stopword?(token)
        slightly_relevant << sent
      end
    end
  end

  [relevant, slightly_relevant]
end

#fix(text) ⇒ String

Correct encoding issues in generated text

Parameters:

  • text (String)

Returns:

  • (String)


232
233
234
# File 'lib/twitter_ebooks/model.rb', line 232

def fix(text)
  NLP.htmlentities.decode text
end

#make_response(input, limit = 280, sentences = @mentions) ⇒ String

Generates a response by looking for related sentences in the corpus and building a smaller generator from these

Parameters:

  • input (String)
  • limit (Integer) (defaults to: 280)

    characters available for response

  • sentences (Array<Array<Integer>>) (defaults to: @mentions)

Returns:

  • (String)


319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# File 'lib/twitter_ebooks/model.rb', line 319

def make_response(input, limit=280, sentences=@mentions)
  # Prefer mentions
  relevant, slightly_relevant = find_relevant(sentences, input)

  if relevant.length >= 3
    generator = SuffixGenerator.build(relevant)
    make_statement(limit, generator)
  elsif slightly_relevant.length >= 5
    generator = SuffixGenerator.build(slightly_relevant)
    make_statement(limit, generator)
  elsif sentences.equal?(@mentions)
    make_response(input, limit, @sentences)
  else
    make_statement(limit)
  end
end

#make_statement(limit = 280, generator = nil, retry_limit = 10) ⇒ String

Generate some text

Parameters:

  • limit (Integer) (defaults to: 280)

    available characters

  • generator (SuffixGenerator, nil) (defaults to: nil)
  • retry_limit (Integer) (defaults to: 10)

    how many times to retry on invalid tweet

Returns:

  • (String)


249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
# File 'lib/twitter_ebooks/model.rb', line 249

def make_statement(limit=280, generator=nil, retry_limit=10)
  responding = !generator.nil?
  generator ||= SuffixGenerator.build(@sentences)

  retries = 0
  tweet = ""

  while (tikis = generator.generate(3, :bigrams)) do
    #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
    break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)

    retries += 1
    break if retries >= retry_limit
  end

  if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
    #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
    while (tikis = generator.generate(3, :unigrams)) do
      break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

      retries += 1
      break if retries >= retry_limit
    end
  end

  tweet = NLP.reconstruct(tikis, @tokens)

  if retries >= retry_limit
    log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
  end

  fix tweet
end

#mass_tikify(text) ⇒ Array<Array<Integer>>

Convert a body of text into arrays of tikis

Parameters:

  • text (String)

Returns:

  • (Array<Array<Integer>>)


124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/twitter_ebooks/model.rb', line 124

def mass_tikify(text)
  sentences = NLP.sentences(text)

  sentences.map do |s|
    tokens = NLP.tokenize(s).reject do |t|
      # Don't include usernames/urls as tokens
      t.include?('@') || t.include?('http')
    end

    tokens.map { |t| tikify(t) }
  end
end

#save(path) ⇒ Object

Save model to a file

Parameters:

  • path (String)


60
61
62
63
64
65
66
67
68
69
70
# File 'lib/twitter_ebooks/model.rb', line 60

def save(path)
  File.open(path, 'wb') do |f|
    f.write(Marshal.dump({
      tokens: @tokens,
      sentences: @sentences,
      mentions: @mentions,
      keywords: @keywords
    }))
  end
  self
end

#tikify(token) ⇒ Integer

Reverse lookup a token index from a token

Parameters:

  • token (String)

Returns:

  • (Integer)


111
112
113
114
115
116
117
118
119
# File 'lib/twitter_ebooks/model.rb', line 111

def tikify(token)
  if @tikis.has_key?(token) then
    return @tikis[token]
  else
    (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
    @tokens << token
    return @tikis[token] = @tokens.length-1
  end
end

#valid_tweet?(tikis, limit) ⇒ Boolean

Check if an array of tikis comprises a valid tweet

Parameters:

  • tikis (Array<Integer>)
  • limit

    Integer how many chars we have left

Returns:

  • (Boolean)


239
240
241
242
# File 'lib/twitter_ebooks/model.rb', line 239

def valid_tweet?(tikis, limit)
  tweet = NLP.reconstruct(tikis, @tokens)
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end

#verbatim?(tikis) ⇒ Boolean

Test if a sentence has been copied verbatim from original

Parameters:

  • tikis (Array<Integer>)

Returns:

  • (Boolean)


286
287
288
# File 'lib/twitter_ebooks/model.rb', line 286

def verbatim?(tikis)
  @sentences.include?(tikis) || @mentions.include?(tikis)
end