Class: Ebooks::Model

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_ebooks/model.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeModel

Returns a new instance of Model.



72
73
74
75
76
77
# File 'lib/twitter_ebooks/model.rb', line 72

def initialize
  @tokens = []

  # Reverse lookup tiki by token, for faster generation
  @tikis = {}
end

Instance Attribute Details

#keywordsArray<String>

The top 200 most important keywords, in descending order

Returns:

  • (Array<String>)


27
28
29
# File 'lib/twitter_ebooks/model.rb', line 27

def keywords
  @keywords
end

#mentionsArray<Array<Integer>>

Sentences derived from Twitter mentions

Returns:

  • (Array<Array<Integer>>)


23
24
25
# File 'lib/twitter_ebooks/model.rb', line 23

def mentions
  @mentions
end

#sentencesArray<Array<Integer>>

Sentences represented by arrays of tikis

Returns:

  • (Array<Array<Integer>>)


19
20
21
# File 'lib/twitter_ebooks/model.rb', line 19

def sentences
  @sentences
end

#tokensArray<String>

An array of unique tokens. This is the main source of actual strings in the model. Manipulation of a token is done using its index in this array, which we call a “tiki”

Returns:

  • (Array<String>)


15
16
17
# File 'lib/twitter_ebooks/model.rb', line 15

def tokens
  @tokens
end

Class Method Details

.consume(path) ⇒ Ebooks::Model

Generate a new model from a corpus file

Parameters:

  • path (String)

Returns:



32
33
34
# File 'lib/twitter_ebooks/model.rb', line 32

def self.consume(path)
  Model.new.consume(path)
end

.consume_all(paths) ⇒ Ebooks::Model

Generate a new model from multiple corpus files

Parameters:

  • paths (Array<String>)

Returns:



39
40
41
# File 'lib/twitter_ebooks/model.rb', line 39

def self.consume_all(paths)
  Model.new.consume_all(paths)
end

.load(path) ⇒ Ebooks::Model

Load a saved model

Parameters:

  • path (String)

Returns:



46
47
48
49
50
51
52
53
54
55
56
# File 'lib/twitter_ebooks/model.rb', line 46

def self.load(path)
  model = Model.new
  model.instance_eval do
    props = Marshal.load(File.open(path, 'rb') { |f| f.read })
    @tokens = props[:tokens]
    @sentences = props[:sentences]
    @mentions = props[:mentions]
    @keywords = props[:keywords]
  end
  model
end

Instance Method Details

#consume(path) ⇒ Object

Consume a corpus into this model

Parameters:

  • path (String)


104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/twitter_ebooks/model.rb', line 104

def consume(path)
  content = File.read(path, :encoding => 'utf-8')

  if path.split('.')[-1] == "json"
    log "Reading json corpus from #{path}"
    lines = JSON.parse(content).map do |tweet|
      tweet['text']
    end
  elsif path.split('.')[-1] == "csv"
    log "Reading CSV corpus from #{path}"
    content = CSV.parse(content)
    header = content.shift
    text_col = header.index('text')
    lines = content.map do |tweet|
      tweet[text_col]
    end
  else
    log "Reading plaintext corpus from #{path}"
    lines = content.split("\n")
  end

  consume_lines(lines)
end

#consume_all(paths) ⇒ Object

Consume multiple corpuses into this model

Parameters:

  • paths (Array<String>)


164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/twitter_ebooks/model.rb', line 164

def consume_all(paths)
  lines = []
  paths.each do |path|
    content = File.read(path, :encoding => 'utf-8')

    if path.split('.')[-1] == "json"
      log "Reading json corpus from #{path}"
      l = JSON.parse(content).map do |tweet|
        tweet['text']
      end
      lines.concat(l)
    elsif path.split('.')[-1] == "csv"
      log "Reading CSV corpus from #{path}"
      content = CSV.parse(content)
      header = content.shift
      text_col = header.index('text')
      l = content.map do |tweet|
        tweet[text_col]
      end
      lines.concat(l)
    else
      log "Reading plaintext corpus from #{path}"
      l = content.split("\n")
      lines.concat(l)
    end
  end
  consume_lines(lines)
end

#consume_lines(lines) ⇒ Object

Consume a sequence of lines

Parameters:

  • lines (Array<String>)


130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/twitter_ebooks/model.rb', line 130

def consume_lines(lines)
  log "Removing commented lines and sorting mentions"

  statements = []
  mentions = []
  lines.each do |l|
    next if l.start_with?('#') # Remove commented lines
    next if l.include?('RT') || l.include?('MT') # Remove soft retweets

    if l.include?('@')
      mentions << NLP.normalize(l)
    else
      statements << NLP.normalize(l)
    end
  end

  text = statements.join("\n")
  mention_text = mentions.join("\n")

  lines = nil; statements = nil; mentions = nil # Allow garbage collection

  log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"

  @sentences = mass_tikify(text)
  @mentions = mass_tikify(mention_text)

  log "Ranking keywords"
  @keywords = NLP.keywords(text).top(200).map(&:to_s)

  self
end

#find_relevant(sentences, input) ⇒ Array<Array<Array<Integer>>, Array<Array<Integer>>>

Finds relevant and slightly relevant tokenized sentences to input comparing non-stopword token overlaps

Parameters:

  • sentences (Array<Array<Integer>>)
  • input (String)

Returns:

  • (Array<Array<Array<Integer>>, Array<Array<Integer>>>)


258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/twitter_ebooks/model.rb', line 258

def find_relevant(sentences, input)
  relevant = []
  slightly_relevant = []

  tokenized = NLP.tokenize(input).map(&:downcase)

  sentences.each do |sent|
    tokenized.each do |token|
      if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
        relevant << sent unless NLP.stopword?(token)
        slightly_relevant << sent
      end
    end
  end

  [relevant, slightly_relevant]
end

#fix(text) ⇒ String

Correct encoding issues in generated text

Parameters:

  • text (String)

Returns:

  • (String)


196
197
198
# File 'lib/twitter_ebooks/model.rb', line 196

def fix(text)
  NLP.htmlentities.decode text
end

#make_response(input, limit = 140, sentences = @mentions) ⇒ String

Generates a response by looking for related sentences in the corpus and building a smaller generator from these

Parameters:

  • input (String)
  • limit (Integer) (defaults to: 140)

    characters available for response

  • sentences (Array<Array<Integer>>) (defaults to: @mentions)

Returns:

  • (String)


282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# File 'lib/twitter_ebooks/model.rb', line 282

def make_response(input, limit=140, sentences=@mentions)
  # Prefer mentions
  relevant, slightly_relevant = find_relevant(sentences, input)

  if relevant.length >= 3
    generator = SuffixGenerator.build(relevant)
    make_statement(limit, generator)
  elsif slightly_relevant.length >= 5
    generator = SuffixGenerator.build(slightly_relevant)
    make_statement(limit, generator)
  elsif sentences.equal?(@mentions)
    make_response(input, limit, @sentences)
  else
    make_statement(limit)
  end
end

#make_statement(limit = 140, generator = nil, retry_limit = 10) ⇒ String

Generate some text

Parameters:

  • limit (Integer) (defaults to: 140)

    available characters

  • generator (SuffixGenerator, nil) (defaults to: nil)
  • retry_limit (Integer) (defaults to: 10)

    how many times to retry on invalid tweet

Returns:

  • (String)


213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/twitter_ebooks/model.rb', line 213

def make_statement(limit=140, generator=nil, retry_limit=10)
  responding = !generator.nil?
  generator ||= SuffixGenerator.build(@sentences)

  retries = 0
  tweet = ""

  while (tikis = generator.generate(3, :bigrams)) do
    next if tikis.length <= 3 && !responding
    break if valid_tweet?(tikis, limit)

    retries += 1
    break if retries >= retry_limit
  end

  if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
    while (tikis = generator.generate(3, :unigrams)) do
      break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

      retries += 1
      break if retries >= retry_limit
    end
  end

  tweet = NLP.reconstruct(tikis, @tokens)

  if retries >= retry_limit
    log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
  end

  fix tweet
end

#mass_tikify(text) ⇒ Array<Array<Integer>>

Convert a body of text into arrays of tikis

Parameters:

  • text (String)

Returns:

  • (Array<Array<Integer>>)


89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/twitter_ebooks/model.rb', line 89

def mass_tikify(text)
  sentences = NLP.sentences(text)

  sentences.map do |s|
    tokens = NLP.tokenize(s).reject do |t|
      # Don't include usernames/urls as tokens
      t.include?('@') || t.include?('http')
    end

    tokens.map { |t| tikify(t) }
  end
end

#save(path) ⇒ Object

Save model to a file

Parameters:

  • path (String)


60
61
62
63
64
65
66
67
68
69
70
# File 'lib/twitter_ebooks/model.rb', line 60

def save(path)
  File.open(path, 'wb') do |f|
    f.write(Marshal.dump({
      tokens: @tokens,
      sentences: @sentences,
      mentions: @mentions,
      keywords: @keywords
    }))
  end
  self
end

#tikify(token) ⇒ Integer

Reverse lookup a token index from a token

Parameters:

  • token (String)

Returns:

  • (Integer)


82
83
84
# File 'lib/twitter_ebooks/model.rb', line 82

def tikify(token)
  @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
end

#valid_tweet?(tikis, limit) ⇒ Boolean

Check if an array of tikis comprises a valid tweet

Parameters:

  • tikis (Array<Integer>)
  • limit

    Integer how many chars we have left

Returns:

  • (Boolean)


203
204
205
206
# File 'lib/twitter_ebooks/model.rb', line 203

def valid_tweet?(tikis, limit)
  tweet = NLP.reconstruct(tikis, @tokens)
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end

#verbatim?(tikis) ⇒ Boolean

Test if a sentence has been copied verbatim from original

Parameters:

  • tikis (Array<Integer>)

Returns:

  • (Boolean)


249
250
251
# File 'lib/twitter_ebooks/model.rb', line 249

def verbatim?(tikis)
  @sentences.include?(tikis) || @mentions.include?(tikis)
end