Class: Ebooks::Model

Inherits:

Object

Object
Ebooks::Model

show all

Defined in:: lib/twitter_ebooks/model.rb

Instance Attribute Summary collapse

#keywords ⇒ Array<String>

The top 200 most important keywords, in descending order.
#mentions ⇒ Array<Array<Integer>>

Sentences derived from Twitter mentions.
#sentences ⇒ Array<Array<Integer>>

Sentences represented by arrays of tikis.
#tokens ⇒ Array<String>

An array of unique tokens.

Class Method Summary collapse

.consume(path) ⇒ Ebooks::Model

Generate a new model from a corpus file.
.consume_all(paths) ⇒ Ebooks::Model

Generate a new model from multiple corpus files.
.load(path) ⇒ Ebooks::Model

Load a saved model.

Instance Method Summary collapse

#append(path) ⇒ Object

Append a generated model to existing model file instead of overwriting it.
#consume(path) ⇒ Object

Consume a corpus into this model.
#consume_all(paths) ⇒ Object

Consume multiple corpuses into this model.
#consume_lines(lines) ⇒ Object

Consume a sequence of lines.
#find_relevant(sentences, input) ⇒ Array<Array<Array<Integer>>, Array<Array<Integer>>>

Finds relevant and slightly relevant tokenized sentences to input comparing non-stopword token overlaps.
#fix(text) ⇒ String

Correct encoding issues in generated text.
#initialize ⇒ Model constructor

A new instance of Model.
#make_response(input, limit = 280, sentences = @mentions) ⇒ String

Generates a response by looking for related sentences in the corpus and building a smaller generator from these.
#make_statement(limit = 280, generator = nil, retry_limit = 10) ⇒ String

Generate some text.
#mass_tikify(text) ⇒ Array<Array<Integer>>

Convert a body of text into arrays of tikis.
#save(path) ⇒ Object

Save model to a file.
#tikify(token) ⇒ Integer

Reverse lookup a token index from a token.
#valid_tweet?(tikis, limit) ⇒ Boolean

Check if an array of tikis comprises a valid tweet.
#verbatim?(tikis) ⇒ Boolean

Test if a sentence has been copied verbatim from original.

Constructor Details

#initialize ⇒ `Model`

Returns a new instance of Model.

# File 'lib/twitter_ebooks/model.rb', line 101

def initialize
  @tokens = []

  # Reverse lookup tiki by token, for faster generation
  @tikis = {}
end

Instance Attribute Details

#keywords ⇒ `Array<String>`

The top 200 most important keywords, in descending order

Returns:

(Array<String>)



27
28
29

# File 'lib/twitter_ebooks/model.rb', line 27

def keywords
  @keywords
end

#mentions ⇒ `Array<Array<Integer>>`

Sentences derived from Twitter mentions

Returns:

(Array<Array<Integer>>)



23
24
25

# File 'lib/twitter_ebooks/model.rb', line 23

def mentions
  @mentions
end

#sentences ⇒ `Array<Array<Integer>>`

Sentences represented by arrays of tikis

Returns:

(Array<Array<Integer>>)



19
20
21

# File 'lib/twitter_ebooks/model.rb', line 19

def sentences
  @sentences
end

#tokens ⇒ `Array<String>`

An array of unique tokens. This is the main source of actual strings in the model. Manipulation of a token is done using its index in this array, which we call a “tiki”

Returns:

(Array<String>)



15
16
17

# File 'lib/twitter_ebooks/model.rb', line 15

def tokens
  @tokens
end

Class Method Details

.consume(path) ⇒ `Ebooks::Model`

Generate a new model from a corpus file

Parameters:

path (String)

Returns:

(Ebooks::Model)



32
33
34

# File 'lib/twitter_ebooks/model.rb', line 32

def self.consume(path)
  Model.new.consume(path)
end

.consume_all(paths) ⇒ `Ebooks::Model`

Generate a new model from multiple corpus files

Parameters:

paths (Array<String>)

Returns:

(Ebooks::Model)



39
40
41

# File 'lib/twitter_ebooks/model.rb', line 39

def self.consume_all(paths)
  Model.new.consume_all(paths)
end

.load(path) ⇒ `Ebooks::Model`

Load a saved model

Parameters:

path (String)

Returns:

(Ebooks::Model)

# File 'lib/twitter_ebooks/model.rb', line 46

def self.load(path)
  model = Model.new
  model.instance_eval do
    props = Marshal.load(File.open(path, 'rb') { |f| f.read })
    @tokens = props[:tokens]
    @sentences = props[:sentences]
    @mentions = props[:mentions]
    @keywords = props[:keywords]
  end
  model
end

Instance Method Details

#append(path) ⇒ `Object`

Append a generated model to existing model file instead of overwriting it

Parameters:

path (String)

# File 'lib/twitter_ebooks/model.rb', line 74

def append(path)
  existing = File.file?(path)
  if !existing
    log "No existing model found at #{path}"
    return
  else
    #read-in and deserialize existing model
    props = Marshal.load(File.open(path,'rb') { |old| old.read })
    old_tokens = props[:tokens]
    old_sentences = props[:sentences]
    old_mentions = props[:mentions]
    old_keywords = props[:keywords]

    #append existing properties to new ones and overwrite with new model
    File.open(path, 'wb') do |f|
      f.write(Marshal.dump({
        tokens: @tokens.concat(old_tokens),
        sentences: @sentences.concat(old_sentences),
        mentions: @mentions.concat(old_mentions),
        keywords: @keywords.concat(old_keywords)
      }))
    end
  end
  self
end

#consume(path) ⇒ `Object`

Consume a corpus into this model

Parameters:

path (String)

# File 'lib/twitter_ebooks/model.rb', line 139

def consume(path)
  content = File.read(path, :encoding => 'utf-8')

  if path.split('.')[-1] == "json"
    log "Reading json corpus from #{path}"
    lines = JSON.parse(content).map do |tweet|
        tweet['text'] || tweet['full_text']
    end
  elsif path.split('.')[-1] == "csv"
    log "Reading CSV corpus from #{path}"
    content = CSV.parse(content)
    header = content.shift
    text_col = header.index('text')
    lines = content.map do |tweet|
      tweet[text_col]
    end
  else
    log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
    lines = content.split("\n")
  end

  consume_lines(lines)
end

#consume_all(paths) ⇒ `Object`

Consume multiple corpuses into this model

Parameters:

paths (Array<String>)

# File 'lib/twitter_ebooks/model.rb', line 200

def consume_all(paths)
  lines = []
  paths.each do |path|
    content = File.read(path, :encoding => 'utf-8')

    if path.split('.')[-1] == "json"
      log "Reading json corpus from #{path}"
      l = JSON.parse(content).map do |tweet|
        tweet['text'] || tweet['full_text']
      end
      lines.concat(l)
    elsif path.split('.')[-1] == "csv"
      log "Reading CSV corpus from #{path}"
      content = CSV.parse(content)
      header = content.shift
      text_col = header.index('text')
      l = content.map do |tweet|
        tweet[text_col]
      end
      lines.concat(l)
    else
      log "Reading plaintext corpus from #{path}"
      l = content.split("\n")
      lines.concat(l)
    end
  end
  consume_lines(lines)
end

#consume_lines(lines) ⇒ `Object`

Consume a sequence of lines

Parameters:

lines (Array<String>)

# File 'lib/twitter_ebooks/model.rb', line 165

def consume_lines(lines)
  log "Removing commented lines and sorting mentions"

  statements = []
  mentions = []
  lines.each do |l|
    next if l.start_with?('#') # Remove commented lines
    next if l.include?('RT') || l.include?('MT') # Remove soft retweets

    if l.include?('@')
      mentions << NLP.normalize(l)
    else
      statements << NLP.normalize(l)
    end
  end

  text = statements.join("\n").encode('UTF-8', :invalid => :replace)
  mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)

  lines = nil; statements = nil; mentions = nil # Allow garbage collection

  log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"

  @sentences = mass_tikify(text)
  @mentions = mass_tikify(mention_text)

  log "Ranking keywords"
  @keywords = NLP.keywords(text).top(200).map(&:to_s)
  log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"

  self
end

#find_relevant(sentences, input) ⇒ `Array<Array<Array<Integer>>, Array<Array<Integer>>>`

Finds relevant and slightly relevant tokenized sentences to input comparing non-stopword token overlaps

Parameters:

sentences (Array<Array<Integer>>)
input (String)

Returns:

(Array<Array<Array<Integer>>, Array<Array<Integer>>>)

# File 'lib/twitter_ebooks/model.rb', line 295

def find_relevant(sentences, input)
  relevant = []
  slightly_relevant = []

  tokenized = NLP.tokenize(input).map(&:downcase)

  sentences.each do |sent|
    tokenized.each do |token|
      if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
        relevant << sent unless NLP.stopword?(token)
        slightly_relevant << sent
      end
    end
  end

  [relevant, slightly_relevant]
end

#fix(text) ⇒ `String`

Correct encoding issues in generated text

Parameters:

text (String)

Returns:

(String)



232
233
234

# File 'lib/twitter_ebooks/model.rb', line 232

def fix(text)
  NLP.htmlentities.decode text
end

#make_response(input, limit = 280, sentences = @mentions) ⇒ `String`

Generates a response by looking for related sentences in the corpus and building a smaller generator from these

Parameters:

input (String)
limit (Integer) (defaults to: 280) —

characters available for response
sentences (Array<Array<Integer>>) (defaults to: @mentions)

Returns:

(String)

# File 'lib/twitter_ebooks/model.rb', line 319

def make_response(input, limit=280, sentences=@mentions)
  # Prefer mentions
  relevant, slightly_relevant = find_relevant(sentences, input)

  if relevant.length >= 3
    generator = SuffixGenerator.build(relevant)
    make_statement(limit, generator)
  elsif slightly_relevant.length >= 5
    generator = SuffixGenerator.build(slightly_relevant)
    make_statement(limit, generator)
  elsif sentences.equal?(@mentions)
    make_response(input, limit, @sentences)
  else
    make_statement(limit)
  end
end

#make_statement(limit = 280, generator = nil, retry_limit = 10) ⇒ `String`

Generate some text

Parameters:

limit (Integer) (defaults to: 280) —

available characters
generator (SuffixGenerator, nil) (defaults to: nil)
retry_limit (Integer) (defaults to: 10) —

how many times to retry on invalid tweet

Returns:

(String)

# File 'lib/twitter_ebooks/model.rb', line 249

def make_statement(limit=280, generator=nil, retry_limit=10)
  responding = !generator.nil?
  generator ||= SuffixGenerator.build(@sentences)

  retries = 0
  tweet = ""

  while (tikis = generator.generate(3, :bigrams)) do
    #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
    break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)

    retries += 1
    break if retries >= retry_limit
  end

  if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
    #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
    while (tikis = generator.generate(3, :unigrams)) do
      break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

      retries += 1
      break if retries >= retry_limit
    end
  end

  tweet = NLP.reconstruct(tikis, @tokens)

  if retries >= retry_limit
    log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
  end

  fix tweet
end

#mass_tikify(text) ⇒ `Array<Array<Integer>>`

Convert a body of text into arrays of tikis

Parameters:

text (String)

Returns:

(Array<Array<Integer>>)

# File 'lib/twitter_ebooks/model.rb', line 124

def mass_tikify(text)
  sentences = NLP.sentences(text)

  sentences.map do |s|
    tokens = NLP.tokenize(s).reject do |t|
      # Don't include usernames/urls as tokens
      t.include?('@') || t.include?('http')
    end

    tokens.map { |t| tikify(t) }
  end
end

#save(path) ⇒ `Object`

Save model to a file

Parameters:

path (String)

# File 'lib/twitter_ebooks/model.rb', line 60

def save(path)
  File.open(path, 'wb') do |f|
    f.write(Marshal.dump({
      tokens: @tokens,
      sentences: @sentences,
      mentions: @mentions,
      keywords: @keywords
    }))
  end
  self
end

#tikify(token) ⇒ `Integer`

Reverse lookup a token index from a token

Parameters:

token (String)

Returns:

(Integer)

# File 'lib/twitter_ebooks/model.rb', line 111

def tikify(token)
  if @tikis.has_key?(token) then
    return @tikis[token]
  else
    (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
    @tokens << token
    return @tikis[token] = @tokens.length-1
  end
end

#valid_tweet?(tikis, limit) ⇒ `Boolean`

Check if an array of tikis comprises a valid tweet

Parameters:

tikis (Array<Integer>)
limit —

Integer how many chars we have left

Returns:

(Boolean)

# File 'lib/twitter_ebooks/model.rb', line 239

def valid_tweet?(tikis, limit)
  tweet = NLP.reconstruct(tikis, @tokens)
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end

#verbatim?(tikis) ⇒ `Boolean`

Test if a sentence has been copied verbatim from original

Parameters:

tikis (Array<Integer>)

Returns:

(Boolean)



286
287
288

# File 'lib/twitter_ebooks/model.rb', line 286

def verbatim?(tikis)
  @sentences.include?(tikis) || @mentions.include?(tikis)
end

Class: Ebooks::Model

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ Model

Instance Attribute Details

#keywords ⇒ Array<String>

#mentions ⇒ Array<Array<Integer>>

#sentences ⇒ Array<Array<Integer>>

#tokens ⇒ Array<String>

Class Method Details

.consume(path) ⇒ Ebooks::Model

.consume_all(paths) ⇒ Ebooks::Model

.load(path) ⇒ Ebooks::Model

Instance Method Details

#append(path) ⇒ Object

#consume(path) ⇒ Object

#consume_all(paths) ⇒ Object

#consume_lines(lines) ⇒ Object

#find_relevant(sentences, input) ⇒ Array<Array<Array<Integer>>, Array<Array<Integer>>>

#fix(text) ⇒ String

#make_response(input, limit = 280, sentences = @mentions) ⇒ String

#make_statement(limit = 280, generator = nil, retry_limit = 10) ⇒ String

#mass_tikify(text) ⇒ Array<Array<Integer>>

#save(path) ⇒ Object

#tikify(token) ⇒ Integer

#valid_tweet?(tikis, limit) ⇒ Boolean

#verbatim?(tikis) ⇒ Boolean

#initialize ⇒ `Model`

#keywords ⇒ `Array<String>`

#mentions ⇒ `Array<Array<Integer>>`

#sentences ⇒ `Array<Array<Integer>>`

#tokens ⇒ `Array<String>`

.consume(path) ⇒ `Ebooks::Model`

.consume_all(paths) ⇒ `Ebooks::Model`

.load(path) ⇒ `Ebooks::Model`

#append(path) ⇒ `Object`

#consume(path) ⇒ `Object`

#consume_all(paths) ⇒ `Object`

#consume_lines(lines) ⇒ `Object`

#find_relevant(sentences, input) ⇒ `Array<Array<Array<Integer>>, Array<Array<Integer>>>`

#fix(text) ⇒ `String`

#make_response(input, limit = 280, sentences = @mentions) ⇒ `String`

#make_statement(limit = 280, generator = nil, retry_limit = 10) ⇒ `String`

#mass_tikify(text) ⇒ `Array<Array<Integer>>`

#save(path) ⇒ `Object`

#tikify(token) ⇒ `Integer`

#valid_tweet?(tikis, limit) ⇒ `Boolean`

#verbatim?(tikis) ⇒ `Boolean`