Class: Raingrams::Model

Inherits:
Object show all
Defined in:
lib/raingrams/model.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}, &block) ⇒ Model

Creates a new NgramModel with the specified options.

options must contain the following keys:

:ngram_size

The size of each gram.

options may contain the following keys:

:ignore_case

Defaults to false.

:ignore_punctuation

Defaults to true.

:ignore_urls

Defaults to false.

:ignore_phone_numbers

Defaults to false.



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/raingrams/model.rb', line 52

def initialize(options={},&block)
  @ngram_size = options[:ngram_size]
  @starting_ngram = Ngram.new(Tokens.start * @ngram_size)
  @stoping_ngram = Ngram.new(Tokens.stop * @ngram_size)

  @ignore_case = false
  @ignore_punctuation = true
  @ignore_urls = true
  @ignore_phone_numbers = false
  @ignore_references = false

  if options.has_key?(:ignore_case)
    @ignore_case = options[:ignore_case]
  end

  if options.has_key?(:ignore_punctuation)
    @ignore_punctuation = options[:ignore_punctuation]
  end

  if options.has_key?(:ignore_urls)
    @ignore_urls = options[:ignore_urls]
  end

  if options.has_key?(:ignore_phone_numbers)
    @ignore_phone_numbers = options[:ignore_phone_numbers]
  end

  if options.has_key?(:ignore_references)
    @ignore_references = options[:ignore_references]
  end

  @prefixes = {}

  block.call(self) if block
end

Instance Attribute Details

#ignore_caseObject (readonly)

Ignore case of parsed text



23
24
25
# File 'lib/raingrams/model.rb', line 23

def ignore_case
  @ignore_case
end

#ignore_phone_numbersObject (readonly)

Ignore Phone numbers



32
33
34
# File 'lib/raingrams/model.rb', line 32

def ignore_phone_numbers
  @ignore_phone_numbers
end

#ignore_punctuationObject (readonly)

Ignore the punctuation of parsed text



26
27
28
# File 'lib/raingrams/model.rb', line 26

def ignore_punctuation
  @ignore_punctuation
end

#ignore_referencesObject (readonly)

Ignore References



35
36
37
# File 'lib/raingrams/model.rb', line 35

def ignore_references
  @ignore_references
end

#ignore_urlsObject (readonly)

Ignore URLs



29
30
31
# File 'lib/raingrams/model.rb', line 29

def ignore_urls
  @ignore_urls
end

#ngram_sizeObject (readonly)

Size of ngrams to use



14
15
16
# File 'lib/raingrams/model.rb', line 14

def ngram_size
  @ngram_size
end

#prefixesObject (readonly)

Probabilities of all (n-1) grams



38
39
40
# File 'lib/raingrams/model.rb', line 38

def prefixes
  @prefixes
end

#starting_ngramObject (readonly)

The sentence starting ngram



17
18
19
# File 'lib/raingrams/model.rb', line 17

def starting_ngram
  @starting_ngram
end

#stoping_ngramObject (readonly)

The sentence stopping ngram



20
21
22
# File 'lib/raingrams/model.rb', line 20

def stoping_ngram
  @stoping_ngram
end

Class Method Details

.build(options = {}, &block) ⇒ Object

Creates a new model object with the given options. If a block is given, it will be passed the newly created model. After the block as been called the model will be built.



93
94
95
96
97
# File 'lib/raingrams/model.rb', line 93

def self.build(options={},&block)
  self.new(options) do |model|
    model.build(&block)
  end
end

.open(path) ⇒ Object

Marshals a model from the contents of the file at the specified path.



143
144
145
146
147
148
149
150
151
# File 'lib/raingrams/model.rb', line 143

def self.open(path)
  model = nil

  File.open(path) do |file|
    model = Marshal.load(file)
  end

  return model
end

.train_with_file(path, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the contents of the specified path.



123
124
125
126
127
# File 'lib/raingrams/model.rb', line 123

def self.train_with_file(path,options={})
  self.build(options) do |model|
    model.train_with_file(path)
  end
end

.train_with_paragraph(paragraph, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the specified paragraph.



103
104
105
106
107
# File 'lib/raingrams/model.rb', line 103

def self.train_with_paragraph(paragraph,options={})
  self.build(options) do |model|
    model.train_with_paragraph(paragraph)
  end
end

.train_with_text(text, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the specified text.



113
114
115
116
117
# File 'lib/raingrams/model.rb', line 113

def self.train_with_text(text,options={})
  self.build(options) do |model|
    model.train_with_text(text)
  end
end

.train_with_url(url, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the inner text of the paragraphs tags at the specified url.



133
134
135
136
137
# File 'lib/raingrams/model.rb', line 133

def self.train_with_url(url,options={})
  self.build(options) do |model|
    model.train_with_url(url)
  end
end

Instance Method Details

#build(&block) ⇒ Object

Clears and rebuilds the model.



830
831
832
833
834
835
836
# File 'lib/raingrams/model.rb', line 830

def build(&block)
  refresh do
    clear

    block.call(self) if block
  end
end

#clearObject

Clears the model of any training data.



841
842
843
844
# File 'lib/raingrams/model.rb', line 841

def clear
  @prefixes.clear
  return self
end

#common_ngrams_from_fragment(fragment) ⇒ Object

Returns the ngrams which occur within the specified fragment and within the model.



475
476
477
# File 'lib/raingrams/model.rb', line 475

def common_ngrams_from_fragment(fragment)
  ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
end

#common_ngrams_from_sentence(sentence) ⇒ Object

Returns the ngrams which occur within the specified sentence and within the model.



483
484
485
# File 'lib/raingrams/model.rb', line 483

def common_ngrams_from_sentence(sentence)
  ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
end

#common_ngrams_from_text(text) ⇒ Object

Returns the ngrams which occur within the specified text and within the model.



491
492
493
# File 'lib/raingrams/model.rb', line 491

def common_ngrams_from_text(text)
  ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
end

#common_ngrams_from_words(words) ⇒ Object

Returns the ngrams which occur within the specified words and within the model.



467
468
469
# File 'lib/raingrams/model.rb', line 467

def common_ngrams_from_words(words)
  ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
end

#each_ngram(&block) ⇒ Object

Iterates over the ngrams that compose the model, passing each one to the given block.



231
232
233
234
235
236
237
238
239
# File 'lib/raingrams/model.rb', line 231

def each_ngram(&block)
  @prefixes.each do |prefix,table|
    table.each_gram do |postfix_gram|
      block.call(prefix + postfix_gram) if block
    end
  end

  return self
end

#fragment_commonality(fragment) ⇒ Object

Returns the joint probability of the common ngrams between the specified fragment and the model.



660
661
662
# File 'lib/raingrams/model.rb', line 660

def fragment_commonality(fragment)
  probability_of_ngrams(common_ngrams_from_fragment(fragment))
end

#fragment_probability(fragment) ⇒ Object

Returns the probability of the specified fragment occuring within arbitrary text.



636
637
638
# File 'lib/raingrams/model.rb', line 636

def fragment_probability(fragment)
  probability_of_ngrams(ngrams_from_fragment(fragment))
end

#fragment_similarity(fragment, other_model) ⇒ Object

Returns the conditional probability of the commonality of the specified fragment against the other_model, given the commonality of the fragment against the model.



685
686
687
# File 'lib/raingrams/model.rb', line 685

def fragment_similarity(fragment,other_model)
  other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
end

#frequencies_for(ngrams) ⇒ Object

Returns the observed frequency of the specified ngrams occurring within the training text.



588
589
590
591
592
593
594
595
596
# File 'lib/raingrams/model.rb', line 588

def frequencies_for(ngrams)
  table = {}

  ngrams.each do |ngram|
    table[ngram] = frequency_of_ngram(ngram)
  end

  return table
end

#frequency_of_ngram(ngram) ⇒ Object

Returns the observed frequency of the specified ngram within the training text.



560
561
562
563
564
565
566
567
568
# File 'lib/raingrams/model.rb', line 560

def frequency_of_ngram(ngram)
  prefix = ngram.prefix

  if @prefixes.has_key?(prefix)
    return @prefixes[prefix].frequency_of(ngram.last)
  else
    return 0
  end
end

#frequency_of_ngrams(ngrams) ⇒ Object

Returns the total observed frequency of the specified ngrams occurring within the training text.



616
617
618
619
620
# File 'lib/raingrams/model.rb', line 616

def frequency_of_ngrams(ngrams)
  frequencies_for(ngrams).values.inject do |total,freq|
    total + freq
  end
end

#gramsObject

Returns all grams within the model.



421
422
423
424
425
# File 'lib/raingrams/model.rb', line 421

def grams
  @prefixes.keys.inject(Set.new) do |all_grams,gram|
    all_grams + gram
  end
end

#grams_following(gram) ⇒ Object

Returns all grams which occur directly after the specified gram.



453
454
455
456
457
458
459
460
461
# File 'lib/raingrams/model.rb', line 453

def grams_following(gram)
  gram_set = Set.new

  ngram_starting_with(gram).each do |ngram|
    gram_set << ngram[1]
  end

  return gram_set
end

#grams_preceeding(gram) ⇒ Object

Returns all grams which preceed the specified gram.



440
441
442
443
444
445
446
447
448
# File 'lib/raingrams/model.rb', line 440

def grams_preceeding(gram)
  gram_set = Set.new

  ngrams_ending_with(gram).each do |ngram|
    gram_set << ngram[-2]
  end

  return gram_set
end

#has_gram?(gram) ⇒ Boolean

Returns true if the model contain the specified gram, returns false otherwise.

Returns:

  • (Boolean)


431
432
433
434
435
# File 'lib/raingrams/model.rb', line 431

def has_gram?(gram)
  @prefixes.keys.any? do |prefix|
    prefix.include?(gram)
  end
end

#has_ngram?(ngram) ⇒ Boolean

Returns true if the model contains the specified ngram, returns false otherwise.

Returns:

  • (Boolean)


219
220
221
222
223
224
225
# File 'lib/raingrams/model.rb', line 219

def has_ngram?(ngram)
  if @prefixes.has_key?(ngram.prefix)
    return @prefixes[ngram.prefix].has_gram?(ngram.last)
  else
    return false
  end
end

#ngramsObject

Returns the ngrams that compose the model.



203
204
205
206
207
208
209
210
211
212
213
# File 'lib/raingrams/model.rb', line 203

def ngrams
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    table.each_gram do |postfix_gram|
      ngram_set << (prefix + postfix_gram)
    end
  end

  return ngram_set
end

#ngrams_ending_with(gram) ⇒ Object

Returns the ngrams which end with the specified gram.



306
307
308
309
310
311
312
313
314
315
316
# File 'lib/raingrams/model.rb', line 306

def ngrams_ending_with(gram)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if table.has_gram?(gram)
      ngram_set << (prefix + gram)
    end
  end

  return ngram_set
end

#ngrams_following(gram) ⇒ Object

Returns all ngrams which occur directly after the specified gram.



406
407
408
409
410
411
412
413
414
415
416
# File 'lib/raingrams/model.rb', line 406

def ngrams_following(gram)
  ngram_set = NgramSet.new

  ngrams_starting_with(gram).each do |starts_with|
    ngrams_prefixed_by(starts_with.postfix).each do |ngram|
      ngram_set << ngram
    end
  end

  return ngram_set
end

#ngrams_from_fragment(fragment) ⇒ Object

Returns the ngrams extracted from the specified fragment of text.



366
367
368
# File 'lib/raingrams/model.rb', line 366

def ngrams_from_fragment(fragment)
  ngrams_from_words(parse_sentence(fragment))
end

#ngrams_from_sentence(sentence) ⇒ Object

Returns the ngrams extracted from the specified sentence.



373
374
375
# File 'lib/raingrams/model.rb', line 373

def ngrams_from_sentence(sentence)
  ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
end

#ngrams_from_text(text) ⇒ Object Also known as: ngrams_from_paragraph

Returns the ngrams extracted from the specified text.



380
381
382
383
384
# File 'lib/raingrams/model.rb', line 380

def ngrams_from_text(text)
  parse_text(text).inject([]) do |ngrams,sentence|
    ngrams + ngrams_from_sentence(sentence)
  end
end

#ngrams_from_words(words) ⇒ Object

Returns the ngrams extracted from the specified words.



357
358
359
360
361
# File 'lib/raingrams/model.rb', line 357

def ngrams_from_words(words)
  return (0...(words.length-@ngram_size+1)).map do |index|
    Ngram.new(words[index,@ngram_size])
  end
end

#ngrams_including_all(*grams) ⇒ Object

Returns the ngrams including all of the specified grams.



344
345
346
347
348
349
350
351
352
# File 'lib/raingrams/model.rb', line 344

def ngrams_including_all(*grams)
  ngram_set = NgramSet.new

  each_ngram do |ngram|
    ngram_set << ngram if ngram.includes_all?(*grams)
  end

  return ngram_set
end

#ngrams_including_any(*grams) ⇒ Object

Returns the ngrams including any of the specified grams.



321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/raingrams/model.rb', line 321

def ngrams_including_any(*grams)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if prefix.includes_any?(*grams)
      table.each_gram do |postfix_gram|
        ngram_set << (prefix + postfix_gram)
      end
    else
      table.each_gram do |postfix_gram|
        if grams.include?(postfix_gram)
          ngram_set << (prefix + postfix_gram)
        end
      end
    end
  end

  return ngram_set
end

#ngrams_postfixed_by(postfix) ⇒ Object

Returns the ngrams postfixed by the specified postfix.



272
273
274
275
276
277
278
279
280
281
282
283
284
# File 'lib/raingrams/model.rb', line 272

def ngrams_postfixed_by(postfix)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if prefix[1..-1] == postfix[0..-2]
      if table.has_gram?(postfix.last)
        ngram_set << (prefix + postfix.last)
      end
    end
  end

  return ngram_set
end

#ngrams_preceeding(gram) ⇒ Object

Returns all ngrams which preceed the specified gram.



391
392
393
394
395
396
397
398
399
400
401
# File 'lib/raingrams/model.rb', line 391

def ngrams_preceeding(gram)
  ngram_set = NgramSet.new

  ngrams_ending_with(gram).each do |ends_with|
    ngrams_postfixed_by(ends_with.prefix).each do |ngram|
      ngram_set << ngram
    end
  end

  return ngram_set
end

#ngrams_prefixed_by(prefix) ⇒ Object

Returns the ngrams prefixed by the specified prefix.



257
258
259
260
261
262
263
264
265
266
267
# File 'lib/raingrams/model.rb', line 257

def ngrams_prefixed_by(prefix)
  ngram_set = NgramSet.new

  return ngram_set unless @prefixes.has_key?(prefix)

  ngram_set += @prefixes[prefix].grams.map do |gram|
    prefix + gram
  end

  return ngram_set
end

#ngrams_starting_with(gram) ⇒ Object

Returns the ngrams starting with the specified gram.



289
290
291
292
293
294
295
296
297
298
299
300
301
# File 'lib/raingrams/model.rb', line 289

def ngrams_starting_with(gram)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if prefix.first == gram
      table.each_gram do |postfix_gram|
        ngram_set << (prefix + postfix_gram)
      end
    end
  end

  return ngram_set
end

#ngrams_with(&block) ⇒ Object

Selects the ngrams that match the given block.



244
245
246
247
248
249
250
251
252
# File 'lib/raingrams/model.rb', line 244

def ngrams_with(&block)
  selected_ngrams = NgramSet.new

  each_ngram do |ngram|
    selected_ngrams << ngram if block.call(ngram)
  end

  return selected_ngrams
end

#parse_sentence(sentence) ⇒ Object

Parses the specified sentence and returns an Array of tokens.



156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/raingrams/model.rb', line 156

def parse_sentence(sentence)
  sentence = sentence.to_s

  if @ignore_punctuation
    # eat tailing punctuation
    sentence.gsub!(/[\.\?!]*$/,'')
  end

  if @ignore_urls
    # remove URLs
    sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
  end

  if @ignore_phone_numbers
    # remove phone numbers
    sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
  end

  if @ignore_references
    # remove RFC style references
    sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
  end

  if @ignore_case
    # downcase the sentence
    sentence.downcase!
  end

  if @ignore_punctuation
    # split and ignore punctuation characters
    return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
  else
    # split and accept punctuation characters
    return sentence.scan(/[\w\-_,:;\.\?\!'"\\\/]+/)
  end
end

#parse_text(text) ⇒ Object

Parses the specified text and returns an Array of sentences.



196
197
198
# File 'lib/raingrams/model.rb', line 196

def parse_text(text)
  text.to_s.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
end

#probabilities_for(ngrams) ⇒ Object

Returns the probability of the specified ngrams occurring within arbitrary text.



602
603
604
605
606
607
608
609
610
# File 'lib/raingrams/model.rb', line 602

def probabilities_for(ngrams)
  table = {}

  ngrams.each do |ngram|
    table[ngram] = probability_of_ngram(ngram)
  end

  return table
end

#probability_of_ngram(ngram) ⇒ Object

Returns the probability of the specified ngram occurring within arbitrary text.



574
575
576
577
578
579
580
581
582
# File 'lib/raingrams/model.rb', line 574

def probability_of_ngram(ngram)
  prefix = ngram.prefix

  if @prefixes.has_key?(prefix)
    return @prefixes[prefix].probability_of(ngram.last)
  else
    return 0.0
  end
end

#probability_of_ngrams(ngrams) ⇒ Object

Returns the joint probability of the specified ngrams occurring within arbitrary text.



626
627
628
629
630
# File 'lib/raingrams/model.rb', line 626

def probability_of_ngrams(ngrams)
  probabilities_for(ngrams).values.inject do |joint,prob|
    joint * prob
  end
end

#random_gramObject

Returns a random gram from the model.



710
711
712
713
714
# File 'lib/raingrams/model.rb', line 710

def random_gram
  prefix = @prefixes.keys[rand(@prefixes.length)]

  return prefix[rand(prefix.length)]
end

#random_gram_sentence(options = {}) ⇒ Object

Returns a randomly generated sentence of grams using the given options.



734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
# File 'lib/raingrams/model.rb', line 734

def random_gram_sentence(options={})
  grams = []
  last_ngram = @starting_ngram
  
  loop do
    next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
    last_ngram = next_ngrams[rand(next_ngrams.length)]

    if last_ngram.nil?
      return []
    else
      last_gram = last_ngram.last

      break if last_gram == Tokens.stop

      grams << last_gram
    end
  end

  return grams
end

#random_ngramObject

Returns a random ngram from the model.



719
720
721
722
723
724
725
726
727
728
# File 'lib/raingrams/model.rb', line 719

def random_ngram
  prefix_index = rand(@prefixes.length)

  prefix = @prefixes.keys[prefix_index]
  table = @prefixes.values[prefix_index]

  gram_index = rand(table.grams.length)

  return (prefix + table.grams[gram_index])
end

#random_paragraph(options = {}) ⇒ Object

Returns a randomly generated paragraph of text using the given options.

options may contain the following keys:

:min_sentences

Minimum number of sentences in the paragraph. Defaults to 3.

:max_sentences

Maximum number of sentences in the paragraph. Defaults to 6.



780
781
782
783
784
785
786
787
788
789
790
# File 'lib/raingrams/model.rb', line 780

def random_paragraph(options={})
  min_sentences = (options[:min_sentences] || 3)
  max_sentences = (options[:max_sentences] || 6)
  sentences = []

  (rand(max_sentences - min_sentences) + min_sentences).times do
    sentences << random_sentence(options)
  end

  return sentences.join(' ')
end

#random_sentence(options = {}) ⇒ Object

Returns a randomly generated sentence of text using the given options.



760
761
762
763
764
765
766
767
768
# File 'lib/raingrams/model.rb', line 760

def random_sentence(options={})
  grams = random_gram_sentence(options)
  sentence = grams.delete_if { |gram|
    gram == Tokens.start || gram == Tokens.stop
  }.join(' ')

  sentence << '.' if @ignore_punctuation
  return sentence
end

#random_text(options = {}) ⇒ Object

Returns randomly generated text using the given options.

options may contain the following keys:

:min_sentences

Minimum number of sentences in the paragraph. Defaults to 3.

:max_sentences

Maximum number of sentences in the paragraph. Defaults to 6.

:min_paragraphs

Minimum number of paragraphs in the text. Defaults to 3.

:max_paragraphs

Maximum number of paragraphs in the text. Defaults to 5.



805
806
807
808
809
810
811
812
813
814
815
# File 'lib/raingrams/model.rb', line 805

def random_text(options={})
  min_paragraphs = (options[:min_paragraphs] || 3)
  max_paragraphs = (options[:max_paragraphs] || 6)
  paragraphs = []

  (rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
    paragraphs << random_paragraph(options)
  end

  return paragraphs.join("\n\n")
end

#refresh(&block) ⇒ Object

Refreshes the probability tables of the model.



820
821
822
823
824
825
# File 'lib/raingrams/model.rb', line 820

def refresh(&block)
  block.call(self) if block

  @prefixes.each_value { |table| table.build }
  return self
end

#save(path) ⇒ Object

Saves the model to the file at the specified path.



849
850
851
852
853
854
855
# File 'lib/raingrams/model.rb', line 849

def save(path)
  File.open(path,'w') do |file|
    Marshal.dump(self,file)
  end

  return self
end

#sentence_commonality(sentence) ⇒ Object

Returns the joint probability of the common ngrams between the specified sentence and the model.



668
669
670
# File 'lib/raingrams/model.rb', line 668

def sentence_commonality(sentence)
  probability_of_ngrams(common_ngrams_from_sentence(sentence))
end

#sentence_probability(sentence) ⇒ Object

Returns the probability of the specified sentence occuring within arbitrary text.



644
645
646
# File 'lib/raingrams/model.rb', line 644

def sentence_probability(sentence)
  probability_of_ngrams(ngrams_from_sentence(sentence))
end

#sentence_similarity(sentence, other_model) ⇒ Object

Returns the conditional probability of the commonality of the specified sentence against the other_model, given the commonality of the sentence against the model.



694
695
696
# File 'lib/raingrams/model.rb', line 694

def sentence_similarity(sentence,other_model)
  other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
end

#set_ngram_frequency(ngram, value) ⇒ Object

Sets the frequency of the specified ngram to the specified value.



498
499
500
# File 'lib/raingrams/model.rb', line 498

def set_ngram_frequency(ngram,value)
  probability_table(ngram).set_count(ngram.last,value)
end

#text_commonality(text) ⇒ Object

Returns the joint probability of the common ngrams between the specified sentence and the model.



676
677
678
# File 'lib/raingrams/model.rb', line 676

def text_commonality(text)
  probability_of_ngrams(common_ngrams_from_text(text))
end

#text_probability(text) ⇒ Object

Returns the probability of the specified text occuring within arbitrary text.



652
653
654
# File 'lib/raingrams/model.rb', line 652

def text_probability(text)
  probability_of_ngrams(ngrams_from_text(text))
end

#text_similarity(text, other_model) ⇒ Object

Returns the conditional probability of the commonality of the specified text against the other_model, given the commonality of the text against the model.



703
704
705
# File 'lib/raingrams/model.rb', line 703

def text_similarity(text,other_model)
  other_model.text_commonality(text) / text_commonality(text)
end

#train_with_file(path) ⇒ Object

Train the model with the contents of the specified path.



540
541
542
# File 'lib/raingrams/model.rb', line 540

def train_with_file(path)
  train_with_text(File.read(path))
end

#train_with_ngram(ngram) ⇒ Object

Train the model with the specified ngram.



505
506
507
# File 'lib/raingrams/model.rb', line 505

def train_with_ngram(ngram)
  probability_table(ngram).count(ngram.last)
end

#train_with_ngrams(ngrams) ⇒ Object

Train the model with the specified ngrams.



512
513
514
# File 'lib/raingrams/model.rb', line 512

def train_with_ngrams(ngrams)
  ngrams.each { |ngram| train_with_ngram(ngram) }
end

#train_with_paragraph(paragraph) ⇒ Object

Train the model with the specified paragraphs.



526
527
528
# File 'lib/raingrams/model.rb', line 526

def train_with_paragraph(paragraph)
  train_with_ngrams(ngrams_from_paragraph(paragraphs))
end

#train_with_sentence(sentence) ⇒ Object

Train the model with the specified sentence.



519
520
521
# File 'lib/raingrams/model.rb', line 519

def train_with_sentence(sentence)
  train_with_ngrams(ngrams_from_sentence(sentence))
end

#train_with_text(text) ⇒ Object

Train the model with the specified text.



533
534
535
# File 'lib/raingrams/model.rb', line 533

def train_with_text(text)
  train_with_ngrams(ngrams_from_text(text))
end

#train_with_url(url) ⇒ Object

Train the model with the inner text of the paragraph tags at the specified url.



548
549
550
551
552
553
554
# File 'lib/raingrams/model.rb', line 548

def train_with_url(url)
  doc = Hpricot(open(url))

  return doc.search('p').map do |p|
    train_with_paragraph(p.inner_text)
  end
end