Class: WCC::TextAnalysis

Inherits:
Object
  • Object
show all
Defined in:
lib/wcc/text_analysis.rb,
lib/wcc/text_analysis/version.rb

Constant Summary collapse

TEXT_BLACKLIST =
/[…]/
VERSION =
'0.0.1'.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string, stop_words: STOPWORDS) ⇒ TextAnalysis

Returns a new instance of TextAnalysis.



15
16
17
18
# File 'lib/wcc/text_analysis.rb', line 15

def initialize(string, stop_words: STOPWORDS)
  @original = string
  @stop_words = stop_words
end

Instance Attribute Details

#normalizedObject (readonly)

Returns the value of attribute normalized.



9
10
11
# File 'lib/wcc/text_analysis.rb', line 9

def normalized
  @normalized
end

#strippedObject (readonly)

Returns the value of attribute stripped.



9
10
11
# File 'lib/wcc/text_analysis.rb', line 9

def stripped
  @stripped
end

Class Method Details

.extract_terms(file) ⇒ Object



11
12
13
# File 'lib/wcc/text_analysis.rb', line 11

def self.extract_terms(file)
  File.read("db/#{file}.txt").split("\n")
end

Instance Method Details

#evaluate_lengthObject



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/wcc/text_analysis.rb', line 29

def evaluate_length
  @normalized ||= normalize(@original).downcase
  @stripped ||= remove_ignored_tokens(tokenize(@normalized))

  <<-OUTPUT.strip_heredoc
    Original
    \tLength: #{@original.length}
    \tWordcount: #{tokenize(@original).length}

    Normalized (removed markdown chars & whitespace)
    \tLength: #{@normalized.length}
    \tWordcount: #{tokenize(@normalized).length}

    Processed (removed above & stopwords)
    \tLength: #{@stripped.join(' ').length}
    \tWordcount: #{@stripped.length}
    \t   Unique: #{@stripped.uniq.length}
    \t   Unique Length: #{@stripped.uniq.join(' ').length}
  OUTPUT
end

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object



20
21
22
23
24
25
26
27
# File 'lib/wcc/text_analysis.rb', line 20

def terms_by_frequency(phrase_length: 1, min_occurrences: 3)
  @normalized ||= normalize(@original)
  terms = ::Phrasie::Extractor.new
    .phrases(@normalized, occur: min_occurrences)
    .select { |t| t.last == phrase_length }
    .map(&:first)
  remove_ignored_tokens terms
end