Class: WCC::TextAnalysis

Inherits:

Object

Object
WCC::TextAnalysis

show all

Defined in:: lib/wcc/text_analysis.rb,
lib/wcc/text_analysis/version.rb

Constant Summary collapse

TEXT_BLACKLIST =

/[…]/

VERSION =

'0.0.1'.freeze

Instance Attribute Summary collapse

#normalized ⇒ Object readonly

Returns the value of attribute normalized.
#stripped ⇒ Object readonly

Returns the value of attribute stripped.

Class Method Summary collapse

.extract_terms(file) ⇒ Object

Instance Method Summary collapse

#evaluate_length ⇒ Object
#initialize(string, stop_words: STOPWORDS) ⇒ TextAnalysis constructor

A new instance of TextAnalysis.
#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object

Constructor Details

#initialize(string, stop_words: STOPWORDS) ⇒ `TextAnalysis`

Returns a new instance of TextAnalysis.

# File 'lib/wcc/text_analysis.rb', line 15

def initialize(string, stop_words: STOPWORDS)
  @original = string
  @stop_words = stop_words
end

Instance Attribute Details

#normalized ⇒ `Object` (readonly)

Returns the value of attribute normalized.



9
10
11

# File 'lib/wcc/text_analysis.rb', line 9

def normalized
  @normalized
end

#stripped ⇒ `Object` (readonly)

Returns the value of attribute stripped.



9
10
11

# File 'lib/wcc/text_analysis.rb', line 9

def stripped
  @stripped
end

Class Method Details

.extract_terms(file) ⇒ `Object`



11
12
13

# File 'lib/wcc/text_analysis.rb', line 11

def self.extract_terms(file)
  File.read("db/#{file}.txt").split("\n")
end

Instance Method Details

#evaluate_length ⇒ `Object`

# File 'lib/wcc/text_analysis.rb', line 29

def evaluate_length
  @normalized ||= normalize(@original).downcase
  @stripped ||= remove_ignored_tokens(tokenize(@normalized))

  <<-OUTPUT.strip_heredoc
    Original
    \tLength: #{@original.length}
    \tWordcount: #{tokenize(@original).length}

    Normalized (removed markdown chars & whitespace)
    \tLength: #{@normalized.length}
    \tWordcount: #{tokenize(@normalized).length}

    Processed (removed above & stopwords)
    \tLength: #{@stripped.join(' ').length}
    \tWordcount: #{@stripped.length}
    \t   Unique: #{@stripped.uniq.length}
    \t   Unique Length: #{@stripped.uniq.join(' ').length}
  OUTPUT
end

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ `Object`

# File 'lib/wcc/text_analysis.rb', line 20

def terms_by_frequency(phrase_length: 1, min_occurrences: 3)
  @normalized ||= normalize(@original)
  terms = ::Phrasie::Extractor.new
    .phrases(@normalized, occur: min_occurrences)
    .select { |t| t.last == phrase_length }
    .map(&:first)
  remove_ignored_tokens terms
end

Class: WCC::TextAnalysis

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string, stop_words: STOPWORDS) ⇒ TextAnalysis

Instance Attribute Details

#normalized ⇒ Object (readonly)

#stripped ⇒ Object (readonly)

Class Method Details

.extract_terms(file) ⇒ Object

Instance Method Details

#evaluate_length ⇒ Object

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object

#initialize(string, stop_words: STOPWORDS) ⇒ `TextAnalysis`

#normalized ⇒ `Object` (readonly)

#stripped ⇒ `Object` (readonly)

.extract_terms(file) ⇒ `Object`

#evaluate_length ⇒ `Object`

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ `Object`