Class: WCC::TextAnalysis

Inherits:

Object

Object
WCC::TextAnalysis

show all

Defined in:: lib/wcc/text_analysis.rb,
lib/wcc/text_analysis/version.rb

Constant Summary collapse

TEXT_BLACKLIST =

/[…]/

VERSION =

'0.0.2'.freeze

Instance Attribute Summary collapse

#normalized ⇒ Object readonly

Returns the value of attribute normalized.
#stripped ⇒ Object readonly

Returns the value of attribute stripped.

Class Method Summary collapse

.default_exclusions ⇒ Object
.default_stopwords ⇒ Object

Stopwords from www.ranks.nl/stopwords.
.extract_terms(db_file) ⇒ Object

Instance Method Summary collapse

#evaluate_length ⇒ Object
#initialize(string, stop_words: self.class.default_stopwords) ⇒ TextAnalysis constructor

A new instance of TextAnalysis.
#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object

Constructor Details

#initialize(string, stop_words: self.class.default_stopwords) ⇒ `TextAnalysis`

Returns a new instance of TextAnalysis.

# File 'lib/wcc/text_analysis.rb', line 26

def initialize(string, stop_words: self.class.default_stopwords)
  @original = string
  @stop_words = stop_words
end

Instance Attribute Details

#normalized ⇒ `Object` (readonly)

Returns the value of attribute normalized.



9
10
11

# File 'lib/wcc/text_analysis.rb', line 9

def normalized
  @normalized
end

#stripped ⇒ `Object` (readonly)

Returns the value of attribute stripped.



9
10
11

# File 'lib/wcc/text_analysis.rb', line 9

def stripped
  @stripped
end

Class Method Details

.default_exclusions ⇒ `Object`



22
23
24

# File 'lib/wcc/text_analysis.rb', line 22

def self.default_exclusions
  @default_exclusions ||= extract_terms('transcript_exclusions.txt')
end

.default_stopwords ⇒ `Object`

Stopwords from www.ranks.nl/stopwords



18
19
20

# File 'lib/wcc/text_analysis.rb', line 18

def self.default_stopwords
  @default_stopwords ||= extract_terms('stop_words.txt')
end

.extract_terms(db_file) ⇒ `Object`

# File 'lib/wcc/text_analysis.rb', line 11

def self.extract_terms(db_file)
  File.read(
    File.join(File.dirname(__FILE__), '../../db', db_file),
  ).split("\n")
end

Instance Method Details

#evaluate_length ⇒ `Object`

# File 'lib/wcc/text_analysis.rb', line 40

def evaluate_length
  @normalized ||= normalize(@original).downcase
  @stripped ||= remove_ignored_tokens(tokenize(@normalized))

  <<-OUTPUT.strip_heredoc
    Original
    \tLength: #{@original.length}
    \tWordcount: #{tokenize(@original).length}

    Normalized (removed markdown chars & whitespace)
    \tLength: #{@normalized.length}
    \tWordcount: #{tokenize(@normalized).length}

    Processed (removed above & stopwords)
    \tLength: #{@stripped.join(' ').length}
    \tWordcount: #{@stripped.length}
    \t   Unique: #{@stripped.uniq.length}
    \t   Unique Length: #{@stripped.uniq.join(' ').length}
  OUTPUT
end

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ `Object`

# File 'lib/wcc/text_analysis.rb', line 31

def terms_by_frequency(phrase_length: 1, min_occurrences: 3)
  @normalized ||= normalize(@original)
  terms = ::Phrasie::Extractor.new
    .phrases(@normalized, occur: min_occurrences)
    .select { |t| t.last == phrase_length }
    .map(&:first)
  remove_ignored_tokens terms
end

Class: WCC::TextAnalysis

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string, stop_words: self.class.default_stopwords) ⇒ TextAnalysis

Instance Attribute Details

#normalized ⇒ Object (readonly)

#stripped ⇒ Object (readonly)

Class Method Details

.default_exclusions ⇒ Object

.default_stopwords ⇒ Object

.extract_terms(db_file) ⇒ Object

Instance Method Details

#evaluate_length ⇒ Object

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object

#initialize(string, stop_words: self.class.default_stopwords) ⇒ `TextAnalysis`

#normalized ⇒ `Object` (readonly)

#stripped ⇒ `Object` (readonly)

.default_exclusions ⇒ `Object`

.default_stopwords ⇒ `Object`

.extract_terms(db_file) ⇒ `Object`

#evaluate_length ⇒ `Object`

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ `Object`