Class: WCC::TextAnalysis

Inherits:
Object
  • Object
show all
Defined in:
lib/wcc/text_analysis.rb,
lib/wcc/text_analysis/version.rb

Constant Summary collapse

TEXT_BLACKLIST =
/[…]/
VERSION =
'0.0.2'.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string, stop_words: self.class.default_stopwords) ⇒ TextAnalysis

Returns a new instance of TextAnalysis.



26
27
28
29
# File 'lib/wcc/text_analysis.rb', line 26

def initialize(string, stop_words: self.class.default_stopwords)
  @original = string
  @stop_words = stop_words
end

Instance Attribute Details

#normalizedObject (readonly)

Returns the value of attribute normalized.



9
10
11
# File 'lib/wcc/text_analysis.rb', line 9

def normalized
  @normalized
end

#strippedObject (readonly)

Returns the value of attribute stripped.



9
10
11
# File 'lib/wcc/text_analysis.rb', line 9

def stripped
  @stripped
end

Class Method Details

.default_exclusionsObject



22
23
24
# File 'lib/wcc/text_analysis.rb', line 22

def self.default_exclusions
  @default_exclusions ||= extract_terms('transcript_exclusions.txt')
end

.default_stopwordsObject

Stopwords from www.ranks.nl/stopwords



18
19
20
# File 'lib/wcc/text_analysis.rb', line 18

def self.default_stopwords
  @default_stopwords ||= extract_terms('stop_words.txt')
end

.extract_terms(db_file) ⇒ Object



11
12
13
14
15
# File 'lib/wcc/text_analysis.rb', line 11

def self.extract_terms(db_file)
  File.read(
    File.join(File.dirname(__FILE__), '../../db', db_file),
  ).split("\n")
end

Instance Method Details

#evaluate_lengthObject



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/wcc/text_analysis.rb', line 40

def evaluate_length
  @normalized ||= normalize(@original).downcase
  @stripped ||= remove_ignored_tokens(tokenize(@normalized))

  <<-OUTPUT.strip_heredoc
    Original
    \tLength: #{@original.length}
    \tWordcount: #{tokenize(@original).length}

    Normalized (removed markdown chars & whitespace)
    \tLength: #{@normalized.length}
    \tWordcount: #{tokenize(@normalized).length}

    Processed (removed above & stopwords)
    \tLength: #{@stripped.join(' ').length}
    \tWordcount: #{@stripped.length}
    \t   Unique: #{@stripped.uniq.length}
    \t   Unique Length: #{@stripped.uniq.join(' ').length}
  OUTPUT
end

#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object



31
32
33
34
35
36
37
38
# File 'lib/wcc/text_analysis.rb', line 31

def terms_by_frequency(phrase_length: 1, min_occurrences: 3)
  @normalized ||= normalize(@original)
  terms = ::Phrasie::Extractor.new
    .phrases(@normalized, occur: min_occurrences)
    .select { |t| t.last == phrase_length }
    .map(&:first)
  remove_ignored_tokens terms
end