Class: WCC::TextAnalysis
- Inherits:
-
Object
- Object
- WCC::TextAnalysis
- Defined in:
- lib/wcc/text_analysis.rb,
lib/wcc/text_analysis/version.rb
Constant Summary collapse
- TEXT_BLACKLIST =
/[…]/- VERSION =
'0.0.1'.freeze
Instance Attribute Summary collapse
-
#normalized ⇒ Object
readonly
Returns the value of attribute normalized.
-
#stripped ⇒ Object
readonly
Returns the value of attribute stripped.
Class Method Summary collapse
Instance Method Summary collapse
- #evaluate_length ⇒ Object
-
#initialize(string, stop_words: STOPWORDS) ⇒ TextAnalysis
constructor
A new instance of TextAnalysis.
- #terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object
Constructor Details
#initialize(string, stop_words: STOPWORDS) ⇒ TextAnalysis
Returns a new instance of TextAnalysis.
15 16 17 18 |
# File 'lib/wcc/text_analysis.rb', line 15 def initialize(string, stop_words: STOPWORDS) @original = string @stop_words = stop_words end |
Instance Attribute Details
#normalized ⇒ Object (readonly)
Returns the value of attribute normalized.
9 10 11 |
# File 'lib/wcc/text_analysis.rb', line 9 def normalized @normalized end |
#stripped ⇒ Object (readonly)
Returns the value of attribute stripped.
9 10 11 |
# File 'lib/wcc/text_analysis.rb', line 9 def stripped @stripped end |
Class Method Details
.extract_terms(file) ⇒ Object
11 12 13 |
# File 'lib/wcc/text_analysis.rb', line 11 def self.extract_terms(file) File.read("db/#{file}.txt").split("\n") end |
Instance Method Details
#evaluate_length ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/wcc/text_analysis.rb', line 29 def evaluate_length @normalized ||= normalize(@original).downcase @stripped ||= remove_ignored_tokens(tokenize(@normalized)) <<-OUTPUT.strip_heredoc Original \tLength: #{@original.length} \tWordcount: #{tokenize(@original).length} Normalized (removed markdown chars & whitespace) \tLength: #{@normalized.length} \tWordcount: #{tokenize(@normalized).length} Processed (removed above & stopwords) \tLength: #{@stripped.join(' ').length} \tWordcount: #{@stripped.length} \t Unique: #{@stripped.uniq.length} \t Unique Length: #{@stripped.uniq.join(' ').length} OUTPUT end |
#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object
20 21 22 23 24 25 26 27 |
# File 'lib/wcc/text_analysis.rb', line 20 def terms_by_frequency(phrase_length: 1, min_occurrences: 3) @normalized ||= normalize(@original) terms = ::Phrasie::Extractor.new .phrases(@normalized, occur: min_occurrences) .select { |t| t.last == phrase_length } .map(&:first) remove_ignored_tokens terms end |