Class: WCC::TextAnalysis
- Inherits:
-
Object
- Object
- WCC::TextAnalysis
- Defined in:
- lib/wcc/text_analysis.rb,
lib/wcc/text_analysis/version.rb
Constant Summary collapse
- TEXT_BLACKLIST =
/[…]/- VERSION =
'0.0.2'.freeze
Instance Attribute Summary collapse
-
#normalized ⇒ Object
readonly
Returns the value of attribute normalized.
-
#stripped ⇒ Object
readonly
Returns the value of attribute stripped.
Class Method Summary collapse
- .default_exclusions ⇒ Object
-
.default_stopwords ⇒ Object
Stopwords from www.ranks.nl/stopwords.
- .extract_terms(db_file) ⇒ Object
Instance Method Summary collapse
- #evaluate_length ⇒ Object
-
#initialize(string, stop_words: self.class.default_stopwords) ⇒ TextAnalysis
constructor
A new instance of TextAnalysis.
- #terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object
Constructor Details
#initialize(string, stop_words: self.class.default_stopwords) ⇒ TextAnalysis
Returns a new instance of TextAnalysis.
26 27 28 29 |
# File 'lib/wcc/text_analysis.rb', line 26 def initialize(string, stop_words: self.class.default_stopwords) @original = string @stop_words = stop_words end |
Instance Attribute Details
#normalized ⇒ Object (readonly)
Returns the value of attribute normalized.
9 10 11 |
# File 'lib/wcc/text_analysis.rb', line 9 def normalized @normalized end |
#stripped ⇒ Object (readonly)
Returns the value of attribute stripped.
9 10 11 |
# File 'lib/wcc/text_analysis.rb', line 9 def stripped @stripped end |
Class Method Details
.default_exclusions ⇒ Object
22 23 24 |
# File 'lib/wcc/text_analysis.rb', line 22 def self.default_exclusions @default_exclusions ||= extract_terms('transcript_exclusions.txt') end |
.default_stopwords ⇒ Object
Stopwords from www.ranks.nl/stopwords
18 19 20 |
# File 'lib/wcc/text_analysis.rb', line 18 def self.default_stopwords @default_stopwords ||= extract_terms('stop_words.txt') end |
.extract_terms(db_file) ⇒ Object
11 12 13 14 15 |
# File 'lib/wcc/text_analysis.rb', line 11 def self.extract_terms(db_file) File.read( File.join(File.dirname(__FILE__), '../../db', db_file), ).split("\n") end |
Instance Method Details
#evaluate_length ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/wcc/text_analysis.rb', line 40 def evaluate_length @normalized ||= normalize(@original).downcase @stripped ||= remove_ignored_tokens(tokenize(@normalized)) <<-OUTPUT.strip_heredoc Original \tLength: #{@original.length} \tWordcount: #{tokenize(@original).length} Normalized (removed markdown chars & whitespace) \tLength: #{@normalized.length} \tWordcount: #{tokenize(@normalized).length} Processed (removed above & stopwords) \tLength: #{@stripped.join(' ').length} \tWordcount: #{@stripped.length} \t Unique: #{@stripped.uniq.length} \t Unique Length: #{@stripped.uniq.join(' ').length} OUTPUT end |
#terms_by_frequency(phrase_length: 1, min_occurrences: 3) ⇒ Object
31 32 33 34 35 36 37 38 |
# File 'lib/wcc/text_analysis.rb', line 31 def terms_by_frequency(phrase_length: 1, min_occurrences: 3) @normalized ||= normalize(@original) terms = ::Phrasie::Extractor.new .phrases(@normalized, occur: min_occurrences) .select { |t| t.last == phrase_length } .map(&:first) remove_ignored_tokens terms end |