Module: Rakit::WordCount

Defined in:
lib/rakit/word_count.rb

Overview

Schema-driven token frequency counting over JSON content (keys or values). CLI: rakit word-count (–json-keys). See specs/004-word-count/contracts/ruby-api.md.

Class Method Summary collapse

Class Method Details

.count(request) ⇒ Rakit::Generated::WordCountResult



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/rakit/word_count.rb', line 14

def count(request)
  json_str = resolve_json_input(request)
  return json_str if json_str.is_a?(Rakit::Generated::WordCountResult)

  data = JSON.parse(json_str)
  config = request.config || Rakit::Generated::WordCountConfig.new
  keys = extract_keys(data)
  tokens = keys.flat_map { |k| normalize_and_split(k, config) }
  counts = filter_count_sort(tokens, config)
  total = tokens.size
  unique = counts.size

  Rakit::Generated::WordCountResult.new(
    success: true,
    message: "",
    counts: counts.map { |tok, cnt| Rakit::Generated::TokenCount.new(token: tok, count: cnt) },
    exit_code: 0,
    stderr: "",
    total_tokens: total,
    unique_tokens: unique
  )
rescue JSON::ParserError => e
  Rakit::Generated::WordCountResult.new(
    success: false,
    message: e.message,
    exit_code: 1,
    stderr: e.message
  )
rescue Errno::ENOENT, Errno::EACCES => e
  Rakit::Generated::WordCountResult.new(
    success: false,
    message: e.message,
    exit_code: 1,
    stderr: e.message
  )
end

.extract_keys(obj) ⇒ Object

T004: Recursive key extraction from JSON structure. Returns array of key strings (duplicates preserved).



52
53
54
55
56
57
58
59
60
61
# File 'lib/rakit/word_count.rb', line 52

def extract_keys(obj)
  case obj
  when Hash
    obj.keys.map(&:to_s) + obj.values.flat_map { |v| extract_keys(v) }
  when Array
    obj.flat_map { |v| extract_keys(v) }
  else
    []
  end
end

.filter_count_sort(tokens, config) ⇒ Object

T006: Filter (min_length, stopwords), count, sort (count desc, token asc), top_n.



76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/rakit/word_count.rb', line 76

def filter_count_sort(tokens, config)
  min_len = config.min_token_length || 0
  stopwords = config.stopwords || []
  normalized_stop = config.case_insensitive ? stopwords.map(&:downcase).to_set : stopwords.to_set

  filtered = tokens.select do |t|
    compare_t = config.case_insensitive ? t.downcase : t
    t.length >= min_len && !normalized_stop.include?(compare_t)
  end
  freq = filtered.tally
  sorted = freq.sort_by { |token, count| [-count, token] }
  top_n = config.top_n || 0
  top_n.positive? ? sorted.take(top_n) : sorted
end

.normalize_and_split(token_string, config) ⇒ Object

T005: Normalize and split one key into tokens. Pipeline: case -> snake/kebab -> camelCase.



64
65
66
67
68
69
70
71
72
73
# File 'lib/rakit/word_count.rb', line 64

def normalize_and_split(token_string, config)
  return [] if token_string.nil? || !token_string.is_a?(String)

  s = token_string.dup
  s = s.downcase if config.case_insensitive
  parts = [s]
  parts = parts.flat_map { |p| p.split(/_|-/) } if config.split_snake_kebab
  parts = parts.flat_map { |p| p.split(/(?=[A-Z])/).reject(&:empty?) } if config.split_camel_case
  parts.map(&:downcase).reject(&:empty?)
end