Module: Dejunk

Extended by:
Dejunk
Included in:
Dejunk
Defined in:
lib/dejunk.rb,
lib/dejunk/version.rb

Constant Summary collapse

MASH_CHARS =

All characters on the middle row of a QWERTY keyboard

'ASDFGHJKLasdfghjkl;: '
MASH_BIGRAMS =

All neighboring key pairs on a QWERTY keyboard, except “er” and “re” which each make up >1% of bigrams in our “good” sample, plus each letter repeated or with a space

(
  ("abcdefghijklmnopqrstuvwxyz".chars.flat_map { |l| ["#{l} ", "#{l}#{l}"] }) +
  %w( qw we rt ty yu ui op as sd df fg gh hj jk kl zx xd cv vb bn nm qa az ws sx ed dc rf fv tg gb yh hn uj jm ik ol )
).flat_map { |bigram| [bigram, bigram.reverse] }.to_set.freeze
VERSION =
"0.5.0"

Instance Method Summary collapse

Instance Method Details

#bigram_similarity_to_corpus(string) ⇒ Object

Cosine similarity between vector of frequencies of bigrams within string, and vector of frequencies of all bigrams within corpus



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/dejunk.rb', line 73

def bigram_similarity_to_corpus(string)
  bigrams = bigrams(string)

  freqs = bigrams.
    each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }.
    each_with_object({}) do |(bigram,count), freqs|
      freqs[bigram] = count.to_f / bigrams.length
    end

  numerator = freqs.
    map{ |bigram, freq| corpus_bigram_frequencies[bigram].to_f * freq }.inject(&:+)
  denominator = corpus_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5)

  numerator / denominator
end

#bigram_similarity_to_mashing(string) ⇒ Object

Cosine similarity between vector of frequencies of bigrams within string, and vector which assumes all bigrams made of neighboring pairs on the keyboard are equally likely, and no others appear



92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/dejunk.rb', line 92

def bigram_similarity_to_mashing(string)
  bigrams = bigrams(string)

  freqs = bigrams.
    each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }.
    each_with_object({}) do |(bigram,count), freqs|
      freqs[bigram] = count.to_f / bigrams.length
    end

  numerator = freqs.map{ |bigram, freq| freq * mashing_bigram_frequencies[bigram].to_f }.inject(&:+)
  denominator = mashing_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5)

  numerator / denominator
end

#bigrams(string) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/dejunk.rb', line 107

def bigrams(string)
  return [] if string.nil?

  string = string.strip
  return [] if string.length < 2

  string.
    chars.
    zip(string.chars[1..-1]).
    map { |c1,c2| "#{c1.mb_chars.downcase}#{c2.mb_chars.downcase}" if c1 && c2 }.
    compact.
    map { |bigram| bigram.gsub(/[0-9]/, '0'.freeze) }.
    map { |bigram| bigram.gsub(/[[:space:]]/, ' '.freeze) }
end

#is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: []) ⇒ Boolean

Returns:

  • (Boolean)


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/dejunk.rb', line 19

def is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: [])
  if string && (whitelist_strings.include?(string) || whitelist_regexes.any? { |re| string =~ re })
    return false
  end

  return :no_alpha if string.nil? || string !~ /[[:alpha:]]/

  normed = normalize_for_comparison(string)

  return :too_short if too_few_alphanumeric_chars?(normed, min_alnum_chars)
  return :one_char_repeat if excessive_single_character_repeats?(string, normed)
  return :starts_with_punct if starts_with_disallowed_punctuation?(string)
  return :too_many_short_words if too_many_short_words?(string)
  return :three_chars_repeat_twice if three_plus_chars_repeat_twice?(string)
  return :fuck if string =~ /\bfuck/i
  return :missing_vowels if missing_vowels?(string, normed)
  return :asdf_row if asdf_row_and_suspicious?(string)

  ascii_proportion = string.chars.count { |c| c.ord < 128 }.to_f / string.length

  # The bigrams look like the ones you'd get from keyboard mashing
  # (the probability shouldn't be taken too literally, > 0.25 is almost all
  # mashing in practice on our corpus)
  if string.length > 1 && ascii_proportion > 0.8
    if probability_of_keyboard_mashing(string) > 0.25
      return :mashing_bigrams
    end
  end

  # The bigrams don't look like the bigrams in legitimate strings
  if string.length > 6 && ascii_proportion > 0.8
    corpus_similarity = bigram_similarity_to_corpus(string)

    # The similarity is more accurate for longer strings, and with more ASCII,
    # so increase the value (= lower the threshold) for shorter strings and
    # strings with less ASCII.
    score = corpus_similarity * (1.0/ascii_proportion**2) * (1.0/(1 - Math.exp(-0.1*string.length)))

    if score < 0.03
      return :unlikely_bigrams
    elsif score < 0.08 && string !~ /\A([[:upper:]][[:lower:]]+ )*[[:upper:]][[:lower:]]+\z/
      # The similarity ignores casing, so instead use a higher threshold if
      # the casing looks wrong
      return :unlikely_bigrams
    elsif score < bigram_similarity_to_mashing(string)
      return :mashing_bigrams
    end
  end

  false
end

#normalize_for_comparison(string) ⇒ Object



146
147
148
149
150
151
152
153
# File 'lib/dejunk.rb', line 146

def normalize_for_comparison(string)
  string.
    mb_chars.
    unicode_normalize(:nfkd).
    gsub(/\p{Mn}+/, ''.freeze).
    gsub(/[^[:alnum:]]+/, ''.freeze).
    downcase
end

#probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1) ⇒ Object

The Bayesian probability of a string being keyboard mashing, given the probability of each bigram if drawn either from the legit corpus or from mashing, and an a priori probability of mashing.

The probability shouldn’t be taken too literally, but it’s a useful indicator.



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/dejunk.rb', line 128

def probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1)
  bigrams = bigrams(string)

  return 0 unless bigrams.present?

  prob_bigrams_given_mashing = bigrams.
    map { |bigram| BigDecimal(mashing_probability(bigram).to_s) }.
    inject(&:*)

  prob_bigrams_given_corpus = bigrams.
    map { |bigram| BigDecimal(corpus_probability(bigram).to_s) }.
    inject(&:*)

  numerator = prob_bigrams_given_mashing * apriori_probability_of_mashing

  numerator / (numerator + prob_bigrams_given_corpus * (1 - apriori_probability_of_mashing))
end