Class: WordFilter::Filter

Inherits:
Object
  • Object
show all
Defined in:
lib/word_filter.rb

Constant Summary collapse

NONE =
0
REPEATED_VOWELS =
1
SWAPPABLE_VOWELS =
2
SWAPPABLE_AND_REPEATED_VOWELS =
3
SWAPPABLE_AND_REPEATED_VOWELS_INCLUDING_NONE =
4
@@emailRegex =
/[a-zA-Z0-9._%+-]+@[a-z0-9.-]+\\.[a-zA-Z]{2,4}/
@@alphaNumericDigit =
/(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|\d)/
@@digitsRegex =
Regexp.new("\b(\s*" + @@alphaNumericDigit.source + ")+\b")
@@streetNameRegex =
Regexp.new("\b(\s*" + @@alphaNumericDigit.source + ")+\s([a-z\d]+\.?\s*){1,5}\b(avenue|ave|street|st|court|ct|circle|boulevard|blvd|lane|ln|trail|tr|loop|lp|route|rt|drive|dr|road|rd|terrace|tr|way|wy|highway|hiway|hw)\b")
@@phoneNumber =
Regexp.new("((" + @@alphaNumericDigit.source + ")\W*?){3}((" + @@alphaNumericDigit.source + ")\W*?){4}\b")
@@urlRegex =
/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeFilter

Returns a new instance of Filter.



21
22
23
# File 'lib/word_filter.rb', line 21

def initialize()
    @filterLevel = NONE
end

Instance Attribute Details

#badWordsRegexObject

Returns the value of attribute badWordsRegex.



25
26
27
# File 'lib/word_filter.rb', line 25

def badWordsRegex
  @badWordsRegex
end

#filterLevelObject

Returns the value of attribute filterLevel.



25
26
27
# File 'lib/word_filter.rb', line 25

def filterLevel
  @filterLevel
end

#goodWordsObject

Returns the value of attribute goodWords.



25
26
27
# File 'lib/word_filter.rb', line 25

def goodWords
  @goodWords
end

Instance Method Details

#filterInit(dictionaryFile, badwordslist) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/word_filter.rb', line 27

def filterInit(dictionaryFile, badwordslist)
    @goodWords = loadDictionary(dictionaryFile)
    #The original java class requiere other three word's list
    @datingWordsRegex = /dating/
    @deviantWordsRegex = /deviant/
    @badWordsRegex = loadBadwords(badwordslist)
    
    vowels = /([aeiou])/
    @vowelSwappedAndRepeatedRegex = Regexp.new(@badWordsRegex.source.gsub(vowels, "[aeiou]+"))
    @vowelSwappedAndRepeatedRegexIncludingEmpty = Regexp.new(@badWordsRegex.source.gsub(vowels, "[aeiou]*"))
 @vowelRepeatedRegex = Regexp.new(@badWordsRegex.source.gsub(vowels, "\\1+"))
 @vowelSwappedRegex = Regexp.new(@badWordsRegex.source.gsub(vowels, "[aeiou]"))
    
end

#filterString(input) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/word_filter.rb', line 62

def filterString(input)

#        Output:
#   -1: An exception occured while trying to check the string, do not post
#   0: string is safe to post
#   1: string contains an email address
#   2: string contains a URL
#   3: string contains a street address
#   4: string contains a phone number
#   5: string contains a dating word
#   6: string contains a deviant word
#   9: string contains any other bad word

    input = input.strip.downcase
    workingCopy = input
    
    if input == ""
        return 0
    end
    
    if @@emailRegex.match(input)
        return 1
    end
    
    if @@urlRegex.match(input)
        return 2
    end
    
    if @@streetNameRegex.match(input)
        return 3
    end
    
    if @@phoneNumber.match(input)
        return 4
    end
    
    workingCopy.gsub("\s+", " ")
    workingCopy.gsub!(/["',.;:?-]/, " ")
    workingCopy.gsub!(/!+\s/, " ")
    workingCopy.gsub!(/!+\z/, " ")
    workingCopy.gsub!(/\br\su/, " ")
    
    cleanVersion = stripGoodWords(workingCopy)
    
    if cleanVersion == nil or cleanVersion.length == 0
        return 0
    end
    
    if @datingWordsRegex.match(cleanVersion)
        return 5
    end
    
    if @deviantWordsRegex.match(cleanVersion)
        return 6
    end
   
    if @badWordsRegex.match(cleanVersion)
        return 7
    end

    #let's try various combinations of bad word tricks
    currentVersion = cleanVersion
    
    #compress the string then check it again
    if @badWordsRegex.match(currentVersion.gsub("[ \t\n\f\r]", ""))
        return 9
    end
    
    #zap special characters and check it again
    if @badWordsRegex.match(currentVersion.gsub("[^a-z]", ""))
        return 9
    end
    
    #replace certain special characters with their letter equivalents
    #NOTE: This one maps vertical non-letter chars (!1|) to i
    specialCharsReplaced_i = currentVersion.tr("@683!1|0$+","abbeiiiost")
    if @badWordsRegex.match(specialCharsReplaced_i)
        return 9
    end
    
    #replace certain special characters with their letter equivalents
    #NOTE: This one maps vertical non-letter chars (!1|) to l
    specialCharsReplaced_l = currentVersion.tr("@683!1|0$+","abbelllost")
    if @badWordsRegex.match(specialCharsReplaced_l)
        return 9
    end
    
    case @filterLevel
    when NONE
        return 0
    when REPEATED_VOWELS
        if @vowelRepeatedRegex.match(specialCharsReplaced_i) or @vowelRepeatedRegex.match(specialCharsReplaced_l)
            return 9
        end
    when SWAPPABLE_VOWELS
        if @vowelSwappedRegex.match(specialCharsReplaced_i) or @vowelSwappedRegex.match(specialCharsReplaced_l)
            return 9
        end
    when SWAPPABLE_AND_REPEATED_VOWELS
        if @vowelSwappedAndRepeatedRegex.match(specialCharsReplaced_i) or @vowelSwappedAndRepeatedRegex.match(specialCharsReplaced_l)
            return 9
        end
    when SWAPPABLE_AND_REPEATED_VOWELS_INCLUDING_NONE
        if @vowelSwappedAndRepeatedRegexIncludingEmpty.match(specialCharsReplaced_i) or @vowelSwappedAndRepeatedRegexIncludingEmpty.match(specialCharsReplaced_l)
            return 9
        end
    end

end

#loadBadwords(path) ⇒ Object



54
55
56
57
58
59
60
# File 'lib/word_filter.rb', line 54

def loadBadwords(path)
  words = File.read(path).gsub("\r", '').split("\n")
  regex = words.join('|')
  regex = '(' + regex + ')'
  regex = Regexp.new(regex)
  return regex
end

#loadDictionary(path) ⇒ Object



43
44
45
46
47
48
49
50
51
52
# File 'lib/word_filter.rb', line 43

def loadDictionary(path)
    words = []
    File.open(path, "r").each_line do |line|
        splitted = line.split(" ")
        splitted.each do |w|
            words << w
        end
    end
    return words
end

#stripGoodWords(input) ⇒ Object



172
173
174
175
176
177
178
179
180
181
# File 'lib/word_filter.rb', line 172

def stripGoodWords(input)
    result = []
    input = input.split(" ")
    input.each do |w|
        if not @goodWords.include? w
            result << w
        end
    end
    return result.join(" ")
end