Module: SubtitleProfanityFinder

Defined in:
lib/subtitle_profanity_finder.rb

Class Method Summary collapse

Class Method Details

.convert_to_regexps(profanity_hash) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/subtitle_profanity_finder.rb', line 17

def self.convert_to_regexps profanity_hash
 all_profanity_combinations = []
 profanity_hash.to_a.sort.reverse.each{|profanity, sanitized|
   as_regexp = Regexp.new(profanity, Regexp::IGNORECASE)
   if sanitized.is_a? Array
     is_single_word_profanity = true
     raise unless sanitized[1] == :full_word
     raise unless sanitized.length == 2
     sanitized = sanitized[0]
   end
   
   permutations = [profanity]
   if profanity =~ /l/
     permutations << profanity.gsub(/l/i, 'i')
   end
   if profanity =~ /i/
     permutations << profanity.gsub(/i/i, 'l')
   end
   
   bracketized = '[' + sanitized + ']'
   
   for permutation in permutations
     if is_single_word_profanity
       # oh wow this is ughly...
       sanitized_version = bracketized
       as_regexp = Regexp.new("\s" + permutation + "\s", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, ' ' + bracketized + ' ']
       as_regexp = Regexp.new("^" + permutation + "\s", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, bracketized + ' ']
       as_regexp = Regexp.new("\s" + permutation + "$", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, ' ' + bracketized]
       as_regexp = Regexp.new("^" + permutation + "$", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, bracketized]
     else
       all_profanity_combinations << [as_regexp, bracketized]
     end
   end
 }
 all_profanity_combinations
end

.edl_output(incoming_filename, extra_profanity_hash = {}, subtract_from_each_beginning_ts = 0, add_to_end_each_ts = 0, beginning_srt = "00:00", beginning_actual_movie = "00:00", ending_srt = "10:00:00", ending_actual = "10:00:00") ⇒ Object



60
61
62
# File 'lib/subtitle_profanity_finder.rb', line 60

def self.edl_output incoming_filename, extra_profanity_hash = {}, subtract_from_each_beginning_ts = 0, add_to_end_each_ts = 0, beginning_srt = "00:00", beginning_actual_movie = "00:00", ending_srt = "10:00:00", ending_actual = "10:00:00"
  edl_output_from_string File.read(incoming_filename), extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, beginning_srt, beginning_actual_movie, ending_srt, ending_actual
end

.edl_output_from_string(subtitles, extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, starting_timestamp_given_srt, starting_timestamp_actual, ending_srt, ending_actual) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/subtitle_profanity_finder.rb', line 64

def self.edl_output_from_string subtitles, extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, starting_timestamp_given_srt, starting_timestamp_actual, ending_srt, ending_actual
   subtitles.gsub!("\r\n", "\n")
   raise if subtract_from_each_beginning_ts < 0 # these have to be positive...in my twisted paradigm
   raise if add_to_end_each_ts < 0

   starting_timestamp_given_srt = EdlParser.translate_string_to_seconds(starting_timestamp_given_srt)
   starting_timestamp_actual = EdlParser.translate_string_to_seconds(starting_timestamp_actual)
   ending_srt = EdlParser.translate_string_to_seconds(ending_srt)
   ending_actual = EdlParser.translate_string_to_seconds ending_actual

   # accomodate for both styles of rewrite, except it messes up the math, so just leave it separate:
   # difference = starting_timestamp_given_srt - starting_timestamp_actual
   # subtract_from_each_beginning_ts += difference
   # add_to_end_each_ts -= difference

#     you minus the initial srt time... (given)
#     ratio = (end actual - init actual/ end given - init given)*(how far you are past the initial srt) plus initial actual
   multiply_by_this_factor = (ending_actual - starting_timestamp_actual)/(ending_srt - starting_timestamp_given_srt)

   multiply_proc = proc {|you|
    ((you - starting_timestamp_given_srt) * multiply_by_this_factor) + starting_timestamp_actual
  }  





























  bad_profanities = {'hell' => ['he..', :full_word],
    'g' +
    'o' + 100.chr => 'goodness', 'g' +
    111.chr + 
    100.chr +
    's' => 'deitys',
    'meu deus' => 'lo..',
    'lord' => 'lo..', 'da' +
    'mn' => 'da..', 
    'f' +
    117.chr +
    99.chr +
    107.chr =>
    'f...',
    'allah' => 'allah',
    'bi' +
    'tc' + 104.chr => 'b....',
    'bas' +
    'ta' + 'r' + 100.chr => 'ba.....',
    ((arse = 'a' +
    's'*2)) => ['a..', :full_word],
    arse + 'h' +
    'ole' => 'a..h...',
    'dieu' => ['deity', :full_word], # TODO fails...
    arse + 'wipe' => 'a..w....',
    'jes' +
    'u' + 's' => 'j....',
    'chri' +
    'st'=> ['chr...', :full_word], # allow for christian[ity] [good idea or not?]
    'sh' +
     'i' + 't' => 'sh..',
    'a realllly bad word' => ['test edited bad word', :full_word]
  }
  
  bad_profanities.merge! extra_profanity_hash # LODO make easier to use...

  semi_bad_profanities = {}
  ['bloody', 'moron', 'breast', 'idiot', 
    'sex', 'genital', 'boob', 'make love', 
    'making love', 'love mak', 
    'dumb', 'suck', 
    'piss'
 ].each{|name|
    # butter?
    semi_bad_profanities[name] = name
  }
  semi_bad_profanities['crap'] = ['crap', :full_word]
  semi_bad_profanities['butt'] = ['butt', :full_word]

  all_profanity_combinationss = [convert_to_regexps(bad_profanities), convert_to_regexps(semi_bad_profanities)]
  
  output = ''
  for all_profanity_combinations in all_profanity_combinationss
    output += "\n"
    for glop in split_to_glops(subtitles)
      for profanity, (sanitized, whole_word) in all_profanity_combinations
        # dunno if we should force words to just start with this or contain it anywhere...
        # what about 'g..ly' for example?
        # or 'un...ly' ? I think we're ok there...

        if glop =~ profanity
          # create english-ified version
          # take out timing line, number line
          sanitized_glop = glop.lines.to_a[1..-1].join(' ')
          sanitized_glop.gsub!(/[\r\n]/, '') # flatten 3 lines to 1
          sanitized_glop.gsub!(/<(.|)(\/|)i>/i, '') # kill <i> 
          sanitized_glop.gsub!(/[^a-zA-Z0-9'""]/, ' ') # kill weird stuff like ellipses
          sanitized_glop.gsub!(/\W\W+/, ' ') # remove duplicate "  " 's
          
          # sanitize the subtitles themselves...
          for all_profanity_combinations2 in all_profanity_combinationss
            for (prof2, (sanitized2, whole_word2)) in all_profanity_combinations2
              if sanitized_glop =~ prof2
                sanitized_glop.gsub!(prof2, sanitized2)
              end
            end
          end
          
          # because we have duplicate's for the letter l/i, refactor [[[profanity]]]
          sanitized_glop.gsub!(/\[+/, '[')
          sanitized_glop.gsub!(/\]+/, ']')
          
          # extract timing info
          timing_line = glop.split("\n").first.strip
          timing_line =~ /((\d\d:\d\d:\d\d),(\d\d\d) --> (\d\d:\d\d:\d\d),(\d\d\d))/
          # "00:03:00.0" , "00:04:00.0", "violence", "of some sort",
          ts_begin = "#{$2}.#{$3}"
          ts_begin = EdlParser.translate_string_to_seconds ts_begin
          ts_begin  -= subtract_from_each_beginning_ts
          ts_begin = multiply_proc.call(ts_begin)
          ts_begin = EdlParser.translate_time_to_human_readable ts_begin, true
          ts_end = "#{$4}.#{$5}"
          ts_end = EdlParser.translate_string_to_seconds ts_end
          ts_end += add_to_end_each_ts
          ts_end = multiply_proc.call(ts_end)
          ts_end = EdlParser.translate_time_to_human_readable ts_end, true
          unless output.contain? ts_begin
            output += %!"#{ts_begin}" , "#{ts_end}", "profanity", "#{sanitized.gsub(/[\[\]]/, '').strip}", "#{sanitized_glop.strip}",\n!
          end
        end
      end
    end
  end
  output

end

.split_to_glops(subtitles) ⇒ Object

splits into timestamps -> timestampsncontent blocks



13
14
15
# File 'lib/subtitle_profanity_finder.rb', line 13

def self.split_to_glops subtitles
  subtitles.scan(/\d\d:\d\d:\d\d.*?^$/m)
end