Module: SubtitleProfanityFinder

Defined in:: lib/subtitle_profanity_finder.rb

Class Method Summary collapse

Class Method Details

.convert_to_regexps(profanity_hash) ⇒ `Object`

# File 'lib/subtitle_profanity_finder.rb', line 17

def self.convert_to_regexps profanity_hash
 all_profanity_combinations = []
 profanity_hash.to_a.sort.reverse.each{|profanity, sanitized|
   as_regexp = Regexp.new(profanity, Regexp::IGNORECASE)
   if sanitized.is_a? Array
     is_single_word_profanity = true
     raise unless sanitized[1] == :full_word
     raise unless sanitized.length == 2
     sanitized = sanitized[0]
   end
   
   permutations = [profanity]
   if profanity =~ /l/
     permutations << profanity.gsub(/l/i, 'i')
   end
   if profanity =~ /i/
     permutations << profanity.gsub(/i/i, 'l')
   end
   
   bracketized = '[' + sanitized + ']'
   
   for permutation in permutations
     if is_single_word_profanity
       # oh wow this is ughly...
       sanitized_version = bracketized
       as_regexp = Regexp.new("\s" + permutation + "\s", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, ' ' + bracketized + ' ']
       as_regexp = Regexp.new("^" + permutation + "\s", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, bracketized + ' ']
       as_regexp = Regexp.new("\s" + permutation + "$", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, ' ' + bracketized]
       as_regexp = Regexp.new("^" + permutation + "$", Regexp::IGNORECASE)
       all_profanity_combinations << [as_regexp, bracketized]
     else
       all_profanity_combinations << [as_regexp, bracketized]
     end
   end
 }
 all_profanity_combinations
end

.edl_output(incoming_filename, extra_profanity_hash = {}, subtract_from_each_beginning_ts = 0, add_to_end_each_ts = 0, beginning_srt = "00:00", beginning_actual_movie = "00:00", ending_srt = "10:00:00", ending_actual = "10:00:00") ⇒ `Object`



60
61
62

# File 'lib/subtitle_profanity_finder.rb', line 60

def self.edl_output incoming_filename, extra_profanity_hash = {}, subtract_from_each_beginning_ts = 0, add_to_end_each_ts = 0, beginning_srt = "00:00", beginning_actual_movie = "00:00", ending_srt = "10:00:00", ending_actual = "10:00:00"
  edl_output_from_string File.read(incoming_filename), extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, beginning_srt, beginning_actual_movie, ending_srt, ending_actual
end

.edl_output_from_string(subtitles, extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, starting_timestamp_given_srt, starting_timestamp_actual, ending_srt, ending_actual) ⇒ `Object`

# File 'lib/subtitle_profanity_finder.rb', line 64

def self.edl_output_from_string subtitles, extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, starting_timestamp_given_srt, starting_timestamp_actual, ending_srt, ending_actual
   subtitles.gsub!("\r\n", "\n")
   raise if subtract_from_each_beginning_ts < 0 # these have to be positive...in my twisted paradigm
   raise if add_to_end_each_ts < 0

   starting_timestamp_given_srt = EdlParser.translate_string_to_seconds(starting_timestamp_given_srt)
   starting_timestamp_actual = EdlParser.translate_string_to_seconds(starting_timestamp_actual)
   ending_srt = EdlParser.translate_string_to_seconds(ending_srt)
   ending_actual = EdlParser.translate_string_to_seconds ending_actual

   # accomodate for both styles of rewrite, except it messes up the math, so just leave it separate:
   # difference = starting_timestamp_given_srt - starting_timestamp_actual
   # subtract_from_each_beginning_ts += difference
   # add_to_end_each_ts -= difference

#     you minus the initial srt time... (given)
#     ratio = (end actual - init actual/ end given - init given)*(how far you are past the initial srt) plus initial actual
   multiply_by_this_factor = (ending_actual - starting_timestamp_actual)/(ending_srt - starting_timestamp_given_srt)

   multiply_proc = proc {|you|
    ((you - starting_timestamp_given_srt) * multiply_by_this_factor) + starting_timestamp_actual
  }  





























  bad_profanities = {'hell' => ['he..', :full_word],
    'g' +
    'o' + 100.chr => 'goodness', 'g' +
    111.chr + 
    100.chr +
    's' => 'deitys',
    'meu deus' => 'lo..',
    'lord' => 'lo..', 'da' +
    'mn' => 'da..', 
    'f' +
    117.chr +
    99.chr +
    107.chr =>
    'f...',
    'allah' => 'allah',
    'bi' +
    'tc' + 104.chr => 'b....',
    'bas' +
    'ta' + 'r' + 100.chr => 'ba.....',
    ((arse = 'a' +
    's'*2)) => ['a..', :full_word],
    arse + 'h' +
    'ole' => 'a..h...',
    'dieu' => ['deity', :full_word], # TODO fails...
    arse + 'wipe' => 'a..w....',
    'jes' +
    'u' + 's' => 'j....',
    'chri' +
    'st'=> ['chr...', :full_word], # allow for christian[ity] [good idea or not?]
    'sh' +
     'i' + 't' => 'sh..',
    'a realllly bad word' => ['test edited bad word', :full_word]
  }
  
  bad_profanities.merge! extra_profanity_hash # LODO make easier to use...

  semi_bad_profanities = {}
  ['bloody', 'moron', 'breast', 'idiot', 
    'sex', 'genital', 'boob', 'make love', 
    'making love', 'love mak', 
    'dumb', 'suck', 
    'piss'
 ].each{|name|
    # butter?
    semi_bad_profanities[name] = name
  }
  semi_bad_profanities['crap'] = ['crap', :full_word]
  semi_bad_profanities['butt'] = ['butt', :full_word]

  all_profanity_combinationss = [convert_to_regexps(bad_profanities), convert_to_regexps(semi_bad_profanities)]
  
  output = ''
  for all_profanity_combinations in all_profanity_combinationss
    output += "\n"
    for glop in split_to_glops(subtitles)
      for profanity, (sanitized, whole_word) in all_profanity_combinations
        # dunno if we should force words to just start with this or contain it anywhere...
        # what about 'g..ly' for example?
        # or 'un...ly' ? I think we're ok there...

        if glop =~ profanity
          # create english-ified version
          # take out timing line, number line
          sanitized_glop = glop.lines.to_a[1..-1].join(' ')
          sanitized_glop.gsub!(/[\r\n]/, '') # flatten 3 lines to 1
          sanitized_glop.gsub!(/<(.|)(\/|)i>/i, '') # kill <i> 
          sanitized_glop.gsub!(/[^a-zA-Z0-9'""]/, ' ') # kill weird stuff like ellipses
          sanitized_glop.gsub!(/\W\W+/, ' ') # remove duplicate "  " 's
          
          # sanitize the subtitles themselves...
          for all_profanity_combinations2 in all_profanity_combinationss
            for (prof2, (sanitized2, whole_word2)) in all_profanity_combinations2
              if sanitized_glop =~ prof2
                sanitized_glop.gsub!(prof2, sanitized2)
              end
            end
          end
          
          # because we have duplicate's for the letter l/i, refactor [[[profanity]]]
          sanitized_glop.gsub!(/\[+/, '[')
          sanitized_glop.gsub!(/\]+/, ']')
          
          # extract timing info
          timing_line = glop.split("\n").first.strip
          timing_line =~ /((\d\d:\d\d:\d\d),(\d\d\d) --> (\d\d:\d\d:\d\d),(\d\d\d))/
          # "00:03:00.0" , "00:04:00.0", "violence", "of some sort",
          ts_begin = "#{$2}.#{$3}"
          ts_begin = EdlParser.translate_string_to_seconds ts_begin
          ts_begin  -= subtract_from_each_beginning_ts
          ts_begin = multiply_proc.call(ts_begin)
          ts_begin = EdlParser.translate_time_to_human_readable ts_begin, true
          ts_end = "#{$4}.#{$5}"
          ts_end = EdlParser.translate_string_to_seconds ts_end
          ts_end += add_to_end_each_ts
          ts_end = multiply_proc.call(ts_end)
          ts_end = EdlParser.translate_time_to_human_readable ts_end, true
          unless output.contain? ts_begin
            output += %!"#{ts_begin}" , "#{ts_end}", "profanity", "#{sanitized.gsub(/[\[\]]/, '').strip}", "#{sanitized_glop.strip}",\n!
          end
        end
      end
    end
  end
  output

end

.split_to_glops(subtitles) ⇒ `Object`

splits into timestamps -> timestampsncontent blocks



13
14
15

# File 'lib/subtitle_profanity_finder.rb', line 13

def self.split_to_glops subtitles
  subtitles.scan(/\d\d:\d\d:\d\d.*?^$/m)
end

Module: SubtitleProfanityFinder

Class Method Summary collapse

Class Method Details

.convert_to_regexps(profanity_hash) ⇒ Object

.edl_output(incoming_filename, extra_profanity_hash = {}, subtract_from_each_beginning_ts = 0, add_to_end_each_ts = 0, beginning_srt = "00:00", beginning_actual_movie = "00:00", ending_srt = "10:00:00", ending_actual = "10:00:00") ⇒ Object

.edl_output_from_string(subtitles, extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, starting_timestamp_given_srt, starting_timestamp_actual, ending_srt, ending_actual) ⇒ Object

.split_to_glops(subtitles) ⇒ Object

.convert_to_regexps(profanity_hash) ⇒ `Object`

.edl_output(incoming_filename, extra_profanity_hash = {}, subtract_from_each_beginning_ts = 0, add_to_end_each_ts = 0, beginning_srt = "00:00", beginning_actual_movie = "00:00", ending_srt = "10:00:00", ending_actual = "10:00:00") ⇒ `Object`

.edl_output_from_string(subtitles, extra_profanity_hash, subtract_from_each_beginning_ts, add_to_end_each_ts, starting_timestamp_given_srt, starting_timestamp_actual, ending_srt, ending_actual) ⇒ `Object`

.split_to_glops(subtitles) ⇒ `Object`