Class: TTML

Inherits:
Object
  • Object
show all
Includes:
AllFather, CommonUtils
Defined in:
lib/ttml.rb

Overview

Library to handle TTML Files

Uses the translator available to do the necessary language operations as defined by the AllFather

Direct Known Subclasses

DFXP

Constant Summary collapse

SUPPORTED_TRANSFORMATIONS =
[TYPE_SCC, TYPE_SRT, TYPE_VTT, TYPE_DFXP]

Constants included from CommonUtils

CommonUtils::CREDITS, CommonUtils::SCC_DEFAULT_FRAME_RATE

Constants included from AllFather

AllFather::TYPE_DFXP, AllFather::TYPE_SCC, AllFather::TYPE_SRT, AllFather::TYPE_TTML, AllFather::TYPE_VTT, AllFather::VALID_FILES

Instance Method Summary collapse

Methods included from CommonUtils

#create_file, #extension_from_type, #new_cue, #scc_encode, #time_details, #write_cue

Constructor Details

#initialize(cc_file, opts = nil) ⇒ TTML

Returns a new instance of TTML.



21
22
23
24
25
# File 'lib/ttml.rb', line 21

def initialize(cc_file, opts=nil)
  @cc_file = cc_file
  @force_detect = opts ? (opts[:force_detect] || false) : false
  raise "Invalid TTML file provided" unless is_valid?
end

Instance Method Details

#callsignObject



27
28
29
# File 'lib/ttml.rb', line 27

def callsign
  TYPE_TTML
end

#infer_languagesObject



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/ttml.rb', line 45

def infer_languages
  lang = []
  begin
    xml_file = File.open(@cc_file)
    xml_doc  = Nokogiri::XML(xml_file)
    div_objects = xml_doc.css("/tt/body/div")
    local_force_detect = false
    div_objects.each_with_index do |div, index|
      # By default, return the lang if specified in the div and 
      # force detect is false
      inferred_lang = div.attributes['lang'].value rescue nil
      if inferred_lang.nil?
        # If lang is not provided in the caption, then override
        # force detect for inferrence
        local_force_detect = true
      end
      if @force_detect || local_force_detect
        local_force_detect = false
        sample_text = get_text(div, 100)
        inferred_lang = @translator.infer_language(sample_text) rescue nil
        if inferred_lang.nil?
          err_msg = "Failed to detect lang for div block number #{index + 1}"
          unless lang.empty?
            err_msg += "; Detected languages before failure are #{lang}"
          end
          raise AllFather::LangDetectionFailureException.new(err_msg)
        end
      end
      lang << inferred_lang
    end
  rescue StandardError => e
    puts "Error while detecting the language due to #{e.message}"
  ensure
    xml_file.close rescue nil
  end
  return nil if lang.empty?
  lang
end

#is_valid?Boolean

Returns:

  • (Boolean)


31
32
33
34
35
36
37
38
39
# File 'lib/ttml.rb', line 31

def is_valid?
  # Do any VTT specific validations here
  if @cc_file =~ /^.*\.(ttml)$/
    return true
  end
  # TODO: Check if it's required to do a File read to see if this
  # a well-formed XML. Another is to see if lang is available in each div
  return false
end

#set_translator(translator) ⇒ Object



41
42
43
# File 'lib/ttml.rb', line 41

def set_translator(translator)
  @translator = translator
end

#supported_transformationsObject



129
130
131
# File 'lib/ttml.rb', line 129

def supported_transformations
  return SUPPORTED_TRANSFORMATIONS
end

#transform_to(types, src_lang, target_lang, output_dir) ⇒ Object



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/ttml.rb', line 133

def transform_to(types, src_lang, target_lang, output_dir)
  # Let's start off with some validations
  super(types, src_lang, target_lang, output_dir)

  # Suffix output dir with File seperator
  output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator)
  
  begin
    xml_file = File.open(@cc_file, 'r')
    xml_doc = Nokogiri::XML(xml_file)
    div_objects = xml_doc.css("/tt/body/div")
    langs = div_objects.map {|div| div.attributes['lang'].value rescue nil}
    translate = false
    matching_divs = []
    inferred_src_lang = nil
    if src_lang.nil? || src_lang.empty?
      if target_lang && !target_lang.empty?
        # Find if any of our div matches this. Else pick first and translate to target lang
        div_objects.each_with_index do |div, j|
          if matching_lang?(div, target_lang)
            matching_divs << div 
            break
          end
        end
        if matching_divs.empty?
          # Let's pick the first div for target translation
          selected_div = div_objects.first
          inferred_src_lang = selected_div.lang
          matching_divs << selected_div
          translate = true
        end
      else
        # Then we will have to create output files for each lang
        matching_divs = div_objects
      end
    else
      # Find the matching lang div and create the outputs
      available_divs = langs.select { |lang| lang.eql?(src_lang) }
      if available_divs.length > 1
        raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported")
      end
      div_objects.each_with_index do |div, j|
        if matching_lang?(div, src_lang)
          matching_divs << div 
          break
        end
      end
      if matching_divs.empty?
        raise InvalidInputException.new("Given Caption file #{@cc_file} doesn't contain #{src_lang} lang. Available langs are #{langs}")
      end
      if matching_divs.length > 1
        raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported")
      end
      if target_lang && !target_lang.empty? && !src_lang.eql?(target_lang)
        translate = true
      end
    end

    div_index = 1
    multiple_outputs = matching_divs.size > 1
    matching_divs.each do |div|
      div_lang = div.attributes['lang'].value rescue nil
      # Override div lang if translate is required
      div_lang = target_lang if translate
      file_map = {}
      # Prepare the output files for each type and for each lang in the file
      types.each do |type|
        output_file = File.basename(@cc_file, File.extname(@cc_file))
        # Suffix div index when multiple outputs are created
        output_file << "_#{div_index}" if multiple_outputs
        if target_lang.nil? && !src_lang.nil?
          output_file << "_#{src_lang}"
        end
        # Suffix lang to filename if provideds 
        if target_lang && !target_lang.empty?
          output_file << "_#{target_lang}"
        end
        output_file << extension_from_type(type)
        out_file = "#{output_dir}#{output_file}"
        if create_file(TYPE_TTML, type, out_file, div_lang)
          file_map[type] = out_file
        else
          raise StandardError.new("Failed to create output file for type #{type}")
        end
      end
      blocks = div.css("p")
      cue_index = 1
      total_blocks = blocks.size
      blocks.each_with_index do |block, index|
        start_time = block.attributes['begin'].value
        end_time = block.attributes['end'].value
        text = block.inner_html.strip.gsub(/(\s){2,}/, '')
        message = ""
        text_blocks = get_block_text(text)
        text_blocks.each do |text_block|
          next if text_block.start_with?('<') || text_block.empty?
          message << text_block
        end
        cue_info = CueInfo.new(callsign)
        cue_info.index = cue_index
        cue_index += 1
        cue_info.message = translated_msg(translate, message, src_lang, inferred_src_lang, target_lang)
        cue_info.start = start_time
        cue_info.end = end_time
        cue_info.start_time_units = time_details(start_time, callsign)
        cue_info.end_time_units = time_details(end_time, callsign)
        write_cue(cue_info, file_map, index == (total_blocks - 1))
      end
      div_index += 1
    end
  ensure
    xml_file.close if xml_file
  end
end

#translate(src_lang, dest_lang, out_file) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/ttml.rb', line 84

def translate(src_lang, dest_lang, out_file)
  super(src_lang, dest_lang, out_file)
  xml_file = File.open(@cc_file, 'r:UTF-8', &:read)
  xml_doc  = Nokogiri::XML(xml_file)
  div_objects = xml_doc.css("/tt/body/div")
  # Irrespective of what lang the div xml:lang says, infer the lang and then
  # check to see if it matches src_lang
  matched_div = nil
  div_objects.each do |div|
    sample_text = get_text(div, 100)
    inferred_lang = @translator.infer_language(sample_text) rescue nil
    next if inferred_lang.nil?
    if inferred_lang.eql?(src_lang)
      matched_div = div 
      break 
    end
  end
  if matched_div.nil?
    FileUtils.remove_file(out_file)
    raise AllFather::InvalidInputException.new("Unable to find #{src_lang} language section in TTML")
  end
  # Update the Lang in the Div
  matched_div.lang = dest_lang

  blocks = matched_div.css("p")
  blocks.each do |block|
    # Multiple spaces being stripped off
    text = block.inner_html.strip.gsub(/(\s){2,}/, '')
    text_blocks = get_block_text(text)
    translated_text = ""
    text_blocks.each do |text_block|
      if text_block.start_with?('<') || text_block.empty?
        translated_text << text_block
        next
      end
      translated_resp = @translator.translate(text_block, src_lang, dest_lang)
      translated_text << translated_resp
    end
    block.inner_html = translated_text
  end
  xml_file.close rescue nil
  File.write(out_file, xml_doc)
  out_file
end