Class: TTML
Overview
Library to handle TTML Files
Uses the translator available to do the necessary language operations as defined by the AllFather
Direct Known Subclasses
Constant Summary collapse
- SUPPORTED_TRANSFORMATIONS =
[TYPE_SCC, TYPE_SRT, TYPE_VTT, TYPE_DFXP]
Constants included from CommonUtils
CommonUtils::CREDITS, CommonUtils::SCC_DEFAULT_FRAME_RATE
Constants included from AllFather
AllFather::TYPE_DFXP, AllFather::TYPE_SCC, AllFather::TYPE_SRT, AllFather::TYPE_TTML, AllFather::TYPE_VTT, AllFather::VALID_FILES
Instance Method Summary collapse
- #callsign ⇒ Object
- #infer_languages ⇒ Object
-
#initialize(cc_file, opts = nil) ⇒ TTML
constructor
A new instance of TTML.
- #is_valid? ⇒ Boolean
- #set_translator(translator) ⇒ Object
- #supported_transformations ⇒ Object
- #transform_to(types, src_lang, target_lang, output_dir) ⇒ Object
- #translate(src_lang, dest_lang, out_file) ⇒ Object
Methods included from CommonUtils
#create_file, #extension_from_type, #new_cue, #scc_encode, #time_details, #write_cue
Constructor Details
#initialize(cc_file, opts = nil) ⇒ TTML
Returns a new instance of TTML.
21 22 23 24 25 |
# File 'lib/ttml.rb', line 21 def initialize(cc_file, opts=nil) @cc_file = cc_file @force_detect = opts ? (opts[:force_detect] || false) : false raise "Invalid TTML file provided" unless is_valid? end |
Instance Method Details
#callsign ⇒ Object
27 28 29 |
# File 'lib/ttml.rb', line 27 def callsign TYPE_TTML end |
#infer_languages ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/ttml.rb', line 45 def infer_languages lang = [] begin xml_file = File.open(@cc_file) xml_doc = Nokogiri::XML(xml_file) div_objects = xml_doc.css("/tt/body/div") local_force_detect = false div_objects.each_with_index do |div, index| # By default, return the lang if specified in the div and # force detect is false inferred_lang = div.attributes['lang'].value rescue nil if inferred_lang.nil? # If lang is not provided in the caption, then override # force detect for inferrence local_force_detect = true end if @force_detect || local_force_detect local_force_detect = false sample_text = get_text(div, 100) inferred_lang = @translator.infer_language(sample_text) rescue nil if inferred_lang.nil? err_msg = "Failed to detect lang for div block number #{index + 1}" unless lang.empty? err_msg += "; Detected languages before failure are #{lang}" end raise AllFather::LangDetectionFailureException.new(err_msg) end end lang << inferred_lang end rescue StandardError => e puts "Error while detecting the language due to #{e.message}" ensure xml_file.close rescue nil end return nil if lang.empty? lang end |
#is_valid? ⇒ Boolean
31 32 33 34 35 36 37 38 39 |
# File 'lib/ttml.rb', line 31 def is_valid? # Do any VTT specific validations here if @cc_file =~ /^.*\.(ttml)$/ return true end # TODO: Check if it's required to do a File read to see if this # a well-formed XML. Another is to see if lang is available in each div return false end |
#set_translator(translator) ⇒ Object
41 42 43 |
# File 'lib/ttml.rb', line 41 def set_translator(translator) @translator = translator end |
#supported_transformations ⇒ Object
129 130 131 |
# File 'lib/ttml.rb', line 129 def supported_transformations return SUPPORTED_TRANSFORMATIONS end |
#transform_to(types, src_lang, target_lang, output_dir) ⇒ Object
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
# File 'lib/ttml.rb', line 133 def transform_to(types, src_lang, target_lang, output_dir) # Let's start off with some validations super(types, src_lang, target_lang, output_dir) # Suffix output dir with File seperator output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator) begin xml_file = File.open(@cc_file, 'r') xml_doc = Nokogiri::XML(xml_file) div_objects = xml_doc.css("/tt/body/div") langs = div_objects.map {|div| div.attributes['lang'].value rescue nil} translate = false matching_divs = [] inferred_src_lang = nil if src_lang.nil? || src_lang.empty? if target_lang && !target_lang.empty? # Find if any of our div matches this. Else pick first and translate to target lang div_objects.each_with_index do |div, j| if matching_lang?(div, target_lang) matching_divs << div break end end if matching_divs.empty? # Let's pick the first div for target translation selected_div = div_objects.first inferred_src_lang = selected_div.lang matching_divs << selected_div translate = true end else # Then we will have to create output files for each lang matching_divs = div_objects end else # Find the matching lang div and create the outputs available_divs = langs.select { |lang| lang.eql?(src_lang) } if available_divs.length > 1 raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported") end div_objects.each_with_index do |div, j| if matching_lang?(div, src_lang) matching_divs << div break end end if matching_divs.empty? raise InvalidInputException.new("Given Caption file #{@cc_file} doesn't contain #{src_lang} lang. Available langs are #{langs}") end if matching_divs.length > 1 raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported") end if target_lang && !target_lang.empty? && !src_lang.eql?(target_lang) translate = true end end div_index = 1 multiple_outputs = matching_divs.size > 1 matching_divs.each do |div| div_lang = div.attributes['lang'].value rescue nil # Override div lang if translate is required div_lang = target_lang if translate file_map = {} # Prepare the output files for each type and for each lang in the file types.each do |type| output_file = File.basename(@cc_file, File.extname(@cc_file)) # Suffix div index when multiple outputs are created output_file << "_#{div_index}" if multiple_outputs if target_lang.nil? && !src_lang.nil? output_file << "_#{src_lang}" end # Suffix lang to filename if provideds if target_lang && !target_lang.empty? output_file << "_#{target_lang}" end output_file << extension_from_type(type) out_file = "#{output_dir}#{output_file}" if create_file(TYPE_TTML, type, out_file, div_lang) file_map[type] = out_file else raise StandardError.new("Failed to create output file for type #{type}") end end blocks = div.css("p") cue_index = 1 total_blocks = blocks.size blocks.each_with_index do |block, index| start_time = block.attributes['begin'].value end_time = block.attributes['end'].value text = block.inner_html.strip.gsub(/(\s){2,}/, '') = "" text_blocks = get_block_text(text) text_blocks.each do |text_block| next if text_block.start_with?('<') || text_block.empty? << text_block end cue_info = CueInfo.new(callsign) cue_info.index = cue_index cue_index += 1 cue_info. = translated_msg(translate, , src_lang, inferred_src_lang, target_lang) cue_info.start = start_time cue_info.end = end_time cue_info.start_time_units = time_details(start_time, callsign) cue_info.end_time_units = time_details(end_time, callsign) write_cue(cue_info, file_map, index == (total_blocks - 1)) end div_index += 1 end ensure xml_file.close if xml_file end end |
#translate(src_lang, dest_lang, out_file) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/ttml.rb', line 84 def translate(src_lang, dest_lang, out_file) super(src_lang, dest_lang, out_file) xml_file = File.open(@cc_file, 'r:UTF-8', &:read) xml_doc = Nokogiri::XML(xml_file) div_objects = xml_doc.css("/tt/body/div") # Irrespective of what lang the div xml:lang says, infer the lang and then # check to see if it matches src_lang matched_div = nil div_objects.each do |div| sample_text = get_text(div, 100) inferred_lang = @translator.infer_language(sample_text) rescue nil next if inferred_lang.nil? if inferred_lang.eql?(src_lang) matched_div = div break end end if matched_div.nil? FileUtils.remove_file(out_file) raise AllFather::InvalidInputException.new("Unable to find #{src_lang} language section in TTML") end # Update the Lang in the Div matched_div.lang = dest_lang blocks = matched_div.css("p") blocks.each do |block| # Multiple spaces being stripped off text = block.inner_html.strip.gsub(/(\s){2,}/, '') text_blocks = get_block_text(text) translated_text = "" text_blocks.each do |text_block| if text_block.start_with?('<') || text_block.empty? translated_text << text_block next end translated_resp = @translator.translate(text_block, src_lang, dest_lang) translated_text << translated_resp end block.inner_html = translated_text end xml_file.close rescue nil File.write(out_file, xml_doc) out_file end |