Class: Tml::Tokenizers::Dom
- Inherits:
-
Object
- Object
- Tml::Tokenizers::Dom
- Defined in:
- lib/tml/tokenizers/dom.rb
Instance Attribute Summary collapse
-
#context ⇒ Object
Returns the value of attribute context.
-
#options ⇒ Object
Returns the value of attribute options.
-
#tokens ⇒ Object
Returns the value of attribute tokens.
Instance Method Summary collapse
- #adjust_name(node) ⇒ Object
- #between_separators?(node) ⇒ Boolean
- #container_node?(node) ⇒ Boolean
- #contextualize(name, context) ⇒ Object
- #debug(doc) ⇒ Object
- #debug_translation(translation) ⇒ Object
- #debug_tree(node, depth) ⇒ Object
- #empty_string?(tml) ⇒ Boolean
- #generate_data_tokens(text) ⇒ Object
- #generate_html_token(node, value = nil) ⇒ Object
- #generate_tml_tags(node) ⇒ Object
- #has_child_nodes?(node) ⇒ Boolean
- #has_inline_or_text_siblings?(node) ⇒ Boolean
- #ignored_node?(node) ⇒ Boolean
-
#initialize(context = {}, options = {}) ⇒ Dom
constructor
A new instance of Dom.
- #inline_node?(node) ⇒ Boolean
- #no_translate_node?(node) ⇒ Boolean
- #node_info(node) ⇒ Object
- #non_translatable_node?(node) ⇒ Boolean
- #only_child?(node) ⇒ Boolean
- #option(name) ⇒ Object
- #reset_context ⇒ Object
- #sanitize_value(value) ⇒ Object
- #self_closing_node?(node) ⇒ Boolean
- #separator_node?(node) ⇒ Boolean
- #short_token?(token, value) ⇒ Boolean
- #translate(doc) ⇒ Object
- #translate_tml(tml) ⇒ Object
- #translate_tree(node) ⇒ Object
- #valid_text_node?(node) ⇒ Boolean
Constructor Details
#initialize(context = {}, options = {}) ⇒ Dom
Returns a new instance of Dom.
41 42 43 44 45 |
# File 'lib/tml/tokenizers/dom.rb', line 41 def initialize(context = {}, = {}) self.context = context self. = reset_context end |
Instance Attribute Details
#context ⇒ Object
Returns the value of attribute context.
39 40 41 |
# File 'lib/tml/tokenizers/dom.rb', line 39 def context @context end |
#options ⇒ Object
Returns the value of attribute options.
39 40 41 |
# File 'lib/tml/tokenizers/dom.rb', line 39 def end |
#tokens ⇒ Object
Returns the value of attribute tokens.
39 40 41 |
# File 'lib/tml/tokenizers/dom.rb', line 39 def tokens @tokens end |
Instance Method Details
#adjust_name(node) ⇒ Object
313 314 315 316 317 |
# File 'lib/tml/tokenizers/dom.rb', line 313 def adjust_name(node) name = node.name.downcase map = option('name_mapping') map[name.to_sym] ? map[name.to_sym] : name end |
#between_separators?(node) ⇒ Boolean
130 131 132 133 |
# File 'lib/tml/tokenizers/dom.rb', line 130 def between_separators?(node) (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling)) end |
#container_node?(node) ⇒ Boolean
202 203 204 |
# File 'lib/tml/tokenizers/dom.rb', line 202 def container_node?(node) node.type == 1 && !inline_node?(node) end |
#contextualize(name, context) ⇒ Object
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 |
# File 'lib/tml/tokenizers/dom.rb', line 319 def contextualize(name, context) if self.tokens[name] and self.tokens[name] != context index = 0 matches = name.match(/\d+$/) if matches and matches.length > 0 index = matches[matches.length-1].to_i name = name.gsub(index.to_s, '') end name += (index + 1).to_s return contextualize(name, context) end self.tokens[name] = context name end |
#debug(doc) ⇒ Object
335 336 337 338 |
# File 'lib/tml/tokenizers/dom.rb', line 335 def debug(doc) self.doc = doc debug_tree(self.doc, 0) end |
#debug_translation(translation) ⇒ Object
160 161 162 |
# File 'lib/tml/tokenizers/dom.rb', line 160 def debug_translation(translation) option('debug_format').gsub('{$0}', translation) end |
#debug_tree(node, depth) ⇒ Object
340 341 342 343 344 345 346 347 348 |
# File 'lib/tml/tokenizers/dom.rb', line 340 def debug_tree(node, depth) padding = ('=' * (depth+1)) Tml.logger.log(padding + '=> ' + (node) + ': ' + node_info(node)) (node.children || []).each do |child| debug_tree(child, depth+1) end end |
#empty_string?(tml) ⇒ Boolean
164 165 166 167 |
# File 'lib/tml/tokenizers/dom.rb', line 164 def empty_string?(tml) tml = tml.gsub(/[\s\n\r\t]/, '') tml == '' end |
#generate_data_tokens(text) ⇒ Object
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
# File 'lib/tml/tokenizers/dom.rb', line 229 def generate_data_tokens(text) if option('data_tokens.special.enabled') matches = text.scan(option('data_tokens.special.regex')) matches.each do |match| token = match[1, - 2] self.context[token] = match text = text.gsub(match, "{#{token}}") end end if option('data_tokens.date.enabled') token_name = option('data_tokens.date.name') formats = option('data_tokens.date.formats') formats.each do |format| regex = format[0] # date_format = format[1] matches = text.scan(regex) if matches matches.each do |match| next if match.first.nil? or match.first == '' date = match.first token = self.contextualize(token_name, date) replacement = "{#{token}}" text = text.gsub(date, replacement) end end end end rules = option('data_tokens.rules') if rules rules.each do |rule| next unless rule[:enabled] matches = text.scan(rule[:regex]) if matches matches.each do |match| next if match.first.nil? or match.first == '' value = match.first.strip unless value == '' token = contextualize(rule[:name], value.gsub(/[.,;\s]/, '').to_i) text = text.gsub(value, value.gsub(value, "{#{token}}")) end end end end end text end |
#generate_html_token(node, value = nil) ⇒ Object
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
# File 'lib/tml/tokenizers/dom.rb', line 282 def generate_html_token(node, value = nil) name = node.name.downcase attributes = node.attributes attributes_hash = {} value = (!value ? '{$0}' : value) if attributes.length == 0 if self_closing_node?(node) return '<' + name + '/>' if %w(br hr).index(name) return '<' + name + '>' + '</' + name + '>' end return '<' + name + '>' + value + '</' + name + '>' end attributes.each do |name, attribute| attributes_hash[name] = attribute.value end keys = attributes_hash.keys.sort attr = [] keys.each do |key| quote = attributes_hash[key].index("'") ? '"' : "'" attr << (key + '=' + quote + attributes_hash[key] + quote) end attr = attr.join(' ') return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node) '<' + name + ' ' + attr + '>' + value + '</' + name + '>' end |
#generate_tml_tags(node) ⇒ Object
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/tml/tokenizers/dom.rb', line 135 def (node) buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text else buffer += (child) end end token_context = generate_html_token(node) token = contextualize(adjust_name(node), token_context) value = sanitize_value(buffer) return '{' + token + '}' if self_closing_node?(node) return '[' + token + ': ' + value + ']' if short_token?(token, value) '[' + token + ']' + value + '[/' + token + ']' end |
#has_child_nodes?(node) ⇒ Boolean
126 127 128 |
# File 'lib/tml/tokenizers/dom.rb', line 126 def has_child_nodes?(node) node.children and node.children.length > 0 end |
#has_inline_or_text_siblings?(node) ⇒ Boolean
182 183 184 185 186 187 188 189 190 191 192 |
# File 'lib/tml/tokenizers/dom.rb', line 182 def has_inline_or_text_siblings?(node) return false unless node.parent node.parent.children.each do |child| unless child == node return true if inline_node?(child) || valid_text_node?(child) end end false end |
#ignored_node?(node) ⇒ Boolean
210 211 212 213 |
# File 'lib/tml/tokenizers/dom.rb', line 210 def ignored_node?(node) return true if (node.type != 1) (option('nodes.ignored') || []).index(node.name.downcase) end |
#inline_node?(node) ⇒ Boolean
194 195 196 197 198 199 200 |
# File 'lib/tml/tokenizers/dom.rb', line 194 def inline_node?(node) ( node.type == 1 and (option('nodes.inline') || []).index(node.name.downcase) and !only_child?(node) ) end |
#no_translate_node?(node) ⇒ Boolean
88 89 90 91 92 93 94 |
# File 'lib/tml/tokenizers/dom.rb', line 88 def no_translate_node?(node) return unless node && node.type == 1 && node.attributes node.attributes.each do |name, attribute| return true if name == 'notranslate' or attribute.value.index('notranslate') end false end |
#node_info(node) ⇒ Object
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 |
# File 'lib/tml/tokenizers/dom.rb', line 350 def node_info(node) info = [] info << node.type info << node.tagName if node.type == 1 if inline_node?(node) info << 'inline' if has_inline_or_text_siblings?(node) info << 'sentence' else info << 'only translatable' end end info << 'self closing' if self_closing_node?(node) info << 'only child' if only_child?(node) return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3 "[#{info.join(', ')}]" end |
#non_translatable_node?(node) ⇒ Boolean
96 97 98 99 100 101 102 |
# File 'lib/tml/tokenizers/dom.rb', line 96 def non_translatable_node?(node) return false unless node return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase) return true if node.type == 1 && node.children.length === 0 && node.inner_text == '' return true if no_translate_node?(node) false end |
#only_child?(node) ⇒ Boolean
177 178 179 180 |
# File 'lib/tml/tokenizers/dom.rb', line 177 def only_child?(node) return false unless node.parent node.parent.children.count == 1 end |
#option(name) ⇒ Object
155 156 157 158 |
# File 'lib/tml/tokenizers/dom.rb', line 155 def option(name) value = Tml::Utils.hash_value(self., name) value || Tml.config.translator_option(name) end |
#reset_context ⇒ Object
169 170 171 |
# File 'lib/tml/tokenizers/dom.rb', line 169 def reset_context self.tokens = {}.merge(self.context) end |
#sanitize_value(value) ⇒ Object
225 226 227 |
# File 'lib/tml/tokenizers/dom.rb', line 225 def sanitize_value(value) value.gsub(/^\s+/, '') end |
#self_closing_node?(node) ⇒ Boolean
206 207 208 |
# File 'lib/tml/tokenizers/dom.rb', line 206 def self_closing_node?(node) !node.children || !node.children.first end |
#separator_node?(node) ⇒ Boolean
220 221 222 223 |
# File 'lib/tml/tokenizers/dom.rb', line 220 def separator_node?(node) return false unless node node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase) end |
#short_token?(token, value) ⇒ Boolean
173 174 175 |
# File 'lib/tml/tokenizers/dom.rb', line 173 def short_token?(token, value) option('nodes.short').index(token.downcase) || value.length < 20 end |
#translate(doc) ⇒ Object
47 48 49 |
# File 'lib/tml/tokenizers/dom.rb', line 47 def translate(doc) translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc) end |
#translate_tml(tml) ⇒ Object
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/tml/tokenizers/dom.rb', line 104 def translate_tml(tml) return tml if empty_string?(tml) tml = generate_data_tokens(tml) if option('split_sentences') sentences = Tml::Utils.split_sentences(tml) translation = tml sentences.each do |sentence| sentence_translation = option('debug') ? debug_translation(sentence) : Tml.session.current_language.translate(sentence, tokens, .dup) translation = translation.gsub(sentence, sentence_translation) end reset_context return translation end tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip translation = option('debug') ? debug_translation(tml) : Tml.session.target_language.translate(tml, tokens, .dup) reset_context translation end |
#translate_tree(node) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/tml/tokenizers/dom.rb', line 51 def translate_tree(node) if non_translatable_node?(node) return node.inner_html end return translate_tml(node.inner_text) if node.type == 3 html = '' buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child) buffer += (child) elsif separator_node?(child) html += translate_tml(buffer) if buffer != '' html += generate_html_token(child) buffer = '' else html += translate_tml(buffer) if buffer != '' container_value = translate_tree(child) if ignored_node?(child) html += container_value else html += generate_html_token(child, container_value) end buffer = '' end end html += translate_tml(buffer) if buffer != '' html end |
#valid_text_node?(node) ⇒ Boolean
215 216 217 218 |
# File 'lib/tml/tokenizers/dom.rb', line 215 def valid_text_node?(node) return false unless node node.type == 3 && !empty_string?(node.inner_text) end |