Class: Tr8n::Tokenizers::Dom
- Inherits:
-
Object
- Object
- Tr8n::Tokenizers::Dom
- Defined in:
- lib/tr8n/tokenizers/dom.rb
Constant Summary collapse
- HTML_SPECIAL_CHAR_REGEX =
/(&[^;]*;)/
- INDEPENDENT_NUMBER_REGEX =
/^(\d+)$|^(\d+[.,;\s])|(\s\d+)$|(\s\d+[,;\s])/
- VERBOSE_DATE_REGEX =
/(((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(January|February|March|April|May|June|July|August|September|October|November|December))\\s\\d+(,\\s\\d+)*(,*\\sat\\s\\d+:\\d+(\\sUTC))*)/
Instance Attribute Summary collapse
-
#context ⇒ Object
Returns the value of attribute context.
-
#options ⇒ Object
Returns the value of attribute options.
-
#tokens ⇒ Object
Returns the value of attribute tokens.
Instance Method Summary collapse
- #adjust_name(node) ⇒ Object
- #between_separators?(node) ⇒ Boolean
- #container_node?(node) ⇒ Boolean
- #contextualize(name, context) ⇒ Object
- #debug(doc) ⇒ Object
- #debug_translation(translation) ⇒ Object
- #debug_tree(node, depth) ⇒ Object
- #empty_string?(tml) ⇒ Boolean
- #generate_data_tokens(text) ⇒ Object
- #generate_html_token(node, value = nil) ⇒ Object
- #generate_tml_tags(node) ⇒ Object
- #has_child_nodes?(node) ⇒ Boolean
- #has_inline_or_text_siblings?(node) ⇒ Boolean
- #ignored_node?(node) ⇒ Boolean
-
#initialize(context = {}, options = {}) ⇒ Dom
constructor
A new instance of Dom.
- #inline_node?(node) ⇒ Boolean
- #node_info(node) ⇒ Object
- #non_translatable_node?(node) ⇒ Boolean
- #only_child?(node) ⇒ Boolean
- #option(name) ⇒ Object
- #replace_special_characters(text) ⇒ Object
- #reset_context ⇒ Object
- #sanitize_value(value) ⇒ Object
- #self_closing_node?(node) ⇒ Boolean
- #separator_node?(node) ⇒ Boolean
- #short_token?(token, value) ⇒ Boolean
- #translate(doc) ⇒ Object
- #translate_tml(tml) ⇒ Object
- #translate_tree(node) ⇒ Object
- #valid_text_node?(node) ⇒ Boolean
Constructor Details
#initialize(context = {}, options = {}) ⇒ Dom
Returns a new instance of Dom.
45 46 47 48 49 |
# File 'lib/tr8n/tokenizers/dom.rb', line 45 def initialize(context = {}, = {}) self.context = context self. = reset_context end |
Instance Attribute Details
#context ⇒ Object
Returns the value of attribute context.
43 44 45 |
# File 'lib/tr8n/tokenizers/dom.rb', line 43 def context @context end |
#options ⇒ Object
Returns the value of attribute options.
43 44 45 |
# File 'lib/tr8n/tokenizers/dom.rb', line 43 def @options end |
#tokens ⇒ Object
Returns the value of attribute tokens.
43 44 45 |
# File 'lib/tr8n/tokenizers/dom.rb', line 43 def tokens @tokens end |
Instance Method Details
#adjust_name(node) ⇒ Object
285 286 287 288 289 |
# File 'lib/tr8n/tokenizers/dom.rb', line 285 def adjust_name(node) name = node.name.downcase map = option('name_mapping') map[name.to_sym] ? map[name.to_sym] : name end |
#between_separators?(node) ⇒ Boolean
126 127 128 129 |
# File 'lib/tr8n/tokenizers/dom.rb', line 126 def between_separators?(node) (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling)) end |
#container_node?(node) ⇒ Boolean
198 199 200 |
# File 'lib/tr8n/tokenizers/dom.rb', line 198 def container_node?(node) node.type == 1 && !inline_node?(node) end |
#contextualize(name, context) ⇒ Object
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 |
# File 'lib/tr8n/tokenizers/dom.rb', line 291 def contextualize(name, context) if self.tokens[name] and self.tokens[name] != context index = 0 matches = name.match(/\d+$/) if matches and matches.length > 0 index = matches[matches.length-1].to_i name = name.gsub(index.to_s, '') end name += (index + 1).to_s return contextualize(name, context) end self.tokens[name] = context name end |
#debug(doc) ⇒ Object
307 308 309 310 |
# File 'lib/tr8n/tokenizers/dom.rb', line 307 def debug(doc) self.doc = doc debug_tree(self.doc, 0) end |
#debug_translation(translation) ⇒ Object
156 157 158 |
# File 'lib/tr8n/tokenizers/dom.rb', line 156 def debug_translation(translation) option('debug_format').gsub('{$0}', translation) end |
#debug_tree(node, depth) ⇒ Object
312 313 314 315 316 317 318 319 320 |
# File 'lib/tr8n/tokenizers/dom.rb', line 312 def debug_tree(node, depth) padding = ('=' * (depth+1)) Tr8n.logger.log(padding + '=> ' + (node) + ': ' + node_info(node)) (node.children || []).each do |child| debug_tree(child, depth+1) end end |
#empty_string?(tml) ⇒ Boolean
160 161 162 163 |
# File 'lib/tr8n/tokenizers/dom.rb', line 160 def empty_string?(tml) tml = tml.gsub(/[\s\n\r\t]/, '') tml == '' end |
#generate_data_tokens(text) ⇒ Object
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
# File 'lib/tr8n/tokenizers/dom.rb', line 238 def generate_data_tokens(text) return text unless option('data_tokens.numeric') matches = text.match(INDEPENDENT_NUMBER_REGEX) || [] token_name = option('data_tokens.numeric_name') matches.each do |match| value = match.gsub(/[.,;\s]/, '') token = contextualize(token_name, value.to_i) replacement = match.replace(value, "{#{token}}") text = text.gsub(match, match.gsub(value, replacement)) end text end |
#generate_html_token(node, value = nil) ⇒ Object
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
# File 'lib/tr8n/tokenizers/dom.rb', line 254 def generate_html_token(node, value = nil) name = node.name.downcase attributes = node.attributes attributes_hash = {} value = (!value ? '{$0}' : value) if attributes.length == 0 if self_closing_node?(node) return '<' + name + '/>' if %w(br hr).index(name) return '<' + name + '>' + '</' + name + '>' end return '<' + name + '>' + value + '</' + name + '>' end attributes.each do |name, attribute| attributes_hash[name] = attribute.value end keys = attributes_hash.keys.sort attr = [] keys.each do |key| quote = attributes_hash[key].index("'") ? '"' : "'" attr << (key + '=' + quote + attributes_hash[key] + quote) end attr = attr.join(' ') return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node) '<' + name + ' ' + attr + '>' + value + '</' + name + '>' end |
#generate_tml_tags(node) ⇒ Object
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/tr8n/tokenizers/dom.rb', line 131 def (node) buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text else buffer += (child) end end token_context = generate_html_token(node) token = contextualize(adjust_name(node), token_context) value = sanitize_value(buffer) return '{' + token + '}' if self_closing_node?(node) return '[' + token + ': ' + value + ']' if short_token?(token, value) '[' + token + ']' + value + '[/' + token + ']' end |
#has_child_nodes?(node) ⇒ Boolean
122 123 124 |
# File 'lib/tr8n/tokenizers/dom.rb', line 122 def has_child_nodes?(node) node.children and node.children.length > 0 end |
#has_inline_or_text_siblings?(node) ⇒ Boolean
178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/tr8n/tokenizers/dom.rb', line 178 def has_inline_or_text_siblings?(node) return false unless node.parent node.parent.children.each do |child| unless child == node return true if inline_node?(child) || valid_text_node?(child) end end false end |
#ignored_node?(node) ⇒ Boolean
206 207 208 209 |
# File 'lib/tr8n/tokenizers/dom.rb', line 206 def ignored_node?(node) return true if (node.type != 1) (option('nodes.ignored') || []).index(node.name.downcase) end |
#inline_node?(node) ⇒ Boolean
190 191 192 193 194 195 196 |
# File 'lib/tr8n/tokenizers/dom.rb', line 190 def inline_node?(node) ( node.type == 1 and (option('nodes.inline') || []).index(node.name.downcase) and !only_child?(node) ) end |
#node_info(node) ⇒ Object
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 |
# File 'lib/tr8n/tokenizers/dom.rb', line 322 def node_info(node) info = [] info << node.type info << node.tagName if node.type == 1 if inline_node?(node) info << 'inline' if has_inline_or_text_siblings?(node) info << 'sentence' else info << 'only translatable' end end info << 'self closing' if self_closing_node?(node) info << 'only child' if only_child?(node) return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3 "[#{info.join(', ')}]" end |
#non_translatable_node?(node) ⇒ Boolean
93 94 95 96 97 98 |
# File 'lib/tr8n/tokenizers/dom.rb', line 93 def non_translatable_node?(node) return false unless node return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase) return true if node.type == 1 && node.children.length === 0 && node.inner_text == '' false end |
#only_child?(node) ⇒ Boolean
173 174 175 176 |
# File 'lib/tr8n/tokenizers/dom.rb', line 173 def only_child?(node) return false unless node.parent node.parent.children.count == 1 end |
#option(name) ⇒ Object
151 152 153 154 |
# File 'lib/tr8n/tokenizers/dom.rb', line 151 def option(name) value = Tr8n::Utils.hash_value(self., name) value || Tr8n.config.translator_option(name) end |
#replace_special_characters(text) ⇒ Object
225 226 227 228 229 230 231 232 233 234 235 236 |
# File 'lib/tr8n/tokenizers/dom.rb', line 225 def replace_special_characters(text) return text if option('data_tokens.special') matches = text.match(HTML_SPECIAL_CHAR_REGEX) matches.each do |match| token = match[1, - 2] self.context[token] = match text = text.gsub(match, "{#{token}}") end text end |
#reset_context ⇒ Object
165 166 167 |
# File 'lib/tr8n/tokenizers/dom.rb', line 165 def reset_context self.tokens = {}.merge(self.context) end |
#sanitize_value(value) ⇒ Object
221 222 223 |
# File 'lib/tr8n/tokenizers/dom.rb', line 221 def sanitize_value(value) value.gsub(/^\s+/, '') end |
#self_closing_node?(node) ⇒ Boolean
202 203 204 |
# File 'lib/tr8n/tokenizers/dom.rb', line 202 def self_closing_node?(node) !node.children || !node.children.first end |
#separator_node?(node) ⇒ Boolean
216 217 218 219 |
# File 'lib/tr8n/tokenizers/dom.rb', line 216 def separator_node?(node) return false unless node node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase) end |
#short_token?(token, value) ⇒ Boolean
169 170 171 |
# File 'lib/tr8n/tokenizers/dom.rb', line 169 def short_token?(token, value) option('nodes.short').index(token.downcase) || value.length < 20 end |
#translate(doc) ⇒ Object
51 52 53 |
# File 'lib/tr8n/tokenizers/dom.rb', line 51 def translate(doc) translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc) end |
#translate_tml(tml) ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/tr8n/tokenizers/dom.rb', line 100 def translate_tml(tml) return tml if empty_string?(tml) tml = generate_data_tokens(tml) if option('split_sentences') sentences = Tr8n::Utils.split_sentences(tml) translation = tml sentences.each do |sentence| sentence_translation = option('debug') ? debug_translation(sentence) : Tr8n.session.current_language.translate(sentence, tokens, ) translation = translation.gsub(sentence, sentence_translation) end reset_context return translation end tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip translation = option('debug') ? debug_translation(tml) : Tr8n.session.target_language.translate(tml, tokens, ) reset_context translation end |
#translate_tree(node) ⇒ Object
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/tr8n/tokenizers/dom.rb', line 55 def translate_tree(node) if non_translatable_node?(node) return node.children.first.inner_text if node.children.count == 1 return '' end return translate_tml(node.inner_text) if node.type == 3 html = '' buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child) buffer += (child) elsif separator_node?(child) html += translate_tml(buffer) if buffer != '' html += generate_html_token(child) buffer = '' else html += translate_tml(buffer) if buffer != '' container_value = translate_tree(child) if ignored_node?(child) html += container_value else html += generate_html_token(child, container_value) end buffer = '' end end html += translate_tml(buffer) if buffer != '' html end |
#valid_text_node?(node) ⇒ Boolean
211 212 213 214 |
# File 'lib/tr8n/tokenizers/dom.rb', line 211 def valid_text_node?(node) return false unless node node.type == 3 && !empty_string?(node.inner_text) end |