Class: Tr8n::Tokenizers::Dom

Inherits:
Object
  • Object
show all
Defined in:
lib/tr8n/tokenizers/dom.rb

Constant Summary collapse

HTML_SPECIAL_CHAR_REGEX =
/(&[^;]*;)/
INDEPENDENT_NUMBER_REGEX =
/^(\d+)$|^(\d+[.,;\s])|(\s\d+)$|(\s\d+[,;\s])/
VERBOSE_DATE_REGEX =
/(((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(January|February|March|April|May|June|July|August|September|October|November|December))\\s\\d+(,\\s\\d+)*(,*\\sat\\s\\d+:\\d+(\\sUTC))*)/

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context = {}, options = {}) ⇒ Dom

Returns a new instance of Dom.



45
46
47
48
49
# File 'lib/tr8n/tokenizers/dom.rb', line 45

def initialize(context = {}, options = {})
  self.context = context
  self.options = options
  reset_context
end

Instance Attribute Details

#contextObject

Returns the value of attribute context.



43
44
45
# File 'lib/tr8n/tokenizers/dom.rb', line 43

def context
  @context
end

#optionsObject

Returns the value of attribute options.



43
44
45
# File 'lib/tr8n/tokenizers/dom.rb', line 43

def options
  @options
end

#tokensObject

Returns the value of attribute tokens.



43
44
45
# File 'lib/tr8n/tokenizers/dom.rb', line 43

def tokens
  @tokens
end

Instance Method Details

#adjust_name(node) ⇒ Object



285
286
287
288
289
# File 'lib/tr8n/tokenizers/dom.rb', line 285

def adjust_name(node)
  name = node.name.downcase
  map = option('name_mapping')
  map[name.to_sym] ? map[name.to_sym] : name
end

#between_separators?(node) ⇒ Boolean

Returns:

  • (Boolean)


126
127
128
129
# File 'lib/tr8n/tokenizers/dom.rb', line 126

def between_separators?(node)
  (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or
  (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling))
end

#container_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


198
199
200
# File 'lib/tr8n/tokenizers/dom.rb', line 198

def container_node?(node)
  node.type == 1 && !inline_node?(node)
end

#contextualize(name, context) ⇒ Object



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# File 'lib/tr8n/tokenizers/dom.rb', line 291

def contextualize(name, context)
  if self.tokens[name] and self.tokens[name] != context
    index = 0
    matches = name.match(/\d+$/)
    if matches and matches.length > 0
      index = matches[matches.length-1].to_i
      name = name.gsub(index.to_s, '')
    end
    name += (index + 1).to_s
    return contextualize(name, context)
  end

  self.tokens[name] = context
  name
end

#debug(doc) ⇒ Object



307
308
309
310
# File 'lib/tr8n/tokenizers/dom.rb', line 307

def debug(doc)
  self.doc = doc
  debug_tree(self.doc, 0)
end

#debug_translation(translation) ⇒ Object



156
157
158
# File 'lib/tr8n/tokenizers/dom.rb', line 156

def debug_translation(translation)
  option('debug_format').gsub('{$0}', translation)
end

#debug_tree(node, depth) ⇒ Object



312
313
314
315
316
317
318
319
320
# File 'lib/tr8n/tokenizers/dom.rb', line 312

def debug_tree(node, depth)
  padding = ('=' * (depth+1))

  Tr8n.logger.log(padding + '=> ' + (node) + ': ' + node_info(node))

  (node.children || []).each do |child|
    debug_tree(child, depth+1)
  end
end

#empty_string?(tml) ⇒ Boolean

Returns:

  • (Boolean)


160
161
162
163
# File 'lib/tr8n/tokenizers/dom.rb', line 160

def empty_string?(tml)
  tml = tml.gsub(/[\s\n\r\t]/, '')
  tml == ''
end

#generate_data_tokens(text) ⇒ Object



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/tr8n/tokenizers/dom.rb', line 238

def generate_data_tokens(text)
  return text unless option('data_tokens.numeric')

  matches = text.match(INDEPENDENT_NUMBER_REGEX) || []
  token_name = option('data_tokens.numeric_name')

  matches.each do |match|
    value = match.gsub(/[.,;\s]/, '')
    token = contextualize(token_name, value.to_i)
    replacement = match.replace(value, "{#{token}}")
    text = text.gsub(match, match.gsub(value, replacement))
  end

  text
end

#generate_html_token(node, value = nil) ⇒ Object



254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# File 'lib/tr8n/tokenizers/dom.rb', line 254

def generate_html_token(node, value = nil)
  name = node.name.downcase
  attributes = node.attributes
  attributes_hash = {}
  value = (!value ? '{$0}' : value)

  if attributes.length == 0
    if self_closing_node?(node)
      return '<' + name + '/>' if %w(br hr).index(name)
      return '<' + name + '>' + '</' + name + '>'
    end
    return '<' + name + '>' + value + '</' + name + '>'
  end

  attributes.each do |name, attribute|
    attributes_hash[name] = attribute.value
  end

  keys = attributes_hash.keys.sort

  attr = []
  keys.each do |key|
    quote = attributes_hash[key].index("'") ? '"' : "'"
    attr << (key + '=' + quote + attributes_hash[key] + quote)
  end
  attr = attr.join(' ')

  return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node)
  '<' + name + ' ' + attr + '>' + value + '</' + name + '>'
end

#generate_tml_tags(node) ⇒ Object



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/tr8n/tokenizers/dom.rb', line 131

def generate_tml_tags(node)
  buffer = ''
  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    else
      buffer += generate_tml_tags(child)
    end
  end

  token_context = generate_html_token(node)
  token = contextualize(adjust_name(node), token_context)
  value = sanitize_value(buffer)

  return '{' + token + '}' if self_closing_node?(node)
  return '[' + token + ': ' + value + ']' if short_token?(token, value)

  '[' + token + ']' + value + '[/' + token + ']'
end

#has_child_nodes?(node) ⇒ Boolean

Returns:

  • (Boolean)


122
123
124
# File 'lib/tr8n/tokenizers/dom.rb', line 122

def has_child_nodes?(node)
  node.children and node.children.length > 0
end

#has_inline_or_text_siblings?(node) ⇒ Boolean

Returns:

  • (Boolean)


178
179
180
181
182
183
184
185
186
187
188
# File 'lib/tr8n/tokenizers/dom.rb', line 178

def has_inline_or_text_siblings?(node)
  return false unless node.parent

  node.parent.children.each do |child|
    unless child == node
      return true if inline_node?(child) || valid_text_node?(child)
    end
  end

  false
end

#ignored_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


206
207
208
209
# File 'lib/tr8n/tokenizers/dom.rb', line 206

def ignored_node?(node)
  return true if (node.type != 1)
  (option('nodes.ignored') || []).index(node.name.downcase)
end

#inline_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


190
191
192
193
194
195
196
# File 'lib/tr8n/tokenizers/dom.rb', line 190

def inline_node?(node)
  (
    node.type == 1 and
    (option('nodes.inline') || []).index(node.name.downcase) and
    !only_child?(node)
  )
end

#node_info(node) ⇒ Object



322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
# File 'lib/tr8n/tokenizers/dom.rb', line 322

def node_info(node)
  info = []
  info << node.type

  info << node.tagName if node.type == 1

  if inline_node?(node)
    info << 'inline'
    if has_inline_or_text_siblings?(node)
      info << 'sentence'
    else
      info << 'only translatable'
    end
  end

  info << 'self closing' if self_closing_node?(node)
  info << 'only child' if only_child?(node)

  return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3
  "[#{info.join(', ')}]"
end

#non_translatable_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


93
94
95
96
97
98
# File 'lib/tr8n/tokenizers/dom.rb', line 93

def non_translatable_node?(node)
  return false unless node
  return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase)
  return true if node.type == 1 && node.children.length === 0 && node.inner_text == ''
  false
end

#only_child?(node) ⇒ Boolean

Returns:

  • (Boolean)


173
174
175
176
# File 'lib/tr8n/tokenizers/dom.rb', line 173

def only_child?(node)
  return false unless node.parent
  node.parent.children.count == 1
end

#option(name) ⇒ Object



151
152
153
154
# File 'lib/tr8n/tokenizers/dom.rb', line 151

def option(name)
  value = Tr8n::Utils.hash_value(self.options, name)
  value || Tr8n.config.translator_option(name)
end

#replace_special_characters(text) ⇒ Object



225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/tr8n/tokenizers/dom.rb', line 225

def replace_special_characters(text)
  return text if option('data_tokens.special')

  matches = text.match(HTML_SPECIAL_CHAR_REGEX)
  matches.each do  |match|
    token = match[1, - 2]
    self.context[token] = match
    text = text.gsub(match, "{#{token}}")
  end

  text
end

#reset_contextObject



165
166
167
# File 'lib/tr8n/tokenizers/dom.rb', line 165

def reset_context
  self.tokens = {}.merge(self.context)
end

#sanitize_value(value) ⇒ Object



221
222
223
# File 'lib/tr8n/tokenizers/dom.rb', line 221

def sanitize_value(value)
  value.gsub(/^\s+/, '')
end

#self_closing_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


202
203
204
# File 'lib/tr8n/tokenizers/dom.rb', line 202

def self_closing_node?(node)
  !node.children || !node.children.first
end

#separator_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


216
217
218
219
# File 'lib/tr8n/tokenizers/dom.rb', line 216

def separator_node?(node)
  return false unless node
  node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase)
end

#short_token?(token, value) ⇒ Boolean

Returns:

  • (Boolean)


169
170
171
# File 'lib/tr8n/tokenizers/dom.rb', line 169

def short_token?(token, value)
  option('nodes.short').index(token.downcase) || value.length < 20
end

#translate(doc) ⇒ Object



51
52
53
# File 'lib/tr8n/tokenizers/dom.rb', line 51

def translate(doc)
  translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc)
end

#translate_tml(tml) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/tr8n/tokenizers/dom.rb', line 100

def translate_tml(tml)
  return tml if empty_string?(tml)
  tml = generate_data_tokens(tml)

  if option('split_sentences')
    sentences = Tr8n::Utils.split_sentences(tml)
    translation = tml
    sentences.each do |sentence|
      sentence_translation = option('debug') ? debug_translation(sentence) : Tr8n.session.current_language.translate(sentence, tokens, options)
      translation = translation.gsub(sentence, sentence_translation)
    end
    reset_context
    return translation
  end

  tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip

  translation = option('debug') ? debug_translation(tml) : Tr8n.session.target_language.translate(tml, tokens, options)
  reset_context
  translation
end

#translate_tree(node) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/tr8n/tokenizers/dom.rb', line 55

def translate_tree(node)
  if non_translatable_node?(node)
    return node.children.first.inner_text if node.children.count == 1
    return ''
  end

  return translate_tml(node.inner_text) if node.type == 3

  html = ''
  buffer = ''

  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child)
      buffer += generate_tml_tags(child)
    elsif separator_node?(child)
      html += translate_tml(buffer) if buffer != ''
      html += generate_html_token(child)
      buffer = ''
    else
      html += translate_tml(buffer) if buffer != ''

      container_value = translate_tree(child)
      if ignored_node?(child)
        html += container_value
      else
        html += generate_html_token(child, container_value)
      end

      buffer = ''
    end
  end

  html += translate_tml(buffer) if buffer != ''
  html
end

#valid_text_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


211
212
213
214
# File 'lib/tr8n/tokenizers/dom.rb', line 211

def valid_text_node?(node)
  return false unless node
  node.type == 3 && !empty_string?(node.inner_text)
end