Class: Tml::Tokenizers::Dom

Inherits:
Object
  • Object
show all
Defined in:
lib/tml/tokenizers/dom.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context = {}, options = {}) ⇒ Dom

Returns a new instance of Dom.



41
42
43
44
45
# File 'lib/tml/tokenizers/dom.rb', line 41

def initialize(context = {}, options = {})
  self.context = context
  self.options = options
  reset_context
end

Instance Attribute Details

#contextObject

Returns the value of attribute context.



39
40
41
# File 'lib/tml/tokenizers/dom.rb', line 39

def context
  @context
end

#optionsObject

Returns the value of attribute options.



39
40
41
# File 'lib/tml/tokenizers/dom.rb', line 39

def options
  @options
end

#tokensObject

Returns the value of attribute tokens.



39
40
41
# File 'lib/tml/tokenizers/dom.rb', line 39

def tokens
  @tokens
end

Instance Method Details

#adjust_name(node) ⇒ Object



313
314
315
316
317
# File 'lib/tml/tokenizers/dom.rb', line 313

def adjust_name(node)
  name = node.name.downcase
  map = option('name_mapping')
  map[name.to_sym] ? map[name.to_sym] : name
end

#between_separators?(node) ⇒ Boolean

Returns:

  • (Boolean)


130
131
132
133
# File 'lib/tml/tokenizers/dom.rb', line 130

def between_separators?(node)
  (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or
  (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling))
end

#container_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


202
203
204
# File 'lib/tml/tokenizers/dom.rb', line 202

def container_node?(node)
  node.type == 1 && !inline_node?(node)
end

#contextualize(name, context) ⇒ Object



319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/tml/tokenizers/dom.rb', line 319

def contextualize(name, context)
  if self.tokens[name] and self.tokens[name] != context
    index = 0
    matches = name.match(/\d+$/)
    if matches and matches.length > 0
      index = matches[matches.length-1].to_i
      name = name.gsub(index.to_s, '')
    end
    name += (index + 1).to_s
    return contextualize(name, context)
  end

  self.tokens[name] = context
  name
end

#debug(doc) ⇒ Object



335
336
337
338
# File 'lib/tml/tokenizers/dom.rb', line 335

def debug(doc)
  self.doc = doc
  debug_tree(self.doc, 0)
end

#debug_translation(translation) ⇒ Object



160
161
162
# File 'lib/tml/tokenizers/dom.rb', line 160

def debug_translation(translation)
  option('debug_format').gsub('{$0}', translation)
end

#debug_tree(node, depth) ⇒ Object



340
341
342
343
344
345
346
347
348
# File 'lib/tml/tokenizers/dom.rb', line 340

def debug_tree(node, depth)
  padding = ('=' * (depth+1))

  Tml.logger.log(padding + '=> ' + (node) + ': ' + node_info(node))

  (node.children || []).each do |child|
    debug_tree(child, depth+1)
  end
end

#empty_string?(tml) ⇒ Boolean

Returns:

  • (Boolean)


164
165
166
167
# File 'lib/tml/tokenizers/dom.rb', line 164

def empty_string?(tml)
  tml = tml.gsub(/[\s\n\r\t]/, '')
  tml == ''
end

#generate_data_tokens(text) ⇒ Object



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/tml/tokenizers/dom.rb', line 229

def generate_data_tokens(text)
  if option('data_tokens.special.enabled')
    matches = text.scan(option('data_tokens.special.regex'))
    matches.each do  |match|
      token = match[1, - 2]
      self.context[token] = match
      text = text.gsub(match, "{#{token}}")
    end
  end

  if option('data_tokens.date.enabled')
    token_name = option('data_tokens.date.name')
    formats = option('data_tokens.date.formats')
    formats.each do |format|
      regex = format[0]
      # date_format = format[1]

      matches = text.scan(regex)
      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          date = match.first
          token = self.contextualize(token_name, date)
          replacement = "{#{token}}"
          text = text.gsub(date, replacement)
        end
      end
    end
  end

  rules = option('data_tokens.rules')
  if rules
    rules.each do |rule|
      next unless rule[:enabled]
      matches = text.scan(rule[:regex])

      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          value = match.first.strip

          unless value == ''
            token = contextualize(rule[:name], value.gsub(/[.,;\s]/, '').to_i)
            text = text.gsub(value, value.gsub(value, "{#{token}}"))
          end
        end
      end
    end
  end

  text
end

#generate_html_token(node, value = nil) ⇒ Object



282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/tml/tokenizers/dom.rb', line 282

def generate_html_token(node, value = nil)
  name = node.name.downcase
  attributes = node.attributes
  attributes_hash = {}
  value = (!value ? '{$0}' : value)

  if attributes.length == 0
    if self_closing_node?(node)
      return '<' + name + '/>' if %w(br hr).index(name)
      return '<' + name + '>' + '</' + name + '>'
    end
    return '<' + name + '>' + value + '</' + name + '>'
  end

  attributes.each do |name, attribute|
    attributes_hash[name] = attribute.value
  end

  keys = attributes_hash.keys.sort

  attr = []
  keys.each do |key|
    quote = attributes_hash[key].index("'") ? '"' : "'"
    attr << (key + '=' + quote + attributes_hash[key] + quote)
  end
  attr = attr.join(' ')

  return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node)
  '<' + name + ' ' + attr + '>' + value + '</' + name + '>'
end

#generate_tml_tags(node) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/tml/tokenizers/dom.rb', line 135

def generate_tml_tags(node)
  buffer = ''
  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    else
      buffer += generate_tml_tags(child)
    end
  end

  token_context = generate_html_token(node)
  token = contextualize(adjust_name(node), token_context)
  value = sanitize_value(buffer)

  return '{' + token + '}' if self_closing_node?(node)
  return '[' + token + ': ' + value + ']' if short_token?(token, value)

  '[' + token + ']' + value + '[/' + token + ']'
end

#has_child_nodes?(node) ⇒ Boolean

Returns:

  • (Boolean)


126
127
128
# File 'lib/tml/tokenizers/dom.rb', line 126

def has_child_nodes?(node)
  node.children and node.children.length > 0
end

#has_inline_or_text_siblings?(node) ⇒ Boolean

Returns:

  • (Boolean)


182
183
184
185
186
187
188
189
190
191
192
# File 'lib/tml/tokenizers/dom.rb', line 182

def has_inline_or_text_siblings?(node)
  return false unless node.parent

  node.parent.children.each do |child|
    unless child == node
      return true if inline_node?(child) || valid_text_node?(child)
    end
  end

  false
end

#ignored_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


210
211
212
213
# File 'lib/tml/tokenizers/dom.rb', line 210

def ignored_node?(node)
  return true if (node.type != 1)
  (option('nodes.ignored') || []).index(node.name.downcase)
end

#inline_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


194
195
196
197
198
199
200
# File 'lib/tml/tokenizers/dom.rb', line 194

def inline_node?(node)
  (
    node.type == 1 and
    (option('nodes.inline') || []).index(node.name.downcase) and
    !only_child?(node)
  )
end

#no_translate_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


88
89
90
91
92
93
94
# File 'lib/tml/tokenizers/dom.rb', line 88

def no_translate_node?(node)
  return unless node && node.type == 1 && node.attributes
  node.attributes.each do |name, attribute|
    return true if name == 'notranslate' or attribute.value.index('notranslate')
  end
  false
end

#node_info(node) ⇒ Object



350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
# File 'lib/tml/tokenizers/dom.rb', line 350

def node_info(node)
  info = []
  info << node.type

  info << node.tagName if node.type == 1

  if inline_node?(node)
    info << 'inline'
    if has_inline_or_text_siblings?(node)
      info << 'sentence'
    else
      info << 'only translatable'
    end
  end

  info << 'self closing' if self_closing_node?(node)
  info << 'only child' if only_child?(node)

  return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3
  "[#{info.join(', ')}]"
end

#non_translatable_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


96
97
98
99
100
101
102
# File 'lib/tml/tokenizers/dom.rb', line 96

def non_translatable_node?(node)
  return false unless node
  return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase)
  return true if node.type == 1 && node.children.length === 0 && node.inner_text == ''
  return true if no_translate_node?(node)
  false
end

#only_child?(node) ⇒ Boolean

Returns:

  • (Boolean)


177
178
179
180
# File 'lib/tml/tokenizers/dom.rb', line 177

def only_child?(node)
  return false unless node.parent
  node.parent.children.count == 1
end

#option(name) ⇒ Object



155
156
157
158
# File 'lib/tml/tokenizers/dom.rb', line 155

def option(name)
  value = Tml::Utils.hash_value(self.options, name)
  value || Tml.config.translator_option(name)
end

#reset_contextObject



169
170
171
# File 'lib/tml/tokenizers/dom.rb', line 169

def reset_context
  self.tokens = {}.merge(self.context)
end

#sanitize_value(value) ⇒ Object



225
226
227
# File 'lib/tml/tokenizers/dom.rb', line 225

def sanitize_value(value)
  value.gsub(/^\s+/, '')
end

#self_closing_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


206
207
208
# File 'lib/tml/tokenizers/dom.rb', line 206

def self_closing_node?(node)
  !node.children || !node.children.first
end

#separator_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


220
221
222
223
# File 'lib/tml/tokenizers/dom.rb', line 220

def separator_node?(node)
  return false unless node
  node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase)
end

#short_token?(token, value) ⇒ Boolean

Returns:

  • (Boolean)


173
174
175
# File 'lib/tml/tokenizers/dom.rb', line 173

def short_token?(token, value)
  option('nodes.short').index(token.downcase) || value.length < 20
end

#translate(doc) ⇒ Object



47
48
49
# File 'lib/tml/tokenizers/dom.rb', line 47

def translate(doc)
  translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc)
end

#translate_tml(tml) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/tml/tokenizers/dom.rb', line 104

def translate_tml(tml)
  return tml if empty_string?(tml)
  tml = generate_data_tokens(tml)

  if option('split_sentences')
    sentences = Tml::Utils.split_sentences(tml)
    translation = tml
    sentences.each do |sentence|
      sentence_translation = option('debug') ? debug_translation(sentence) : Tml.session.current_language.translate(sentence, tokens, options.dup)
      translation = translation.gsub(sentence, sentence_translation)
    end
    reset_context
    return translation
  end

  tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip

  translation = option('debug') ? debug_translation(tml) : Tml.session.target_language.translate(tml, tokens, options.dup)
  reset_context
  translation
end

#translate_tree(node) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/tml/tokenizers/dom.rb', line 51

def translate_tree(node)
  if non_translatable_node?(node)
    return node.inner_html
  end

  return translate_tml(node.inner_text) if node.type == 3

  html = ''
  buffer = ''

  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child)
      buffer += generate_tml_tags(child)
    elsif separator_node?(child)
      html += translate_tml(buffer) if buffer != ''
      html += generate_html_token(child)
      buffer = ''
    else
      html += translate_tml(buffer) if buffer != ''

      container_value = translate_tree(child)
      if ignored_node?(child)
        html += container_value
      else
        html += generate_html_token(child, container_value)
      end

      buffer = ''
    end
  end

  html += translate_tml(buffer) if buffer != ''
  html
end

#valid_text_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


215
216
217
218
# File 'lib/tml/tokenizers/dom.rb', line 215

def valid_text_node?(node)
  return false unless node
  node.type == 3 && !empty_string?(node.inner_text)
end