Class: RubyRTF::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/ruby-rtf/parser.rb

Overview

Handles the parsing of RTF content into an RubyRTF::Document

Constant Summary collapse

STOP_CHARS =
[' ', '\\', '{', '}', "\r", "\n", ';']
BLACKLISTED =

Keys that aren’t inherited

[:paragraph, :newline, :tab, :lquote, :rquote, :ldblquote, :rdblquote]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(unknown_control_warning_enabled: true) ⇒ Parser

Returns a new instance of Parser.

Parameters:

  • unknown_control_warning_enabled (Boolean) (defaults to: true)

    Whether to write unknown control directive warnings to STDERR



14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/ruby-rtf/parser.rb', line 14

def initialize(unknown_control_warning_enabled: true)
  # default_mods needs to be the same has in the formatting stack and in
  # the current_section modifiers or the first stack ends up getting lost.
  default_mods = {}
  @formatting_stack = [default_mods]
  @current_section = {:text => '', :modifiers => default_mods}
  @unknown_control_warning_enabled = unknown_control_warning_enabled

  @seen = {}

  @doc = RubyRTF::Document.new
  @context_stack = []
end

Instance Attribute Details

#current_sectionObject

Returns the value of attribute current_section.



6
7
8
# File 'lib/ruby-rtf/parser.rb', line 6

def current_section
  @current_section
end

#docObject (readonly)

Returns the value of attribute doc.



11
12
13
# File 'lib/ruby-rtf/parser.rb', line 11

def doc
  @doc
end

#encodingObject

Returns the value of attribute encoding.



6
7
8
# File 'lib/ruby-rtf/parser.rb', line 6

def encoding
  @encoding
end

#formatting_stackArray (readonly)

Returns The current formatting block to use as the basis for new sections.

Returns:

  • (Array)

    The current formatting block to use as the basis for new sections



9
10
11
# File 'lib/ruby-rtf/parser.rb', line 9

def formatting_stack
  @formatting_stack
end

Instance Method Details

#add_modifier_section(mods = {}, text = nil) ⇒ Object



495
496
497
498
499
500
501
# File 'lib/ruby-rtf/parser.rb', line 495

def add_modifier_section(mods = {}, text = nil)
  force_section!(mods, text)
  pop_formatting!

  force_section!
  pop_formatting!
end

#add_section!(mods = {}) ⇒ Object



503
504
505
506
507
508
509
# File 'lib/ruby-rtf/parser.rb', line 503

def add_section!(mods = {})
  if current_section[:text].empty?
    current_section[:modifiers].merge!(mods)
  else
    force_section!(mods)
  end
end

#current_contextObject



535
536
537
# File 'lib/ruby-rtf/parser.rb', line 535

def current_context
  @context_stack.last || doc
end

#force_section!(mods = {}, text = nil) ⇒ Object



513
514
515
516
517
518
519
520
521
522
523
524
# File 'lib/ruby-rtf/parser.rb', line 513

def force_section!(mods = {}, text =  nil)
  current_context << @current_section

  fs = formatting_stack.last || {}
  fs.each_pair do |k, v|
    next if BLACKLISTED.include?(k)
    mods[k] = v
  end
  formatting_stack.push(mods)

  @current_section = {:text => (text || ''), :modifiers => mods}
end

#handle_control(name, val, src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Handle a given control

Parameters:

  • name (Symbol)

    The control name

  • val (Integer|nil)

    The controls value, or nil if non associated

  • src (String)

    The source document

  • current_pos (Integer)

    The current document position

Returns:

  • (Integer)

    The new current position



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# File 'lib/ruby-rtf/parser.rb', line 125

def handle_control(name, val, src, current_pos)
  case(name)
  when :rtf then ;
  when :deff then @doc.default_font = val
  when :ansicpg then self.encoding = "windows-#{val}"
  when *[:ansi, :mac, :pc, :pca] then @doc.character_set = name
  when :fonttbl then current_pos = parse_font_table(src, current_pos)
  when :colortbl then current_pos = parse_colour_table(src, current_pos)
  when :stylesheet then current_pos = parse_stylesheet(src, current_pos)
  when :info  then current_pos = parse_info(src, current_pos)
  when :* then current_pos = parse_skip(src, current_pos)

  when :f then add_section!(:font => @doc.font_table[val])

  # RTF font sizes are in half-points. divide by 2 to get points
  when :fs then add_section!(:font_size => (val.to_f / 2.0))
  when :b then
    if val
      @formatting_stack.pop
      add_section!
    else
      add_section!(:bold => true)
    end

  when :i then
    if val
      @formatting_stack.pop
      add_section!
    else
      add_section!(:italic => true)
    end

  when :ul then
    if val
      @formatting_stack.pop
      add_section!
    else
      add_section!(:underline => true)
    end
  when :ulnone then
    current_section[:modifiers][:underline] = false
    @formatting_stack.pop

  when :super then add_section!(:superscript => true)
  when :sub then add_section!(:subscript => true)
  when :strike then add_section!(:strikethrough => true)
  when :scaps then add_section!(:smallcaps => true)
  when :ql then add_section!(:justification => :left)
  when :qr then add_section!(:justification => :right)
  when :qj then add_section!(:justification => :full)
  when :qc then add_section!(:justification => :center)
  when :fi then add_section!(:first_line_indent => RubyRTF.twips_to_points(val))
  when :li then add_section!(:left_indent => RubyRTF.twips_to_points(val))
  when :ri then add_section!(:right_indent => RubyRTF.twips_to_points(val))
  when :margl then add_section!(:left_margin => RubyRTF.twips_to_points(val))
  when :margr then add_section!(:right_margin => RubyRTF.twips_to_points(val))
  when :margt then add_section!(:top_margin => RubyRTF.twips_to_points(val))
  when :margb then add_section!(:bottom_margin => RubyRTF.twips_to_points(val))
  when :sb then add_section!(:space_before => RubyRTF.twips_to_points(val))
  when :sa then add_section!(:space_after => RubyRTF.twips_to_points(val))
  when :cf then add_section!(:foreground_colour => @doc.colour_table[val])
  when :cb then add_section!(:background_colour => @doc.colour_table[val])
  when :hex then current_section[:text] << val
  when :uc then @skip_byte = val.to_i
  when :u then
    if @skip_byte && @skip_byte == 0
      val = val % 100
      @skip_byte = nil
    end
    if val == 32 || val == 8232
      add_modifier_section({:newline => true}, "\n")
    else
      val += 65_536 if val < 0
      char = if val < 10_000
               [val.to_s.hex].pack('U*')
             else
               [val].pack('U*')
             end
      current_section[:text] << char
    end

  when *[:rquote, :lquote] then add_modifier_section({name => true}, "'")
  when *[:rdblquote, :ldblquote] then add_modifier_section({name => true}, '"')

  when :'{' then current_section[:text] << "{"
  when :'}' then current_section[:text] << "}"
  when :'\\' then current_section[:text] << '\\'

  when :~ then add_modifier_section({:nbsp => true}, " ")

  when :tab then add_modifier_section({:tab => true}, "\t")
  when :emdash then add_modifier_section({:emdash => true}, "--")
  when :endash then add_modifier_section({:endash => true}, "-")

  when *[:line, :"\n"] then add_modifier_section({:newline => true}, "\n")
  when :"\r" then ;

  when :par then add_modifier_section({:paragraph => true})
  when *[:pard, :plain] then reset_current_section!

  when :trowd then
    table = nil
    table = doc.sections.last[:modifiers][:table] if doc.sections.last && doc.sections.last[:modifiers][:table]
    if table
      table.add_row
    else
      table = RubyRTF::Table.new

      if !current_section[:text].empty?
        force_section!({:table => table})
      else
        current_section[:modifiers][:table] = table
        pop_formatting!
      end

      force_section!
      pop_formatting!
    end

    @context_stack.push(table.current_row.current_cell)

  when :trgaph then
    raise "trgaph outside of a table?" if !current_context.respond_to?(:table)
    current_context.table.half_gap = RubyRTF.twips_to_points(val)

  when :trleft then
    raise "trleft outside of a table?" if !current_context.respond_to?(:table)
    current_context.table.left_margin = RubyRTF.twips_to_points(val)

  when :cellx then
    raise "cellx outside of a table?" if !current_context.respond_to?(:row)
    current_context.row.end_positions.push(RubyRTF.twips_to_points(val))

  when :intbl then ;

  when :cell then
    pop_formatting!

    table = current_context.table if current_context.respond_to?(:table)

    force_section! #unless current_section[:text].empty?
    reset_current_section!

    @context_stack.pop

    # only add a cell if the row isn't full already
    if table && table.current_row && (table.current_row.cells.length < table.current_row.end_positions.length)
      cell = table.current_row.add_cell
      @context_stack.push(cell)
    end

  when :row then
    if current_context.sections.empty?
      # empty row
      table = current_context.table
      table.rows.pop

      @context_stack.pop
    end
  when :pict then add_section!(picture: true)
  when :jpegblip then add_section!(picture_format:'jpeg')
  when :pngblip then add_section!(picture_format:'png')
  when *[:dibitmap, :wbitmap] then add_section!(picture_format:'bmp')
  when *[:wmetafile, :pmmetafile] then add_section!(picture_format:'wmf')
  when :pich then add_section!(picture_height: RubyRTF.twips_to_points(val))
  when :picw then add_section!(picture_width: RubyRTF.twips_to_points(val))
  when :picscalex then add_section!(picture_scale_x: val.to_i)
  when :picscaley then add_section!(picture_scale_y: val.to_i)

  else
    unless @seen[name]
      @seen[name] = true
      if @unknown_control_warning_enabled
        warn "Unknown control #{name.inspect} with #{val} at #{current_pos}"
      end
    end
  end
  current_pos
end

#parse(src) ⇒ RubyRTF::Document

Parses a given string into an RubyRTF::Document

Parameters:

  • src (String)

    The document to parse

Returns:

Raises:



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/ruby-rtf/parser.rb', line 33

def parse(src)
  raise RubyRTF::InvalidDocument.new("Opening \\rtf1 missing") unless src =~ /\{\\rtf1/

  current_pos = 0
  len = src.length

  group_level = 0
  while (current_pos < len)
    char = src[current_pos]
    current_pos += 1

    case(char)
    when '\\' then
      name, val, current_pos = parse_control(src, current_pos)
      current_pos = handle_control(name, val, src, current_pos)

    when '{' then
      add_section!
      group_level += 1

    when '}' then
      pop_formatting!
      add_section!
      group_level -= 1

    when *["\r", "\n"] then ;
    else current_section[:text] << char
    end
  end

  unless current_section[:text].empty?
    current_context << current_section
  end

  raise RubyRTF::InvalidDocument.new("Unbalanced {}s") unless group_level == 0
  @doc
end

#parse_colour_table(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the colour table group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
# File 'lib/ruby-rtf/parser.rb', line 385

def parse_colour_table(src, current_pos)
  if src[current_pos] == ';'
    colour = RubyRTF::Colour.new
    colour.use_default = true

    @doc.colour_table << colour

    current_pos += 1
  end

  colour = RubyRTF::Colour.new

  while (true)
    case(src[current_pos])
    when '\\' then
      ctrl, val, current_pos = parse_control(src, current_pos + 1)

      case(ctrl)
      when :red then colour.red = val
      when :green then colour.green = val
      when :blue then colour.blue = val
      when :ctint then colour.tint = val
      when :cshade then colour.shade = val
      when *[:cmaindarkone, :cmainlightone, :cmaindarktwo, :cmainlighttwo, :caccentone,
             :caccenttwo, :caccentthree, :caccentfour, :caccentfive, :caccentsix,
             :chyperlink, :cfollowedhyperlink, :cbackgroundone, :ctextone,
             :cbackgroundtwo, :ctexttwo] then
        colour.theme = ctrl.to_s[1..-1].to_sym
      end

    when *["\r", "\n"] then current_pos += 1
    when ';' then
      @doc.colour_table << colour

      colour = RubyRTF::Colour.new
      current_pos += 1

    when '}' then break
    end
  end

  current_pos
end

#parse_control(src, current_pos = 0) ⇒ String, ...

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses a control switch

Parameters:

  • src (String)

    The fragment to parse

  • current_pos (Integer) (defaults to: 0)

    The position in string the control starts at (after the )

Returns:

  • (String, String|Integer, Integer)

    The name, optional control value and the new current position



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/ruby-rtf/parser.rb', line 80

def parse_control(src, current_pos = 0)
  ctrl = ''
  val = nil

  max_len = src.length
  start = current_pos

  # handle hex special
  if src[current_pos] == "'"
    val = src[(current_pos + 1), 2].hex.chr
    if encoding
      val = val.force_encoding(encoding).encode('UTF-8')
    end
    current_pos += 3
    return [:hex, val, current_pos]
  end

  while (true)
    break if current_pos >= max_len
    break if STOP_CHARS.include?(src[current_pos])

    current_pos += 1
  end
  return [src[current_pos].to_sym, nil, current_pos + 1] if start == current_pos

  contents = src[start, current_pos - start]
  m = contents.match(/([\*a-z]+)(\-?\d+)?\*?/)
  ctrl = m[1].to_sym
  val = m[2].to_i unless m[2].nil?

  # we advance past the optional space if present
  current_pos += 1 if src[current_pos] == ' '

  [ctrl, val, current_pos]
end

#parse_font_table(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the font table group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# File 'lib/ruby-rtf/parser.rb', line 312

def parse_font_table(src, current_pos)
  group = 1

  font = nil
  in_extra = nil

  while (true)
    case(src[current_pos])
    when '{' then
      font = RubyRTF::Font.new if group == 1
      in_extra = nil

      group += 1

    when '}' then
      group -= 1

      if group <= 1
        break if font.nil?
        font.cleanup_names
        @doc.font_table[font.number] = font
      end

      in_extra = nil

      break if group == 0

    when '\\' then
      ctrl, val, current_pos = parse_control(src, current_pos + 1)

      font = RubyRTF::Font.new if font.nil?

      case(ctrl)
      when :f then font.number = val
      when :fprq then font.pitch = val
      when :fcharset then font.character_set = val
      when *[:flomajor, :fhimajor, :fdbmajor, :fbimajor,
             :flominor, :fhiminor, :fdbminor, :fbiminor] then
        font.theme = ctrl.to_s[1..-1].to_sym

      when *[:falt, :fname, :panose] then in_extra = ctrl
      else
        cmd = ctrl.to_s[1..-1].to_sym
        if RubyRTF::Font::FAMILIES.include?(cmd)
          font.family_command = cmd
        end
      end

      # need to next as parse_control will leave current_pos at the
      # next character already so current_pos += 1 below would move us too far
      next
    when *["\r", "\n"] then ;
    else
      case(in_extra)
      when :falt then font.alternate_name << src[current_pos]
      when :panose then font.panose << src[current_pos]
      when :fname then font.non_tagged_name << src[current_pos]
      when nil then font.name << src[current_pos]
      end
    end
    current_pos += 1
  end

  current_pos
end

#parse_info(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the info group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



458
459
460
461
462
463
464
465
466
467
468
469
470
471
# File 'lib/ruby-rtf/parser.rb', line 458

def parse_info(src, current_pos)
  group = 1
  while (true)
    case(src[current_pos])
    when '{' then group += 1
    when '}' then
      group -= 1
      break if group == 0
    end
    current_pos += 1
  end

  current_pos
end

#parse_skip(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses a comment group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



480
481
482
483
484
485
486
487
488
489
490
491
492
493
# File 'lib/ruby-rtf/parser.rb', line 480

def parse_skip(src, current_pos)
  group = 1
  while (true)
    case(src[current_pos])
    when '{' then group += 1
    when '}' then
      group -= 1
      break if group == 0
    end
    current_pos += 1
  end

  current_pos
end

#parse_stylesheet(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the stylesheet group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



436
437
438
439
440
441
442
443
444
445
446
447
448
449
# File 'lib/ruby-rtf/parser.rb', line 436

def parse_stylesheet(src, current_pos)
  group = 1
  while (true)
    case(src[current_pos])
    when '{' then group += 1
    when '}' then
      group -= 1
      break if group == 0
    end
    current_pos += 1
  end

  current_pos
end

#pop_formatting!Nil

Note:

This will not allow you to remove the defualt formatting parameters

Pop the current top element off the formatting stack.

Returns:

  • (Nil)


543
544
545
# File 'lib/ruby-rtf/parser.rb', line 543

def pop_formatting!
  formatting_stack.pop if formatting_stack.length > 1
end

#reset_current_section!Nil

Resets the current section to default formating

Returns:

  • (Nil)


529
530
531
532
533
# File 'lib/ruby-rtf/parser.rb', line 529

def reset_current_section!
  paragraph = current_section[:modifiers].has_key?(:paragraph)
  current_section[:modifiers].clear
  current_section[:modifiers][:paragraph] = true if paragraph
end