Class: RubyRTF::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/ruby-rtf/parser.rb

Overview

Handles the parsing of RTF content into an RubyRTF::Document

Constant Summary collapse

STOP_CHARS =
[' ', '\\', '{', '}', "\r", "\n", ';']
BLACKLISTED =

Keys that aren’t inherited

[:paragraph, :newline, :tab, :lquote, :rquote, :ldblquote, :rdblquote]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeParser

Returns a new instance of Parser.



13
14
15
16
17
18
19
20
21
22
# File 'lib/ruby-rtf/parser.rb', line 13

def initialize
  default_mods = {}
  @formatting_stack = [default_mods]
  @current_section = {:text => '', :modifiers => default_mods}

  @seen = {}

  @doc = RubyRTF::Document.new
  @context_stack = []
end

Instance Attribute Details

#current_sectionObject

Returns the value of attribute current_section.



6
7
8
# File 'lib/ruby-rtf/parser.rb', line 6

def current_section
  @current_section
end

#docObject (readonly)

Returns the value of attribute doc.



11
12
13
# File 'lib/ruby-rtf/parser.rb', line 11

def doc
  @doc
end

#formatting_stackArray (readonly)

Returns The current formatting block to use as the basis for new sections.

Returns:

  • (Array)

    The current formatting block to use as the basis for new sections



9
10
11
# File 'lib/ruby-rtf/parser.rb', line 9

def formatting_stack
  @formatting_stack
end

Instance Method Details

#add_modifier_section(mods = {}, text = nil) ⇒ Object



443
444
445
446
447
448
449
# File 'lib/ruby-rtf/parser.rb', line 443

def add_modifier_section(mods = {}, text = nil)
  force_section!(mods, text)
  pop_formatting!

  force_section!
  pop_formatting!
end

#add_section!(mods = {}) ⇒ Object



451
452
453
454
455
456
457
# File 'lib/ruby-rtf/parser.rb', line 451

def add_section!(mods = {})
  if current_section[:text].empty?
    current_section[:modifiers].merge!(mods)
  else
    force_section!(mods)
  end
end

#current_contextObject



480
481
482
# File 'lib/ruby-rtf/parser.rb', line 480

def current_context
  @context_stack.last || doc
end

#force_section!(mods = {}, text = nil) ⇒ Object



461
462
463
464
465
466
467
468
469
470
471
# File 'lib/ruby-rtf/parser.rb', line 461

def force_section!(mods = {}, text =  nil)
  current_context << @current_section

  formatting_stack.last.each_pair do |k, v|
    next if BLACKLISTED.include?(k)
    mods[k] = v
  end
  formatting_stack.push(mods)

  @current_section = {:text => (text || ''), :modifiers => mods}
end

#handle_control(name, val, src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Handle a given control

Parameters:

  • name (Symbol)

    The control name

  • val (Integer|nil)

    The controls value, or nil if non associated

  • src (String)

    The source document

  • current_pos (Integer)

    The current document position

Returns:

  • (Integer)

    The new current position



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/ruby-rtf/parser.rb', line 118

def handle_control(name, val, src, current_pos)
  case(name)
  when :rtf then ;
  when :deff then @doc.default_font = val
  when *[:ansi, :mac, :pc, :pca] then @doc.character_set = name
  when :fonttbl then current_pos = parse_font_table(src, current_pos)
  when :colortbl then current_pos = parse_colour_table(src, current_pos)
  when :stylesheet then current_pos = parse_stylesheet(src, current_pos)
  when :info  then current_pos = parse_info(src, current_pos)
  when :* then current_pos = parse_skip(src, current_pos)

  when :f then add_section!(:font => @doc.font_table[val])

  # RTF font sizes are in half-points. divide by 2 to get points
  when :fs then add_section!(:font_size => (val.to_f / 2.0))
  when :b then add_section!(:bold => true)
  when :i then add_section!(:italic => true)
  when :ul then add_section!(:underline => true)
  when :super then add_section!(:superscript => true)
  when :sub then add_section!(:subscript => true)
  when :strike then add_section!(:strikethrough => true)
  when :scaps then add_section!(:smallcaps => true)
  when :ql then add_section!(:justification => :left)
  when :qr then add_section!(:justification => :right)
  when :qj then add_section!(:justification => :full)
  when :qc then add_section!(:justification => :center)
  when :fi then add_section!(:first_line_indent => RubyRTF.twips_to_points(val))
  when :li then add_section!(:left_indent => RubyRTF.twips_to_points(val))
  when :ri then add_section!(:right_indent => RubyRTF.twips_to_points(val))
  when :margl then add_section!(:left_margin => RubyRTF.twips_to_points(val))
  when :margr then add_section!(:right_margin => RubyRTF.twips_to_points(val))
  when :margt then add_section!(:top_margin => RubyRTF.twips_to_points(val))
  when :margb then add_section!(:bottom_margin => RubyRTF.twips_to_points(val))
  when :sb then add_section!(:space_before => RubyRTF.twips_to_points(val))
  when :sa then add_section!(:space_after => RubyRTF.twips_to_points(val))
  when :cf then add_section!(:foreground_colour => @doc.colour_table[val])
  when :cb then add_section!(:background_colour => @doc.colour_table[val])
  when :hex then current_section[:text] << val
  when :u then
    char = if val > 0 && val < 10_000
      '\u' + ("0" * (4 - val.to_s.length)) + val.to_s
    elsif val > 0
      '\u' + ("%04x" % val)
    else
      '\u' + ("%04x" % (val + 65_536))
    end
    current_section[:text] << eval("\"#{char}\"")

  when *[:rquote, :lquote] then add_modifier_section({name => true}, "'")
  when *[:rdblquote, :ldblquote] then add_modifier_section({name => true}, '"')

  when :'{' then current_section[:text] << "{"
  when :'}' then current_section[:text] << "}"
  when :'\\' then current_section[:text] << '\\'

  when :~ then add_modifier_section({:nbsp => true}, " ")

  when :tab then add_modifier_section({:tab => true}, "\t")
  when :emdash then add_modifier_section({:emdash => true}, "--")
  when :endash then add_modifier_section({:endash => true}, "-")

  when *[:line, :"\n"] then add_modifier_section({:newline => true}, "\n")
  when :"\r" then ;

  when :par then add_modifier_section({:paragraph => true})
  when *[:pard, :plain] then reset_current_section!

  when :trowd then
    table = nil
    table = doc.sections.last[:modifiers][:table] if doc.sections.last && doc.sections.last[:modifiers][:table]
    if table
      table.add_row
    else
      table = RubyRTF::Table.new

      if !current_section[:text].empty?
        force_section!({:table => table})
      else
        current_section[:modifiers][:table] = table
        pop_formatting!
      end

      force_section!
      pop_formatting!
    end

    @context_stack.push(table.current_row.current_cell)

  when :trgaph then
    raise "trgaph outside of a table?" if !current_context.respond_to?(:table)
    current_context.table.half_gap = RubyRTF.twips_to_points(val)

  when :trleft then
    raise "trleft outside of a table?" if !current_context.respond_to?(:table)
    current_context.table.left_margin = RubyRTF.twips_to_points(val)

  when :cellx then
    raise "cellx outside of a table?" if !current_context.respond_to?(:row)
    current_context.row.end_positions.push(RubyRTF.twips_to_points(val))

  when :intbl then ;

  when :cell then
    pop_formatting!

    table = current_context.table if current_context.respond_to?(:table)

    force_section! #unless current_section[:text].empty?
    reset_current_section!

    @context_stack.pop

    # only add a cell if the row isn't full already
    if table && table.current_row && (table.current_row.cells.length < table.current_row.end_positions.length)
      cell = table.current_row.add_cell
      @context_stack.push(cell)
    end

  when :row then
    if current_context.sections.empty?
      # empty row
      table = current_context.table
      table.rows.pop

      @context_stack.pop
    end

  else
    unless @seen[name]
      @seen[name] = true
      STDERR.puts "Unknown control #{name.inspect} with #{val} at #{current_pos}"
    end
  end
  current_pos
end

#parse(src) ⇒ RubyRTF::Document

Parses a given string into an RubyRTF::Document

Parameters:

  • src (String)

    The document to parse

Returns:

Raises:



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/ruby-rtf/parser.rb', line 29

def parse(src)
  raise RubyRTF::InvalidDocument.new("Opening \\rtf1 missing") unless src =~ /\{\\rtf1/

  current_pos = 0
  len = src.length

  group_level = 0
  while (current_pos < len)
    char = src[current_pos]
    current_pos += 1

    case(char)
    when '\\' then
      name, val, current_pos = parse_control(src, current_pos)
      current_pos = handle_control(name, val, src, current_pos)

    when '{' then
      add_section!
      group_level += 1

    when '}' then
      pop_formatting!
      add_section!
      group_level -= 1

    when *["\r", "\n"] then ;
    else current_section[:text] << char
    end
  end

  unless current_section[:text].empty?
    current_context << current_section
  end

  raise RubyRTF::InvalidDocument.new("Unbalanced {}s") unless group_level == 0
  @doc
end

#parse_colour_table(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the colour table group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# File 'lib/ruby-rtf/parser.rb', line 333

def parse_colour_table(src, current_pos)
  if src[current_pos] == ';'
    colour = RubyRTF::Colour.new
    colour.use_default = true

    @doc.colour_table << colour

    current_pos += 1
  end

  colour = RubyRTF::Colour.new

  while (true)
    case(src[current_pos])
    when '\\' then
      ctrl, val, current_pos = parse_control(src, current_pos + 1)

      case(ctrl)
      when :red then colour.red = val
      when :green then colour.green = val
      when :blue then colour.blue = val
      when :ctint then colour.tint = val
      when :cshade then colour.shade = val
      when *[:cmaindarkone, :cmainlightone, :cmaindarktwo, :cmainlighttwo, :caccentone,
             :caccenttwo, :caccentthree, :caccentfour, :caccentfive, :caccentsix,
             :chyperlink, :cfollowedhyperlink, :cbackgroundone, :ctextone,
             :cbackgroundtwo, :ctexttwo] then
        colour.theme = ctrl.to_s[1..-1].to_sym
      end

    when *["\r", "\n"] then current_pos += 1
    when ';' then
      @doc.colour_table << colour

      colour = RubyRTF::Colour.new
      current_pos += 1

    when '}' then break
    end
  end

  current_pos
end

#parse_control(src, current_pos = 0) ⇒ String, ...

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses a control switch

Parameters:

  • src (String)

    The fragment to parse

  • current_pos (Integer) (defaults to: 0)

    The position in string the control starts at (after the )

Returns:

  • (String, String|Integer, Integer)

    The name, optional control value and the new current position



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/ruby-rtf/parser.rb', line 76

def parse_control(src, current_pos = 0)
  ctrl = ''
  val = nil

  max_len = src.length
  start = current_pos

  # handle hex special
  if src[current_pos] == "'"
    val = src[(current_pos + 1), 2].hex.chr
    current_pos += 3
    return [:hex, val, current_pos]
  end

  while (true)
    break if current_pos >= max_len
    break if STOP_CHARS.include?(src[current_pos])

    current_pos += 1
  end
  return [src[current_pos].to_sym, nil, current_pos + 1] if start == current_pos

  contents = src[start, current_pos - start]
  m = contents.match(/([\*a-z]+)(\-?\d+)?\*?/)
  ctrl = m[1].to_sym
  val = m[2].to_i unless m[2].nil?

  # we advance past the optional space if present
  current_pos += 1 if src[current_pos] == ' '

  [ctrl, val, current_pos]
end

#parse_font_table(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the font table group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
# File 'lib/ruby-rtf/parser.rb', line 261

def parse_font_table(src, current_pos)
  group = 1

  font = nil
  in_extra = nil

  while (true)
    case(src[current_pos])
    when '{' then
      font = RubyRTF::Font.new if group == 1
      in_extra = nil

      group += 1

    when '}' then
      group -= 1

      if group <= 1
        font.cleanup_names
        @doc.font_table[font.number] = font
      end

      in_extra = nil

      break if group == 0

    when '\\' then
      ctrl, val, current_pos = parse_control(src, current_pos + 1)

      font = RubyRTF::Font.new if font.nil?

      case(ctrl)
      when :f then font.number = val
      when :fprq then font.pitch = val
      when :fcharset then font.character_set = val
      when *[:flomajor, :fhimajor, :fdbmajor, :fbimajor,
             :flominor, :fhiminor, :fdbminor, :fbiminor] then
        font.theme = ctrl.to_s[1..-1].to_sym

      when *[:falt, :fname, :panose] then in_extra = ctrl
      else
        cmd = ctrl.to_s[1..-1].to_sym
        if RubyRTF::Font::FAMILIES.include?(cmd)
          font.family_command = cmd
        end
      end

      # need to next as parse_control will leave current_pos at the
      # next character already so current_pos += 1 below would move us too far
      next
    when *["\r", "\n"] then ;
    else
      case(in_extra)
      when :falt then font.alternate_name << src[current_pos]
      when :panose then font.panose << src[current_pos]
      when :fname then font.non_tagged_name << src[current_pos]
      when nil then font.name << src[current_pos]
      end
    end
    current_pos += 1
  end

  current_pos
end

#parse_info(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the info group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



406
407
408
409
410
411
412
413
414
415
416
417
418
419
# File 'lib/ruby-rtf/parser.rb', line 406

def parse_info(src, current_pos)
  group = 1
  while (true)
    case(src[current_pos])
    when '{' then group += 1
    when '}' then
      group -= 1
      break if group == 0
    end
    current_pos += 1
  end

  current_pos
end

#parse_skip(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses a comment group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



428
429
430
431
432
433
434
435
436
437
438
439
440
441
# File 'lib/ruby-rtf/parser.rb', line 428

def parse_skip(src, current_pos)
  group = 1
  while (true)
    case(src[current_pos])
    when '{' then group += 1
    when '}' then
      group -= 1
      break if group == 0
    end
    current_pos += 1
  end

  current_pos
end

#parse_stylesheet(src, current_pos) ⇒ Integer

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Parses the stylesheet group

Parameters:

  • src (String)

    The source document

  • current_pos (Integer)

    The starting position

Returns:

  • (Integer)

    The new current position



384
385
386
387
388
389
390
391
392
393
394
395
396
397
# File 'lib/ruby-rtf/parser.rb', line 384

def parse_stylesheet(src, current_pos)
  group = 1
  while (true)
    case(src[current_pos])
    when '{' then group += 1
    when '}' then
      group -= 1
      break if group == 0
    end
    current_pos += 1
  end

  current_pos
end

#pop_formatting!Nil

Note:

This will not allow you to remove the defualt formatting parameters

Pop the current top element off the formatting stack.

Returns:

  • (Nil)


488
489
490
# File 'lib/ruby-rtf/parser.rb', line 488

def pop_formatting!
  formatting_stack.pop if formatting_stack.length > 1
end

#reset_current_section!Nil

Resets the current section to default formating

Returns:

  • (Nil)


476
477
478
# File 'lib/ruby-rtf/parser.rb', line 476

def reset_current_section!
  current_section[:modifiers].clear
end