Class: HTMLParser

Inherits:
SGMLParser show all
Defined in:
lib/web/htmlparser/html-parser.rb,
lib/html-parser.rb

Overview

:nodoc: all

Constant Summary

Constants inherited from SGMLParser

SGMLParser::Attrfind, SGMLParser::Charref, SGMLParser::Commentclose, SGMLParser::Commentopen, SGMLParser::Endbracket, SGMLParser::Endtagopen, SGMLParser::Entitydefs, SGMLParser::Entityref, SGMLParser::Incomplete, SGMLParser::Interesting, SGMLParser::Special, SGMLParser::Starttagopen, SGMLParser::Tagfind

Instance Method Summary collapse

Methods inherited from SGMLParser

#close, #feed, #finish_endtag, #finish_starttag, #goahead, #handle_charref, #handle_comment, #handle_endtag, #handle_entityref, #handle_special, #handle_starttag, #has_context, #parse_comment, #parse_endtag, #parse_special, #parse_starttag, #report_unbalanced, #reset, #setliteral, #setnomoretags, #unknown_charref, #unknown_entityref

Constructor Details

#initialize(formatter, verbose = nil) ⇒ HTMLParser

Returns a new instance of HTMLParser.



7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/html-parser.rb', line 7

def initialize(formatter, verbose=nil)
  super(verbose)
  @formatter = formatter
  @savedata = nil
  @isindex = 0
  @title = nil
  @base = nil
  @anchor = nil
  @anchorlist = []
  @nofill = 0
  @list_stack = []
end

Instance Method Details

#anchor_bgn(href, name, type) ⇒ Object



47
48
49
50
51
52
# File 'lib/html-parser.rb', line 47

def anchor_bgn(href, name, type)
  @anchor = href
  if @anchor
    @anchorlist << href
  end
end

#anchor_endObject



54
55
56
57
58
59
# File 'lib/html-parser.rb', line 54

def anchor_end
  if @anchor
    #handle_data(format "[%d]", @anchorlist.length)
    @anchor = nil
  end
end

#ddpop(bl = 0) ⇒ Object



305
306
307
308
309
310
311
312
313
# File 'lib/html-parser.rb', line 305

def ddpop(bl=0)
  @formatter.end_paragraph(bl)
  if @list_stack.length > 0
    if @list_stack[-1][0] == 'dd'
      @list_stack.pop
      @formatter.pop_margin
    end
  end
end

#do_base(attrs) ⇒ Object



82
83
84
85
86
87
88
# File 'lib/html-parser.rb', line 82

def do_base(attrs)
  for a, v in attrs
    if a == 'href'
      @base = v
    end
  end
end

#do_br(attrs) ⇒ Object



380
381
382
# File 'lib/html-parser.rb', line 380

def do_br(attrs)
  @formatter.add_line_break
end

#do_dd(attrs) ⇒ Object



299
300
301
302
303
# File 'lib/html-parser.rb', line 299

def do_dd(attrs)
  ddpop
  @formatter.push_margin('dd')
  @list_stack << ['dd', '', 0]
end

#do_dt(attrs) ⇒ Object



295
296
297
# File 'lib/html-parser.rb', line 295

def do_dt(attrs)
  ddpop
end

#do_hr(attrs) ⇒ Object



384
385
386
# File 'lib/html-parser.rb', line 384

def do_hr(attrs)
  @formatter.add_hor_rule
end

#do_img(attrs) ⇒ Object



388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# File 'lib/html-parser.rb', line 388

def do_img(attrs)
  align = nil
  alt = '(image)'
  ismap = nil
  src = nil
  width = 0
  height = 0
  for attrname, value in attrs
    if attrname == 'align'
      align = value
    end
    if attrname == 'alt'
      alt = value
    end
    if attrname == 'ismap'
      ismap = value
    end
    if attrname == 'src'
      src = value
    end
    if attrname == 'width'
      width = Integer(value)
    end
    if attrname == 'height'
      height = Integer(value)
    end
  end
  handle_image(src, alt, ismap, align, width, height)
end

#do_isindex(attrs) ⇒ Object



90
91
92
# File 'lib/html-parser.rb', line 90

def do_isindex(attrs)
  @isindex = 1
end

#do_li(attrs) ⇒ Object



233
234
235
236
237
238
239
240
241
242
# File 'lib/html-parser.rb', line 233

def do_li(attrs)
  @formatter.end_paragraph(0)
  if @list_stack && @list_stack.size > 0
    dummy, label, counter = top = @list_stack[-1]
    top[2] = counter = counter+1
  else
    label, counter = '*', 0
  end
  @formatter.add_label_data(label, counter)
end


94
95
# File 'lib/html-parser.rb', line 94

def do_link(attrs)
end

#do_meta(attrs) ⇒ Object



97
98
# File 'lib/html-parser.rb', line 97

def do_meta(attrs)
end

#do_nextid(attrs) ⇒ Object

Deprecated



100
101
# File 'lib/html-parser.rb', line 100

def do_nextid(attrs) # Deprecated
end

#do_p(attrs) ⇒ Object



164
165
166
# File 'lib/html-parser.rb', line 164

def do_p(attrs)
  @formatter.end_paragraph(1)
end

#do_plaintext(attrs) ⇒ Object



418
419
420
421
# File 'lib/html-parser.rb', line 418

def do_plaintext(attrs)
  start_pre(attrs)
  setnomoretags # Tell SGML parser
end

#end_aObject



376
377
378
# File 'lib/html-parser.rb', line 376

def end_a
  anchor_end
end

#end_addressObject



204
205
206
207
# File 'lib/html-parser.rb', line 204

def end_address
  @formatter.end_paragraph(0)
  @formatter.pop_font()
end

#end_bObject



346
347
348
# File 'lib/html-parser.rb', line 346

def end_b
  @formatter.pop_font
end

#end_blockquoteObject



214
215
216
217
# File 'lib/html-parser.rb', line 214

def end_blockquote
  @formatter.end_paragraph(1)
  @formatter.pop_margin()
end

#end_bodyObject



72
# File 'lib/html-parser.rb', line 72

def end_body() end

#end_citeObject



316
# File 'lib/html-parser.rb', line 316

def end_cite() end_i end

#end_codeObject



319
# File 'lib/html-parser.rb', line 319

def end_code() end_tt end

#end_dirObject



279
280
281
# File 'lib/html-parser.rb', line 279

def end_dir
  end_ul
end

#end_dlObject



288
289
290
291
292
293
# File 'lib/html-parser.rb', line 288

def end_dl
  ddpop(1)
  if @list_stack.length > 0
    @list_stack.pop
  end
end

#end_emObject



322
# File 'lib/html-parser.rb', line 322

def end_em() end_i end

#end_h1Object



109
110
111
112
# File 'lib/html-parser.rb', line 109

def end_h1
  @formatter.end_paragraph(1)
  @formatter.pop_font()
end

#end_h2Object



119
120
121
122
# File 'lib/html-parser.rb', line 119

def end_h2
  @formatter.end_paragraph(1)
  @formatter.pop_font()
end

#end_h3Object



129
130
131
132
# File 'lib/html-parser.rb', line 129

def end_h3
  @formatter.end_paragraph(1)
  @formatter.pop_font()
end

#end_h4Object



139
140
141
142
# File 'lib/html-parser.rb', line 139

def end_h4
  @formatter.end_paragraph(1)
  @formatter.pop_font()
end

#end_h5Object



149
150
151
152
# File 'lib/html-parser.rb', line 149

def end_h5
  @formatter.end_paragraph(1)
  @formatter.pop_font()
end

#end_h6Object



159
160
161
162
# File 'lib/html-parser.rb', line 159

def end_h6
  @formatter.end_paragraph(1)
  @formatter.pop_font()
end

#end_headObject



69
# File 'lib/html-parser.rb', line 69

def end_head() end

#end_htmlObject



66
# File 'lib/html-parser.rb', line 66

def end_html() end

#end_iObject



339
340
341
# File 'lib/html-parser.rb', line 339

def end_i
  @formatter.pop_font
end

#end_kbdObject



325
# File 'lib/html-parser.rb', line 325

def end_kbd() end_tt end

#end_listingObject



195
196
197
# File 'lib/html-parser.rb', line 195

def end_listing
  end_pre
end

#end_menuObject



271
272
273
# File 'lib/html-parser.rb', line 271

def end_menu
  end_ul
end

#end_olObject



259
260
261
262
263
264
265
# File 'lib/html-parser.rb', line 259

def end_ol
  if @list_stack
    @list_stack.pop
  end
  @formatter.end_paragraph(0)
  @formatter.pop_margin
end

#end_preObject



174
175
176
177
178
179
# File 'lib/html-parser.rb', line 174

def end_pre
  @formatter.end_paragraph(1)
  @formatter.pop_font()
  @nofill = @nofill - 1
  if @nofill < 0 then @nofill = 0 end
end

#end_sampObject



328
# File 'lib/html-parser.rb', line 328

def end_samp() end_tt end

#end_strongObject



331
# File 'lib/html-parser.rb', line 331

def end_strong() end_b end

#end_titleObject



78
79
80
# File 'lib/html-parser.rb', line 78

def end_title
  @title = save_end
end

#end_ttObject



353
354
355
# File 'lib/html-parser.rb', line 353

def end_tt
  @formatter.pop_font
end

#end_ulObject



225
226
227
228
229
230
231
# File 'lib/html-parser.rb', line 225

def end_ul
  if @list_stack
    @list_stack.pop
  end
  @formatter.end_paragraph(0)
  @formatter.pop_margin
end

#end_varObject



334
# File 'lib/html-parser.rb', line 334

def end_var() end_i end

#end_xmpObject



186
187
188
# File 'lib/html-parser.rb', line 186

def end_xmp
  end_pre
end

#handle_data(data) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
# File 'lib/html-parser.rb', line 21

def handle_data(data)
  if @savedata
    @savedata = @savedata + data
  else
    if @nofill != 0
      @formatter.add_literal_data(data)
    else
      @formatter.add_flowing_data(data)
    end
  end
end

#handle_image(src, alt, *args) ⇒ Object



61
62
63
# File 'lib/html-parser.rb', line 61

def handle_image(src, alt, *args)
  handle_data(alt)
end

#save_bgnObject



33
34
35
# File 'lib/html-parser.rb', line 33

def save_bgn
  @savedata = ''
end

#save_endObject



37
38
39
40
41
42
43
44
45
# File 'lib/html-parser.rb', line 37

def save_end
  data = @savedata
  @savedata = nil
  data = '' if data == nil
  if @nofill == 0
    data = data.split.join(" ")
  end
  return data
end

#start_a(attrs) ⇒ Object



357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
# File 'lib/html-parser.rb', line 357

def start_a(attrs)
  href = nil
  name = nil
  type = nil
  for attrname, value in attrs
    value = value.strip
    if attrname == 'href'
      href = value
    end
    if attrname == 'name'
      name = value
    end
    if attrname == 'type'
      type = value.downcase
    end
  end
  anchor_bgn(href, name, type)
end

#start_address(attrs) ⇒ Object



199
200
201
202
# File 'lib/html-parser.rb', line 199

def start_address(attrs)
  @formatter.end_paragraph(0)
  @formatter.push_font(nil, 1, nil, nil)
end

#start_b(attrs) ⇒ Object



343
344
345
# File 'lib/html-parser.rb', line 343

def start_b(attrs)
  @formatter.push_font(nil, nil, 1, nil)
end

#start_blockquote(attrs) ⇒ Object



209
210
211
212
# File 'lib/html-parser.rb', line 209

def start_blockquote(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_margin('blockquote')
end

#start_body(attrs) ⇒ Object



71
# File 'lib/html-parser.rb', line 71

def start_body(attrs) end

#start_cite(attrs) ⇒ Object



315
# File 'lib/html-parser.rb', line 315

def start_cite(attrs) start_i(attrs) end

#start_code(attrs) ⇒ Object



318
# File 'lib/html-parser.rb', line 318

def start_code(attrs) start_tt(attrs) end

#start_dir(attrs) ⇒ Object



275
276
277
# File 'lib/html-parser.rb', line 275

def start_dir(attrs)
  start_ul(attrs)
end

#start_dl(attrs) ⇒ Object



283
284
285
286
# File 'lib/html-parser.rb', line 283

def start_dl(attrs)
  @formatter.end_paragraph(1)
  @list_stack << ['dl', '', 0]
end

#start_em(attrs) ⇒ Object



321
# File 'lib/html-parser.rb', line 321

def start_em(attrs) start_i(attrs) end

#start_h1(attrs) ⇒ Object



104
105
106
107
# File 'lib/html-parser.rb', line 104

def start_h1(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_font('h1', 0, 1, 0)
end

#start_h2(attrs) ⇒ Object



114
115
116
117
# File 'lib/html-parser.rb', line 114

def start_h2(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_font('h2', 0, 1, 0)
end

#start_h3(attrs) ⇒ Object



124
125
126
127
# File 'lib/html-parser.rb', line 124

def start_h3(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_font('h3', 0, 1, 0)
end

#start_h4(attrs) ⇒ Object



134
135
136
137
# File 'lib/html-parser.rb', line 134

def start_h4(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_font('h4', 0, 1, 0)
end

#start_h5(attrs) ⇒ Object



144
145
146
147
# File 'lib/html-parser.rb', line 144

def start_h5(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_font('h5', 0, 1, 0)
end

#start_h6(attrs) ⇒ Object



154
155
156
157
# File 'lib/html-parser.rb', line 154

def start_h6(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_font('h6', 0, 1, 0)
end

#start_head(attrs) ⇒ Object



68
# File 'lib/html-parser.rb', line 68

def start_head(attrs) end

#start_html(attrs) ⇒ Object



65
# File 'lib/html-parser.rb', line 65

def start_html(attrs) end

#start_i(attrs) ⇒ Object



336
337
338
# File 'lib/html-parser.rb', line 336

def start_i(attrs)
  @formatter.push_font(nil, 1, nil, nil)
end

#start_kbd(attrs) ⇒ Object



324
# File 'lib/html-parser.rb', line 324

def start_kbd(attrs) start_tt(attrs) end

#start_listing(attrs) ⇒ Object



190
191
192
193
# File 'lib/html-parser.rb', line 190

def start_listing(attrs)
  start_pre(attrs)
  setliteral('listing') # Tell SGML parser
end

#start_menu(attrs) ⇒ Object



267
268
269
# File 'lib/html-parser.rb', line 267

def start_menu(attrs)
  start_ul(attrs)
end

#start_ol(attrs) ⇒ Object



244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/html-parser.rb', line 244

def start_ol(attrs)
  @formatter.end_paragraph(0)
  @formatter.push_margin('ol')
  label = '1.'
  for a, v in attrs
    if a == 'type'
      if v.length == 1
        v = v + '.'
        label = v
      end
    end
  end
  @list_stack << ['ol', label, 0]
end

#start_pre(attrs) ⇒ Object



168
169
170
171
172
# File 'lib/html-parser.rb', line 168

def start_pre(attrs)
  @formatter.end_paragraph(1)
  @formatter.push_font(nil, nil, nil, 1)
  @nofill = @nofill + 1
end

#start_samp(attrs) ⇒ Object



327
# File 'lib/html-parser.rb', line 327

def start_samp(attrs) start_tt(attrs) end

#start_strong(attrs) ⇒ Object



330
# File 'lib/html-parser.rb', line 330

def start_strong(attrs) start_b(attrs) end

#start_title(attrs) ⇒ Object



74
75
76
# File 'lib/html-parser.rb', line 74

def start_title(attrs)
  save_bgn
end

#start_tt(attrs) ⇒ Object



350
351
352
# File 'lib/html-parser.rb', line 350

def start_tt(attrs)
  @formatter.push_font(nil, nil, nil, 1)
end

#start_ul(attrs) ⇒ Object



219
220
221
222
223
# File 'lib/html-parser.rb', line 219

def start_ul(attrs)
  @formatter.end_paragraph(0)
  @formatter.push_margin('ul')
  @list_stack << ['ul', '*', 0]
end

#start_var(attrs) ⇒ Object



333
# File 'lib/html-parser.rb', line 333

def start_var(attrs) start_i(attrs) end

#start_xmp(attrs) ⇒ Object



181
182
183
184
# File 'lib/html-parser.rb', line 181

def start_xmp(attrs)
  start_pre(attrs)
  setliteral('xmp') # Tell SGML parser
end

#unknown_endtag(tag) ⇒ Object



426
427
# File 'lib/html-parser.rb', line 426

def unknown_endtag(tag)
end

#unknown_starttag(tag, attrs) ⇒ Object



423
424
# File 'lib/html-parser.rb', line 423

def unknown_starttag(tag, attrs)
end