Class: HTMLParser
- Inherits:
-
SGMLParser
show all
- Defined in:
- lib/web/htmlparser/html-parser.rb,
lib/html-parser.rb
Overview
Constant Summary
Constants inherited
from SGMLParser
SGMLParser::Attrfind, SGMLParser::Charref, SGMLParser::Commentclose, SGMLParser::Commentopen, SGMLParser::Endbracket, SGMLParser::Endtagopen, SGMLParser::Entitydefs, SGMLParser::Entityref, SGMLParser::Incomplete, SGMLParser::Interesting, SGMLParser::Special, SGMLParser::Starttagopen, SGMLParser::Tagfind
Instance Method Summary
collapse
Methods inherited from SGMLParser
#close, #feed, #finish_endtag, #finish_starttag, #goahead, #handle_charref, #handle_comment, #handle_endtag, #handle_entityref, #handle_special, #handle_starttag, #has_context, #parse_comment, #parse_endtag, #parse_special, #parse_starttag, #report_unbalanced, #reset, #setliteral, #setnomoretags, #unknown_charref, #unknown_entityref
Constructor Details
#initialize(formatter, verbose = nil) ⇒ HTMLParser
Returns a new instance of HTMLParser.
7
8
9
10
11
12
13
14
15
16
17
18
|
# File 'lib/html-parser.rb', line 7
def initialize(formatter, verbose=nil)
super(verbose)
@formatter = formatter
@savedata = nil
@isindex = 0
@title = nil
@base = nil
@anchor = nil
@anchorlist = []
@nofill = 0
@list_stack = []
end
|
Instance Method Details
#anchor_bgn(href, name, type) ⇒ Object
47
48
49
50
51
52
|
# File 'lib/html-parser.rb', line 47
def anchor_bgn(href, name, type)
@anchor = href
if @anchor
@anchorlist << href
end
end
|
#anchor_end ⇒ Object
54
55
56
57
58
59
|
# File 'lib/html-parser.rb', line 54
def anchor_end
if @anchor
@anchor = nil
end
end
|
#ddpop(bl = 0) ⇒ Object
305
306
307
308
309
310
311
312
313
|
# File 'lib/html-parser.rb', line 305
def ddpop(bl=0)
@formatter.end_paragraph(bl)
if @list_stack.length > 0
if @list_stack[-1][0] == 'dd'
@list_stack.pop
@formatter.pop_margin
end
end
end
|
#do_base(attrs) ⇒ Object
82
83
84
85
86
87
88
|
# File 'lib/html-parser.rb', line 82
def do_base(attrs)
for a, v in attrs
if a == 'href'
@base = v
end
end
end
|
#do_br(attrs) ⇒ Object
380
381
382
|
# File 'lib/html-parser.rb', line 380
def do_br(attrs)
@formatter.add_line_break
end
|
#do_dd(attrs) ⇒ Object
299
300
301
302
303
|
# File 'lib/html-parser.rb', line 299
def do_dd(attrs)
ddpop
@formatter.push_margin('dd')
@list_stack << ['dd', '', 0]
end
|
#do_dt(attrs) ⇒ Object
295
296
297
|
# File 'lib/html-parser.rb', line 295
def do_dt(attrs)
ddpop
end
|
#do_hr(attrs) ⇒ Object
384
385
386
|
# File 'lib/html-parser.rb', line 384
def do_hr(attrs)
@formatter.add_hor_rule
end
|
#do_img(attrs) ⇒ Object
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
|
# File 'lib/html-parser.rb', line 388
def do_img(attrs)
align = nil
alt = '(image)'
ismap = nil
src = nil
width = 0
height = 0
for attrname, value in attrs
if attrname == 'align'
align = value
end
if attrname == 'alt'
alt = value
end
if attrname == 'ismap'
ismap = value
end
if attrname == 'src'
src = value
end
if attrname == 'width'
width = Integer(value)
end
if attrname == 'height'
height = Integer(value)
end
end
handle_image(src, alt, ismap, align, width, height)
end
|
#do_isindex(attrs) ⇒ Object
90
91
92
|
# File 'lib/html-parser.rb', line 90
def do_isindex(attrs)
@isindex = 1
end
|
#do_li(attrs) ⇒ Object
233
234
235
236
237
238
239
240
241
242
|
# File 'lib/html-parser.rb', line 233
def do_li(attrs)
@formatter.end_paragraph(0)
if @list_stack && @list_stack.size > 0
dummy, label, counter = top = @list_stack[-1]
top[2] = counter = counter+1
else
label, counter = '*', 0
end
@formatter.add_label_data(label, counter)
end
|
#do_link(attrs) ⇒ Object
94
95
|
# File 'lib/html-parser.rb', line 94
def do_link(attrs)
end
|
97
98
|
# File 'lib/html-parser.rb', line 97
def do_meta(attrs)
end
|
#do_nextid(attrs) ⇒ Object
100
101
|
# File 'lib/html-parser.rb', line 100
def do_nextid(attrs) end
|
#do_p(attrs) ⇒ Object
164
165
166
|
# File 'lib/html-parser.rb', line 164
def do_p(attrs)
@formatter.end_paragraph(1)
end
|
#do_plaintext(attrs) ⇒ Object
418
419
420
421
|
# File 'lib/html-parser.rb', line 418
def do_plaintext(attrs)
start_pre(attrs)
setnomoretags end
|
376
377
378
|
# File 'lib/html-parser.rb', line 376
def end_a
anchor_end
end
|
#end_address ⇒ Object
204
205
206
207
|
# File 'lib/html-parser.rb', line 204
def end_address
@formatter.end_paragraph(0)
@formatter.pop_font()
end
|
346
347
348
|
# File 'lib/html-parser.rb', line 346
def end_b
@formatter.pop_font
end
|
#end_blockquote ⇒ Object
214
215
216
217
|
# File 'lib/html-parser.rb', line 214
def end_blockquote
@formatter.end_paragraph(1)
@formatter.pop_margin()
end
|
72
|
# File 'lib/html-parser.rb', line 72
def end_body() end
|
316
|
# File 'lib/html-parser.rb', line 316
def end_cite() end_i end
|
319
|
# File 'lib/html-parser.rb', line 319
def end_code() end_tt end
|
279
280
281
|
# File 'lib/html-parser.rb', line 279
def end_dir
end_ul
end
|
288
289
290
291
292
293
|
# File 'lib/html-parser.rb', line 288
def end_dl
ddpop(1)
if @list_stack.length > 0
@list_stack.pop
end
end
|
322
|
# File 'lib/html-parser.rb', line 322
def end_em() end_i end
|
109
110
111
112
|
# File 'lib/html-parser.rb', line 109
def end_h1
@formatter.end_paragraph(1)
@formatter.pop_font()
end
|
119
120
121
122
|
# File 'lib/html-parser.rb', line 119
def end_h2
@formatter.end_paragraph(1)
@formatter.pop_font()
end
|
129
130
131
132
|
# File 'lib/html-parser.rb', line 129
def end_h3
@formatter.end_paragraph(1)
@formatter.pop_font()
end
|
139
140
141
142
|
# File 'lib/html-parser.rb', line 139
def end_h4
@formatter.end_paragraph(1)
@formatter.pop_font()
end
|
149
150
151
152
|
# File 'lib/html-parser.rb', line 149
def end_h5
@formatter.end_paragraph(1)
@formatter.pop_font()
end
|
159
160
161
162
|
# File 'lib/html-parser.rb', line 159
def end_h6
@formatter.end_paragraph(1)
@formatter.pop_font()
end
|
69
|
# File 'lib/html-parser.rb', line 69
def end_head() end
|
66
|
# File 'lib/html-parser.rb', line 66
def end_html() end
|
339
340
341
|
# File 'lib/html-parser.rb', line 339
def end_i
@formatter.pop_font
end
|
325
|
# File 'lib/html-parser.rb', line 325
def end_kbd() end_tt end
|
#end_listing ⇒ Object
195
196
197
|
# File 'lib/html-parser.rb', line 195
def end_listing
end_pre
end
|
271
272
273
|
# File 'lib/html-parser.rb', line 271
def
end_ul
end
|
259
260
261
262
263
264
265
|
# File 'lib/html-parser.rb', line 259
def end_ol
if @list_stack
@list_stack.pop
end
@formatter.end_paragraph(0)
@formatter.pop_margin
end
|
174
175
176
177
178
179
|
# File 'lib/html-parser.rb', line 174
def end_pre
@formatter.end_paragraph(1)
@formatter.pop_font()
@nofill = @nofill - 1
if @nofill < 0 then @nofill = 0 end
end
|
328
|
# File 'lib/html-parser.rb', line 328
def end_samp() end_tt end
|
#end_strong ⇒ Object
331
|
# File 'lib/html-parser.rb', line 331
def end_strong() end_b end
|
#end_title ⇒ Object
78
79
80
|
# File 'lib/html-parser.rb', line 78
def end_title
@title = save_end
end
|
353
354
355
|
# File 'lib/html-parser.rb', line 353
def end_tt
@formatter.pop_font
end
|
225
226
227
228
229
230
231
|
# File 'lib/html-parser.rb', line 225
def end_ul
if @list_stack
@list_stack.pop
end
@formatter.end_paragraph(0)
@formatter.pop_margin
end
|
334
|
# File 'lib/html-parser.rb', line 334
def end_var() end_i end
|
186
187
188
|
# File 'lib/html-parser.rb', line 186
def end_xmp
end_pre
end
|
#handle_data(data) ⇒ Object
21
22
23
24
25
26
27
28
29
30
31
|
# File 'lib/html-parser.rb', line 21
def handle_data(data)
if @savedata
@savedata = @savedata + data
else
if @nofill != 0
@formatter.add_literal_data(data)
else
@formatter.add_flowing_data(data)
end
end
end
|
#handle_image(src, alt, *args) ⇒ Object
61
62
63
|
# File 'lib/html-parser.rb', line 61
def handle_image(src, alt, *args)
handle_data(alt)
end
|
33
34
35
|
# File 'lib/html-parser.rb', line 33
def save_bgn
@savedata = ''
end
|
37
38
39
40
41
42
43
44
45
|
# File 'lib/html-parser.rb', line 37
def save_end
data = @savedata
@savedata = nil
data = '' if data == nil
if @nofill == 0
data = data.split.join(" ")
end
return data
end
|
#start_a(attrs) ⇒ Object
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
|
# File 'lib/html-parser.rb', line 357
def start_a(attrs)
href = nil
name = nil
type = nil
for attrname, value in attrs
value = value.strip
if attrname == 'href'
href = value
end
if attrname == 'name'
name = value
end
if attrname == 'type'
type = value.downcase
end
end
anchor_bgn(href, name, type)
end
|
#start_address(attrs) ⇒ Object
199
200
201
202
|
# File 'lib/html-parser.rb', line 199
def start_address(attrs)
@formatter.end_paragraph(0)
@formatter.push_font(nil, 1, nil, nil)
end
|
#start_b(attrs) ⇒ Object
343
344
345
|
# File 'lib/html-parser.rb', line 343
def start_b(attrs)
@formatter.push_font(nil, nil, 1, nil)
end
|
#start_blockquote(attrs) ⇒ Object
209
210
211
212
|
# File 'lib/html-parser.rb', line 209
def start_blockquote(attrs)
@formatter.end_paragraph(1)
@formatter.push_margin('blockquote')
end
|
#start_body(attrs) ⇒ Object
71
|
# File 'lib/html-parser.rb', line 71
def start_body(attrs) end
|
#start_cite(attrs) ⇒ Object
315
|
# File 'lib/html-parser.rb', line 315
def start_cite(attrs) start_i(attrs) end
|
#start_code(attrs) ⇒ Object
318
|
# File 'lib/html-parser.rb', line 318
def start_code(attrs) start_tt(attrs) end
|
#start_dir(attrs) ⇒ Object
275
276
277
|
# File 'lib/html-parser.rb', line 275
def start_dir(attrs)
start_ul(attrs)
end
|
#start_dl(attrs) ⇒ Object
283
284
285
286
|
# File 'lib/html-parser.rb', line 283
def start_dl(attrs)
@formatter.end_paragraph(1)
@list_stack << ['dl', '', 0]
end
|
#start_em(attrs) ⇒ Object
321
|
# File 'lib/html-parser.rb', line 321
def start_em(attrs) start_i(attrs) end
|
#start_h1(attrs) ⇒ Object
104
105
106
107
|
# File 'lib/html-parser.rb', line 104
def start_h1(attrs)
@formatter.end_paragraph(1)
@formatter.push_font('h1', 0, 1, 0)
end
|
#start_h2(attrs) ⇒ Object
114
115
116
117
|
# File 'lib/html-parser.rb', line 114
def start_h2(attrs)
@formatter.end_paragraph(1)
@formatter.push_font('h2', 0, 1, 0)
end
|
#start_h3(attrs) ⇒ Object
124
125
126
127
|
# File 'lib/html-parser.rb', line 124
def start_h3(attrs)
@formatter.end_paragraph(1)
@formatter.push_font('h3', 0, 1, 0)
end
|
#start_h4(attrs) ⇒ Object
134
135
136
137
|
# File 'lib/html-parser.rb', line 134
def start_h4(attrs)
@formatter.end_paragraph(1)
@formatter.push_font('h4', 0, 1, 0)
end
|
#start_h5(attrs) ⇒ Object
144
145
146
147
|
# File 'lib/html-parser.rb', line 144
def start_h5(attrs)
@formatter.end_paragraph(1)
@formatter.push_font('h5', 0, 1, 0)
end
|
#start_h6(attrs) ⇒ Object
154
155
156
157
|
# File 'lib/html-parser.rb', line 154
def start_h6(attrs)
@formatter.end_paragraph(1)
@formatter.push_font('h6', 0, 1, 0)
end
|
#start_head(attrs) ⇒ Object
68
|
# File 'lib/html-parser.rb', line 68
def start_head(attrs) end
|
#start_html(attrs) ⇒ Object
65
|
# File 'lib/html-parser.rb', line 65
def start_html(attrs) end
|
#start_i(attrs) ⇒ Object
336
337
338
|
# File 'lib/html-parser.rb', line 336
def start_i(attrs)
@formatter.push_font(nil, 1, nil, nil)
end
|
#start_kbd(attrs) ⇒ Object
324
|
# File 'lib/html-parser.rb', line 324
def start_kbd(attrs) start_tt(attrs) end
|
#start_listing(attrs) ⇒ Object
190
191
192
193
|
# File 'lib/html-parser.rb', line 190
def start_listing(attrs)
start_pre(attrs)
setliteral('listing') end
|
267
268
269
|
# File 'lib/html-parser.rb', line 267
def (attrs)
start_ul(attrs)
end
|
#start_ol(attrs) ⇒ Object
244
245
246
247
248
249
250
251
252
253
254
255
256
257
|
# File 'lib/html-parser.rb', line 244
def start_ol(attrs)
@formatter.end_paragraph(0)
@formatter.push_margin('ol')
label = '1.'
for a, v in attrs
if a == 'type'
if v.length == 1
v = v + '.'
label = v
end
end
end
@list_stack << ['ol', label, 0]
end
|
#start_pre(attrs) ⇒ Object
168
169
170
171
172
|
# File 'lib/html-parser.rb', line 168
def start_pre(attrs)
@formatter.end_paragraph(1)
@formatter.push_font(nil, nil, nil, 1)
@nofill = @nofill + 1
end
|
#start_samp(attrs) ⇒ Object
327
|
# File 'lib/html-parser.rb', line 327
def start_samp(attrs) start_tt(attrs) end
|
#start_strong(attrs) ⇒ Object
330
|
# File 'lib/html-parser.rb', line 330
def start_strong(attrs) start_b(attrs) end
|
#start_title(attrs) ⇒ Object
74
75
76
|
# File 'lib/html-parser.rb', line 74
def start_title(attrs)
save_bgn
end
|
#start_tt(attrs) ⇒ Object
350
351
352
|
# File 'lib/html-parser.rb', line 350
def start_tt(attrs)
@formatter.push_font(nil, nil, nil, 1)
end
|
#start_ul(attrs) ⇒ Object
219
220
221
222
223
|
# File 'lib/html-parser.rb', line 219
def start_ul(attrs)
@formatter.end_paragraph(0)
@formatter.push_margin('ul')
@list_stack << ['ul', '*', 0]
end
|
#start_var(attrs) ⇒ Object
333
|
# File 'lib/html-parser.rb', line 333
def start_var(attrs) start_i(attrs) end
|
#start_xmp(attrs) ⇒ Object
181
182
183
184
|
# File 'lib/html-parser.rb', line 181
def start_xmp(attrs)
start_pre(attrs)
setliteral('xmp') end
|
#unknown_endtag(tag) ⇒ Object
426
427
|
# File 'lib/html-parser.rb', line 426
def unknown_endtag(tag)
end
|
#unknown_starttag(tag, attrs) ⇒ Object
423
424
|
# File 'lib/html-parser.rb', line 423
def unknown_starttag(tag, attrs)
end
|