Class: Boilerpipe::SAX::TagActions::AnchorText

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/sax/tag_actions/anchor_text.rb

Instance Method Summary collapse

Instance Method Details

#append_anchor_text_end(handler) ⇒ Object



35
36
37
38
39
# File 'lib/boilerpipe/sax/tag_actions/anchor_text.rb', line 35

def append_anchor_text_end(handler)
  handler.append_space
  handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END)
  handler.append_token(' ')
end

#append_anchor_text_start(handler) ⇒ Object



29
30
31
32
33
# File 'lib/boilerpipe/sax/tag_actions/anchor_text.rb', line 29

def append_anchor_text_start(handler)
  handler.append_space
  handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START)
  handler.append_token(' ')
end

#changes_tag_level?Boolean

Returns:

  • (Boolean)


25
26
27
# File 'lib/boilerpipe/sax/tag_actions/anchor_text.rb', line 25

def changes_tag_level?
  true
end

#end_tag(handler, name) ⇒ Object



19
20
21
22
23
# File 'lib/boilerpipe/sax/tag_actions/anchor_text.rb', line 19

def end_tag(handler, name)
  handler.in_anchor_tag -= 1
  append_anchor_text_end(handler) unless handler.in_anchor_tag? || handler.in_ignorable_element?
  false
end

#nested_achor_tag_error_recovering(handler, name) ⇒ Object



41
42
43
44
45
46
47
# File 'lib/boilerpipe/sax/tag_actions/anchor_text.rb', line 41

def nested_achor_tag_error_recovering(handler, name)
  # - dunno about nokogiri???????
  # as nested A elements are not allowed per specification, we
  # are probably reaching this branch due to a bug in the XML parser
  # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
  end_tag(handler, name)
end

#start(handler, name, attrs) ⇒ Object

Marks this tag as “anchor” (this should usually only be set for the <A> tag). Anchor tags may not be nested. There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe

  • encounters such nestings, a SAXException is thrown.



6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/boilerpipe/sax/tag_actions/anchor_text.rb', line 6

def start(handler, name, attrs)
  if handler.in_anchor_tag?
    handler.in_anchor_tag += 1
    nested_achor_tag_error_recovering(handler, name)
    return
  else
    handler.in_anchor_tag += 1
  end

  append_anchor_text_start(handler) unless handler.in_ignorable_element?
  false
end