Module: Ircbot::Utils::HtmlParser

Defined in:
lib/ircbot/utils/html_parser.rb

Instance Method Summary collapse

Instance Method Details

#get_title(html) ⇒ Object



6
7
8
9
# File 'lib/ircbot/utils/html_parser.rb', line 6

def get_title(html)
  title = $1.strip if %r{<title>(.*?)</title>}mi =~ html
  title ? trim_tags(title) : ""
end

#trim_tags(html) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/ircbot/utils/html_parser.rb', line 11

def trim_tags(html)
  html.gsub!(%r{<head.*?>.*?</head>}mi, '')
  html.gsub!(%r{<script.*?>.*?</script>}mi, '')
  html.gsub!(%r{<style.*?>.*?</style>}mi, '')
  html.gsub!(%r{<noscript.*?>.*?</noscript>}mi, '')
  html.gsub!(%r{</?.*?>}, '')
  html.gsub!(%r{<\!--.*?-->}mi, '')
  html.gsub!(%r{<\!\w.*?>}mi, '')
  html.gsub!(/\s+/m, ' ')
  html.strip!
  html = CGI.unescapeHTML(html)
  return html
end