Class: GuessHtmlEncoding::HTMLScanner

Inherits:
Object
  • Object
show all
Defined in:
lib/guess_html_encoding.rb

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ HTMLScanner

Returns a new instance of HTMLScanner.



59
60
61
# File 'lib/guess_html_encoding.rb', line 59

def initialize(html)
  @html = html
end

Instance Method Details

#encodingObject

Returns the encoding sniffed from the content of an HTML page, as determined using an implemention of the algorithm to ‘prescan a byte stream to determine its encoding’, as specified by the HTML specification: www.w3.org/html/wg/drafts/html/master/syntax.html#prescan-a-byte-stream-to-determine-its-encoding



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/guess_html_encoding.rb', line 67

def encoding

  position = 0
  charset = nil
  length = @html.length

  done = false

  while position < length && !done

    # First look for a standard HTML comment (ie <!-- blah -->)
    if @html[position, 4] == '<!--'

      position += 2

      position += (@html[position, length].index('-->') || length)

    # Then look for the start of a meta tag
    elsif  @html[position, 6] =~ /\A\<meta[\s\/]/i

      charset, position_increment = charset_from_meta(@html[position + 5, length])

      break if charset

      position += position_increment

    # Then look for <! or </ or <?
    elsif @html[position, 2] =~ /\A\<[\!\/\?]/

      # Advance position to the first > that appears next in string, or end
      position += @html[position, length].index('>') || length

    else
      # Do nothing. (This is just here to make the algorithm easier to follow)
    end

    # Advance position to next character
    position += 1
  end

  charset
end