Class: HTML::Encoder

Inherits:
Object
  • Object
show all
Defined in:
lib/HTML/Encoder.rb,
lib/HTML/Encoder/Unicode.rb

Defined Under Namespace

Classes: Unicode

Instance Method Summary collapse

Constructor Details

#initializeEncoder

Returns a new instance of Encoder.



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/HTML/Encoder.rb', line 7

def initialize()

    @entity2char = {
        'amp'   => '&',  # ampersand 
        'gt'    => '>',  # greater than
        'lt'    => '<',  # less than
        'quot'  => '"',  # double quote
        'apos'  => "'",  # single quote

        # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
        'AElig'  => 198.chr,  # capital AE diphthong (ligature)
        'Aacute' => 193.chr,  # capital A, acute accent
        'Acirc'  => 194.chr,  # capital A, circumflex accent
        'Agrave' => 192.chr,  # capital A, grave accent
        'Aring'  => 197.chr,  # capital A, ring
        'Atilde' => 195.chr,  # capital A, tilde
        'Auml'   => 196.chr,  # capital A, dieresis or umlaut mark
        'Ccedil' => 199.chr,  # capital C, cedilla
        'ETH'    => 208.chr,  # capital Eth, Icelandic
        'Eacute' => 201.chr,  # capital E, acute accent
        'Ecirc'  => 202.chr,  # capital E, circumflex accent
        'Egrave' => 200.chr,  # capital E, grave accent
        'Euml'   => 203.chr,  # capital E, dieresis or umlaut mark
        'Iacute' => 205.chr,  # capital I, acute accent
        'Icirc'  => 206.chr,  # capital I, circumflex accent
        'Igrave' => 204.chr,  # capital I, grave accent
        'Iuml'   => 207.chr,  # capital I, dieresis or umlaut mark
        'Ntilde' => 209.chr,  # capital N, tilde
        'Oacute' => 211.chr,  # capital O, acute accent
        'Ocirc'  => 212.chr,  # capital O, circumflex accent
        'Ograve' => 210.chr,  # capital O, grave accent
        'Oslash' => 216.chr,  # capital O, slash
        'Otilde' => 213.chr,  # capital O, tilde
        'Ouml'   => 214.chr,  # capital O, dieresis or umlaut mark
        'THORN'  => 222.chr,  # capital THORN, Icelandic
        'Uacute' => 218.chr,  # capital U, acute accent
        'Ucirc'  => 219.chr,  # capital U, circumflex accent
        'Ugrave' => 217.chr,  # capital U, grave accent
        'Uuml'   => 220.chr,  # capital U, dieresis or umlaut mark
        'Yacute' => 221.chr,  # capital Y, acute accent
        'aacute' => 225.chr,  # small a, acute accent
        'acirc'  => 226.chr,  # small a, circumflex accent
        'aelig'  => 230.chr,  # small ae diphthong (ligature)
        'agrave' => 224.chr,  # small a, grave accent
        'aring'  => 229.chr,  # small a, ring
        'atilde' => 227.chr,  # small a, tilde
        'auml'   => 228.chr,  # small a, dieresis or umlaut mark
        'ccedil' => 231.chr,  # small c, cedilla
        'eacute' => 233.chr,  # small e, acute accent
        'ecirc'  => 234.chr,  # small e, circumflex accent
        'egrave' => 232.chr,  # small e, grave accent
        'eth'    => 240.chr,  # small eth, Icelandic
        'euml'   => 235.chr,  # small e, dieresis or umlaut mark
        'iacute' => 237.chr,  # small i, acute accent
        'icirc'  => 238.chr,  # small i, circumflex accent
        'igrave' => 236.chr,  # small i, grave accent
        'iuml'   => 239.chr,  # small i, dieresis or umlaut mark
        'ntilde' => 241.chr,  # small n, tilde
        'oacute' => 243.chr,  # small o, acute accent
        'ocirc'  => 244.chr,  # small o, circumflex accent
        'ograve' => 242.chr,  # small o, grave accent
        'oslash' => 248.chr,  # small o, slash
        'otilde' => 245.chr,  # small o, tilde
        'ouml'   => 246.chr,  # small o, dieresis or umlaut mark
        'szlig'  => 223.chr,  # small sharp s, German (sz ligature)
        'thorn'  => 254.chr,  # small thorn, Icelandic
        'uacute' => 250.chr,  # small u, acute accent
        'ucirc'  => 251.chr,  # small u, circumflex accent
        'ugrave' => 249.chr,  # small u, grave accent
        'uuml'   => 252.chr,  # small u, dieresis or umlaut mark
        'yacute' => 253.chr,  # small y, acute accent
        'yuml'   => 255.chr,  # small y, dieresis or umlaut mark

        # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
        'copy'   => 169.chr,  # copyright sign
        'reg'    => 174.chr,  # registered sign
        'nbsp'   => 160.chr,  # non breaking space

        # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
        'iexcl'  => 161.chr,
        'cent'   => 162.chr,
        'pound'  => 163.chr,
        'curren' => 164.chr,
        'yen'    => 165.chr,
        'brvbar' => 166.chr,
        'sect'   => 167.chr,
        'uml'    => 168.chr,
        'ordf'   => 170.chr,
        'laquo'  => 171.chr,
        'not'    => 172.chr,
        'shy'    => 173.chr,
        'macr'   => 175.chr,
        'deg'    => 176.chr,
        'plusmn' => 177.chr,
        'sup1'   => 185.chr,
        'sup2'   => 178.chr,
        'sup3'   => 179.chr,
        'acute'  => 180.chr,
        'micro'  => 181.chr,
        'para'   => 182.chr,
        'middot' => 183.chr,
        'cedil'  => 184.chr,
        'ordm'   => 186.chr,
        'raquo'  => 187.chr,
        'frac14' => 188.chr,
        'frac12' => 189.chr,
        'frac34' => 190.chr,
        'iquest' => 191.chr,
        'times'  => 215.chr,
        'divide' => 247.chr,
    }

    if RUBY_VERSION > '1.8.7'
        HTML::Encoder::Unicode.unicode_mapping.each{ |k,v|
            @entity2char[k] = v
        }
    end

    @char2entity = Hash[@entity2char.map { |k, v| [v, "&#{k};"] }]

    for i in 0..255
        unless @char2entity.has_key?( i.chr )
            @char2entity[i.chr] = "&##{i};";
        end
    end

end

Instance Method Details

#encode(string, *args) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/HTML/Encoder.rb', line 135

def encode( string, *args )

    if (! args[0].nil? and ! args[0].to_s.empty?)
        lookup = {}
        args[0].to_s.each_char{ |c|
            lookup[c] = @char2entity[c].nil? ? num_entity(c) : @char2entity[c]
        }
        string = string.to_s.gsub( /./ ) {|c| lookup[c].nil? ? c : lookup[c] }
    else             
        # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
        string = string.to_s.gsub( /([^\n\r\t !\#\$%\(-;=?-~])/ ) {|c| 
            @char2entity[c].nil? ? num_entity(c) : @char2entity[c]
        }
    end

    return string

end

#encode_hex(*args) ⇒ Object



154
155
156
157
158
159
160
# File 'lib/HTML/Encoder.rb', line 154

def encode_hex( *args )
    tmp = @char2entity
    @char2entity = {}
    string = encode( *args )
    @char2entity = tmp
    return string
end

#num_entity(char) ⇒ Object



162
163
164
# File 'lib/HTML/Encoder.rb', line 162

def num_entity( char )
    return sprintf( '&#x%X;', char.unpack('C')[0] )
end