Class: HTree::Encoder

Inherits:
Object
  • Object
show all
Defined in:
lib/htree/encoder.rb

Constant Summary collapse

ChRef =
{
  '&' => '&',
  '<' => '&lt;',
  '>' => '&gt;',
  '"' => '&quot;',
}
KcodeCharset =

:stopdoc:

{
  'EUC' => 'EUC-JP',
  'SJIS' => 'Shift_JIS',
  'UTF8' => 'UTF-8',
  'NONE' => 'ISO-8859-1',
}
FirstCharPattern =
{
  'EUC-JP' => /\A(?:
     [\x00-\x7f]
    |[\xa1-\xfe][\xa1-\xfe]
    |\x8e[\xa1-\xfe]
    |\x8f[\xa1-\xfe][\xa1-\xfe])/nx,
  'Shift_JIS' => /\A(?:
     [\x00-\x7f]
    |[\x81-\x9f][\x40-\x7e\x80-\xfc]
    |[\xa1-\xdf]
    |[\xe0-\xfc][\x40-\x7e\x80-\xfc])/nx,
  'UTF-8' => /\A(?:
     [\x00-\x7f]
    |[\xc0-\xdf][\x80-\xbf]
    |[\xe0-\xef][\x80-\xbf][\x80-\xbf]
    |[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]
    |[\xf8-\xfb][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf]
    |[\xfc-\xfd][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf])/nx,
  'ISO-8859-1' => /\A[\x00-\xff]/n
}
SubCharset =
{
  'ISO-2022-JP-2' => ['US-ASCII', 'ISO-2022-JP'],
  'ISO-2022-JP-3' => ['US-ASCII', 'ISO-2022-JP'],
  'UTF-16BE' => [],
  'UTF-16LE' => [],
  'UTF-16' => [],
}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output_encoding, internal_encoding = HTree::Encoder.internal_charset) ⇒ Encoder

Returns a new instance of Encoder.



22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/htree/encoder.rb', line 22

def initialize(output_encoding, internal_encoding=HTree::Encoder.internal_charset)
  @buf = ''
  @internal_encoding = internal_encoding
  @output_encoding = output_encoding
  @ic = Iconv.new(output_encoding, @internal_encoding)
  @charpat = FirstCharPattern[internal_encoding]
  @subcharset_list = SubCharset[output_encoding] || []
  @subcharset_ic = {}
  @subcharset_list.each {|subcharset|
    @subcharset_ic[subcharset] = Iconv.new(subcharset, @internal_encoding)
  }
  @html_output = false
end

Class Method Details

.internal_charsetObject

HTree::Encoder.internal_charset returns the MIME charset corresponding to $KCODE.

  • ‘ISO-8859-1’ when $KCODE==‘NONE’

  • ‘UTF-8’ when $KCODE==‘UTF8’

  • ‘EUC-JP’ when $KCODE==‘EUC’

  • ‘Shift_JIS’ when $KCODE==‘SJIS’

This mapping ignores EUC-KR and various single byte charset other than ISO-8859-1 at least. This should be fixed when Ruby is m17nized.



14
15
16
17
18
19
20
# File 'lib/htree/encoder.rb', line 14

def Encoder.internal_charset
  if Object.const_defined? :Encoding
    Encoding.default_external.name
  else
    KcodeCharset[$KCODE]
  end
end

Instance Method Details

#finishObject

:startdoc:



147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/htree/encoder.rb', line 147

def finish
  external_str = @ic.close
  @buf << external_str
  @subcharset_ic.reject! {|subcharset, ic|
    begin
      ic.close != external_str
    rescue Iconv::Failure
      true
    end
  }
  @buf
end

#finish_with_xmldeclObject



160
161
162
163
164
165
# File 'lib/htree/encoder.rb', line 160

def finish_with_xmldecl
  content = finish
  xmldecl = Iconv.conv(@output_encoding, 'US-ASCII',
    "<?xml version=\"1.0\" encoding=\"#{minimal_charset}\"?>")
  xmldecl + content
end

#html_output=(flag) ⇒ Object



41
42
43
# File 'lib/htree/encoder.rb', line 41

def html_output=(flag)
  @html_output = flag
end

#html_output?Boolean

:stopdoc:

Returns:

  • (Boolean)


37
38
39
# File 'lib/htree/encoder.rb', line 37

def html_output?
  @html_output
end

#minimal_charsetObject



167
168
169
170
171
172
173
174
# File 'lib/htree/encoder.rb', line 167

def minimal_charset
  @subcharset_list.each {|subcharset|
    if @subcharset_ic.include? subcharset
      return subcharset
    end
  }
  @output_encoding
end

#output_cdata_content(content, context) ⇒ Object



62
63
64
65
66
67
68
69
70
71
# File 'lib/htree/encoder.rb', line 62

def output_cdata_content(content, context)
  if @html_output
    # xxx: should raise an error for non-text node?
    texts = content.grep(HTree::Text)
    text = HTree::Text.concat(*texts)
    text.output_cdata(self)
  else
    content.each {|n| n.output(self, context) }
  end
end

#output_cdata_content_do(out, pre, body, post) ⇒ Object



45
46
47
48
49
50
51
52
53
54
# File 'lib/htree/encoder.rb', line 45

def output_cdata_content_do(out, pre, body, post)
  if @html_output
    pre.call
    body.call
    post.call(out)
  else
    body.call
  end
  return out
end

#output_cdata_for_html(*args) ⇒ Object



73
74
75
76
77
78
79
# File 'lib/htree/encoder.rb', line 73

def output_cdata_for_html(*args)
  str = args.join('')
  if %r{</} =~ str
    raise ArgumentError, "cdata contains '</' : #{str.inspect}"
  end
  output_string str
end

#output_dynamic_attvalue(string) ⇒ Object



137
138
139
140
141
142
143
# File 'lib/htree/encoder.rb', line 137

def output_dynamic_attvalue(string)
  if string.respond_to? :rcdata
    output_text(string.rcdata.gsub(/[<>"]/) { ChRef[$&] })
  else
    output_text(string.to_s.gsub(/[&<>"]/) { ChRef[$&] })
  end
end

#output_dynamic_text(string) ⇒ Object



129
130
131
132
133
134
135
# File 'lib/htree/encoder.rb', line 129

def output_dynamic_text(string)
  if string.respond_to? :rcdata
    output_text(string.rcdata.gsub(/[<>]/) { ChRef[$&] })
  else
    output_text(string.to_s.gsub(/[&<>]/) { ChRef[$&] })
  end
end

#output_slash_if_xmlObject



56
57
58
59
60
# File 'lib/htree/encoder.rb', line 56

def output_slash_if_xml
  if !@html_output
    output_string('/')
  end
end

#output_string(internal_str, external_str = @ic.iconv(internal_str)) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/htree/encoder.rb', line 81

def output_string(internal_str, external_str=@ic.iconv(internal_str))
  @buf.force_encoding(external_str.encoding) if @buf.empty? && @buf.respond_to?(:force_encoding) # xxx: should be fixed Ruby itself
  @buf << external_str
  @subcharset_ic.reject! {|subcharset, ic|
    begin
      ic.iconv(internal_str) != external_str
    rescue Iconv::Failure
      true
    end
  }
  nil
end

#output_text(string) ⇒ Object



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/htree/encoder.rb', line 94

def output_text(string)
  begin
    output_string string, @ic.iconv(string)
  rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => e
    success = e.success
    output_string string[0, string.length - e.failed.length], success
    unless /\A./m =~ e.failed
      # xxx: should be configulable?
      #raise ArgumentError, "cannot extract first character: #{e.failed.dump}"
      string = e.failed[1, e.failed.length-1]
      output_string '?'
      retry
    end
    char = $&
    rest = $'
    begin
      ucode = Iconv.conv("UTF-8", @internal_encoding, char).unpack("U")[0]
      char = "&##{ucode};"
    rescue Iconv::IllegalSequence, Iconv::InvalidCharacter
      # xxx: should be configulable?
      char = '?'
    end
    output_string char
    string = rest
    retry
  end
end