Module: FeedParserUtilities

Included in:
FeedParser
Defined in:
lib/rfeedparser/scrub.rb,
lib/rfeedparser/aliases.rb,
lib/rfeedparser/utilities.rb,
lib/rfeedparser/markup_helpers.rb,
lib/rfeedparser/encoding_helpers.rb

Defined Under Namespace

Classes: SanitizerDoc

Constant Summary collapse

Encoding_Aliases =

Adapted from python2.4’s encodings/aliases.py

{     
  'unicode'		 => 'utf-16',  

  # MacOSX does not have Unicode as a separate encoding nor even
  # aliased. My Ubuntu box has it as a separate encoding but I cannot
  # for the life of me figure out where the source code for UNICODE.so
  # is (supposedly, in libc6 .deb but that's a damn lie), so I don't
  # know what it expects. After some extensive research, I've decided
  # to alias it to utf-16 much like Python does when it is built with
  # --enable-unicode=ucs2. This could be seriously wrong. I have no idea.

  # ascii codec
  '646'                => 'ascii',
  'ansi_x3.4_1968'     => 'ascii',
  'ansi_x3_4_1968'     => 'ascii', # some email headers use this non-standard name
  'ansi_x3.4_1986'     => 'ascii',
  'cp367'              => 'ascii',
  'csascii'            => 'ascii',
  'ibm367'             => 'ascii',
  'iso646_us'          => 'ascii',
  'iso_646.irv_1991'   => 'ascii',
  'iso_ir_6'           => 'ascii',
  'us'                 => 'ascii',
  'us_ascii'           => 'ascii',

  # big5 codec
  'big5_tw'            => 'big5',
  'csbig5'             => 'big5',

  # big5hkscs codec
  'big5_hkscs'         => 'big5hkscs',
  'hkscs'              => 'big5hkscs',

  # cp037 codec
  '037'                => 'cp037',
  'csibm037'           => 'cp037',
  'ebcdic_cp_ca'       => 'cp037',
  'ebcdic_cp_nl'       => 'cp037',
  'ebcdic_cp_us'       => 'cp037',
  'ebcdic_cp_wt'       => 'cp037',
  'ibm037'             => 'cp037',
  'ibm039'             => 'cp037',

  # cp1026 codec
  '1026'               => 'cp1026',
  'csibm1026'          => 'cp1026',
  'ibm1026'            => 'cp1026',

  # cp1140 codec
  '1140'               => 'cp1140',
  'ibm1140'            => 'cp1140',

  # cp1250 codec
  '1250'               => 'cp1250',
  'windows_1250'       => 'cp1250',

  # cp1251 codec
  '1251'               => 'cp1251',
  'windows_1251'       => 'cp1251',

  # cp1252 codec
  '1252'               => 'cp1252',
  'windows_1252'       => 'cp1252',

  # cp1253 codec
  '1253'               => 'cp1253',
  'windows_1253'       => 'cp1253',

  # cp1254 codec
  '1254'               => 'cp1254',
  'windows_1254'       => 'cp1254',

  # cp1255 codec
  '1255'               => 'cp1255',
  'windows_1255'       => 'cp1255',

  # cp1256 codec
  '1256'               => 'cp1256',
  'windows_1256'       => 'cp1256',

  # cp1257 codec
  '1257'               => 'cp1257',
  'windows_1257'       => 'cp1257',

  # cp1258 codec
  '1258'               => 'cp1258',
  'windows_1258'       => 'cp1258',

  # cp424 codec
  '424'                => 'cp424',
  'csibm424'           => 'cp424',
  'ebcdic_cp_he'       => 'cp424',
  'ibm424'             => 'cp424',

  # cp437 codec
  '437'                => 'cp437',
  'cspc8codepage437'   => 'cp437',
  'ibm437'             => 'cp437',

  # cp500 codec
  '500'                => 'cp500',
  'csibm500'           => 'cp500',
  'ebcdic_cp_be'       => 'cp500',
  'ebcdic_cp_ch'       => 'cp500',
  'ibm500'             => 'cp500',

  # cp775 codec
  '775'              => 'cp775',
  'cspc775baltic'      => 'cp775',
  'ibm775'             => 'cp775',

  # cp850 codec
  '850'                => 'cp850',
  'cspc850multilingual' => 'cp850',
  'ibm850'             => 'cp850',

  # cp852 codec
  '852'                => 'cp852',
  'cspcp852'           => 'cp852',
  'ibm852'             => 'cp852',

  # cp855 codec
  '855'                => 'cp855',
  'csibm855'           => 'cp855',
  'ibm855'             => 'cp855',

  # cp857 codec
  '857'                => 'cp857',
  'csibm857'           => 'cp857',
  'ibm857'             => 'cp857',

  # cp860 codec
  '860'                => 'cp860',
  'csibm860'           => 'cp860',
  'ibm860'             => 'cp860',

  # cp861 codec
  '861'                => 'cp861',
  'cp_is'              => 'cp861',
  'csibm861'           => 'cp861',
  'ibm861'             => 'cp861',

  # cp862 codec
  '862'                => 'cp862',
  'cspc862latinhebrew' => 'cp862',
  'ibm862'             => 'cp862',

  # cp863 codec
  '863'                => 'cp863',
  'csibm863'           => 'cp863',
  'ibm863'             => 'cp863',

  # cp864 codec
  '864'                => 'cp864',
  'csibm864'           => 'cp864',
  'ibm864'             => 'cp864',

  # cp865 codec
  '865'                => 'cp865',
  'csibm865'           => 'cp865',
  'ibm865'             => 'cp865',

  # cp866 codec
  '866'                => 'cp866',
  'csibm866'           => 'cp866',
  'ibm866'             => 'cp866',

  # cp869 codec
  '869'                => 'cp869',
  'cp_gr'              => 'cp869',
  'csibm869'           => 'cp869',
  'ibm869'             => 'cp869',

  # cp932 codec
  '932'                => 'cp932',
  'ms932'              => 'cp932',
  'mskanji'            => 'cp932',
  'ms_kanji'           => 'cp932',

  # cp949 codec
  '949'                => 'cp949',
  'ms949'              => 'cp949',
  'uhc'                => 'cp949',

  # cp950 codec
  '950'                => 'cp950',
  'ms950'              => 'cp950',

  # euc_jp codec
  'euc_jp'             => 'euc-jp',
  'eucjp'              => 'euc-jp',
  'ujis'               => 'euc-jp',
  'u_jis'              => 'euc-jp',

  # euc_kr codec
  'euc_kr'             => 'euc-kr',
  'euckr'              => 'euc-kr',
  'korean'             => 'euc-kr',
  'ksc5601'            => 'euc-kr',
  'ks_c_5601'          => 'euc-kr',
  'ks_c_5601_1987'     => 'euc-kr',
  'ksx1001'            => 'euc-kr',
  'ks_x_1001'          => 'euc-kr',

  # gb18030 codec
  'gb18030_2000'       => 'gb18030',

  # gb2312 codec
  'chinese'            => 'gb2312',
  'csiso58gb231280'    => 'gb2312',
  'euc_cn'             => 'gb2312',
  'euccn'              => 'gb2312',
  'eucgb2312_cn'       => 'gb2312',
  'gb2312_1980'        => 'gb2312',
  'gb2312_80'          => 'gb2312',
  'iso_ir_58'          => 'gb2312',

  # gbk codec
  '936'                => 'gbk',
  'cp936'              => 'gbk',
  'ms936'              => 'gbk',

  # hp-roman8 codec
  'hp_roman8'          => 'hp-roman8',
  'roman8'             => 'hp-roman8',
  'r8'                 => 'hp-roman8',
  'csHPRoman8'         => 'hp-roman8',

  # iso2022_jp codec
  'iso2022_jp'         => 'iso-2022-jp',
  'csiso2022jp'        => 'iso-2022-jp',
  'iso2022jp'          => 'iso-2022-jp',
  'iso_2022_jp'        => 'iso-2022-jp',

  # iso2022_jp_1 codec
  'iso2002_jp_1'       => 'iso-2022-jp-1',
  'iso2022jp_1'        => 'iso-2022-jp-1',
  'iso_2022_jp_1'      => 'iso-2022-jp-1',

  # iso2022_jp_2 codec
  'iso2022_jp_2'       => 'iso-2002-jp-2',
  'iso2022jp_2'        => 'iso-2022-jp-2',
  'iso_2022_jp_2'      => 'iso-2022-jp-2',

  # iso2022_jp_3 codec
  'iso2002_jp_3'       => 'iso-2022-jp-3',
  'iso2022jp_3'        => 'iso-2022-jp-3',
  'iso_2022_jp_3'      => 'iso-2022-jp-3',

  # iso2022_kr codec
  'iso2022_kr'         => 'iso-2022-kr',
  'csiso2022kr'        => 'iso-2022-kr',
  'iso2022kr'          => 'iso-2022-kr',
  'iso_2022_kr'        => 'iso-2022-kr',

  # iso8859_10 codec
  'iso8859_10'         => 'iso-8859-10',
  'csisolatin6'        => 'iso-8859-10',
  'iso_8859_10'        => 'iso-8859-10',
  'iso_8859_10_1992'   => 'iso-8859-10',
  'iso_ir_157'         => 'iso-8859-10',
  'l6'                 => 'iso-8859-10',
  'latin6'             => 'iso-8859-10',

  # iso8859_13 codec
  'iso8859_13'         => 'iso-8859-13',
  'iso_8859_13'        => 'iso-8859-13',

  # iso8859_14 codec
  'iso8859_14'         => 'iso-8859-14',
  'iso_8859_14'        => 'iso-8859-14',
  'iso_8859_14_1998'   => 'iso-8859-14',
  'iso_celtic'         => 'iso-8859-14',
  'iso_ir_199'         => 'iso-8859-14',
  'l8'                 => 'iso-8859-14',
  'latin8'             => 'iso-8859-14',

  # iso8859_15 codec
  'iso8859_15'         => 'iso-8859-15',
  'iso_8859_15'        => 'iso-8859-15',

  # iso8859_1 codec
  'latin_1'            => 'iso-8859-1',
  'cp819'              => 'iso-8859-1',
  'csisolatin1'        => 'iso-8859-1',
  'ibm819'             => 'iso-8859-1',
  'iso8859'            => 'iso-8859-1',
  'iso_8859_1'         => 'iso-8859-1',
  'iso_8859_1_1987'    => 'iso-8859-1',
  'iso_ir_100'         => 'iso-8859-1',
  'l1'                 => 'iso-8859-1',
  'latin'              => 'iso-8859-1',
  'latin1'             => 'iso-8859-1',

  # iso8859_2 codec
  'iso8859_2'          => 'iso-8859-2',
  'csisolatin2'        => 'iso-8859-2',
  'iso_8859_2'         => 'iso-8859-2',
  'iso_8859_2_1987'    => 'iso-8859-2',
  'iso_ir_101'         => 'iso-8859-2',
  'l2'                 => 'iso-8859-2',
  'latin2'             => 'iso-8859-2',

  # iso8859_3 codec
  'iso8859_3'          => 'iso-8859-3',
  'csisolatin3'        => 'iso-8859-3',
  'iso_8859_3'         => 'iso-8859-3',
  'iso_8859_3_1988'    => 'iso-8859-3',
  'iso_ir_109'         => 'iso-8859-3',
  'l3'                 => 'iso-8859-3',
  'latin3'             => 'iso-8859-3',

  # iso8859_4 codec
  'iso8849_4'          => 'iso-8859-4',
  'csisolatin4'        => 'iso-8859-4',
  'iso_8859_4'         => 'iso-8859-4',
  'iso_8859_4_1988'    => 'iso-8859-4',
  'iso_ir_110'         => 'iso-8859-4',
  'l4'                 => 'iso-8859-4',
  'latin4'             => 'iso-8859-4',

  # iso8859_5 codec
  'iso8859_5'          => 'iso-8859-5',
  'csisolatincyrillic' => 'iso-8859-5',
  'cyrillic'           => 'iso-8859-5',
  'iso_8859_5'         => 'iso-8859-5',
  'iso_8859_5_1988'    => 'iso-8859-5',
  'iso_ir_144'         => 'iso-8859-5',

  # iso8859_6 codec
  'iso8859_6'          => 'iso-8859-6',
  'arabic'             => 'iso-8859-6',
  'asmo_708'           => 'iso-8859-6',
  'csisolatinarabic'   => 'iso-8859-6',
  'ecma_114'           => 'iso-8859-6',
  'iso_8859_6'         => 'iso-8859-6',
  'iso_8859_6_1987'    => 'iso-8859-6',
  'iso_ir_127'         => 'iso-8859-6',

  # iso8859_7 codec
  'iso8859_7'          => 'iso-8859-7',
  'csisolatingreek'    => 'iso-8859-7',
  'ecma_118'           => 'iso-8859-7',
  'elot_928'           => 'iso-8859-7',
  'greek'              => 'iso-8859-7',
  'greek8'             => 'iso-8859-7',
  'iso_8859_7'         => 'iso-8859-7',
  'iso_8859_7_1987'    => 'iso-8859-7',
  'iso_ir_126'         => 'iso-8859-7',

  # iso8859_8 codec
  'iso8859_9'          => 'iso8859_8',
  'csisolatinhebrew'   => 'iso-8859-8',
  'hebrew'             => 'iso-8859-8',
  'iso_8859_8'         => 'iso-8859-8',
  'iso_8859_8_1988'    => 'iso-8859-8',
  'iso_ir_138'         => 'iso-8859-8',

  # iso8859_9 codec
  'iso8859_9'          => 'iso-8859-9',
  'csisolatin5'        => 'iso-8859-9',
  'iso_8859_9'         => 'iso-8859-9',
  'iso_8859_9_1989'    => 'iso-8859-9',
  'iso_ir_148'         => 'iso-8859-9',
  'l5'                 => 'iso-8859-9',
  'latin5'             => 'iso-8859-9',

  # iso8859_11 codec
  'iso8859_11'         => 'iso-8859-11',
  'thai'               => 'iso-8859-11',
  'iso_8859_11'        => 'iso-8859-11',
  'iso_8859_11_2001'   => 'iso-8859-11',

  # iso8859_16 codec
  'iso8859_16'         => 'iso-8859-16',
  'iso_8859_16'        => 'iso-8859-16',
  'iso_8859_16_2001'   => 'iso-8859-16',
  'iso_ir_226'         => 'iso-8859-16',
  'l10'                => 'iso-8859-16',
  'latin10'            => 'iso-8859-16',

  # cskoi8r codec 
  'koi8_r'             => 'cskoi8r',

  # mac_cyrillic codec
  'mac_cyrillic'       => 'maccyrillic',

  # shift_jis codec
  'csshiftjis'         => 'shift_jis',
  'shiftjis'           => 'shift_jis',
  'sjis'               => 'shift_jis',
  's_jis'              => 'shift_jis',

  # shift_jisx0213 codec
  'shiftjisx0213'      => 'shift_jisx0213',
  'sjisx0213'          => 'shift_jisx0213',
  's_jisx0213'         => 'shift_jisx0213',

  # utf_16 codec
  'utf_16'             => 'utf-16',
  'u16'                => 'utf-16',
  'utf16'              => 'utf-16',

  # utf_16_be codec
  'utf_16_be'          => 'utf-16be',
  'unicodebigunmarked' => 'utf-16be',
  'utf_16be'           => 'utf-16be',

  # utf_16_le codec
  'utf_16_le'          => 'utf-16le',
  'unicodelittleunmarked' => 'utf-16le',
  'utf_16le'           => 'utf-16le',

  # utf_7 codec
  'utf_7'              => 'utf-7',
  'u7'                 => 'utf-7',
  'utf7'               => 'utf-7',

  # utf_8 codec
  'utf_8'              => 'utf-8',
  'u8'                 => 'utf-8',
  'utf'                => 'utf-8',
  'utf8'               => 'utf-8',
  'utf8_ucs2'          => 'utf-8',
  'utf8_ucs4'          => 'utf-8',
}

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.extract_tuple(atime) ⇒ Object



15
16
17
# File 'lib/rfeedparser/utilities.rb', line 15

def extract_tuple(atime)
  FeedParser::FeedTimeParser.extract_tuple(atime)
end

.parse_date(date_string) ⇒ Object



10
11
12
# File 'lib/rfeedparser/utilities.rb', line 10

def parse_date(date_string)
  FeedParser::FeedTimeParser.parse_date(date_string)
end

.SanitizerDoc(html) ⇒ Object



200
201
202
# File 'lib/rfeedparser/scrub.rb', line 200

def SanitizerDoc(html)
  SanitizerDoc.new(Hpricot.make(html))
end

Instance Method Details

#_ebcdic_to_ascii(s) ⇒ Object



26
27
28
# File 'lib/rfeedparser/encoding_helpers.rb', line 26

def _ebcdic_to_ascii(s)   
  return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
end

#getCharacterEncoding(http_headers, xml_data) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/rfeedparser/encoding_helpers.rb', line 30

def getCharacterEncoding(http_headers, xml_data)
  # Get the character encoding of the XML document
  $stderr << "In getCharacterEncoding\n" if $debug
  sniffed_xml_encoding = nil
  xml_encoding = nil
  true_encoding = nil
  
  http_content_type, charset = http_headers['content-type'].to_s.split(';',2)

  encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
  http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]

  http_encoding = nil if http_encoding && http_encoding.empty?
  # FIXME Open-Uri returns iso8859-1 if there is no charset header,
  # but that doesn't pass the tests. Open-Uri claims its following
  # the right RFC. Are they wrong or do we need to change the tests?
  
  # Must sniff for non-ASCII-compatible character encodings before
  # searching for XML declaration.  This heuristic is defined in
  # section F of the XML specification:
  # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
  begin 
    if xml_data[0..3] == "\x4c\x6f\xa7\x94"
      # EBCDIC
      xml_data = __ebcdic_to_ascii(xml_data)
    elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
      # UTF-16BE
      sniffed_xml_encoding = 'utf-16be'
      xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
    elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
      # UTF-16BE with BOM
      sniffed_xml_encoding = 'utf-16be'
      xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
    elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
      # UTF-16LE
      sniffed_xml_encoding = 'utf-16le'
      xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
    elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
      # UTF-16LE with BOM
      sniffed_xml_encoding = 'utf-16le'
      xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
    elsif xml_data[0..3] == "\x00\x00\x00\x3c"
      # UTF-32BE
      sniffed_xml_encoding = 'utf-32be'
      xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
    elsif xml_data[0..3] == "\x3c\x00\x00\x00"
      # UTF-32LE
      sniffed_xml_encoding = 'utf-32le'
      xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
    elsif xml_data[0..3] == "\x00\x00\xfe\xff"
      # UTF-32BE with BOM
      sniffed_xml_encoding = 'utf-32be'
      xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
    elsif xml_data[0..3] == "\xff\xfe\x00\x00"
      # UTF-32LE with BOM
      sniffed_xml_encoding = 'utf-32le'
      xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
    elsif xml_data[0..2] == "\xef\xbb\xbf"
      # UTF-8 with BOM
      sniffed_xml_encoding = 'utf-8'
      xml_data = xml_data[3..-1]
    else
      # ASCII-compatible
    end
    xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
  rescue
    xml_encoding_match = nil
  end
  if xml_encoding_match 
    xml_encoding = xml_encoding_match[1].downcase
    xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
    if sniffed_xml_encoding and xencodings.include?xml_encoding
      xml_encoding = sniffed_xml_encoding
    end
  end

  acceptable_content_type = false
  application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
  text_content_types = ['text/xml', 'text/xml-external-parsed-entity']

  if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
    acceptable_content_type = true
    true_encoding = http_encoding || xml_encoding || 'utf-8'
  elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
    acceptable_content_type = true
    true_encoding = http_encoding || 'us-ascii'
  elsif /^text\// =~ http_content_type 
    true_encoding = http_encoding || 'us-ascii'
  elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
    true_encoding = xml_encoding || 'iso-8859-1'
  else
    true_encoding = xml_encoding || 'utf-8'
  end
  return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
end

#index_match(stri, regexp, offset) ⇒ Object



17
18
19
20
21
22
23
24
# File 'lib/rfeedparser/encoding_helpers.rb', line 17

def index_match(stri,regexp, offset)
  i = stri.index(regexp, offset)

  return nil, nil unless i

  full = stri[i..-1].match(regexp)
  return i, full
end

#py2rtime(pytuple) ⇒ Object



20
21
22
# File 'lib/rfeedparser/utilities.rb', line 20

def py2rtime(pytuple)
  Time.utc(*pytuple[0..5]) unless pytuple.nil? || pytuple.empty? 
end

#resolveRelativeURIs(htmlSource, baseURI, encoding) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/rfeedparser/markup_helpers.rb', line 28

def resolveRelativeURIs(htmlSource, baseURI, encoding)
  $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
  relative_uris = [ ['a','href'],
    ['applet','codebase'],
    ['area','href'],
    ['blockquote','cite'],
    ['body','background'],
    ['del','cite'],
    ['form','action'],
    ['frame','longdesc'],
    ['frame','src'],
    ['iframe','longdesc'],
    ['iframe','src'],
    ['head','profile'],
    ['img','longdesc'],
    ['img','src'],
    ['img','usemap'],
    ['input','src'],
    ['input','usemap'],
    ['ins','cite'],
    ['link','href'],
    ['object','classid'],
    ['object','codebase'],
    ['object','data'],
    ['object','usemap'],
    ['q','cite'],
    ['script','src'],
  ]
  h = Hpricot(htmlSource)
  relative_uris.each do |l|
    ename, eattr = l
    h.search(ename).each do |elem|
      euri = elem.attributes[eattr]
      uri = Addressable::URI.parse(Addressable::URI.encode(euri)) rescue nil
      if euri and not euri.empty? and uri and uri.relative?
        elem.raw_attributes[eattr] = urljoin(baseURI, euri)
      end
    end
  end
  return h.to_html
end

#sanitizeHTML(html, encoding) ⇒ Object



205
206
207
208
209
210
211
# File 'lib/rfeedparser/scrub.rb', line 205

def sanitizeHTML(html,encoding)
  # FIXME Tidy not yet supported
  html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
  h = SanitizerDoc(html)
  h = h.scrub
  return h.strip
end

#stripDoctype(data) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/rfeedparser/markup_helpers.rb', line 3

def stripDoctype(data)
  #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
  #rss_version may be 'rss091n' or None
  #stripped_data is the same XML document, minus the DOCTYPE
  entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
  
  data = data.gsub(entity_pattern,'')

  doctype_pattern = /<!DOCTYPE(.*?)>/m
  doctype_results = data.scan(doctype_pattern)
  if doctype_results and doctype_results[0]
    doctype = doctype_results[0][0]
  else
    doctype = ''
  end

  if /netscape/ =~ doctype.downcase
    version = 'rss091n'
  else
    version = nil
  end
  data = data.sub(doctype_pattern, '')
  return version, data
end

#toUTF8(data, encoding) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/rfeedparser/encoding_helpers.rb', line 126

def toUTF8(data, encoding)
  $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
  # NOTE we must use double quotes when dealing with \x encodings!
  if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
    if $debug
      $stderr << "stripping BOM\n"
      if encoding != 'utf-16be'
        $stderr << "string utf-16be instead\n"
      end
    end
    encoding = 'utf-16be'
    data = data[2..-1]
  elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
    if $debug
      $stderr << "stripping BOM\n"
      $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
    end
    encoding = 'utf-16le'
    data = data[2..-1]
  elsif (data[0..2] == "\xef\xbb\xbf")
    if $debug
      $stderr << "stripping BOM\n"
      $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
    end
    encoding = 'utf-8'
    data = data[3..-1]
  elsif (data[0..3] == "\x00\x00\xfe\xff")
    if $debug
      $stderr << "stripping BOM\n"
      if encoding != 'utf-32be'
        $stderr << "trying utf-32be instead\n"
      end
    end
    encoding = 'utf-32be'
    data = data[4..-1]
  elsif (data[0..3] == "\xff\xfe\x00\x00")
    if $debug
      $stderr << "stripping BOM\n"
      if encoding != 'utf-32le'
        $stderr << "trying utf-32le instead\n"
      end
    end
    encoding = 'utf-32le'
    data = data[4..-1]
  end
  begin
    newdata = uconvert(data, encoding, 'utf-8')
  rescue => details
    raise details
  end
  $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
  declmatch = /^<\?xml[^>]*?>/
  newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
  if declmatch =~ newdata
    newdata.sub!(declmatch, newdecl) 
  else
    newdata = newdecl + "\n" + newdata
  end
  return newdata
end

#uconvert(data, from_encoding, to_encoding = 'utf-8') ⇒ Object



11
12
13
14
15
# File 'lib/rfeedparser/encoding_helpers.rb', line 11

def uconvert(data, from_encoding, to_encoding = 'utf-8')
  from_encoding = Encoding_Aliases[from_encoding] || from_encoding
  to_encoding = Encoding_Aliases[to_encoding] || to_encoding
  Iconv.iconv(to_encoding, from_encoding, data)[0]
end

#unicode(data, from_encoding) ⇒ Object



5
6
7
8
9
# File 'lib/rfeedparser/encoding_helpers.rb', line 5

def unicode(data, from_encoding)
  # Takes a single string and converts it from the encoding in 
  # from_encoding to unicode.
  uconvert(data, from_encoding, 'unicode')
end