Module: FeedParserUtilities

Included in:: FeedParser

Defined in:: lib/rfeedparser/scrub.rb,
lib/rfeedparser/aliases.rb,
lib/rfeedparser/utilities.rb,
lib/rfeedparser/markup_helpers.rb,
lib/rfeedparser/encoding_helpers.rb

Defined Under Namespace

Constant Summary collapse

Encoding_Aliases = Adapted from python2.4’s encodings/aliases.py

{     
  'unicode'		 => 'utf-16',  

  # MacOSX does not have Unicode as a separate encoding nor even
  # aliased. My Ubuntu box has it as a separate encoding but I cannot
  # for the life of me figure out where the source code for UNICODE.so
  # is (supposedly, in libc6 .deb but that's a damn lie), so I don't
  # know what it expects. After some extensive research, I've decided
  # to alias it to utf-16 much like Python does when it is built with
  # --enable-unicode=ucs2. This could be seriously wrong. I have no idea.

  # ascii codec
  '646'                => 'ascii',
  'ansi_x3.4_1968'     => 'ascii',
  'ansi_x3_4_1968'     => 'ascii', # some email headers use this non-standard name
  'ansi_x3.4_1986'     => 'ascii',
  'cp367'              => 'ascii',
  'csascii'            => 'ascii',
  'ibm367'             => 'ascii',
  'iso646_us'          => 'ascii',
  'iso_646.irv_1991'   => 'ascii',
  'iso_ir_6'           => 'ascii',
  'us'                 => 'ascii',
  'us_ascii'           => 'ascii',

  # big5 codec
  'big5_tw'            => 'big5',
  'csbig5'             => 'big5',

  # big5hkscs codec
  'big5_hkscs'         => 'big5hkscs',
  'hkscs'              => 'big5hkscs',

  # cp037 codec
  '037'                => 'cp037',
  'csibm037'           => 'cp037',
  'ebcdic_cp_ca'       => 'cp037',
  'ebcdic_cp_nl'       => 'cp037',
  'ebcdic_cp_us'       => 'cp037',
  'ebcdic_cp_wt'       => 'cp037',
  'ibm037'             => 'cp037',
  'ibm039'             => 'cp037',

  # cp1026 codec
  '1026'               => 'cp1026',
  'csibm1026'          => 'cp1026',
  'ibm1026'            => 'cp1026',

  # cp1140 codec
  '1140'               => 'cp1140',
  'ibm1140'            => 'cp1140',

  # cp1250 codec
  '1250'               => 'cp1250',
  'windows_1250'       => 'cp1250',

  # cp1251 codec
  '1251'               => 'cp1251',
  'windows_1251'       => 'cp1251',

  # cp1252 codec
  '1252'               => 'cp1252',
  'windows_1252'       => 'cp1252',

  # cp1253 codec
  '1253'               => 'cp1253',
  'windows_1253'       => 'cp1253',

  # cp1254 codec
  '1254'               => 'cp1254',
  'windows_1254'       => 'cp1254',

  # cp1255 codec
  '1255'               => 'cp1255',
  'windows_1255'       => 'cp1255',

  # cp1256 codec
  '1256'               => 'cp1256',
  'windows_1256'       => 'cp1256',

  # cp1257 codec
  '1257'               => 'cp1257',
  'windows_1257'       => 'cp1257',

  # cp1258 codec
  '1258'               => 'cp1258',
  'windows_1258'       => 'cp1258',

  # cp424 codec
  '424'                => 'cp424',
  'csibm424'           => 'cp424',
  'ebcdic_cp_he'       => 'cp424',
  'ibm424'             => 'cp424',

  # cp437 codec
  '437'                => 'cp437',
  'cspc8codepage437'   => 'cp437',
  'ibm437'             => 'cp437',

  # cp500 codec
  '500'                => 'cp500',
  'csibm500'           => 'cp500',
  'ebcdic_cp_be'       => 'cp500',
  'ebcdic_cp_ch'       => 'cp500',
  'ibm500'             => 'cp500',

  # cp775 codec
  '775'              => 'cp775',
  'cspc775baltic'      => 'cp775',
  'ibm775'             => 'cp775',

  # cp850 codec
  '850'                => 'cp850',
  'cspc850multilingual' => 'cp850',
  'ibm850'             => 'cp850',

  # cp852 codec
  '852'                => 'cp852',
  'cspcp852'           => 'cp852',
  'ibm852'             => 'cp852',

  # cp855 codec
  '855'                => 'cp855',
  'csibm855'           => 'cp855',
  'ibm855'             => 'cp855',

  # cp857 codec
  '857'                => 'cp857',
  'csibm857'           => 'cp857',
  'ibm857'             => 'cp857',

  # cp860 codec
  '860'                => 'cp860',
  'csibm860'           => 'cp860',
  'ibm860'             => 'cp860',

  # cp861 codec
  '861'                => 'cp861',
  'cp_is'              => 'cp861',
  'csibm861'           => 'cp861',
  'ibm861'             => 'cp861',

  # cp862 codec
  '862'                => 'cp862',
  'cspc862latinhebrew' => 'cp862',
  'ibm862'             => 'cp862',

  # cp863 codec
  '863'                => 'cp863',
  'csibm863'           => 'cp863',
  'ibm863'             => 'cp863',

  # cp864 codec
  '864'                => 'cp864',
  'csibm864'           => 'cp864',
  'ibm864'             => 'cp864',

  # cp865 codec
  '865'                => 'cp865',
  'csibm865'           => 'cp865',
  'ibm865'             => 'cp865',

  # cp866 codec
  '866'                => 'cp866',
  'csibm866'           => 'cp866',
  'ibm866'             => 'cp866',

  # cp869 codec
  '869'                => 'cp869',
  'cp_gr'              => 'cp869',
  'csibm869'           => 'cp869',
  'ibm869'             => 'cp869',

  # cp932 codec
  '932'                => 'cp932',
  'ms932'              => 'cp932',
  'mskanji'            => 'cp932',
  'ms_kanji'           => 'cp932',

  # cp949 codec
  '949'                => 'cp949',
  'ms949'              => 'cp949',
  'uhc'                => 'cp949',

  # cp950 codec
  '950'                => 'cp950',
  'ms950'              => 'cp950',

  # euc_jp codec
  'euc_jp'             => 'euc-jp',
  'eucjp'              => 'euc-jp',
  'ujis'               => 'euc-jp',
  'u_jis'              => 'euc-jp',

  # euc_kr codec
  'euc_kr'             => 'euc-kr',
  'euckr'              => 'euc-kr',
  'korean'             => 'euc-kr',
  'ksc5601'            => 'euc-kr',
  'ks_c_5601'          => 'euc-kr',
  'ks_c_5601_1987'     => 'euc-kr',
  'ksx1001'            => 'euc-kr',
  'ks_x_1001'          => 'euc-kr',

  # gb18030 codec
  'gb18030_2000'       => 'gb18030',

  # gb2312 codec
  'chinese'            => 'gb2312',
  'csiso58gb231280'    => 'gb2312',
  'euc_cn'             => 'gb2312',
  'euccn'              => 'gb2312',
  'eucgb2312_cn'       => 'gb2312',
  'gb2312_1980'        => 'gb2312',
  'gb2312_80'          => 'gb2312',
  'iso_ir_58'          => 'gb2312',

  # gbk codec
  '936'                => 'gbk',
  'cp936'              => 'gbk',
  'ms936'              => 'gbk',

  # hp-roman8 codec
  'hp_roman8'          => 'hp-roman8',
  'roman8'             => 'hp-roman8',
  'r8'                 => 'hp-roman8',
  'csHPRoman8'         => 'hp-roman8',

  # iso2022_jp codec
  'iso2022_jp'         => 'iso-2022-jp',
  'csiso2022jp'        => 'iso-2022-jp',
  'iso2022jp'          => 'iso-2022-jp',
  'iso_2022_jp'        => 'iso-2022-jp',

  # iso2022_jp_1 codec
  'iso2002_jp_1'       => 'iso-2022-jp-1',
  'iso2022jp_1'        => 'iso-2022-jp-1',
  'iso_2022_jp_1'      => 'iso-2022-jp-1',

  # iso2022_jp_2 codec
  'iso2022_jp_2'       => 'iso-2002-jp-2',
  'iso2022jp_2'        => 'iso-2022-jp-2',
  'iso_2022_jp_2'      => 'iso-2022-jp-2',

  # iso2022_jp_3 codec
  'iso2002_jp_3'       => 'iso-2022-jp-3',
  'iso2022jp_3'        => 'iso-2022-jp-3',
  'iso_2022_jp_3'      => 'iso-2022-jp-3',

  # iso2022_kr codec
  'iso2022_kr'         => 'iso-2022-kr',
  'csiso2022kr'        => 'iso-2022-kr',
  'iso2022kr'          => 'iso-2022-kr',
  'iso_2022_kr'        => 'iso-2022-kr',

  # iso8859_10 codec
  'iso8859_10'         => 'iso-8859-10',
  'csisolatin6'        => 'iso-8859-10',
  'iso_8859_10'        => 'iso-8859-10',
  'iso_8859_10_1992'   => 'iso-8859-10',
  'iso_ir_157'         => 'iso-8859-10',
  'l6'                 => 'iso-8859-10',
  'latin6'             => 'iso-8859-10',

  # iso8859_13 codec
  'iso8859_13'         => 'iso-8859-13',
  'iso_8859_13'        => 'iso-8859-13',

  # iso8859_14 codec
  'iso8859_14'         => 'iso-8859-14',
  'iso_8859_14'        => 'iso-8859-14',
  'iso_8859_14_1998'   => 'iso-8859-14',
  'iso_celtic'         => 'iso-8859-14',
  'iso_ir_199'         => 'iso-8859-14',
  'l8'                 => 'iso-8859-14',
  'latin8'             => 'iso-8859-14',

  # iso8859_15 codec
  'iso8859_15'         => 'iso-8859-15',
  'iso_8859_15'        => 'iso-8859-15',

  # iso8859_1 codec
  'latin_1'            => 'iso-8859-1',
  'cp819'              => 'iso-8859-1',
  'csisolatin1'        => 'iso-8859-1',
  'ibm819'             => 'iso-8859-1',
  'iso8859'            => 'iso-8859-1',
  'iso_8859_1'         => 'iso-8859-1',
  'iso_8859_1_1987'    => 'iso-8859-1',
  'iso_ir_100'         => 'iso-8859-1',
  'l1'                 => 'iso-8859-1',
  'latin'              => 'iso-8859-1',
  'latin1'             => 'iso-8859-1',

  # iso8859_2 codec
  'iso8859_2'          => 'iso-8859-2',
  'csisolatin2'        => 'iso-8859-2',
  'iso_8859_2'         => 'iso-8859-2',
  'iso_8859_2_1987'    => 'iso-8859-2',
  'iso_ir_101'         => 'iso-8859-2',
  'l2'                 => 'iso-8859-2',
  'latin2'             => 'iso-8859-2',

  # iso8859_3 codec
  'iso8859_3'          => 'iso-8859-3',
  'csisolatin3'        => 'iso-8859-3',
  'iso_8859_3'         => 'iso-8859-3',
  'iso_8859_3_1988'    => 'iso-8859-3',
  'iso_ir_109'         => 'iso-8859-3',
  'l3'                 => 'iso-8859-3',
  'latin3'             => 'iso-8859-3',

  # iso8859_4 codec
  'iso8849_4'          => 'iso-8859-4',
  'csisolatin4'        => 'iso-8859-4',
  'iso_8859_4'         => 'iso-8859-4',
  'iso_8859_4_1988'    => 'iso-8859-4',
  'iso_ir_110'         => 'iso-8859-4',
  'l4'                 => 'iso-8859-4',
  'latin4'             => 'iso-8859-4',

  # iso8859_5 codec
  'iso8859_5'          => 'iso-8859-5',
  'csisolatincyrillic' => 'iso-8859-5',
  'cyrillic'           => 'iso-8859-5',
  'iso_8859_5'         => 'iso-8859-5',
  'iso_8859_5_1988'    => 'iso-8859-5',
  'iso_ir_144'         => 'iso-8859-5',

  # iso8859_6 codec
  'iso8859_6'          => 'iso-8859-6',
  'arabic'             => 'iso-8859-6',
  'asmo_708'           => 'iso-8859-6',
  'csisolatinarabic'   => 'iso-8859-6',
  'ecma_114'           => 'iso-8859-6',
  'iso_8859_6'         => 'iso-8859-6',
  'iso_8859_6_1987'    => 'iso-8859-6',
  'iso_ir_127'         => 'iso-8859-6',

  # iso8859_7 codec
  'iso8859_7'          => 'iso-8859-7',
  'csisolatingreek'    => 'iso-8859-7',
  'ecma_118'           => 'iso-8859-7',
  'elot_928'           => 'iso-8859-7',
  'greek'              => 'iso-8859-7',
  'greek8'             => 'iso-8859-7',
  'iso_8859_7'         => 'iso-8859-7',
  'iso_8859_7_1987'    => 'iso-8859-7',
  'iso_ir_126'         => 'iso-8859-7',

  # iso8859_8 codec
  'iso8859_9'          => 'iso8859_8',
  'csisolatinhebrew'   => 'iso-8859-8',
  'hebrew'             => 'iso-8859-8',
  'iso_8859_8'         => 'iso-8859-8',
  'iso_8859_8_1988'    => 'iso-8859-8',
  'iso_ir_138'         => 'iso-8859-8',

  # iso8859_9 codec
  'iso8859_9'          => 'iso-8859-9',
  'csisolatin5'        => 'iso-8859-9',
  'iso_8859_9'         => 'iso-8859-9',
  'iso_8859_9_1989'    => 'iso-8859-9',
  'iso_ir_148'         => 'iso-8859-9',
  'l5'                 => 'iso-8859-9',
  'latin5'             => 'iso-8859-9',

  # iso8859_11 codec
  'iso8859_11'         => 'iso-8859-11',
  'thai'               => 'iso-8859-11',
  'iso_8859_11'        => 'iso-8859-11',
  'iso_8859_11_2001'   => 'iso-8859-11',

  # iso8859_16 codec
  'iso8859_16'         => 'iso-8859-16',
  'iso_8859_16'        => 'iso-8859-16',
  'iso_8859_16_2001'   => 'iso-8859-16',
  'iso_ir_226'         => 'iso-8859-16',
  'l10'                => 'iso-8859-16',
  'latin10'            => 'iso-8859-16',

  # cskoi8r codec 
  'koi8_r'             => 'cskoi8r',

  # mac_cyrillic codec
  'mac_cyrillic'       => 'maccyrillic',

  # shift_jis codec
  'csshiftjis'         => 'shift_jis',
  'shiftjis'           => 'shift_jis',
  'sjis'               => 'shift_jis',
  's_jis'              => 'shift_jis',

  # shift_jisx0213 codec
  'shiftjisx0213'      => 'shift_jisx0213',
  'sjisx0213'          => 'shift_jisx0213',
  's_jisx0213'         => 'shift_jisx0213',

  # utf_16 codec
  'utf_16'             => 'utf-16',
  'u16'                => 'utf-16',
  'utf16'              => 'utf-16',

  # utf_16_be codec
  'utf_16_be'          => 'utf-16be',
  'unicodebigunmarked' => 'utf-16be',
  'utf_16be'           => 'utf-16be',

  # utf_16_le codec
  'utf_16_le'          => 'utf-16le',
  'unicodelittleunmarked' => 'utf-16le',
  'utf_16le'           => 'utf-16le',

  # utf_7 codec
  'utf_7'              => 'utf-7',
  'u7'                 => 'utf-7',
  'utf7'               => 'utf-7',

  # utf_8 codec
  'utf_8'              => 'utf-8',
  'u8'                 => 'utf-8',
  'utf'                => 'utf-8',
  'utf8'               => 'utf-8',
  'utf8_ucs2'          => 'utf-8',
  'utf8_ucs4'          => 'utf-8',
}

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.extract_tuple(atime) ⇒ `Object`



15
16
17

# File 'lib/rfeedparser/utilities.rb', line 15

def extract_tuple(atime)
  FeedParser::FeedTimeParser.extract_tuple(atime)
end

.parse_date(date_string) ⇒ `Object`



10
11
12

# File 'lib/rfeedparser/utilities.rb', line 10

def parse_date(date_string)
  FeedParser::FeedTimeParser.parse_date(date_string)
end

.SanitizerDoc(html) ⇒ `Object`



200
201
202

# File 'lib/rfeedparser/scrub.rb', line 200

def SanitizerDoc(html)
  SanitizerDoc.new(Hpricot.make(html))
end

Instance Method Details

#_ebcdic_to_ascii(s) ⇒ `Object`



26
27
28

# File 'lib/rfeedparser/encoding_helpers.rb', line 26

def _ebcdic_to_ascii(s)   
  return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
end

#getCharacterEncoding(http_headers, xml_data) ⇒ `Object`

# File 'lib/rfeedparser/encoding_helpers.rb', line 30

def getCharacterEncoding(http_headers, xml_data)
  # Get the character encoding of the XML document
  $stderr << "In getCharacterEncoding\n" if $debug
  sniffed_xml_encoding = nil
  xml_encoding = nil
  true_encoding = nil
  
  http_content_type, charset = http_headers['content-type'].to_s.split(';',2)

  encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
  http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]

  http_encoding = nil if http_encoding && http_encoding.empty?
  # FIXME Open-Uri returns iso8859-1 if there is no charset header,
  # but that doesn't pass the tests. Open-Uri claims its following
  # the right RFC. Are they wrong or do we need to change the tests?
  
  # Must sniff for non-ASCII-compatible character encodings before
  # searching for XML declaration.  This heuristic is defined in
  # section F of the XML specification:
  # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
  begin 
    if xml_data[0..3] == "\x4c\x6f\xa7\x94"
      # EBCDIC
      xml_data = __ebcdic_to_ascii(xml_data)
    elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
      # UTF-16BE
      sniffed_xml_encoding = 'utf-16be'
      xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
    elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
      # UTF-16BE with BOM
      sniffed_xml_encoding = 'utf-16be'
      xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
    elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
      # UTF-16LE
      sniffed_xml_encoding = 'utf-16le'
      xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
    elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
      # UTF-16LE with BOM
      sniffed_xml_encoding = 'utf-16le'
      xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
    elsif xml_data[0..3] == "\x00\x00\x00\x3c"
      # UTF-32BE
      sniffed_xml_encoding = 'utf-32be'
      xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
    elsif xml_data[0..3] == "\x3c\x00\x00\x00"
      # UTF-32LE
      sniffed_xml_encoding = 'utf-32le'
      xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
    elsif xml_data[0..3] == "\x00\x00\xfe\xff"
      # UTF-32BE with BOM
      sniffed_xml_encoding = 'utf-32be'
      xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
    elsif xml_data[0..3] == "\xff\xfe\x00\x00"
      # UTF-32LE with BOM
      sniffed_xml_encoding = 'utf-32le'
      xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
    elsif xml_data[0..2] == "\xef\xbb\xbf"
      # UTF-8 with BOM
      sniffed_xml_encoding = 'utf-8'
      xml_data = xml_data[3..-1]
    else
      # ASCII-compatible
    end
    xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
  rescue
    xml_encoding_match = nil
  end
  if xml_encoding_match 
    xml_encoding = xml_encoding_match[1].downcase
    xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
    if sniffed_xml_encoding and xencodings.include?xml_encoding
      xml_encoding = sniffed_xml_encoding
    end
  end

  acceptable_content_type = false
  application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
  text_content_types = ['text/xml', 'text/xml-external-parsed-entity']

  if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
    acceptable_content_type = true
    true_encoding = http_encoding || xml_encoding || 'utf-8'
  elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
    acceptable_content_type = true
    true_encoding = http_encoding || 'us-ascii'
  elsif /^text\// =~ http_content_type 
    true_encoding = http_encoding || 'us-ascii'
  elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
    true_encoding = xml_encoding || 'iso-8859-1'
  else
    true_encoding = xml_encoding || 'utf-8'
  end
  return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
end

#index_match(stri, regexp, offset) ⇒ `Object`

# File 'lib/rfeedparser/encoding_helpers.rb', line 17

def index_match(stri,regexp, offset)
  i = stri.index(regexp, offset)

  return nil, nil unless i

  full = stri[i..-1].match(regexp)
  return i, full
end

#py2rtime(pytuple) ⇒ `Object`



20
21
22

# File 'lib/rfeedparser/utilities.rb', line 20

def py2rtime(pytuple)
  Time.utc(*pytuple[0..5]) unless pytuple.nil? || pytuple.empty? 
end

#resolveRelativeURIs(htmlSource, baseURI, encoding) ⇒ `Object`

# File 'lib/rfeedparser/markup_helpers.rb', line 28

def resolveRelativeURIs(htmlSource, baseURI, encoding)
  $stderr << "entering resolveRelativeURIs\n" if $debug # FIXME write a decent logger
  relative_uris = [ ['a','href'],
    ['applet','codebase'],
    ['area','href'],
    ['blockquote','cite'],
    ['body','background'],
    ['del','cite'],
    ['form','action'],
    ['frame','longdesc'],
    ['frame','src'],
    ['iframe','longdesc'],
    ['iframe','src'],
    ['head','profile'],
    ['img','longdesc'],
    ['img','src'],
    ['img','usemap'],
    ['input','src'],
    ['input','usemap'],
    ['ins','cite'],
    ['link','href'],
    ['object','classid'],
    ['object','codebase'],
    ['object','data'],
    ['object','usemap'],
    ['q','cite'],
    ['script','src'],
  ]
  h = Hpricot(htmlSource)
  relative_uris.each do |l|
    ename, eattr = l
    h.search(ename).each do |elem|
      euri = elem.attributes[eattr]
      uri = Addressable::URI.parse(Addressable::URI.encode(euri)) rescue nil
      if euri and not euri.empty? and uri and uri.relative?
        elem.raw_attributes[eattr] = urljoin(baseURI, euri)
      end
    end
  end
  return h.to_html
end

#sanitizeHTML(html, encoding) ⇒ `Object`

# File 'lib/rfeedparser/scrub.rb', line 205

def sanitizeHTML(html,encoding)
  # FIXME Tidy not yet supported
  html = html.gsub(/<!((?!DOCTYPE|--|\[))/, '&lt;!\1')
  h = SanitizerDoc(html)
  h = h.scrub
  return h.strip
end

#stripDoctype(data) ⇒ `Object`

# File 'lib/rfeedparser/markup_helpers.rb', line 3

def stripDoctype(data)
  #Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
  #rss_version may be 'rss091n' or None
  #stripped_data is the same XML document, minus the DOCTYPE
  entity_pattern = /<!ENTITY(.*?)>/m # m is for Regexp::MULTILINE
  
  data = data.gsub(entity_pattern,'')

  doctype_pattern = /<!DOCTYPE(.*?)>/m
  doctype_results = data.scan(doctype_pattern)
  if doctype_results and doctype_results[0]
    doctype = doctype_results[0][0]
  else
    doctype = ''
  end

  if /netscape/ =~ doctype.downcase
    version = 'rss091n'
  else
    version = nil
  end
  data = data.sub(doctype_pattern, '')
  return version, data
end

#toUTF8(data, encoding) ⇒ `Object`

# File 'lib/rfeedparser/encoding_helpers.rb', line 126

def toUTF8(data, encoding)
  $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
  # NOTE we must use double quotes when dealing with \x encodings!
  if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
    if $debug
      $stderr << "stripping BOM\n"
      if encoding != 'utf-16be'
        $stderr << "string utf-16be instead\n"
      end
    end
    encoding = 'utf-16be'
    data = data[2..-1]
  elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
    if $debug
      $stderr << "stripping BOM\n"
      $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
    end
    encoding = 'utf-16le'
    data = data[2..-1]
  elsif (data[0..2] == "\xef\xbb\xbf")
    if $debug
      $stderr << "stripping BOM\n"
      $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
    end
    encoding = 'utf-8'
    data = data[3..-1]
  elsif (data[0..3] == "\x00\x00\xfe\xff")
    if $debug
      $stderr << "stripping BOM\n"
      if encoding != 'utf-32be'
        $stderr << "trying utf-32be instead\n"
      end
    end
    encoding = 'utf-32be'
    data = data[4..-1]
  elsif (data[0..3] == "\xff\xfe\x00\x00")
    if $debug
      $stderr << "stripping BOM\n"
      if encoding != 'utf-32le'
        $stderr << "trying utf-32le instead\n"
      end
    end
    encoding = 'utf-32le'
    data = data[4..-1]
  end
  begin
    newdata = uconvert(data, encoding, 'utf-8')
  rescue => details
    raise details
  end
  $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
  declmatch = /^<\?xml[^>]*?>/
  newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
  if declmatch =~ newdata
    newdata.sub!(declmatch, newdecl) 
  else
    newdata = newdecl + "\n" + newdata
  end
  return newdata
end

#uconvert(data, from_encoding, to_encoding = 'utf-8') ⇒ `Object`

# File 'lib/rfeedparser/encoding_helpers.rb', line 11

def uconvert(data, from_encoding, to_encoding = 'utf-8')
  from_encoding = Encoding_Aliases[from_encoding] || from_encoding
  to_encoding = Encoding_Aliases[to_encoding] || to_encoding
  Iconv.iconv(to_encoding, from_encoding, data)[0]
end

#unicode(data, from_encoding) ⇒ `Object`

# File 'lib/rfeedparser/encoding_helpers.rb', line 5

def unicode(data, from_encoding)
  # Takes a single string and converts it from the encoding in 
  # from_encoding to unicode.
  uconvert(data, from_encoding, 'unicode')
end

Module: FeedParserUtilities

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.extract_tuple(atime) ⇒ Object

.parse_date(date_string) ⇒ Object

.SanitizerDoc(html) ⇒ Object

Instance Method Details

#_ebcdic_to_ascii(s) ⇒ Object

#getCharacterEncoding(http_headers, xml_data) ⇒ Object

#index_match(stri, regexp, offset) ⇒ Object

#py2rtime(pytuple) ⇒ Object

#resolveRelativeURIs(htmlSource, baseURI, encoding) ⇒ Object

#sanitizeHTML(html, encoding) ⇒ Object

#stripDoctype(data) ⇒ Object

#toUTF8(data, encoding) ⇒ Object

#uconvert(data, from_encoding, to_encoding = 'utf-8') ⇒ Object

#unicode(data, from_encoding) ⇒ Object

.extract_tuple(atime) ⇒ `Object`

.parse_date(date_string) ⇒ `Object`

.SanitizerDoc(html) ⇒ `Object`

#_ebcdic_to_ascii(s) ⇒ `Object`

#getCharacterEncoding(http_headers, xml_data) ⇒ `Object`

#index_match(stri, regexp, offset) ⇒ `Object`

#py2rtime(pytuple) ⇒ `Object`

#resolveRelativeURIs(htmlSource, baseURI, encoding) ⇒ `Object`

#sanitizeHTML(html, encoding) ⇒ `Object`

#stripDoctype(data) ⇒ `Object`

#toUTF8(data, encoding) ⇒ `Object`

#uconvert(data, from_encoding, to_encoding = 'utf-8') ⇒ `Object`

#unicode(data, from_encoding) ⇒ `Object`