Class: PDF::Reader::Encoding::WinAnsiEncoding
- Inherits:
-
PDF::Reader::Encoding
- Object
- PDF::Reader::Encoding
- PDF::Reader::Encoding::WinAnsiEncoding
- Defined in:
- lib/pdf/reader/encoding.rb
Constant Summary
Constants inherited from PDF::Reader::Encoding
Instance Attribute Summary
Attributes inherited from PDF::Reader::Encoding
Instance Method Summary collapse
-
#to_utf8(str, tounicode = nil) ⇒ Object
convert a WinAnsiEncoding string into UTF-8.
Methods inherited from PDF::Reader::Encoding
Instance Method Details
#to_utf8(str, tounicode = nil) ⇒ Object
convert a WinAnsiEncoding string into UTF-8
751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 |
# File 'lib/pdf/reader/encoding.rb', line 751 def to_utf8(str, tounicode = nil) # content of this method borrowed from REXML::Encoding.decode_cp1252 # for further reading: # http://www.intertwingly.net/stories/2004/04/14/i18n.html array_latin9 = str.unpack('C*') array_latin9 = self.process_differences(array_latin9) array_enc = [] array_latin9.each do |num| case num # characters that added compared to iso-8859-1 when 0x80; array_enc << 0x20AC # 0xe2 0x82 0xac when 0x82; array_enc << 0x201A # 0xe2 0x82 0x9a when 0x83; array_enc << 0x0192 # 0xc6 0x92 when 0x84; array_enc << 0x201E # 0xe2 0x82 0x9e when 0x85; array_enc << 0x2026 # 0xe2 0x80 0xa6 when 0x86; array_enc << 0x2020 # 0xe2 0x80 0xa0 when 0x87; array_enc << 0x2021 # 0xe2 0x80 0xa1 when 0x88; array_enc << 0x02C6 # 0xcb 0x86 when 0x89; array_enc << 0x2030 # 0xe2 0x80 0xb0 when 0x8A; array_enc << 0x0160 # 0xc5 0xa0 when 0x8B; array_enc << 0x2039 # 0xe2 0x80 0xb9 when 0x8C; array_enc << 0x0152 # 0xc5 0x92 when 0x8E; array_enc << 0x017D # 0xc5 0xbd when 0x91; array_enc << 0x2018 # 0xe2 0x80 0x98 when 0x92; array_enc << 0x2019 # 0xe2 0x80 0x99 when 0x93; array_enc << 0x201C when 0x94; array_enc << 0x201D when 0x95; array_enc << 0x2022 when 0x96; array_enc << 0x2013 when 0x97; array_enc << 0x2014 when 0x98; array_enc << 0x02DC when 0x99; array_enc << 0x2122 when 0x9A; array_enc << 0x0161 when 0x9B; array_enc << 0x203A when 0x9C; array_enc << 0x0152 # 0xc5 0x93 when 0x9E; array_enc << 0x017E # 0xc5 0xbe when 0x9F; array_enc << 0x0178 else array_enc << num end end # convert any glyph names to unicode codepoints array_enc = self.process_glyphnames(array_enc) # replace charcters that didn't convert to unicode nicely with something valid array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR } # pack all our Unicode codepoints into a UTF-8 string ret = array_enc.pack("U*") # set the strings encoding correctly under ruby 1.9+ ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding) return ret end |