Class: EncodedWord
- Inherits:
-
Object
- Object
- EncodedWord
- Defined in:
- lib/encoded_word.rb
Instance Method Summary collapse
- #combine_all_mlog_plain ⇒ Object
- #concat_one_line(words) ⇒ Object
- #decode ⇒ Object
- #decode_all_mlog ⇒ Object
- #decode_attach(attach) ⇒ Object
- #decode_attaches(parts) ⇒ Object
- #decode_subject(sub) ⇒ Object
- #format_date(engdate) ⇒ Object
- #getmail(line, at) ⇒ Object
-
#initialize(inputdir) ⇒ EncodedWord
constructor
A new instance of EncodedWord.
- #mime_decode(input, out_charset = 'utf-8') ⇒ Object
- #mysplit(line, sep = "\t") ⇒ Object
- #trim_emails(emails) ⇒ Object
- #word_decode(input, out_charset = 'utf-8') ⇒ Object
Constructor Details
#initialize(inputdir) ⇒ EncodedWord
Returns a new instance of EncodedWord.
6 7 8 9 10 11 12 |
# File 'lib/encoded_word.rb', line 6 def initialize(inputdir) if File::ALT_SEPARATOR @inputdir = inputdir.gsub(File::ALT_SEPARATOR, File::SEPARATOR) else @inputdir = inputdir end end |
Instance Method Details
#combine_all_mlog_plain ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/encoded_word.rb', line 19 def combine_all_mlog_plain File.open(File.join(@inputdir, 'all_mlog.csv'), 'w:utf-8') do |out| out.puts '"key","","","date","from","to","cc","bcc","subject","attach"' Dir.glob(File.join(@inputdir, File.join('**', '*.plain'))).select do |f| puts f File.open(f, 'r:utf-8').each_line do |line| parts = mysplit(line) key = parts[3] date = parts[0] subject = parts[2] from = parts[3] to = parts[4].gsub("\a", ';') cc = parts[5].gsub("\a", ';') attach = parts[6].gsub("\a", ';') out.puts %Q("#{key}","","","#{date}","#{from}","#{to}","#{cc}","","#{subject}","#{attach}") end end end end |
#concat_one_line(words) ⇒ Object
66 67 68 69 70 |
# File 'lib/encoded_word.rb', line 66 def concat_one_line(words) line = words.join('') parts = mysplit(line, "\n") parts.join('') end |
#decode ⇒ Object
14 15 16 17 |
# File 'lib/encoded_word.rb', line 14 def decode decode_all_mlog combine_all_mlog_plain end |
#decode_all_mlog ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/encoded_word.rb', line 39 def decode_all_mlog Dir.glob(File.join(@inputdir, File.join('**', '*.mlog'))).select do |f| puts f mids = {} File.open(f + '.plain', 'w:utf-8') do |out| File.open(f) do |input| @input_enc = input.external_encoding input.each_line do |line| parts = mysplit(line) next if mids.has_key?(parts[1]) mids[parts[1]] = 0 newparts = [] newparts << format_date(parts[0]) #date newparts << parts[1] #message-id newparts << decode_subject(parts[2]) #subject newparts << trim_emails(parts[3]) #from newparts << trim_emails(parts[4]) #to newparts << trim_emails(parts[5]) #cc newparts << decode_attaches(parts) out.puts newparts.join("\t") end end end end end |
#decode_attach(attach) ⇒ Object
152 153 154 155 156 157 158 159 160 161 |
# File 'lib/encoded_word.rb', line 152 def decode_attach(attach) return '' unless attach and attach.length > 0 parts = mysplit(attach, "\a") words = [] parts.each do |p| wd = word_decode(p) words << wd end concat_one_line(words) end |
#decode_attaches(parts) ⇒ Object
143 144 145 146 147 148 149 150 |
# File 'lib/encoded_word.rb', line 143 def decode_attaches(parts) attaches = [] 6.upto(parts.length-1) do |i| attaches << decode_attach(parts[i]) end return '' unless attaches.length > 0 attaches.join("\a") end |
#decode_subject(sub) ⇒ Object
101 102 103 104 105 106 107 108 109 110 |
# File 'lib/encoded_word.rb', line 101 def decode_subject(sub) return '' unless sub and sub.length > 0 parts = mysplit(sub, "\a") words = [] parts.each do |p| wd = word_decode(p) words << wd end concat_one_line(words) end |
#format_date(engdate) ⇒ Object
94 95 96 97 98 99 |
# File 'lib/encoded_word.rb', line 94 def format_date(engdate) return '' unless engdate and engdate.length > 0 dt = DateTime.parse(engdate) dt = dt.new_offset('+0900') dt.strftime("%Y/%m/%d %H:%M:%S") end |
#getmail(line, at) ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/encoded_word.rb', line 127 def getmail(line, at) pos1 = 0 pos2 = line.length - 1 (at-1).step(0, -1) do |i| next if line[i] =~ /[\._a-zA-Z0-9-]/ pos1 = i + 1 break end (at+1).upto(line.length) do |i| next if line[i] =~ /[\.a-zA-Z0-9-]/ pos2 = i - 1 break end line[pos1..pos2] end |
#mime_decode(input, out_charset = 'utf-8') ⇒ Object
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/encoded_word.rb', line 176 def mime_decode(input, out_charset = 'utf-8') return '' unless input and input.length > 0 begin ret = input.sub!(/=\?([A-Za-z0-9_-]+)\?([BQbq])\?([^\?]+)\?=/) { charset = $1 enc = $2.upcase word = $3 word = word.unpack({ "B"=>"m*", "Q"=>"M*" }[enc]).first word.encode(out_charset, charset, :undef=>:replace, :invalid=>:replace) } return ret ? mime_decode(input) : input rescue => e puts e return input end end |
#mysplit(line, sep = "\t") ⇒ Object
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/encoded_word.rb', line 72 def mysplit(line, sep = "\t") return [] unless line return [] unless line.length > 0 parts = [] last = -1 pos1 = -1 while true do pos1 += 1 pos2 = line.index(sep, pos1) if pos2 parts << line[pos1...pos2] pos1 = pos2 else last -= 1 if line[last] == "\n" parts << line[pos1..last] break end end return parts end |
#trim_emails(emails) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/encoded_word.rb', line 112 def trim_emails(emails) return '' unless emails and emails.length > 0 emails = emails.encode('utf-8', @input_enc, :undef=>:replace, :invalid=>:replace) pos1 = -1 newparts = [] while true do pos1 += 1 pos2 = emails.index('@', pos1) break unless pos2 newparts << getmail(emails, pos2) pos1 = pos2 end newparts.join("\a") end |
#word_decode(input, out_charset = 'utf-8') ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/encoded_word.rb', line 163 def word_decode(input, out_charset = 'utf-8') u8 = input.encode('utf-8', @u8_enc, :undef=>:replace, :invalid=>:replace) pos1 = u8.index('=?') return input unless pos1 pos2 = u8.index('?=', pos1+1) trim = u8[pos1..pos2+1] parts = trim.scan(/=\?([A-Za-z0-9_-]+)\?([BQbq])\?([^\?]+)\?=/).first charset = parts[0] enc = parts[1].upcase wd = parts[2].unpack({ "B"=>"m*", "Q"=>"M*" }[enc]).first wd.encode(out_charset, charset, :undef=>:replace, :invalid=>:replace) end |