Class: EncodedWord
- Inherits:
-
Object
- Object
- EncodedWord
- Defined in:
- lib/encoded_word.rb
Instance Method Summary collapse
- #combine_all_mlog_plain ⇒ Object
- #decode ⇒ Object
- #decode_all_mlog ⇒ Object
- #decode_attach(attach) ⇒ Object
- #decode_attaches(parts) ⇒ Object
- #decode_subject(sub) ⇒ Object
- #format_date(engdate) ⇒ Object
- #getmail(line, at) ⇒ Object
-
#initialize(inputdir) ⇒ EncodedWord
constructor
A new instance of EncodedWord.
- #mime_decode(input, out_charset = 'utf-8') ⇒ Object
- #mysplit(line, sep = "\t") ⇒ Object
- #trim_emails(emails) ⇒ Object
Constructor Details
#initialize(inputdir) ⇒ EncodedWord
Returns a new instance of EncodedWord.
6 7 8 9 10 11 12 |
# File 'lib/encoded_word.rb', line 6 def initialize(inputdir) if File::ALT_SEPARATOR @inputdir = inputdir.gsub(File::ALT_SEPARATOR, File::SEPARATOR) else @inputdir = inputdir end end |
Instance Method Details
#combine_all_mlog_plain ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/encoded_word.rb', line 19 def combine_all_mlog_plain File.open(File.join(@inputdir, 'all_mlog.csv'), 'w:utf-8') do |out| out.puts '"key","","","date","from","to","cc","bcc","subject","attach"' Dir.glob(File.join(@inputdir, File.join('**', '*.plain'))).select do |f| puts f File.open(f, 'r:utf-8').each_line do |line| parts = mysplit(line) key = parts[3] date = parts[0] subject = parts[2] from = parts[3] to = parts[4].gsub("\a", ';') cc = parts[5].gsub("\a", ';') attach = parts[6].gsub("\a", ';') out.puts %Q("#{key}","","","#{date}","#{from}","#{to}","#{cc}","","#{subject}","#{attach}") end end end end |
#decode ⇒ Object
14 15 16 17 |
# File 'lib/encoded_word.rb', line 14 def decode decode_all_mlog combine_all_mlog_plain end |
#decode_all_mlog ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/encoded_word.rb', line 39 def decode_all_mlog Dir.glob(File.join(@inputdir, File.join('**', '*.mlog'))).select do |f| puts f mids = {} File.open(f + '.plain', 'w:utf-8') do |out| File.open(f) do |input| @input_enc = input.external_encoding input.each_line do |line| parts = mysplit(line) next if mids.has_key?(parts[1]) mids[parts[1]] = 0 newparts = [] newparts << format_date(parts[0]) #date newparts << parts[1] #message-id newparts << decode_subject(parts[2]) #subject newparts << trim_emails(parts[3]) #from newparts << trim_emails(parts[4]) #to newparts << trim_emails(parts[5]) #cc newparts << decode_attaches(parts) out.puts newparts.join("\t") end end end end end |
#decode_attach(attach) ⇒ Object
142 143 144 145 146 147 148 149 150 151 |
# File 'lib/encoded_word.rb', line 142 def decode_attach(attach) return '' unless attach and attach.length > 0 attach = attach.encode('utf-8', @input_enc, :undef=>:replace, :invalid=>:replace) parts = mysplit(attach, "\a") newparts = [] parts.each do |p| newparts << mime_decode(p) end newparts.join('') end |
#decode_attaches(parts) ⇒ Object
133 134 135 136 137 138 139 140 |
# File 'lib/encoded_word.rb', line 133 def decode_attaches(parts) attaches = [] 6.upto(parts.length-1) do |i| attaches << decode_attach(parts[i]) end return '' unless attaches.length > 0 attaches.join("\a") end |
#decode_subject(sub) ⇒ Object
95 96 97 98 99 100 |
# File 'lib/encoded_word.rb', line 95 def decode_subject(sub) return '' unless sub and sub.length > 0 sub = sub.encode('utf-8', @input_enc, :undef=>:replace, :invalid=>:replace) one = sub.gsub("\a", '') mime_decode(one) end |
#format_date(engdate) ⇒ Object
88 89 90 91 92 93 |
# File 'lib/encoded_word.rb', line 88 def format_date(engdate) return '' unless engdate and engdate.length > 0 dt = DateTime.parse(engdate) dt = dt.new_offset('+0900') dt.strftime("%Y/%m/%d %H:%M:%S") end |
#getmail(line, at) ⇒ Object
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/encoded_word.rb', line 117 def getmail(line, at) pos1 = 0 pos2 = line.length - 1 (at-1).step(0, -1) do |i| next if line[i] =~ /[\._a-zA-Z0-9-]/ pos1 = i + 1 break end (at+1).upto(line.length) do |i| next if line[i] =~ /[\.a-zA-Z0-9-]/ pos2 = i - 1 break end line[pos1..pos2] end |
#mime_decode(input, out_charset = 'utf-8') ⇒ Object
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'lib/encoded_word.rb', line 153 def mime_decode(input, out_charset = 'utf-8') return '' unless input and input.length > 0 begin ret = input.sub!(/=\?([A-Za-z0-9_-]+)\?([BQbq])\?([^\?]+)\?=/) { charset = $1 enc = $2.upcase word = $3 word = word.unpack({ "B"=>"m*", "Q"=>"M*" }[enc]).first word.encode(out_charset, charset, :undef=>:replace, :invalid=>:replace) } return ret ? mime_decode(input) : input rescue return input end end |
#mysplit(line, sep = "\t") ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/encoded_word.rb', line 66 def mysplit(line, sep = "\t") return [] unless line return [] unless line.length > 0 parts = [] last = -1 pos1 = -1 while true do pos1 += 1 pos2 = line.index("\t", pos1) if pos2 parts << line[pos1...pos2] pos1 = pos2 else last -= 1 if line[last] == "\n" parts << line[pos1..last] break end end return parts end |
#trim_emails(emails) ⇒ Object
102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# File 'lib/encoded_word.rb', line 102 def trim_emails(emails) return '' unless emails and emails.length > 0 emails = emails.encode('utf-8', @input_enc, :undef=>:replace, :invalid=>:replace) pos1 = -1 newparts = [] while true do pos1 += 1 pos2 = emails.index('@', pos1) break unless pos2 newparts << getmail(emails, pos2) pos1 = pos2 end newparts.join("\a") end |