Class: EncodedWord

Inherits:
Object
  • Object
show all
Defined in:
lib/encoded_word.rb

Instance Method Summary collapse

Constructor Details

#initialize(inputdir) ⇒ EncodedWord

Returns a new instance of EncodedWord.



6
7
8
9
10
11
12
# File 'lib/encoded_word.rb', line 6

def initialize(inputdir)
    if File::ALT_SEPARATOR
        @inputdir = inputdir.gsub(File::ALT_SEPARATOR, File::SEPARATOR)
    else
        @inputdir = inputdir
    end
end

Instance Method Details

#combine_all_mlog_plainObject



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/encoded_word.rb', line 19

def combine_all_mlog_plain
    File.open(File.join(@inputdir, 'all_mlog.csv'), 'w:utf-8') do |out|
        out.puts '"key","","","date","from","to","cc","bcc","subject","attach"'
        Dir.glob(File.join(@inputdir, File.join('**', '*.plain'))).select do |f|
            puts f
            File.open(f, 'r:utf-8').each_line do |line|
                parts = mysplit(line)
                key = parts[3]
                date = parts[0]
                subject = parts[2]
                from = parts[3]
                to = parts[4].gsub("\a", ';')
                cc = parts[5].gsub("\a", ';')
                attach = parts[6].gsub("\a", ';')
                out.puts %Q("#{key}","","","#{date}","#{from}","#{to}","#{cc}","","#{subject}","#{attach}")
            end
        end
    end
end

#concat_one_line(words) ⇒ Object



66
67
68
69
70
# File 'lib/encoded_word.rb', line 66

def concat_one_line(words)
    line = words.join('')
    parts = mysplit(line, "\n")
    parts.join('')
end

#decodeObject



14
15
16
17
# File 'lib/encoded_word.rb', line 14

def decode
    decode_all_mlog
    combine_all_mlog_plain
end

#decode_all_mlogObject



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/encoded_word.rb', line 39

def decode_all_mlog
    Dir.glob(File.join(@inputdir, File.join('**', '*.mlog'))).select do |f|
        puts f
        mids = {}
        File.open(f + '.plain', 'w:utf-8') do |out|
            File.open(f) do |input|
                @input_enc = input.external_encoding
                input.each_line do |line|
                    parts = mysplit(line)
                    next if mids.has_key?(parts[1])
                    mids[parts[1]] = 0

                    newparts = []
                    newparts << format_date(parts[0]) #date
                    newparts << parts[1] #message-id
                    newparts << decode_subject(parts[2]) #subject
                    newparts << trim_emails(parts[3]) #from
                    newparts << trim_emails(parts[4]) #to
                    newparts << trim_emails(parts[5]) #cc
                    newparts << decode_attaches(parts)
                    out.puts newparts.join("\t")
                end
            end
        end
    end
end

#decode_attach(attach) ⇒ Object



152
153
154
155
156
157
158
159
160
161
# File 'lib/encoded_word.rb', line 152

def decode_attach(attach)
    return '' unless attach and attach.length > 0
    parts = mysplit(attach, "\a")
    words = []
    parts.each do |p|
        wd = word_decode(p) 
        words << wd
    end
    concat_one_line(words)
end

#decode_attaches(parts) ⇒ Object



143
144
145
146
147
148
149
150
# File 'lib/encoded_word.rb', line 143

def decode_attaches(parts)
    attaches = []
    6.upto(parts.length-1) do |i|
        attaches << decode_attach(parts[i])
    end
    return '' unless attaches.length > 0
    attaches.join("\a")
end

#decode_subject(sub) ⇒ Object



101
102
103
104
105
106
107
108
109
110
# File 'lib/encoded_word.rb', line 101

def decode_subject(sub)
    return '' unless sub and sub.length > 0
    parts = mysplit(sub, "\a")
    words = []
    parts.each do |p|
        wd = word_decode(p) 
        words << wd
    end
    concat_one_line(words)
end

#format_date(engdate) ⇒ Object



94
95
96
97
98
99
# File 'lib/encoded_word.rb', line 94

def format_date(engdate)
    return '' unless engdate and engdate.length > 0
    dt = DateTime.parse(engdate)
    dt = dt.new_offset('+0900')
    dt.strftime("%Y/%m/%d %H:%M:%S")
end

#getmail(line, at) ⇒ Object



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/encoded_word.rb', line 127

def getmail(line, at)
    pos1 = 0
    pos2 = line.length - 1
    (at-1).step(0, -1) do |i|
        next if line[i] =~ /[\._a-zA-Z0-9-]/
        pos1 = i + 1
        break
    end
    (at+1).upto(line.length) do |i|
        next if line[i] =~ /[\.a-zA-Z0-9-]/
        pos2 = i - 1
        break
    end
    line[pos1..pos2]
end

#mysplit(line, sep = "\t") ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/encoded_word.rb', line 72

def mysplit(line, sep = "\t")
    return [] unless line
    return [] unless line.length > 0

    parts = []
    last = -1
    pos1 = -1
    while true do
        pos1 += 1
        pos2 = line.index(sep, pos1)
        if pos2
            parts << line[pos1...pos2]
            pos1 = pos2
        else
            last -= 1 if line[last] == "\n"
            parts << line[pos1..last]
            break
        end
    end
    return parts
end

#trim_emails(emails) ⇒ Object



112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/encoded_word.rb', line 112

def trim_emails(emails)
    return '' unless emails and emails.length > 0
    emails = emails.encode('utf-8', @input_enc, :undef=>:replace, :invalid=>:replace)
    pos1 = -1
    newparts = []
    while true do
        pos1 += 1
        pos2 = emails.index('@', pos1)
        break unless pos2
        newparts << getmail(emails, pos2)
        pos1 = pos2
    end
    newparts.join("\a")
end

#word_decode(input, out_charset = 'utf-8') ⇒ Object



163
164
165
166
167
168
169
170
171
# File 'lib/encoded_word.rb', line 163

def word_decode(input, out_charset = 'utf-8')
    u8 = input.encode('utf-8', @u8_enc, :undef=>:replace, :invalid=>:replace)
    parts = u8.scan(/=\?([A-Za-z0-9_-]+)\?([BQbq])\?([^\?]+)\?=/).first
    return input unless parts and parts.length == 3
    charset = parts[0]
    enc = parts[1].upcase
    wd = parts[2].unpack({ "B"=>"m*", "Q"=>"M*" }[enc]).first
    wd.encode(out_charset, charset, :undef=>:replace, :invalid=>:replace)
end