10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
# File 'lib/CKIP_Client.rb', line 10
def self.get( sys , text )
text_encoding = text.encoding.to_s
unless ['Big5','Big5-UAO','UTF-8'].include? text_encoding
raise 'Encoding ERROR!! CKIP_Client only support UTF-8 or Big5 or Big5-UAO encodings.'
end
input_encoding = (text_encoding == 'Big5-UAO')? 'Big5' : text_encoding
sst = 2.0 - 2304.0 / (text.size + 1280)
config = YAML::load( File.open( File.dirname(__FILE__) + "/config/#{sys}.yml" ).read )
sleep rand * 0.25 + 0.1
request = "<?xml version=\"1.0\" ?>
<wordsegmentation version=\"0.1\" charsetcode=\"#{input_encoding.downcase}\">
<option showcategory=\"1\" />
<authentication username=\"#{config['username']}\" password=\"#{config['password']}\" />
<text>#{text}</text>
</wordsegmentation>"
begin
time0 = Time.now
xml_result = Timeout::timeout(8.0 * (sst + 1.0)){
@socket = TCPSocket.open( config['host'] , config['port'] )
@socket.write( request )
@socket.gets.force_encoding( text_encoding )
}
time1 = (Time.now - time0)
sleep (rand + 0.5) * sst + time1 * 0.35
if xml_result.valid_encoding?
return xml_result.encode!('UTF-8')
else
trans_text = xml_result.encode("UTF-32", :undef => :replace, :invalid => :replace).encode( text_encoding )
text2 = text.gsub(/[^[:word:]]+/ , "")
trans_text.each_char{ |c| text2.delete!(c) }
puts "!!contains unsupported character: #{text2}!!"
raise Encoding::InvalidByteSequenceError
end
rescue Timeout::Error
time1 = (Time.now - time0)
puts "!!!Timeout: waited for #{time1.round(2)}s and no response from CKIP server!!!"
raise $!
ensure
@socket.close
end
end
|