Class: ICU::UCharsetDetector

Inherits:
Object
  • Object
show all
Defined in:
lib/uchardet.rb,
ext/uchardet/uchardet.c

Overview

:main: README

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#new(text = nil, declared_encoding = nil) ⇒ Object

Create a new charset detector. Optionally set input text and declared encoding.



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'ext/uchardet/uchardet.c', line 162

static VALUE
UCharsetDetector_initialize(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    if (NIL_P(text))
        UCharsetDetector_set_text(self, Qnil);
    else
        set_text(self, text);
    
    if (NIL_P(declared_encoding))
        UCharsetDetector_set_declared_encoding(self, Qnil);
    else
        set_declared_encoding(self, declared_encoding);
    
    return self;
}

Class Method Details

.detect(*args) ⇒ Object

Shortcut for ICU::UCharsetDetector#detect



19
20
21
# File 'lib/uchardet.rb', line 19

def self.detect(*args)
  self.new.detect(*args)
end

.detect_all(*args) ⇒ Object

Shortcut for ICU::UCharsetDetector#detect_all



26
27
28
# File 'lib/uchardet.rb', line 26

def self.detect_all(*args)
  self.new.detect_all(*args)
end

.detectable_charsetsObject

Shortcut for ICU::UCharsetDetector#detectable_charsets



33
34
35
# File 'lib/uchardet.rb', line 33

def self.detectable_charsets
  self.new.detectable_charsets
end

Instance Method Details

#declared_encodingObject

Get the declared encoding for charset detection.



101
102
103
104
105
# File 'ext/uchardet/uchardet.c', line 101

static VALUE
UCharsetDetector_get_declared_encoding(VALUE self)
{
    return rb_iv_get(self, "@declared_encoding");
}

#declared_encoding=Object

Set the declared encoding for charset detection. The declared encoding of an input text is an encoding obtained by the user from an http header or xml declaration or similar source that can be provided as an additional hint to the charset detector.



116
117
118
119
120
# File 'ext/uchardet/uchardet.c', line 116

static VALUE
UCharsetDetector_set_declared_encoding(VALUE self, VALUE declared_encoding)
{
    return rb_iv_set(self, "@declared_encoding", declared_encoding);
}

#detect(text = nil, declared_encoding = nil) ⇒ Object

Return the charset that best matches the supplied input data.

Note though, that because the detection only looks at the start of the input data, there is a possibility that the returned charset will fail to handle the full set of input data.

The function will fail if

  • no charset appears to match the data

  • no input text has been provided (with text or set with #text= )



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# File 'ext/uchardet/uchardet.c', line 197

static VALUE
UCharsetDetector_detect(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    set_text(self, text);
    set_declared_encoding(self, declared_encoding);
    
    UErrorCode status = U_ZERO_ERROR;
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    
    const UCharsetMatch *match = ucsdet_detect(detector, &status);
    ensure(status);
        
    const char *encoding_name = ucsdet_getName(match, &status);
    ensure(status);

    int32_t encoding_confidence = ucsdet_getConfidence(match, &status);
    ensure(status);
        
    const char *encoding_language = ucsdet_getLanguage(match, &status);
    ensure(status);
        
    VALUE hash = rb_hash_new();
    rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
    rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
    rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
    
    return hash;
}

#detect_all(text = nil, declared_encoding = nil) ⇒ Object

Find all charset matches that appear to be consistent with the input, returning an array of results. The results are ordered with the best quality match first.

Because the detection only looks at a limited amount of the input byte data, some of the returned charsets may fail to handle the all of input data.

Return an error if

  • no charset appears to match the data

  • no input text has been provided (with text or set with #text= )



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# File 'ext/uchardet/uchardet.c', line 247

static VALUE
UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self)
{
    VALUE text;
    VALUE declared_encoding;
    
    rb_scan_args(argc, argv, "02", &text, &declared_encoding);
    set_text(self, text);
    set_declared_encoding(self, declared_encoding);
    
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    UErrorCode status = U_ZERO_ERROR;
    int32_t matches_found;
    
    const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status);
    ensure(status);
    
    VALUE ary = rb_ary_new();
    int i = 0;
    
    for (i = 0; i < matches_found; i++) {
        const char *encoding_name = ucsdet_getName(matches[i], &status);
        ensure(status);

        int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status);
        ensure(status);
        
        const char *encoding_language = ucsdet_getLanguage(matches[i], &status);
        ensure(status);
        
        VALUE hash = rb_hash_new();
        rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name));
        rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence));
        rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language));
        
        rb_ary_push(ary, hash);
    }
    
    return ary;
}

#detectable_charsetsObject

Get array of names of all detectable charsets that are known to the charset detection service.



295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# File 'ext/uchardet/uchardet.c', line 295

static VALUE
UCharsetDetector_get_detectable_charsets(VALUE self)
{
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    UErrorCode status = U_ZERO_ERROR;
    
    UEnumeration *charsets = ucsdet_getAllDetectableCharsets(detector, &status);
    ensure(status);
    
    VALUE ary = rb_ary_new();
    int32_t result_length;
    const char *charset_name;
    
    while (charset_name = uenum_next(charsets, &result_length, &status)) {
        ensure(status);
        rb_ary_push(ary, rb_str_new2(charset_name));
    }
    uenum_close(charsets);
    
    return ary;
}

#input_filtered=Object

Enable filtering of input text. If filtering is enabled, text within angle brackets (“<” and “>”) will be removed before detection, which will remove most HTML or xml markup.



61
62
63
64
65
66
67
68
69
# File 'ext/uchardet/uchardet.c', line 61

static VALUE
UCharsetDetector_set_input_filtered(VALUE self, VALUE flag)
{
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    
    ucsdet_enableInputFilter(detector, RTEST(flag) ? TRUE : FALSE);
    return self;
}

#input_filteredBoolean

Return filtering flag value this charset detector.

Returns:

  • (Boolean)


44
45
46
47
48
49
50
51
# File 'ext/uchardet/uchardet.c', line 44

static VALUE
UCharsetDetector_get_input_filtered(VALUE self)
{
    UCharsetDetector *detector;
    Data_Get_Struct(self, UCharsetDetector, detector);
    
    return ucsdet_isInputFilterEnabled(detector) ? Qtrue : Qfalse;
}

#textObject

Get input text for this detector.



77
78
79
80
81
# File 'ext/uchardet/uchardet.c', line 77

static VALUE
UCharsetDetector_get_text(VALUE self)
{
    return rb_iv_get(self, "@text");
}

#text=Object

Set input text for this detector.



89
90
91
92
93
# File 'ext/uchardet/uchardet.c', line 89

static VALUE
UCharsetDetector_set_text(VALUE self, VALUE text)
{
    return rb_iv_set(self, "@text", text);
}