Module: NKF

Defined in:
nkf.c

Constant Summary collapse

AUTO =

Auto-Detect

INT2FIX(_AUTO)
JIS =

ISO-2022-JP

INT2FIX(_JIS)
EUC =

EUC-JP

INT2FIX(_EUC)
SJIS =

Shift_JIS

INT2FIX(_SJIS)
BINARY =

BINARY

INT2FIX(_BINARY)
NOCONV =

No conversion

INT2FIX(_NOCONV)
ASCII =

ASCII

INT2FIX(_ASCII)
UTF8 =

UTF-8

INT2FIX(_UTF8)
UTF16 =

UTF-16

INT2FIX(_UTF16)
UTF32 =

UTF-32

INT2FIX(_UTF32)
UNKNOWN =

UNKNOWN

INT2FIX(_UNKNOWN)
VERSION =

Full version string of nkf

rb_str_new2(RUBY_NKF_VERSION)
NKF_VERSION =

Version of nkf

rb_str_new2(NKF_VERSION)
NKF_RELEASE_DATE =

Release date of nkf

rb_str_new2(NKF_RELEASE_DATE)

Class Method Summary collapse

Class Method Details

.guess1(str) ⇒ Integer

Returns guessed encoding of str as integer.

Algorithm described in: Ken Lunde. 'Understanding Japanese Information Processing' Sebastopol, CA: O'Reilly & Associates.

case NKF.guess1(input)
when NKF::JIS
  "ISO-2022-JP"
when NKF::SJIS
  "Shift_JIS"
when NKF::EUC
  "EUC-JP"
when NKF::UNKNOWN
  "UNKNOWN(ASCII)"
when NKF::BINARY
  "BINARY"
end

Returns:

  • (Integer)


# File 'nkf.c'

/*
 *  call-seq:
 *     NKF.guess1(str)  -> integer
 *
 *  Returns guessed encoding of _str_ as integer.
 *
 *  Algorithm described in:
 *  Ken Lunde. `Understanding Japanese Information Processing'
 *  Sebastopol, CA: O'Reilly & Associates.
 *
 *      case NKF.guess1(input)
 *      when NKF::JIS
 *        "ISO-2022-JP"
 *      when NKF::SJIS
 *        "Shift_JIS"
 *      when NKF::EUC
 *        "EUC-JP"
 *      when NKF::UNKNOWN
 *        "UNKNOWN(ASCII)"
 *      when NKF::BINARY
 *        "BINARY"
 *      end
 */

static VALUE
rb_nkf_guess1(obj, src)
  VALUE obj, src;
{
  unsigned char *p;
  unsigned char *pend;
  int sequence_counter = 0;

  StringValue(src);
  p = (unsigned char *)RSTRING(src)->ptr;
  pend = p + RSTRING(src)->len;
  if (p == pend) return INT2FIX(_UNKNOWN);

#define INCR do {\
      p++;\
      if (p==pend) return INT2FIX(_UNKNOWN);\
      sequence_counter++;\
      if (sequence_counter % 2 == 1 && *p != 0xa4)\
    sequence_counter = 0;\
      if (6 <= sequence_counter) {\
      sequence_counter = 0;\
      return INT2FIX(_EUC);\
      }\
  } while (0)

  if (*p == 0xa4)
    sequence_counter = 1;

  while (p<pend) {
    if (*p == '\033') {
      return INT2FIX(_JIS);
    }
    if (*p < '\006' || *p == 0x7f || *p == 0xff) {
      return INT2FIX(_BINARY);
    }
    if (0x81 <= *p && *p <= 0x8d) {
      return INT2FIX(_SJIS);
    }
    if (0x8f <= *p && *p <= 0x9f) {
      return INT2FIX(_SJIS);
    }
    if (*p == 0x8e) {   /* SS2 */
      INCR;
      if ((0x40 <= *p && *p <= 0x7e) ||
      (0x80 <= *p && *p <= 0xa0) ||
      (0xe0 <= *p && *p <= 0xfc))
    return INT2FIX(_SJIS);
    }
    else if (0xa1 <= *p && *p <= 0xdf) {
      INCR;
      if (0xf0 <= *p && *p <= 0xfe)
    return INT2FIX(_EUC);
      if (0xe0 <= *p && *p <= 0xef) {
    while (p < pend && *p >= 0x40) {
      if (*p >= 0x81) {
        if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) {
          return INT2FIX(_SJIS);
        }
        else if (0xfd <= *p && *p <= 0xfe) {
          return INT2FIX(_EUC);
        }
      }
      INCR;
    }
      }
      else if (*p <= 0x9f) {
    return INT2FIX(_SJIS);
      }
    }
    else if (0xf0 <= *p && *p <= 0xfe) {
      return INT2FIX(_EUC);
    }
    else if (0xe0 <= *p && *p <= 0xef) {
      INCR;
      if ((0x40 <= *p && *p <= 0x7e) ||
      (0x80 <= *p && *p <= 0xa0)) {
    return INT2FIX(_SJIS);
      }
      if (0xfd <= *p && *p <= 0xfe) {
    return INT2FIX(_EUC);
      }
    }
    INCR;
  }
  return INT2FIX(_UNKNOWN);
}

.guess2(str) ⇒ Integer

Returns guessed encoding of str as integer by nkf routine.

case NKF.guess(input)
when NKF::ASCII
  "ASCII"
when NKF::JIS
  "ISO-2022-JP"
when NKF::SJIS
  "Shift_JIS"
when NKF::EUC
  "EUC-JP"
when NKF::UTF8
  "UTF-8"
when NKF::UTF16
  "UTF-16"
when NKF::UNKNOWN
  "UNKNOWN"
when NKF::BINARY
  "BINARY"
end

Returns:

  • (Integer)


# File 'nkf.c'

/*
 *  call-seq:
 *     NKF.guess2(str)  -> integer
 *
 *  Returns guessed encoding of _str_ as integer by nkf routine.
 *
 *     case NKF.guess(input)
 *     when NKF::ASCII
 *       "ASCII"
 *     when NKF::JIS
 *       "ISO-2022-JP"
 *     when NKF::SJIS
 *       "Shift_JIS"
 *     when NKF::EUC
 *       "EUC-JP"
 *     when NKF::UTF8
 *       "UTF-8"
 *     when NKF::UTF16
 *       "UTF-16"
 *     when NKF::UNKNOWN
 *       "UNKNOWN"
 *     when NKF::BINARY
 *       "BINARY"
 *     end
 */

static VALUE
rb_nkf_guess2(obj, src)
  VALUE obj, src;
{
  int code = _BINARY;

  reinit();

  input_ctr = 0;
  StringValue(src);
  input = (unsigned char *)RSTRING(src)->ptr;
  i_len = RSTRING(src)->len;

  if(x0201_f == WISH_TRUE)
    x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);

  guess_f = TRUE;
  kanji_convert( NULL );
  guess_f = FALSE;

  if (!is_inputcode_mixed) {
    if (strcmp(input_codename, "") == 0) {
      code = _ASCII;
    } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
      code = _JIS;
    } else if (strcmp(input_codename, "EUC-JP") == 0) {
      code = _EUC;
    } else if (strcmp(input_codename, "Shift_JIS") == 0) {
      code = _SJIS;
    } else if (strcmp(input_codename, "UTF-8") == 0) {
      code = _UTF8;
    } else if (strcmp(input_codename, "UTF-16") == 0) {
      code = _UTF16;
    } else if (strlen(input_codename) > 0) {
      code = _UNKNOWN;
    }
  }

  return INT2FIX( code );
}

.nkf(opt, str) ⇒ String

Convert str and return converted result. Conversion details are specified by opt as String.

require 'nkf'
output = NKF.nkf("-s", input)

Note By default, nkf decodes MIME encoded string. If you want not to decode input, use NKF.nkf with -m0 flag.

Returns:



# File 'nkf.c'

/*
 *  call-seq:
 *     NKF.nkf(opt, str)   -> string
 *
 *  Convert _str_ and return converted result.
 *  Conversion details are specified by _opt_ as String.
 *
 *     require 'nkf'
 *     output = NKF.nkf("-s", input)
 *
 *  *Note*
 *  By default, nkf decodes MIME encoded string.
 *  If you want not to decode input, use NKF.nkf with <b>-m0</b> flag.
 */

static VALUE
rb_nkf_kconv(obj, opt, src)
  VALUE obj, opt, src;
{
  char *opt_ptr, *opt_end;
  volatile VALUE v;

  reinit();
  StringValue(opt);
  opt_ptr = RSTRING(opt)->ptr;
  opt_end = opt_ptr + RSTRING(opt)->len;
  nkf_split_options(opt_ptr);

  incsize = INCSIZE;

  input_ctr = 0;
  StringValue(src);
  input = (unsigned char *)RSTRING(src)->ptr;
  i_len = RSTRING(src)->len;
  result = rb_str_new(0, i_len*3 + 10);
  v = result;

  output_ctr = 0;
  output     = (unsigned char *)RSTRING(result)->ptr;
  o_len      = RSTRING(result)->len;
  *output    = '\0';

  if(x0201_f == WISH_TRUE)
    x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);

  kanji_convert(NULL);
  RSTRING(result)->ptr[output_ctr] = '\0';
  RSTRING(result)->len = output_ctr;
  OBJ_INFECT(result, src);

  return result;
}