Kuromoji

A ruby wrapper for the Kuromoji Japanese morphological analyzer.

require JAVA

Installation

Add this line to your application's Gemfile:

gem 'kuromoji-ruby'

And then execute:

$ bundle

Or install it yourself as:

$ gem install kuromoji-ruby

Usage

You have to set JAVA_HOME

# .bashrc
# This is OSX example.
export JAVA_HOME=`/usr/libexec/java_home`

and to run.

Kuromoji.tokenize("あそこにいるのチャウチャウちゃうんちゃう")
#=> {"あそこ"=>"名詞,代名詞,一般,*,*,*,あそこ,アソコ,アソコ",
 "に"=>"助詞,格助詞,一般,*,*,*,に,ニ,ニ",
 "いる"=>"動詞,自立,*,*,一段,基本形,いる,イル,イル",
 "の"=>"助詞,連体化,*,*,*,*,の,ノ,ノ",
 "チャウチャウ"=>"名詞,一般,*,*,*,*,*",
 "ちゃう"=>"動詞,自立,*,*,五段・ワ行促音便,基本形,ちゃう,チャウ,チャウ",
 "ん"=>"名詞,非自立,一般,*,*,*,ん,ン,ン"}
 Kuromoji.reading("吉田篤")
=> {"吉田"=>"ヨシダ", "篤"=>"アツシ"}

command line interface.

Commands:
  kuromoji all_features    # echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -a
  kuromoji help [COMMAND]  # Describe available commands or one specific command
  kuromoji reading         # echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -r
  kuromoji tokenize        # echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -t

Exsample

output json format

$ echo '関西国際空港は、大阪府大阪市の南西38kmにまたがる会社管理空港である' | kuromoji -t | jq .

#[
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 0,
#    "surface_form": "関西国際空港",
#    "base_form": "関西国際空港",
#    "reading": "カンサイコクサイクウコウ",
#    "parts_of_speech": "名詞,固有名詞,組織,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 6,
#    "surface_form": "は",
#    "base_form": "は",
#    "reading": "ハ",
#    "parts_of_speech": "助詞,係助詞,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 7,
#    "surface_form": "、",
#    "base_form": "、",
#    "reading": "、",
#    "parts_of_speech": "記号,読点,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 8,
#    "surface_form": "大阪",
#    "base_form": "大阪",
#    "reading": "オオサカ",
#    "parts_of_speech": "名詞,固有名詞,地域,一般"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 10,
#    "surface_form": "府",
#    "base_form": "府",
#    "reading": "フ",
#    "parts_of_speech": "名詞,接尾,地域,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 11,
#    "surface_form": "大阪",
#    "base_form": "大阪",
#    "reading": "オオサカ",
#    "parts_of_speech": "名詞,固有名詞,地域,一般"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 13,
#    "surface_form": "市",
#    "base_form": "市",
#    "reading": "シ",
#    "parts_of_speech": "名詞,接尾,地域,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 14,
#    "surface_form": "の",
#    "base_form": "の",
#    "reading": "ノ",
#    "parts_of_speech": "助詞,連体化,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 15,
#    "surface_form": "南西",
#    "base_form": "南西",
#    "reading": "ナンセイ",
#    "parts_of_speech": "名詞,一般,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": false,
#    "position": 17,
#    "surface_form": "38",
#    "base_form": null,
#    "reading": null,
#    "parts_of_speech": "名詞,数,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": false,
#    "position": 19,
#    "surface_form": "km",
#    "base_form": null,
#    "reading": null,
#    "parts_of_speech": "名詞,一般,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 21,
#    "surface_form": "に",
#    "base_form": "に",
#    "reading": "ニ",
#    "parts_of_speech": "助詞,格助詞,一般,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 22,
#    "surface_form": "またがる",
#    "base_form": "またがる",
#    "reading": "マタガル",
#    "parts_of_speech": "動詞,自立,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 26,
#    "surface_form": "会社",
#    "base_form": "会社",
#    "reading": "カイシャ",
#    "parts_of_speech": "名詞,一般,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 28,
#    "surface_form": "管理",
#    "base_form": "管理",
#    "reading": "カンリ",
#    "parts_of_speech": "名詞,サ変接続,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 30,
#    "surface_form": "空港",
#    "base_form": "空港",
#    "reading": "クウコウ",
#    "parts_of_speech": "名詞,一般,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 32,
#    "surface_form": "で",
#    "base_form": "だ",
#    "reading": "デ",
#    "parts_of_speech": "助動詞,*,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": true,
#    "position": 33,
#    "surface_form": "ある",
#    "base_form": "ある",
#    "reading": "アル",
#    "parts_of_speech": "助動詞,*,*,*"
#  },
#  {
#    "is_user": false,
#    "is_known": false,
#    "position": 35,
#    "surface_form": "\n",
#    "base_form": null,
#    "reading": null,
#    "parts_of_speech": "記号,空白,*,*"
#  }
#]

Contributing

  1. Fork it ( http://github.com//kuromoji/fork )
  2. Create your feature branch (git checkout -b my-new-feature)
  3. Commit your changes (git commit -am 'Add some feature')
  4. Push to the branch (git push origin my-new-feature)
  5. Create new Pull Request