安装 ./bin/elasticsearch-plugin install https: // github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.4/elasticsearch-analysis-ik-5.6.4.zip ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-p
安装
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.4/elasticsearch-analysis-ik-5.6.4.zip
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v5.6.4/elasticsearch-analysis-pinyin-5.6.4.zip
安装后需要重启elasticsearch服务
查看当前已安装插件
GET _cat/plugins 结果 node01 analysis-ik 5.6.4 node01 analysis-pinyin 5.6.4
测试中文分词器,支持ik_max_word和ik_smart两种方式
GET _analyze { "analyzer":"ik_max_word", "text":"中华人民共和国国歌" }
结果 { "tokens": [ { "token": "中华人民共和国", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 0 }, { "token": "中华人民", "start_offset": 0, "end_offset": 4, "type": "CN_WORD", "position": 1 }, { "token": "中华", "start_offset": 0, "end_offset": 2, "type": "CN_WORD", "position": 2 }, { "token": "华人", "start_offset": 1, "end_offset": 3, "type": "CN_WORD", "position": 3 }, { "token": "人民共和国", "start_offset": 2, "end_offset": 7, "type": "CN_WORD", "position": 4 }, { "token": "人民", "start_offset": 2, "end_offset": 4, "type": "CN_WORD", "position": 5 }, { "token": "共和国", "start_offset": 4, "end_offset": 7, "type": "CN_WORD", "position": 6 }, { "token": "共和", "start_offset": 4, "end_offset": 6, "type": "CN_WORD", "position": 7 }, { "token": "国", "start_offset": 6, "end_offset": 7, "type": "CN_CHAR", "position": 8 }, { "token": "国歌", "start_offset": 7, "end_offset": 9, "type": "CN_WORD", "position": 9 } ] }
使用ik_smart,则会尽可能少的返回词语: { "tokens": [ { "token": "中华人民共和国", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 0 }, { "token": "国歌", "start_offset": 7, "end_offset": 9, "type": "CN_WORD", "position": 1 } ] }
ik分词器支持自定义词库
vi config/IKAnalyzer.cfg.xml
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 扩展配置</comment> <!--用户可以在这里配置自己的扩展字典 --> <entry key="ext_dict">zhouls.dic</entry> <!--用户可以在这里配置自己的扩展停止词字典--> <entry key="ext_stopwords"></entry> <!--用户可以在这里配置远程扩展字典 --> <!-- <entry key="remote_ext_dict">words_location</entry> --> <!--用户可以在这里配置远程扩展停止词字典--> <!-- <entry key="remote_ext_stopwords">words_location</entry> --> </properties>
#配置完成需要重启服务
简单测试拼音分词
PUT test08
{
"index": {
"analysis": {
"analyzer": {
"pinyin_analyzer": {
"tokenizer": "my_pinyin",
"filter": "word_delimiter"
}
},
"tokenizer": {
"my_pinyin": {
"type": "pinyin",
"first_letter": "none",
"padding_char": " "
}
}
}
}
}
GET medcl/_analyze
{
"text":"刘德华",
"analyzer":"pinyin_analyzer"
}
结果
{
"tokens": [
{
"token": "liu",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "ldh",
"start_offset": 0,
"end_offset": 3,
"type": "word",
"position": 0
},
{
"token": "de",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "hua",
"start_offset": 2,
"end_offset": 3,
"type": "word",
"position": 2
}
]
}
同时支持中文和拼音的分词器
PUT test06 { "settings":{ "number_of_shards":"1", "index.refresh_interval":"15s", "index":{ "analysis":{ "analyzer":{ "ik_pinyin_analyzer":{ "type":"custom", "tokenizer":"ik_smart", "filter":"pinyin_filter" } }, "filter":{ "pinyin_filter":{ "type":"pinyin", "keep_first_letter": false } } } } }, "mappings": { "doc":{ "properties": { "name":{ "type": "text", "analyzer": "ik_pinyin_analyzer" } } } } } POST test06/_analyze { "analyzer": "ik_pinyin_analyzer", "text":"中华人民共和国国歌" }
结果 { "tokens": [ { "token": "zhong", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 0 }, { "token": "hua", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 1 }, { "token": "ren", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 2 }, { "token": "min", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 3 }, { "token": "gong", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 4 }, { "token": "he", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 5 }, { "token": "guo", "start_offset": 0, "end_offset": 7, "type": "CN_WORD", "position": 6 }, { "token": "guo", "start_offset": 7, "end_offset": 9, "type": "CN_WORD", "position": 7 }, { "token": "ge", "start_offset": 7, "end_offset": 9, "type": "CN_WORD", "position": 8 } ] }
参考文档:
https://blog.csdn.net/u013905744/article/details/80935846
https://www.cnblogs.com/xing901022/p/5910139.html
https://blog.csdn.net/qq_28018283/article/details/80396937