elasticsearch 同义词导致start_offset改变是怎么回事?

设置的同义词如下:

托尼-克罗斯=>托尼-克罗斯,克罗斯,托尼克罗斯,托尼,tk

index setting如下:

{
  "settings": {
    "index": {
      "analysis": {
        "filter": {
          "my_synonym": {
            "type": "synonym",
            "synonyms_path": "my_synonym.txt",
            "lenient": "true"
          }
        },
        "analyzer": {
          "my_ik_analyzer": {
            "filter": [
              "my_synonym"
            ],
            "type": "custom",
            "tokenizer": "my_ik_token"
          }
        },
        "tokenizer": {
          "my_ik_token": {
            "type": "ik_max_word"
          }
        }
      }
    }
  }
}

tokenizer(my_ik_token)分词托尼-克罗斯结果为

{  
    "tokens":[  
        {  
            "token":"托尼",  
            "start_offset":0,  
            "end_offset":2,  
            "type":"CN_WORD",  
            "position":0  
        },  
        {  
            "token":"克罗斯",  
            "start_offset":3,  
            "end_offset":6,  
            "type":"CN_WORD",  
            "position":1  
        },  
        {  
            "token":"罗斯",  
            "start_offset":4,  
            "end_offset":6,  
            "type":"CN_WORD",  
            "position":2  
        }  
    ]  
}

加上了synonym filteranalyzer(my_ik_analyzer)分词结果为:

{
    "tokens": [
        {
            "token": "托尼",
            "start_offset": 0,
            "end_offset": 2,
            "type": "SYNONYM",
            "position": 0
        },
        {
            "token": "克罗斯",
            "start_offset": 0,
            "end_offset": 2,
            "type": "SYNONYM",
            "position": 0
        },
        {
            "token": "托尼",
            "start_offset": 0,
            "end_offset": 2,
            "type": "SYNONYM",
            "position": 0
        },
        {
            "token": "托尼",
            "start_offset": 0,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 0
        },
        {
            "token": "tk",
            "start_offset": 0,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 0
        },
        {
            "token": "克罗斯",
            "start_offset": 3,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 1
        },
        {
            "token": "罗斯",
            "start_offset": 3,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 1
        },
        {
            "token": "尼克",
            "start_offset": 3,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 1
        },
        {
            "token": "罗斯",
            "start_offset": 4,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 2
        },
        {
            "token": "克罗斯",
            "start_offset": 4,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 2
        },
        {
            "token": "罗斯",
            "start_offset": 4,
            "end_offset": 6,
            "type": "SYNONYM",
            "position": 3
        }
    ]
}

可以看到克罗斯出现了两次,其中有一次的start_offsetend_offset是错误的。

阅读 1.3k
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题