elasticsearch中将content字段设置为ik分词器后再使用 terms 聚合生成类似热门词汇的功能

索引的 msg2017-04201038447mapping

{
  "msg2017-04": {
    "mappings": {
      "201038447": {
        "properties": {
          "@timestamp": {
            "type": "date"
          },
          "content": {
            "type": "text",
            "boost": 8,
            "analyzer": "ik_smart",
            "include_in_all": true
          },
          "createTime": {
            "type": "date"
          }
        }
      }
    }
  }
}

索引的 settings

{
  "msg2017-04": {
    "settings": {
      "index": {
        "creation_date": "1492398234434",
        "number_of_shards": "5",
        "number_of_replicas": "1",
        "uuid": "yiGoDhL1T3WLexG79e5uQg",
        "version": {
          "created": "5020299"
        },
        "provided_name": "msg2017-04"
      }
    }
  }
}

环境:

  • linux

  • elasticsearch 5.2.2

  • 已安装ik分词

分词结果

// request
GET /msg2017-04/_search?pretty
{
    "size": 1,
    "aggs": {
      "fenci" : {
        "terms" : { 
          "field" : "content.ik_smart"
        }
      }
    }
}

// response
{
  "took": 2,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 105,
    "max_score": 1,
    "hits": [
      {
        "_index": "msg2017-04",
        "_type": "7510570179@chatroom",
        "_id": "5067959408840553063",
        "_score": 1,
        "_source": {
          "wxid": "wxid_1idf7gf5jgh822",
          "msgId": "69",
          "msgSvrId": "5067959408840553063",
          "type": 0,
          "isSend": "1",
          "status": "2",
          "speakerId": "",
          "content": "rhh",
          "imei": "867464024215618",
          "room": "7510570179@chatroom",
          "roomName": "和湖光山色hzhzh",
          "roomOwner": "mikezhangsky",
          "roomMembers": "mikezhangsky;wxid_1idf7gf5jgh822;wxid_j56srpxywn5n22;wxid_90uy0wlz229e22;sun461629376",
          "roomSize": "5",
          "createTime": "2017-04-07T03:08:37",
          "@timestamp": "2017-04-17T03:14:15"
        }
      }
    ]
  },
  "aggregations": {
    "fenci": {
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0,
      "buckets": []
    }
  }
}

我想通过中文分词后再聚合,这样就可以实时的统计出一段时间内的热词,类似微博的热搜。

阅读 5.5k
1 个回答

官方文档中有介绍

PUT my_index
{
  "mappings": {
    "my_type": {
      "properties": {
        "city": {
          "type": "text",
          "fields": {
            "raw": { 
              "type":  "keyword"
            }
          }
        }
      }
    }
  }
}

PUT my_index/my_type/1
{
  "city": "New York"
}

PUT my_index/my_type/2
{
  "city": "York"
}

GET my_index/_search
{
  "query": {
    "match": {
      "city": "york" 
    }
  },
  "sort": {
    "city.raw": "asc" 
  },
  "aggs": {
    "Cities": {
      "terms": {
        "field": "city.raw" 
      }
    }
  }
}

参考链接

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题