一、参考
二、ES 中的窗口聚合函数
2.1 索引数据
订单ID | 订单价格 | 订单时间 | 用户 |
---|---|---|---|
1 | 20 | 2021/09/01 01:00:00 | u1 |
2 | 30 | 2021/09/01 01:01:00 | u2 |
3 | 200 | 2021/09/01 01:01:30 | u1 |
4 | 300 | 2021/09/01 01:02:00 | u2 |
5 | 10 | 2021/09/01 01:02:30 | u1 |
6 | 5 | 2021/09/01 01:03:00 | u1 |
7 | 100 | 2021/09/01 01:03:30 | u2 |
8 | 1000 | 2021/09/01 01:04:00 | u2 |
PUT test-order/
{
"mappings": {
"properties": {
"order_id": {
"type": "keyword"
},
"price": {
"type": "long"
},
"username": {
"type": "keyword"
},
"ts": {
"type": "date"
}
}
}
}
POST _bulk
{"index":{"_index":"test-order"}}
{"order_id":"1", "price": 20, "username": "u1", "ts": "2021-09-01T01:00:00Z"}
{"index":{"_index":"test-order"}}
{"order_id":"2", "price": 30, "username": "u2", "ts": "2021-09-01T01:01:00Z"}
{"index":{"_index":"test-order"}}
{"order_id":"3", "price": 200, "username": "u1", "ts": "2021-09-01T01:01:30Z"}
{"index":{"_index":"test-order"}}
{"order_id":"4", "price": 300, "username": "u2", "ts": "2021-09-01T01:02:00Z"}
{"index":{"_index":"test-order"}}
{"order_id":"5", "price": 10, "username": "u1", "ts": "2021-09-01T01:02:30Z"}
{"index":{"_index":"test-order"}}
{"order_id":"6", "price": 5, "username": "u1", "ts": "2021-09-01T01:03:00Z"}
{"index":{"_index":"test-order"}}
{"order_id":"7", "price": 100, "username": "u2", "ts": "2021-09-01T01:03:30Z"}
{"index":{"_index":"test-order"}}
{"order_id":"8", "price": 1000, "username": "u2", "ts": "2021-09-01T01:04:00Z"}
2.2 普通的时间聚合
GET test-order/_search
{
"size": 0,
"query": {
"range": {
"ts": {
"gte": "2021-09-01T01:00:00Z",
"lte": "2021-09-01T01:10:00Z"
}
}
},
"aggs": {
"a1": {
"date_histogram": {
"field": "ts",
"fixed_interval": "30s"
},
"aggs": {
"a2": {
"sum": {
"field": "price"
}
}
}
}
}
}
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 8,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"a1" : {
"buckets" : [
{
"key_as_string" : "2021-09-01T01:00:00.000Z",
"key" : 1630458000000,
"doc_count" : 1,
"a2" : {
"value" : 20.0
}
},
{
"key_as_string" : "2021-09-01T01:00:30.000Z",
"key" : 1630458030000,
"doc_count" : 0,
"a2" : {
"value" : 0.0
}
},
{
"key_as_string" : "2021-09-01T01:01:00.000Z",
"key" : 1630458060000,
"doc_count" : 1,
"a2" : {
"value" : 30.0
}
},
{
"key_as_string" : "2021-09-01T01:01:30.000Z",
"key" : 1630458090000,
"doc_count" : 1,
"a2" : {
"value" : 200.0
}
},
{
"key_as_string" : "2021-09-01T01:02:00.000Z",
"key" : 1630458120000,
"doc_count" : 1,
"a2" : {
"value" : 300.0
}
},
{
"key_as_string" : "2021-09-01T01:02:30.000Z",
"key" : 1630458150000,
"doc_count" : 1,
"a2" : {
"value" : 10.0
}
},
{
"key_as_string" : "2021-09-01T01:03:00.000Z",
"key" : 1630458180000,
"doc_count" : 1,
"a2" : {
"value" : 5.0
}
},
{
"key_as_string" : "2021-09-01T01:03:30.000Z",
"key" : 1630458210000,
"doc_count" : 1,
"a2" : {
"value" : 100.0
}
},
{
"key_as_string" : "2021-09-01T01:04:00.000Z",
"key" : 1630458240000,
"doc_count" : 1,
"a2" : {
"value" : 1000.0
}
}
]
}
}
}
2.3 pipeline聚合中实现窗口聚合
GET test-order/_search
{
"size": 0,
"aggs": {
"a1": {
"date_histogram": {
"field": "ts",
"fixed_interval": "30s"
},
"aggs": {
"the_sum": {
"sum": {
"field": "price"
}
},
"the_window": {
"moving_fn": {
"buckets_path": "the_sum",
"window": 2,
"script": "MovingFunctions.sum(values)",
"shift": 1,
"gap_policy": "keep_values"
}
}
}
}
}
}
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 8,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"a1" : {
"buckets" : [
{
"key_as_string" : "2021-09-01T01:00:00.000Z",
"key" : 1630458000000,
"doc_count" : 1,
"the_sum" : {
"value" : 20.0
},
"the_window" : {
"value" : 20.0
}
},
{
"key_as_string" : "2021-09-01T01:00:30.000Z",
"key" : 1630458030000,
"doc_count" : 0,
"the_sum" : {
"value" : 0.0
},
"the_window" : {
"value" : 20.0
}
},
{
"key_as_string" : "2021-09-01T01:01:00.000Z",
"key" : 1630458060000,
"doc_count" : 1,
"the_sum" : {
"value" : 30.0
},
"the_window" : {
"value" : 30.0
}
},
{
"key_as_string" : "2021-09-01T01:01:30.000Z",
"key" : 1630458090000,
"doc_count" : 1,
"the_sum" : {
"value" : 200.0
},
"the_window" : {
"value" : 230.0
}
},
{
"key_as_string" : "2021-09-01T01:02:00.000Z",
"key" : 1630458120000,
"doc_count" : 1,
"the_sum" : {
"value" : 300.0
},
"the_window" : {
"value" : 500.0
}
},
{
"key_as_string" : "2021-09-01T01:02:30.000Z",
"key" : 1630458150000,
"doc_count" : 1,
"the_sum" : {
"value" : 10.0
},
"the_window" : {
"value" : 310.0
}
},
{
"key_as_string" : "2021-09-01T01:03:00.000Z",
"key" : 1630458180000,
"doc_count" : 1,
"the_sum" : {
"value" : 5.0
},
"the_window" : {
"value" : 15.0
}
},
{
"key_as_string" : "2021-09-01T01:03:30.000Z",
"key" : 1630458210000,
"doc_count" : 1,
"the_sum" : {
"value" : 100.0
},
"the_window" : {
"value" : 105.0
}
},
{
"key_as_string" : "2021-09-01T01:04:00.000Z",
"key" : 1630458240000,
"doc_count" : 1,
"the_sum" : {
"value" : 1000.0
},
"the_window" : {
"value" : 1100.0
}
}
]
}
}
}
三、flink中的 window 函数
3.1 flink 创建表
order.csv
如下
1,u1,20,2021-09-01 01:00:00
2,u2,30,2021-09-01 01:01:00
3,u1,200,2021-09-01 01:01:30
4,u2,300,2021-09-01 01:02:00
5,u1,10,2021-09-01 01:02:30
6,u1,5,2021-09-01 01:03:00
7,u2,100,2021-09-01 01:03:30
8,u2,1000,2021-09-01 01:04:00
/*
测试订单
*/
create table test_order (
order_id STRING,
username STRING,
price INT,
ts TIMESTAMP(3),
WATERMARK FOR ts AS ts - INTERVAL '1' SECOND
) with (
'connector' = 'filesystem',
'path' = '/Users/yz/work/github/flinkLearn/csvs/order.csv',
'format' = 'csv'
);
3.2 滚动窗口聚合 tumble
-- 滚动窗口
select
tumble_start(ts, interval '30' second) as t_start,
tumble_end(ts, interval '30' second) as t_end,
sum(price) as sum_price from test_order group by tumble(ts, interval '30' second);
3.3 滑动窗口聚合 hop
-- 滑动窗口 滑动步长(30s) < 窗口时长 (1m)
-- 此时 第一个窗口的开启时间会前移 前移时长=窗口时长-滑动步长
select
hop_start(ts, interval '30' second, interval '1' minute) as h_start,
hop_end(ts, interval '30' second, interval '1' minute) as h_end,
sum(price) as sum_price from test_order group by hop(ts, interval '30' second, interval '1' minute);
-- 滑动窗口, 滑动步长(30s) > 窗口时长 (10s)
select
hop_start(ts, interval '30' second, interval '10' second) as h_start,
hop_end(ts, interval '30' second, interval '10' second) as h_end,
sum(price) as sum_price from test_order group by hop(ts, interval '30' second, interval '10' second);
四、比较
ES | Flink | |
---|---|---|
简单时间聚合 | 使用 date_histogram 实现 | 使用 tumble 滚动窗口函数实现 ⚠️ 如果存在数据缺失,flink不会自动填充 , es会自动填充,例如: 2021-09-01T01:00:30.000Z |
滑动窗口 | 使用 moving_fn pipeline 聚合函数实现 | 使用 hop 滑动窗口函数实现 ⚠️ 在开始和结束,会多出窗口 |
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。