有一天研发反馈说Grafana的Panel出现了突刺
image.png

将irate改成了rate突刺少了很多,直接查看一下相关的代码
https://github.com/prometheus...

先看下rate的

func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNodeHelper, isCounter, isRate bool) Vector {
    ... ...
    if samples.Points[0].H != nil {
       resultHistogram = histogramRate(samples.Points, isCounter)
       if resultHistogram == nil {
          // Points are a mix of floats and histograms, or the histograms
          // are not compatible with each other.
          // TODO(beorn7): find a way of communicating the exact reason
          return enh.Out
       }
    } else {
       /*
          resultValue 最后一个Sample Value和第一个Sample Value的差值
          这块逻辑是做数值修正,Counter类型的数据是如果机器重启后,Value会从0开始计数。
          例如60秒内有下面6数值,在第四个数字后面发生了重置
          2 4 6 8 2 4
          resultValue = 4 - 2 = 2
       */
       resultValue = samples.Points[len(samples.Points)-1].V - samples.Points[0].V
       prevValue := samples.Points[0].V
       // We have to iterate through everything even in the non-counter
       // case because we have to check that everything is a float.
       // TODO(beorn7): Find a way to check that earlier, e.g. by
       // handing in a []FloatPoint and a []HistogramPoint separately.
       for _, currPoint := range samples.Points[1:] {
          if currPoint.H != nil {
             return nil // Range contains a mix of histograms and floats.
          }
          if !isCounter {
             continue
          }
          /*
             2 < 8
             resultValue += 8 = 10
          */
          if currPoint.V < prevValue {
             resultValue += prevValue
          }
          prevValue = currPoint.V
       }
       /*
          resultValue最后为10,如果是在第四个Sample不发生重置,继续采集的话应该是
          2 4 6 8 10 12
          resultValue = 12 -2 = 10 resultValue最后也是为10的
       */
    }
    
    
    // If the first/last samples are close to the boundaries of the range,
    // extrapolate the result. This is as we expect that another sample
    // will exist given the spacing between samples we've seen thus far,
    // with an allowance for noise.
    /*
       extrapolationThreshold: 1.1倍的Sample间时间间隔
       extrapolateToInterval: 初始值为sampledInterval(第一个和最后一个Sample的时间间隔)
    */
    extrapolationThreshold := averageDurationBetweenSamples * 1.1
    extrapolateToInterval := sampledInterval
    
    
    // 进行extrapolateToInterval的推断,补齐时间
    if durationToStart < extrapolationThreshold {
       extrapolateToInterval += durationToStart
    } else {
       extrapolateToInterval += averageDurationBetweenSamples / 2
    }
    if durationToEnd < extrapolationThreshold {
       extrapolateToInterval += durationToEnd
    } else {
       extrapolateToInterval += averageDurationBetweenSamples / 2
    }
    // 根据推断后的extrapolateToInterval除以sampledInterval得出一个系数
    factor := extrapolateToInterval / sampledInterval
    if isRate {
       factor /= ms.Range.Seconds()
    }
    if resultHistogram == nil {
       /* rate函数走这里的逻辑,这里可以结合上面一起来看比较哈理解,目的就是对resultValue进行一个推断
       resultValue = (resultValue / ms.Range.Seconds()) * factor
       */
       resultValue *= factor
    } else {
       resultHistogram.Scale(factor)
    }
    
    
    return append(enh.Out, Sample{
       Point: Point{V: resultValue, H: resultHistogram},
    })
}

继续看irate的

func instantValue(vals []parser.Value, out Vector, isRate bool) Vector {
    samples := vals[0].(Matrix)[0]
    // No sense in trying to compute a rate without at least two points. Drop
    // this Vector element.
    if len(samples.Points) < 2 {
        return out
    }

    lastSample := samples.Points[len(samples.Points)-1]
    previousSample := samples.Points[len(samples.Points)-2]

    var resultValue float64
    if isRate && lastSample.V < previousSample.V {
        // Counter reset. 发生了重置,比如重启服务
        resultValue = lastSample.V
    } else {
        // 用最后一个Sample减去后数第二个得到差值
        resultValue = lastSample.V - previousSample.V
    }

    sampledInterval := lastSample.T - previousSample.T
    if sampledInterval == 0 {
        // Avoid dividing by 0.
        return out
    }

    if isRate {
        // Convert to per-second.
        // 得到每秒增加的数量
        resultValue /= float64(sampledInterval) / 1000
    }
    fmt.Println(1111)
    fmt.Printf("samples.Points:%v\n", samples.Points)
    fmt.Printf("resultValue:%v\n", resultValue)
    fmt.Println(2222)
    return append(out, Sample{
        Point: Point{V: resultValue},
    })
}

此时直接访问Prometheus打印出出问题时间点的原始数据

./promtool query range 'http://127.0.0.1:10902/' 'sum(irate(xxx_origin_count{adtype="video", cloud="aws", country="other", env="release", idType="backup_system_id", job="hb", os="android"}[10m])) by(region)' --start=1676026800 --end=1676028000

image.png
第三个点是突刺的起点,手动计算下这个Point的value是多少
image.png

结合上面的分析可以得到如下的结论:

  1. rate有推导的逻辑,考虑到了重启、时间偏差这些逻辑,更加平滑;
  2. irate只计算最后两个sample,简单粗暴,但更节省资源,但同时突刺会比rate明显;

根据以往经验,还是建议用rate函数,irate在某些情况突刺较多,本文中是由于公司内部有一套聚合逻辑导致源是数据波动较大,但重启这些有些时候避免不了的,此时rate可以避免突刺.

https://pshizhsysu.gitbook.io...


六铉
1 声望0 粉丝

没有读懂源码以前,无脑试错总是效率很低的!