用 C 语言最快实现正弦、余弦和平方根（不需要很精确）

我在谷歌上搜索了过去一个小时的问题，但只有泰勒级数或一些示例代码太慢或根本无法编译。好吧，我在谷歌上找到的大多数答案是“谷歌它，它已经被问过了”，但遗憾 的是它不是……

我在低端 Pentium 4 上分析我的游戏，发现大约 85% 的执行时间浪费在计算正弦、余弦和平方根（来自 Visual Studio 中的标准 C++ 库），这似乎严重依赖 CPU（在我的 I7 上，相同的功能只获得了 5% 的执行时间，而且游戏速度 _更快_）。我不能优化这三个函数，也不能一次计算正弦和余弦（相互依赖），但我的模拟不需要太准确的结果，所以我可以接受更快的近似。

所以，问题是：在 C++ 中计算浮点数的正弦、余弦和平方根的最快方法是什么？

编辑查找表更痛苦，因为在现代 CPU 上产生的 Cache Miss 比泰勒级数更昂贵。这些天 CPU 的速度实在是太快了，而缓存却不是。

我犯了一个错误，虽然我需要计算泰勒级数的几个阶乘，现在我看到它们可以实现为常数。

所以更新的问题：平方根是否也有任何快速优化？

编辑2

我正在使用平方根来计算距离，而不是归一化 - 不能使用快速逆平方根算法（如评论中所指出的： http ://en.wikipedia.org/wiki/Fast_inverse_square_root

编辑3

我也不能对平方距离进行操作，我需要精确的距离进行计算

原文由 PiotrK 发布，翻译遵循 CC BY-SA 4.0 许可协议

最快取决于您的需求。这是 [-1,1] 输入范围的特定情况的余弦实现：

与 std::cos 的平均距离为 0.078 ULPS，在 -1 输入附近最大为 1 ULPS
每个 cos 1.4 个周期 (AVX2)
每个 cos 0.8 个周期（AVX512）：~0.25 纳秒
- ~4 GFLOPS 的 cos
- 约 36 GFLOPS 的乘法和加法运算
- 看起来像内存带宽瓶颈（小型工作负载为 0.56 个周期）
除了 squared-x 没有额外的内存需求

https://godbolt.org/z/T6br8azKP

 #include<iostream>
        // only optimized for [-1,1] input range
        // computes Simd number of cosines at once.
        // Auto-vectorization-friendly since all elementary operations separeted into their own loops
        template<typename Type, int Simd>
        inline
        void cosFast(Type * const __restrict__ data, Type * const __restrict__ result) noexcept
        {
            alignas(64)
            Type xSqr[Simd];

            for(int i=0;i<Simd;i++)
            {
                xSqr[i] =   data[i]*data[i];
            }
            // optimized coefficients by genetic algorithm
            // c1 * x^8 + c2 * x^6 + c3 * x^4 + c4 * x^2 + c5
            // looks like Chebyshev Polynomial of first kind but with different coefficients
            for(int i=0;i<Simd;i++)
            {
                result[i] =     Type(2.375724425540681750135263e-05);
            }

            // Horner Scheme is used for quick&precise computation of polynomial
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(-0.001387603183718333355045615);
            }
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(0.04166606225906388516477818);
            }
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(-0.4999999068460709850114654);
            }
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(0.9999999771350314148321559);
            }

        }

#include<cstring>
template<typename T>
uint32_t GetUlpDifference(T a, T b)
{
    uint32_t aBitValue;
    uint32_t bBitValue;
    std::memcpy(&aBitValue,&a,sizeof(T));
    std::memcpy(&bBitValue,&b,sizeof(T));
    return (aBitValue > bBitValue) ?
           (aBitValue - bBitValue) :
           (bBitValue - aBitValue);
}
#include<vector>
template<typename Type>
float computeULP(std::vector<Type> real, std::vector<Type> approximation)
{
    int ctr = 0;
    Type diffSum = 0;
    for(auto r:real)
    {
        Type diff = GetUlpDifference(r,approximation[ctr++]);
        diffSum += diff;
    }
    return diffSum/ctr;
}

template<typename Type>
float computeMaxULP(std::vector<Type> real, std::vector<Type> approximation, std::vector<Type> input)
{
    int ctr = 0;
    Type mx = 0;
    int index = -1;
    Type rr = 0;
    Type aa = 0;
    for(auto r:real)
    {
        Type diff = GetUlpDifference(r,approximation[ctr++]);
        if(mx<diff)
        {
            mx = diff;
            rr=r;
            aa=approximation[ctr-1];
            index = ctr-1;
        }
    }
    std::cout<<"(index="<<index<<": "<<rr<<" <--> "<<aa<<")"<<std::endl;
    std::cout<<"most problematic input: "<<input[index]<<std::endl;
    return mx;
}
#include<cmath>
#include <stdint.h>
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif

inline
uint64_t readTSC() {
    uint64_t tsc = __rdtsc();
    return tsc;
}
void test()
{
    constexpr int n = 8192*64;
    std::vector<float> a(n),b(n),c(n);
    for(int k=0;k<10;k++)
    {
        for(int i=0;i<n;i++)
            a[i]=(i-(n/2))/(float)(n/2);

        // approximation
        auto t1 = readTSC();
        for(int i=0;i<n;i+=16)
            cosFast<float,16>(a.data()+i,b.data()+i);
        auto t2 = readTSC();

        // exact
        for(int i=0;i<n;i++)
            c[i] = std::cos(a[i]);
        auto t3 = readTSC();
        std::cout<<"avg. ulps: "<<computeULP(b,c)<<std::endl;
        std::cout<<"max. ulps: "<<computeMaxULP(b,c,a)<<"--> max ulps"<<std::endl;
        std::cout<<"approximation cycles: "<<t2-t1<<std::endl;
        std::cout<<"std::cos cycles: "<<t3-t2<<std::endl;
        std::cout<<"approximation cycles per cos: "<<(t2-t1)/(float)n<<std::endl;
        std::cout<<"std::cos cycles per cos: "<<(t3-t2)/(float)n<<std::endl;
        std::cout<<"---------------------------"<<std::endl;
    }
}

int main()
{
    test();
    return 0;
}

通常，切比雪夫多项式用于减少范围作为范围缩小版本的后处理，但您需要速度。你要速度吗？然后，您需要权衡输入的计算范围或准确性或两者兼而有之。（至少只有这样的 5 次多项式）

既然你问的解决方案不太准确，我在这里也放了一个不太准确的范围缩小版本 + 相同的 cosFast 一起服务于大角度范围（使用 -100M 弧度到 100M 弧度进行测试）：

 #include<iostream>
#include<cmath>
        // only optimized for [-1,1] input range
        template<typename Type, int Simd>
        inline
        void cosFast(Type * const __restrict__ data, Type * const __restrict__ result) noexcept
        {
            alignas(64)
            Type xSqr[Simd];

            for(int i=0;i<Simd;i++)
            {
                xSqr[i] =   data[i]*data[i];
            }
            // optimized coefficients by genetic algorithm
            // c1 * x^8 + c2 * x^6 + c3 * x^4 + c4 * x^2 + c5
            // looks like Chebyshev Polynomial of first kind but with different coefficients
            for(int i=0;i<Simd;i++)
            {
                result[i] =     Type(2.375724425540681750135263e-05);
            }
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(-0.001387603183718333355045615);
            }
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(0.04166606225906388516477818);
            }
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(-0.4999999068460709850114654);
            }
            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(0.9999999771350314148321559);
            }

        }

        // optimized for [-any,+any] input range!!
        template<typename Type, int Simd>
        inline
        void cosFastFullRange(Type * const __restrict__ data, Type * const __restrict__ result) noexcept
        {
            // reduce range to [-pi,+pi] by modf(input, 2pi) - pi { at high precision }
            // divide by 4 (multiply 0.25)
            // compute on [-1,+1] range
            // compute T4(cos(x)) chebyshev (   8cos(x)^4 - 8cos(x)^2 + 1   )
            // return

            alignas(64)
            double wrapAroundHighPrecision[Simd];

            alignas(64)
            double wrapAroundHighPrecisionTmp[Simd];

            alignas(64)
            double reducedData[Simd];

            alignas(64)
            double reducedDataTmp[Simd];

            alignas(64)
            Type xSqr[Simd];

            // these have to be as high precision as possible to let wide-range of inputs be used
            constexpr double pi =  /*Type(std::acos(-1));*/ double(3.1415926535897932384626433832795028841971693993751058209749445923);

            constexpr double twoPi = double(2.0 * pi);

            constexpr double twoPiInv = double(1.0/twoPi);


            // range reduction start
            // from -any,any to -pi,pi
            for(int i=0;i<Simd;i++)
            {
                wrapAroundHighPrecision[i] = data[i];
            }


            for(int i=0;i<Simd;i++)
            {
                wrapAroundHighPrecisionTmp[i] = wrapAroundHighPrecision[i] * twoPiInv;
            }


            for(int i=0;i<Simd;i++)
            {
                wrapAroundHighPrecisionTmp[i] = std::floor(wrapAroundHighPrecisionTmp[i]);
            }


            for(int i=0;i<Simd;i++)
            {
                wrapAroundHighPrecisionTmp[i] = twoPi*wrapAroundHighPrecisionTmp[i];
            }


            for(int i=0;i<Simd;i++)
            {
                reducedData[i] = wrapAroundHighPrecision[i] - wrapAroundHighPrecisionTmp[i];
            }


            for(int i=0;i<Simd;i++)
            {
                reducedDataTmp[i] = reducedData[i]-twoPi;
            }


            for(int i=0;i<Simd;i++)
            {
                reducedData[i]=reducedData[i]<double(0.0)?reducedDataTmp[i]:reducedData[i];
            }

            // shift range left by pi to make symmetric
            for(int i=0;i<Simd;i++)
            {
                reducedData[i] = reducedData[i] - pi;
            }

            // division by 4 to make it inside -1,+1 range
            // will require de-reduction of range later
            for(int i=0;i<Simd;i++)
            {
                reducedData[i] = reducedData[i]*double(0.25);
            }


            for(int i=0;i<Simd;i++)
            {
                reducedData[i] =    reducedData[i]*reducedData[i];
            }

            // from here, polynomial approximation is made
            // (looks like Chebyshev with fractional coefficients)
            for(int i=0;i<Simd;i++)
            {
                xSqr[i] =   reducedData[i];
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     Type(2.375724425540681750135263e-05);
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(-0.001387603183718333355045615);
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(0.04166606225906388516477818);
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(-0.4999999068460709850114654);
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(0.9999999771350314148321559);
            }

            // from here, de-reduction of range is hapening by
            // Chebyshev L_4(cos_approximated(x)) = 8x^4 - 8x^2 + 1
            for(int i=0;i<Simd;i++)
            {
                xSqr[i] =   result[i]*result[i];
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     Type(8.0)*xSqr[i] - Type(8.0);
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     result[i]*xSqr[i] + Type(1.0);
            }


            for(int i=0;i<Simd;i++)
            {
                result[i] =     -result[i];
            }
        }

#include<cstring>
template<typename T>
uint32_t GetUlpDifference(T a, T b)
{
    uint32_t aBitValue;
    uint32_t bBitValue;
    std::memcpy(&aBitValue,&a,sizeof(T));
    std::memcpy(&bBitValue,&b,sizeof(T));
    return (aBitValue > bBitValue) ?
           (aBitValue - bBitValue) :
           (bBitValue - aBitValue);
}
#include<vector>
template<typename Type>
float computeULP(std::vector<Type> real, std::vector<Type> approximation)
{
    int ctr = 0;
    Type diffSum = 0;
    for(auto r:real)
    {
        Type diff = GetUlpDifference(r,approximation[ctr++]);
        diffSum += diff;
    }
    return diffSum/ctr;
}

template<typename Type>
float computeMaxULP(std::vector<Type> real, std::vector<Type> approximation, std::vector<Type> input)
{
    int ctr = 0;
    Type mx = 0;
    int index = -1;
    Type rr = 0;
    Type aa = 0;
    for(auto r:real)
    {
        Type diff = GetUlpDifference(r,approximation[ctr++]);
        if(mx<diff)
        {
            mx = diff;
            rr=r;
            aa=approximation[ctr-1];
            index = ctr-1;
        }
    }
    std::cout<<"(index="<<index<<": "<<rr<<" <--> "<<aa<<")"<<std::endl;
    std::cout<<"most problematic input: "<<input[index]<<std::endl;
    return mx;
}

#include <stdint.h>
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif

inline
uint64_t readTSC() {
    uint64_t tsc = __rdtsc();
    return tsc;
}
void test()
{
    constexpr int n = 8192*64;
    std::vector<float> a(n),b(n),c(n);
    for(int k=0;k<10;k++)
    {
        // huge-angle initialization
        // -100M to +100M radians
        for(int i=0;i<n;i++)
            a[i]=100000000.0f*(i-(n/2))/(float)(n/2);

        // approximation
        auto t1 = readTSC();
        for(int i=0;i<n;i+=16)
            cosFastFullRange<float,16>(a.data()+i,b.data()+i);
        auto t2 = readTSC();

        // exact
        for(int i=0;i<n;i++)
            c[i] = std::cos(a[i]);
        auto t3 = readTSC();
        std::cout<<"avg. ulps: "<<computeULP(b,c)<<std::endl;
        std::cout<<"max. ulps: "<<computeMaxULP(b,c,a)<<"--> max ulps"<<std::endl;
        std::cout<<"approximation cycles: "<<t2-t1<<std::endl;
        std::cout<<"std::cos cycles: "<<t3-t2<<std::endl;
        std::cout<<"approximation cycles per cos: "<<(t2-t1)/(float)n<<std::endl;
        std::cout<<"std::cos cycles per cos: "<<(t3-t2)/(float)n<<std::endl;
        std::cout<<"---------------------------"<<std::endl;
    }
}

int main()
{
    test();
    return 0;
}

https://godbolt.org/z/M654boWY9

如您所见，它有 15-25 ulps 的平均误差（取决于样本数量，而在受限版本中 -1,1 范围内的所有可表示对象上只有 0.075），对于小结果来说它很重要，但对于游戏-就像以某个角度发射火箭这样的事情在游戏中不需要火箭科学，所以这应该可以在最大 0.0000001 位错误的情况下使用。此外，对于 AVX2，它现在每 cos 4 个周期更慢（如果您需要像 +/-100M 这样的大角度，双精度范围缩小会大大降低速度，这很重要）。

通常范围缩小需要 1000+ 位的 PI。我在那里只使用 64 位浮点，所以它远非科学。

原文由 huseyin tugrul buyukisik 发布，翻译遵循 CC BY-SA 4.0 许可协议

用 C 语言最快实现正弦、余弦和平方根（不需要很精确）

你尚未登录，登录后可以

c++模板类链表链接错误？

c++98环境循环单链表类的私有结构体与引用其的成员函数顺序问题？

头文件保护为什么报warring?

请问下这种数据结构怎么选择?

为什么我的这个std::sort会排序数组之外的一个元素?

我希望能让自己设计的qt界面更美观,更有条理,应该看哪些书/学哪些技术?

员工电脑装了公司内部的证书，那么通过公共wifi可以拦截并解密用户的微信信息吗？

Stack Overflow 翻译