galois

# BGR888ToYUV444

``````void BGR888ToYUV444(unsigned char *yuv, unsigned char *bgr, int pixel_num)
{
int i;
for (i = 0; i < pixel_num; ++i) {
uint8_t r = bgr[i * 3];
uint8_t g = bgr[i * 3 + 1];
uint8_t b = bgr[i * 3 + 2];

uint8_t y = 0.299 * r + 0.587 * g + 0.114 * b;
uint8_t u = -0.169 * r - 0.331 * g + 0.5 * b + 128;
uint8_t v = 0.5 * r - 0.419 * g - 0.081 * b + 128;

yuv[i * 3] = y;
yuv[i * 3 + 1] = u;
yuv[i * 3 + 2] = v;
}
}``````

（细心的看官可能会问到双精度浮点数double的运算吧？遗憾的是，根据Guide，NEON并不支持double，你可以考虑使用VFP/Vector Floating Point，但VFP并非SIMD单元）。

# 从浮点运算到整型运算

``````// Refer to: https://en.wikipedia.org/wiki/YUV (Full swing for BT.601)
void BGR888ToYUV444(unsigned char *yuv, unsigned char *bgr, int pixel_num)
{
int i;
for (i = 0; i < pixel_num; ++i) {
uint8_t r = bgr[i * 3];
uint8_t g = bgr[i * 3 + 1];
uint8_t b = bgr[i * 3 + 2];

// 1. Multiply transform matrix (Y′: unsigned, U/V: signed)
uint16_t y_tmp = 76 * r + 150 * g + 29 * b;
int16_t u_tmp = -43 * r - 84 * g + 127 * b;
int16_t v_tmp = 127 * r - 106 * g - 21 * b;

// 2. Scale down (">>8") to 8-bit values with rounding ("+128") (Y′: unsigned, U/V: signed)
y_tmp = (y_tmp + 128) >> 8;
u_tmp = (u_tmp + 128) >> 8;
v_tmp = (v_tmp + 128) >> 8;

// 3. Add an offset to the values to eliminate any negative values (all results are 8-bit unsigned)
yuv[i * 3] = (uint8_t) y_tmp;
yuv[i * 3 + 1] = (uint8_t) (u_tmp + 128);
yuv[i * 3 + 2] = (uint8_t) (v_tmp + 128);
}
}``````

# NEON Intrinsics

``````void BGR888ToYUV444(unsigned char * __restrict__ yuv, unsigned char * __restrict__ bgr, int pixel_num)
{
const uint8x8_t u8_zero = vdup_n_u8(0);
const uint16x8_t u16_rounding = vdupq_n_u16(128);
const int16x8_t s16_rounding = vdupq_n_s16(128);
const int8x16_t s8_rounding = vdupq_n_s8(128);

int count = pixel_num / 16;

int i;
for (i = 0; i < count; ++i) {
uint8x16x3_t pixel_bgr = vld3q_u8(bgr);

uint8x8_t high_r = vget_high_u8(pixel_bgr.val[0]);
uint8x8_t low_r = vget_low_u8(pixel_bgr.val[0]);
uint8x8_t high_g = vget_high_u8(pixel_bgr.val[1]);
uint8x8_t low_g = vget_low_u8(pixel_bgr.val[1]);
uint8x8_t high_b = vget_high_u8(pixel_bgr.val[2]);
uint8x8_t low_b = vget_low_u8(pixel_bgr.val[2]);

// NOTE:
// declaration may not appear after executable statement in block
uint16x8_t high_y;
uint16x8_t low_y;
uint8x8_t scalar = vdup_n_u8(76);
int16x8_t high_u;
int16x8_t low_u;
int16x8_t signed_scalar = vdupq_n_s16(-43);
int16x8_t high_v;
int16x8_t low_v;
uint8x16x3_t pixel_yuv;
int8x16_t u;
int8x16_t v;

// 1. Multiply transform matrix (Y′: unsigned, U/V: signed)
high_y = vmull_u8(high_r, scalar);
low_y = vmull_u8(low_r, scalar);

high_u = vmulq_s16(signed_high_r, signed_scalar);
low_u = vmulq_s16(signed_low_r, signed_scalar);

signed_scalar = vdupq_n_s16(127);
high_v = vmulq_s16(signed_high_r, signed_scalar);
low_v = vmulq_s16(signed_low_r, signed_scalar);

scalar = vdup_n_u8(150);
high_y = vmlal_u8(high_y, high_g, scalar);
low_y = vmlal_u8(low_y, low_g, scalar);

signed_scalar = vdupq_n_s16(-84);
high_u = vmlaq_s16(high_u, signed_high_g, signed_scalar);
low_u = vmlaq_s16(low_u, signed_low_g, signed_scalar);

signed_scalar = vdupq_n_s16(-106);
high_v = vmlaq_s16(high_v, signed_high_g, signed_scalar);
low_v = vmlaq_s16(low_v, signed_low_g, signed_scalar);

scalar = vdup_n_u8(29);
high_y = vmlal_u8(high_y, high_b, scalar);
low_y = vmlal_u8(low_y, low_b, scalar);

signed_scalar = vdupq_n_s16(127);
high_u = vmlaq_s16(high_u, signed_high_b, signed_scalar);
low_u = vmlaq_s16(low_u, signed_low_b, signed_scalar);

signed_scalar = vdupq_n_s16(-21);
high_v = vmlaq_s16(high_v, signed_high_b, signed_scalar);
low_v = vmlaq_s16(low_v, signed_low_b, signed_scalar);

// 2. Scale down (">>8") to 8-bit values with rounding ("+128") (Y′: unsigned, U/V: signed)
// 3. Add an offset to the values to eliminate any negative values (all results are 8-bit unsigned)

pixel_yuv.val[0] = vcombine_u8(vqshrn_n_u16(low_y, 8), vqshrn_n_u16(high_y, 8));

u = vcombine_s8(vqshrn_n_s16(low_u, 8), vqshrn_n_s16(high_u, 8));

v = vcombine_s8(vqshrn_n_s16(low_v, 8), vqshrn_n_s16(high_v, 8));

pixel_yuv.val[1] = vreinterpretq_u8_s8(u);

pixel_yuv.val[2] = vreinterpretq_u8_s8(v);

// Store
vst3q_u8(yuv, pixel_yuv);

bgr += 3 * 16;
yuv += 3 * 16;
}

// Handle leftovers
for (i = count * 16; i < pixel_num; ++i) {
uint8_t r = bgr[i * 3];
uint8_t g = bgr[i * 3 + 1];
uint8_t b = bgr[i * 3 + 2];

uint16_t y_tmp = 76 * r + 150 * g + 29 * b;
int16_t u_tmp = -43 * r - 84 * g + 127 * b;
int16_t v_tmp = 127 * r - 106 * g - 21 * b;

y_tmp = (y_tmp + 128) >> 8;
u_tmp = (u_tmp + 128) >> 8;
v_tmp = (v_tmp + 128) >> 8;

yuv[i * 3] = (uint8_t) y_tmp;
yuv[i * 3 + 1] = (uint8_t) (u_tmp + 128);
yuv[i * 3 + 2] = (uint8_t) (v_tmp + 128);
}
}``````

• 我用`vld3q_u8`指令从内存一次加载16个像素（也就是`uint8_t`类型的B、G、R三个通道的数值），将各个通道的16个数值放到一个Q-register中（也就是用了三个Q-register，每个分别存放16个B、16个G和16个R），`vst3q_u8`的写入操作也是类似的，这充分利用128-bit的带宽，使内存存取（memory access）的次数尽可能少。但是，后边运算的向量化程度其实仍然不变，只能同时进行8个16-bit整型运算，也就是说，对于运算的部分，理想加速比（speedup）是8而非16。（当然，一次从内存加载多少数据是一个design choice，没有绝对的答案，看官也可以尝试别的方式XD）
• 对于像素个数不能被16整除的情况，可能会有剩下的1到15个像素没被上述NEON代码处理到，这里我把它们称作leftovers，简单粗暴地跑了之前的for循环。其实leftover的情况如果占比高的话，还有更高明的处理方式，请各位看官参见Guide的6.2 Handling non-multiple array lengths一节。
• 我把Y通道计算的一部分放到一起，稍作解释，其他的很多运算都是类似的：
``````// 复制8个值为128的uint16_t类型整数到某个Q-register，该Q-register对应的C变量是u16_rounding
const uint16x8_t u16_rounding = vdupq_n_u16(128);
// 复制8个值为128的uint8_t类型整数到某个D-register，该D-register对应的C变量是scalar
uint8x8_t scalar = vdup_n_u8(76);
// pixel_bgr.val[0]为一个Q-register，16个uint8_t类型的数，对应R通道
// pixel_bgr.val[1]为一个Q-register，16个uint8_t类型的数，对应G通道
// pixel_bgr.val[2]为一个Q-register，16个uint8_t类型的数，对应B通道
uint8x16x3_t pixel_bgr = vld3q_u8(bgr);
// 一个Q-register对应有两个D-register
// 这是拿到对应R通道的Q-register高位的D-register，有8个R值
// 参见Guide中Register overlap一节（这部分内容很重要！）
// 其他如low_r、high_g的情况相似，这里从略
uint8x8_t high_r = vget_high_u8(pixel_bgr.val[0]);
// 对8个R值执行乘76操作
// vmull是变长指令（常以单词末尾额外的L作为标记），源操作数是两个uint8x8_t的向量，目的操作数是uint16x8_t的向量
// 到这一步：Y = R * 76
// low_y的情况类似，从略
high_y = vmull_u8(high_r, scalar);
// 把scalar的8个128改为8个150
scalar = vdup_n_u8(150);
// 执行乘加运算
// 到这一步：Y = R * 76 + G * 150
high_y = vmlal_u8(high_y, high_g, scalar);
// 把scalar的8个150改为8个29
scalar = vdup_n_u8(29);
// 执行乘加运算
// 到这一步：Y = R * 76 + G * 150 + B * 29
high_y = vmlal_u8(high_y, high_b, scalar);
// 到这一步：Y = (R * 76 + G * 150 + B * 29) + 128
// vqshrn_n_u16是变窄指令（常以单词末尾额外的N作为标记，N为Narrow），将uint16x8_t的向量压缩为uint8x8_t
// 到这一步：Y = ((R * 76 + G * 150 + B * 29) + 128) >> 8
// vcombine_u8用于将两个D-register的值组装到一个Q-register中
pixel_yuv.val[0] = vcombine_u8(vqshrn_n_u16(low_y, 8), vqshrn_n_u16(high_y, 8));``````
• 看官也许已经注意到了，在前面的`BGR888ToYUV444`函数中，我并非像上面的代码一样，将一个通道的运算放到一块，而是有意将Y、U、V三个通道的运算打散搅在一起。这是为了尽可能减少data dependency所引起的stall，这也是优化NEON代码需要着重考虑的方向之一。
• 对NEON稍有了解的细心看官可能会问：乘法和加法所需要的128、76等常数为何不用立即数——例如NEON的`vmul_n`——而是采用先批量复制常数到向量再做向量运算？这是因为我们很多运算的源操作数是`int8x8_t``uint8x8_t`的向量，`vmul_n`等API很不幸不支持这种格式。这里也良心提醒看官，写NEON intrinsics或者汇编一定要对照Guide后面的附录列出的格式，否则编译器常常会报一些风马牛不相及的错误，把人往坑里带——我当初可是各种踩坑各种在不相关的地方纠结啊555...

155 声望
16 粉丝