在H264/H265 NALU 起始码搜索性能优化(1)的基础上,这几天又思考了一下,针对上次的simd的实现算法,发现还有一些优化余地,利用start code应该是小概率事件的特性,可以将几次的比较结果放到一个int64的字段m里面,通过判断该字段是否为0,就只要一个条件判断就可以确定16个byte里面(哦,准确地讲是14个byte,最高的两个byte需要跨16字节边界进行判断)到底是否存在start code,然后通过调整m的判断顺序,还可以保证添加到输出结果中的记录依然能够保序,省去了最后进行排序的需求。
代码如下:
vector<int> split_nalu_simd3(const uint8_t *video_data, size_t size)
{
size_t index = 0;
vector<int> list_index;
op_data od;
size_t next_block = 31;
// TODO:
// 需要确保video_data按照16字节对齐
// 如果16字节没有对齐,需要提前处理
while(index + 16 <= (size_t)size){
__m128i src = _mm_load_si128((__m128i*)(video_data+index));
__m128i mask = _mm_load_si128((__m128i*)od.mask_1);
__m128i dest = _mm_load_si128((__m128i*)od.dest_1);
__m128i tmp = _mm_and_si128(src, mask);
__m128i tmp2 = _mm_cmpeq_epi32(tmp, dest);
int64_t m = _mm_movemask_epi8(tmp2); // 将tmp2中的结果合并到int里面
mask = _mm_load_si128((__m128i*)od.mask_2);
dest = _mm_load_si128((__m128i*)od.dest_2);
tmp = _mm_and_si128(src, mask);
tmp2 = _mm_cmpeq_epi32(tmp, dest);
m = (((int64_t)_mm_movemask_epi8(tmp2) << 16) | m);
src = _mm_srli_si128(src, 2); // 左移2个字节,再进行比较
mask = _mm_load_si128((__m128i*)od.mask_1);
dest = _mm_load_si128((__m128i*)od.dest_1);
tmp = _mm_and_si128(src, mask);
tmp2 = _mm_cmpeq_epi32(tmp, dest);
m = (((int64_t)_mm_movemask_epi8(tmp2) << 32) | m);
mask = _mm_load_si128((__m128i*)od.mask_2);
dest = _mm_load_si128((__m128i*)od.dest_2);
tmp = _mm_and_si128(src, mask);
tmp2 = _mm_cmpeq_epi32(tmp, dest);
m = (((int64_t)_mm_movemask_epi8(tmp2) << 48) | m);
if (srs_unlikely(m != 0l)){
if (srs_unlikely(m & 0xFl)){
list_index.emplace_back(index);
}
if (srs_unlikely(m & 0xF0000l)){
list_index.emplace_back(index + 1);
}
if (srs_unlikely(m & 0xF00000000l)){
list_index.emplace_back(index + 2);
}
if (srs_unlikely(m & 0xF000000000000l)){
list_index.emplace_back(index + 3);
}
if (srs_unlikely(m & 0xF0l)){
list_index.emplace_back(index + 4);
}
if ((m & 0xF00000l)){
list_index.emplace_back(index + 5);
}
if (srs_unlikely(m & 0xF000000000l)){
list_index.emplace_back(index + 6);
}
if (srs_unlikely(m & 0xF0000000000000l)){
list_index.emplace_back(index + 7);
}
if (srs_unlikely(m & 0xF00l)){
list_index.emplace_back(index + 8);
}
if (srs_unlikely(m & 0xF000000l)){
list_index.emplace_back(index + 9);
}
if (srs_unlikely(m & 0xF0000000000l)){
list_index.emplace_back(index + 10);
}
if (srs_unlikely(m & 0xF00000000000000l)){
list_index.emplace_back(index + 11);
}
if (srs_unlikely(m & 0xF000l)){
list_index.emplace_back(index + 12);
}
if (srs_unlikely(m & 0xF0000000l)){
list_index.emplace_back(index + 13);
}
}
index += 16;
uint32_t tail = _mm_extract_epi32(src, 3);
//
// 处理拖尾字节,如果是00 00 或者是00
// 需要比对下一个16字节的block中是否有01 或者00 01
//
if (tail == 0x0u){
if (index + 1 <= size && video_data[index] == 0x01){
list_index.emplace_back(index-2);
continue;
}
}
if ((tail & 0x0000FF00u) == 0x0u){
if (index + 2 <= size && video_data[index] == 0x00 && video_data[index+1] == 0x01){
list_index.emplace_back(index-1);
}
}
}
while(index + 4 < (size_t)size){
uint32_t code = *(uint64_t*)(video_data+index);
if ((code & 0x00FFFFFFu) == 0x00010000u){
list_index.emplace_back(index);
index += 3;
continue;
}
index++;
}
//输出到list_index中的起始码位置已经保序,不需要再进行sort
//std::sort(list_index.begin(), list_index.end());
return list_index;
}
结果确实比较令人满意,在split_nalu_simd2的基础上又提升了30%的性能。大数据量查找对比结果显示所需时间是split_nalu的2/5到1/3左右,还是非常不错的!