memcmp是最基本的库函数了。下文选择几版代码,来对比分析性能。
代码选自linux4.4/lib/string.c
int memcmp(const void *cs, const void *ct, size_t count)
{
const unsigned char *su1, *su2;
int res = 0;
for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
if ((res = *su1 - *su2) != 0)
break;
return res;
}
一个byte一个byte的循环比较。c语言的简洁明了一览无遗。
代码选自uksm:https://github.com/dolohow/uksm/blob/master/uksm-4.9.patch
int memcmpx86_64(void *s1, void *s2, size_t n)
{
size_t num = n / 8;
register int res;
__asm__ __volatile__
(
"testq %q3,%q3\n\t"
"repe; cmpsq\n\t"
"je 1f\n\t"
"sbbq %q0,%q0\n\t"
"orq $1,%q0\n"
"1:"
: "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
: "0" (0)
: "cc");
return res;
}
因为是比对page页面的函数,size是8的整数倍,所以没有另外判断。
代码选自glibc-2.23/sysdeps/x86_64/memcmp.S
以下的代码是使用汇编语言实现,针对x64的加速,xmm寄存器是16byte宽的,效率更高。
ENTRY (memcmp)
test %rdx, %rdx
jz L(finz)
cmpq $1, %rdx
jle L(finr1b)
subq %rdi, %rsi
movq %rdx, %r10
cmpq $32, %r10
jge L(gt32)
/* Handle small chunks and last block of less than 32 bytes. */
L(small):
testq $1, %r10
jz L(s2b)
movzbl (%rdi), %eax
movzbl (%rdi, %rsi), %edx
subq $1, %r10
je L(finz1)
addq $1, %rdi
subl %edx, %eax
jnz L(exit)
L(s2b):
testq $2, %r10
jz L(s4b)
movzwl (%rdi), %eax
movzwl (%rdi, %rsi), %edx
subq $2, %r10
je L(fin2_7)
addq $2, %rdi
cmpl %edx, %eax
jnz L(fin2_7)
L(s4b):
testq $4, %r10
jz L(s8b)
movl (%rdi), %eax
movl (%rdi, %rsi), %edx
subq $4, %r10
je L(fin2_7)
addq $4, %rdi
cmpl %edx, %eax
jnz L(fin2_7)
L(s8b):
testq $8, %r10
jz L(s16b)
movq (%rdi), %rax
movq (%rdi, %rsi), %rdx
subq $8, %r10
je L(fin2_7)
addq $8, %rdi
cmpq %rdx, %rax
jnz L(fin2_7)
L(s16b):
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %edx
xorl %eax, %eax
subl $0xffff, %edx
jz L(finz)
bsfl %edx, %ecx
leaq (%rdi, %rcx), %rcx
movzbl (%rcx), %eax
movzbl (%rsi, %rcx), %edx
jmp L(finz1)
.p2align 4,, 4
L(finr1b):
movzbl (%rdi), %eax
movzbl (%rsi), %edx
L(finz1):
subl %edx, %eax
L(exit):
ret
.p2align 4,, 4
L(fin2_7):
cmpq %rdx, %rax
jz L(finz)
movq %rax, %r11
subq %rdx, %r11
bsfq %r11, %rcx
sarq $3, %rcx
salq $3, %rcx
sarq %cl, %rax
movzbl %al, %eax
sarq %cl, %rdx
movzbl %dl, %edx
subl %edx, %eax
ret
.p2align 4,, 4
L(finz):
xorl %eax, %eax
ret
/* For blocks bigger than 32 bytes
1. Advance one of the addr pointer to be 16B aligned.
2. Treat the case of both addr pointers aligned to 16B
separately to avoid movdqu.
3. Handle any blocks of greater than 64 consecutive bytes with
unrolling to reduce branches.
4. At least one addr pointer is 16B aligned, use memory version
of pcmbeqb.
*/
.p2align 4,, 4
L(gt32):
movq %rdx, %r11
addq %rdi, %r11
movq %rdi, %r8
andq $15, %r8
jz L(16am)
/* Both pointers may be misaligned. */
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %edx
subl $0xffff, %edx
jnz L(neq)
neg %r8
leaq 16(%rdi, %r8), %rdi
L(16am):
/* Handle two 16B aligned pointers separately. */
testq $15, %rsi
jz L(ATR)
testq $16, %rdi
jz L(A32)
movdqu (%rdi, %rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(A32):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jge L(mt16)
/* Pre-unroll to be ready for unrolled 64B loop. */
testq $32, %rdi
jz L(A64)
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(A64):
movq %r11, %r10
andq $-64, %r10
cmpq %r10, %rdi
jge L(mt32)
L(A64main):
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(A64main)
L(mt32):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jge L(mt16)
L(A32main):
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(A32main)
L(mt16):
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
.p2align 4,, 4
L(neq):
bsfl %edx, %ecx
movzbl (%rdi, %rcx), %eax
addq %rdi, %rsi
movzbl (%rsi,%rcx), %edx
jmp L(finz1)
.p2align 4,, 4
L(ATR):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jge L(mt16)
testq $16, %rdi
jz L(ATR32)
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
je L(mt16)
L(ATR32):
movq %r11, %r10
andq $-64, %r10
testq $32, %rdi
jz L(ATR64)
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(ATR64):
cmpq %rdi, %r10
je L(mt32)
L(ATR64main):
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(ATR64main)
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jge L(mt16)
L(ATR32res):
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %r10, %rdi
jne L(ATR32res)
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
/* Align to 16byte to improve instruction fetch. */
.p2align 4,, 4
END(memcmp)
方法1修改一下,单次比较unsigned long的长度。
int kmemcmp(const void *cs, const void *ct, size_t count)
{
const unsigned long *su1, *su2;
int res = 0;
for (su1 = cs, su2 = ct; 0 < count;) {
if ((res = *su1 - *su2) != 0)
break;
su1 += 1;
su2 += 1;
count -= sizeof(unsigned long);
}
return res;
}
gcc使用O0,对两个4k大小的内存进行比较,一共比较2G,统计执行时间:
方法1用时大约20S。 方法2用时大约0.64S。 方法3用时大约0.20S。 方法4用时大约2.50S。 方法4的时间是方法1的八分之一左右,这个是预期的逻辑。使用XMM寄存器最快。
gcc使用O2,对两个4k大小的内存进行比较,一共比较2G,统计执行时间:
方法1用时大约3.4S。 方法2用时大约0.62S。 方法3用时大约0.20S。 方法4用时大约0.42S。 方法4的时间还是方法1的八分之一左右,但是他们都明显的缩短了时间,方法4甚至比方法2还要快一些,可见,gcc的优化效果真的好厉害。
真的应了一句话:没有对比,就没有伤害。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。