我一直在努力提高大型(千兆字节)比特阵列操作的性能.我不是SIMD专家,但似乎SIMD在所有情况下都比标量操作慢.我尝试过几次优化,包括循环展开,但无济于事.基于程序集,它似乎是因为标
/* gcc -Wall -O3 bitwise-and.c -o bitwise-and -m64 -fomit-frame-pointer -mtune=nocona -msse2 */ #ifdef ENABLE_PREFETCH #warning "SIMD PREFETCHING ENABLED" #else #warning "SIMD PREFETCHING DISABLED" #endif #ifdef ENABLE_SIMD_UNROLLING #warning "UNROLLING SIMD" #else #warning "NOT UNROLLING SIMD" #endif #ifdef AVOID_TEMP_VARS #warning "AVOIDING SIMD TEMPORARY VARIABLES" #else #warning "USING SIMD TEMPORARY VARIABLES" #endif #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <unistd.h> #include <string.h> #include <signal.h> #include <setjmp.h> #include <sys/time.h> #include <sys/types.h> #include <sys/wait.h> #include <emmintrin.h> #include <xmmintrin.h> #include <assert.h> #define __forceinline __attribute__((always_inline)) double microtime (void) { struct timeval time; gettimeofday(&time, NULL); return (double) time.tv_sec * 1E6 + (double) time.tv_usec; } __forceinline void simd_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size) { const __m128i *wrd_ptr = (__m128i *) src; const __m128i *wrd_end = (__m128i *) (src + block_size); __m128i *dst_ptr = (__m128i *) dst; _mm_empty(); do { __m128i xmm1; __m128i xmm2; #ifdef ENABLE_SIMD_UNROLLING # ifdef ENABLE_PREFETCH _mm_prefetch((src + 512), _MM_HINT_NTA); # endif xmm1 = _mm_load_si128(wrd_ptr++); xmm2 = _mm_load_si128(dst_ptr); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst_ptr++, xmm1); xmm1 = _mm_load_si128(wrd_ptr++); xmm2 = _mm_load_si128(dst_ptr); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst_ptr++, xmm1); xmm1 = _mm_load_si128(wrd_ptr++); xmm2 = _mm_load_si128(dst_ptr); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst_ptr++, xmm1); xmm1 = _mm_load_si128(wrd_ptr++); xmm2 = _mm_load_si128(dst_ptr); xmm1 = _mm_and_si128(xmm1, xmm2); _mm_store_si128(dst_ptr++, xmm1); #else # ifdef AVOID_TEMP_VARS xmm1 = _mm_and_si128(*dst_ptr, *wrd_ptr); # else xmm1 = _mm_load_si128(wrd_ptr); xmm2 = _mm_load_si128(dst_ptr); xmm1 = _mm_and_si128(xmm1, xmm2); # endif _mm_store_si128(dst_ptr, xmm1); ++dst_ptr; ++wrd_ptr; #endif } while (wrd_ptr < wrd_end); } __forceinline void word_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size) { unsigned int *wrd_ptr = (unsigned int *) src; unsigned int *wrd_end = (unsigned int *) (src + block_size); unsigned int *dst_ptr = (unsigned int *) dst; do { dst_ptr[0] &= wrd_ptr[0]; dst_ptr[1] &= wrd_ptr[1]; dst_ptr[2] &= wrd_ptr[2]; dst_ptr[3] &= wrd_ptr[3]; dst_ptr += 4; wrd_ptr += 4; } while (wrd_ptr < wrd_end); } int main (int argc, char **argv) { unsigned char *dest; unsigned char *key1; unsigned char *key2; size_t minlen = (1024UL * 1024UL * 512UL); double start_time = 0.0f; double end_time = 0.0f; posix_memalign((void *) &key1, sizeof(__m128i), minlen); posix_memalign((void *) &key2, sizeof(__m128i), minlen); posix_memalign((void *) &dest, sizeof(__m128i), minlen); key1[128] = 0xff; key2[128] = 0x03; // 128-bit SIMD Bitwise AND memcpy(dest, key1, minlen); start_time = microtime(); simd_bitwise_and(dest, key2, minlen); end_time = microtime(); printf("Elapsed: %8.6fs\n", (end_time - start_time)); assert(0x03 == dest[128]); // 4xWORD Bitwise AND memcpy(dest, key1, minlen); start_time = microtime(); word_bitwise_and(dest, key2, minlen); end_time = microtime(); printf("Elapsed: %8.6fs\n", (end_time - start_time)); assert(0x03 == dest[128]); free(dest); free(key2); free(key1); return EXIT_SUCCESS; } /* vi: set et sw=2 ts=2: */这里发生的事情是你被懒惰的虚拟内存分配所困扰.如果您将代码更改为:
// 128-bit SIMD Bitwise AND memcpy(dest, key1, minlen); start_time = microtime(); simd_bitwise_and(dest, key2, minlen); end_time = microtime(); printf("SIMD Elapsed : %8.6fs\n", (end_time - start_time)); assert(0x03 == dest[128]); // 4xWORD Bitwise AND memcpy(dest, key1, minlen); start_time = microtime(); word_bitwise_and(dest, key2, minlen); end_time = microtime(); printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time)); assert(0x03 == dest[128]); // 128-bit SIMD Bitwise AND memcpy(dest, key1, minlen); start_time = microtime(); simd_bitwise_and(dest, key2, minlen); end_time = microtime(); printf("SIMD Elapsed : %8.6fs\n", (end_time - start_time)); assert(0x03 == dest[128]); // 4xWORD Bitwise AND memcpy(dest, key1, minlen); start_time = microtime(); word_bitwise_and(dest, key2, minlen); end_time = microtime(); printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time)); assert(0x03 == dest[128]);
你应该看到这样的结果:
$./bitwise-and SIMD Elapsed : 630061.000000s Scalar Elapsed: 228156.000000s SIMD Elapsed : 182645.000000s Scalar Elapsed: 202697.000000s $
说明:第一次迭代大内存分配时,会产生页面错误,因为之前未使用的页面已接通.这为第一个基准测试提供了人为的高速时间,恰好是SIMD基准测试.在第二个及后续的基准测试中,页面都已连接,您可以获得更准确的基准测试,并且正如预期的那样,SIMD例程比标量例程略快.差异并不像预期的那么大,因为每2个负载1个存储只执行一条ALU指令,因此性能受DRAM带宽而非计算效率的限制.
作为编写基准测试代码时的一般规则:在任何实际时序测量之前始终至少调用一次基准测试例程,以便所有内存分配都正确连接.之后在循环中多次运行基准测试例程并忽略任何异常值.