By
Liang Zhu
更新日期:
SSE
SSE概述
SSE是指令集的简称,它包括70条指令,其中包含单指令多数据浮点计算、以及额外的SIMD整数和高速缓存控制指令。其优势包括:更高分辨率的图像浏览和处理、高质量音频、MPEG2视频、同时MPEG2加解密码以及语音识别占用更少CPU资源;更高精度和更快响应速度。
大部分涉及到128位内存变量操作的,内存变量首地址必须要对齐16字节,也就是内存地址低4位为0,否则会引起CPU异常,导致指令执行失败,此错误编译器不检查.
SSE DEMO
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
| #include <stdio.h> #include <vector> #include <iostream> #include <unistd.h> #include <fcntl.h> #include <sys/types.h> #include <time.h> #include <stdlib.h> #include <xmmintrin.h>
using namespace std;
void ComputeArrayCPlusPlusSSE( float* pArray1, float* pArray2, float* pResult, int nSize) { int nLoop = nSize / 4;
__m128 m1, m2, m3, m4;
__m128* pSrc1 = (__m128*) pArray1; __m128* pSrc2 = (__m128*) pArray2; __m128* pDest = (__m128*) pResult;
__m128 m0_5 = _mm_set_ps1(0.5f);
for ( int i = 0; i < nLoop; i++ ) { m1 = _mm_mul_ps(*pSrc1, *pSrc1); m2 = _mm_mul_ps(*pSrc2, *pSrc2); m3 = _mm_add_ps(m1, m2); m4 = _mm_sqrt_ps(m3); *pDest = _mm_add_ps(m4, m0_5);
pSrc1++; pSrc2++; pDest++; } }
void test() { float *va = new float[64]; float *vb = new float[64]; float *vo = new float[64]; for (int i=0; i<64; ++i) { va[i] = i/10.0; vb[i] = (i+1)/10.0; } ComputeArrayCPlusPlusSSE(va, vb, vo, 64); for (int i=0; i<64; ++i) { printf("%.3f ", vo[i]); } printf("\n"); delete [] va; delete [] vb; delete [] vo; }
void test2() { float *a, *b; for (int i=0; i<10; ++i) { a = new float[9]; b = new float[1]; printf("a:%p b:%p\n", a, b); } }
int main(int argc, char *argv[]) {
test(); return 0; }
|