Instructions函数对照表:02 xmmintrin.h与SSE指令集

作者:zyl910

更多详情见——SIMD函数整理:00 索引贴

R:寄存器。M:64位MM寄存器;X:128位XMM寄存器;Y:256位YMM寄存器。Name:函数名。Name2:另一种函数名。功能:功能描述。Asm:汇编指令。PCode:伪代码。

RNameName2功能AsmPCode

X_MM_SHUFFLE混洗的掩码.4#(vs.71).aspx

X_MM_TRANSPOSE4_PS矩阵转置.4×4#(v=vs.71).aspx

X_MM_SET_EXCEPTION_STATE状态.设置异常状态#(v=vs.71).aspx

X_MM_GET_EXCEPTION_STATE状态.取得异常状态#(v=vs.71).aspx

X_MM_SET_EXCEPTION_MASK状态.设置异常掩码#(v=vs.71).aspx

X_MM_GET_EXCEPTION_MASK状态.取得异常掩码#(v=vs.71).aspx

X_MM_SET_ROUNDING_MODE状态.设置舍入模式#(v=vs.71).aspx

X_MM_GET_ROUNDING_MODE状态.取得舍入模式#(v=vs.71).aspx

X_MM_SET_FLUSH_ZERO_MODE状态.设置下溢清零模式#(v=vs.71).aspx

X_MM_GET_FLUSH_ZERO_MODE状态.取得下溢清零模式#(v=vs.71).aspx

X_mm_add_ss加法.单精.标量ADDSSr.fS[0]=m1.fS[0]+m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_add_ps加法.单精.紧缩ADDPSfor(i=0;i<4;++i){ r.fS[i]=m1.fS[i]+m2.fS[i]; }

X_mm_sub_ss减法.单精.标量SUBSSr.fS[0]=m1.fS[0]-m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_sub_ps减法.单精.紧缩SUBPSfor(i=0;i<4;++i){ r.fS[i]=m1.fS[i]-m2.fS[i]; }

X_mm_mul_ss乘法.单精.标量MULSSr.fS[0]=m1.fS[0]*m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_mul_ps乘法.单精.紧缩MULPSfor(i=0;i<4;++i){ r.fS[i]=m1.fS[i]*m2.fS[i]; }

X_mm_div_ss除法.单精.标量DIVSSr.fS[0]=m1.fS[0]/m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_div_ps除法.单精.紧缩DIVPSfor(i=0;i<4;++i){ r.fS[i]=m1.fS[i]/m2.fS[i]; }

X_mm_sqrt_ss平方根.单精.标量SQRTSSr.fS[0]=sqrt(m1.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_sqrt_ps平方根.单精.紧缩SQRTPSfor(i=0;i<4;++i){ r.fS[i]=sqrt(m1.fS[i]); }

X_mm_rcp_ss倒数.单精.标量RCPSSr.fS[0]=1/m1.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_rcp_ps倒数.单精.紧缩RCPPSfor(i=0;i<4;++i){ r.fS[i]=1/m1.fS[i]; }

X_mm_rsqrt_ss平方根的倒数.单精.标量RSQRTSSr.fS[0]=1/sqrt(m1.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_rsqrt_ps平方根的倒数.单精.紧缩RSQRTPSfor(i=0;i<4;++i){ r.fS[i]=1/sqrt(m1.fS[i]); }

X_mm_min_ss最小值.单精.标量MINSSr.fS[0]=min(m1.fS[0], m2.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_min_ps最小值.单精.紧缩MINPSfor(i=0;i<4;++i){ r.fS[i]=min(m1.fS[i], m2.fS[i]); }

X_mm_max_ss最大值.单精.标量MAXSSr.fS[0]=max(m1.fS[0], m2.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_max_ps最大值.单精.紧缩MAXPSfor(i=0;i<4;++i){ r.fS[i]=max(m1.fS[i], m2.fS[i]); }

X_mm_and_ps逻辑位与.单精ANDPSfor(i=0;i<4;++i){ r.fS[i]=m1.fS[i]&m2.fS[i]; }

X_mm_andnot_ps逻辑位与非.单精ANDNPSfor(i=0;i<4;++i){ r.fS[i]=(!m1.fS[i])&m2.fS[i]; }

X_mm_or_ps逻辑位或.单精ORPSfor(i=0;i<4;++i){ r.fS[i]=m1.fS[i]|m2.fS[i]; }

X_mm_xor_ps逻辑位异或.单精XORPSfor(i=0;i<4;++i){ r.fS[i]=m1.fS[i]^m2.fS[i]; }

X_mm_cmpeq_ss比较.等于.单精.标量CMPEQSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpeq_ps比较.等于.单精.紧缩CMPEQPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmplt_ss比较.小于.单精.标量CMPLTSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmplt_ps比较.小于.单精.紧缩CMPLTPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmple_ss比较.小于等于.单精.标量CMPLESSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmple_ps比较.小于等于.单精.紧缩CMPLEPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpgt_ss比较.大于.单精.标量CMPLTSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpgt_ps比较.大于.单精.紧缩CMPLTPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpge_ss比较.大于等于.单精.标量CMPLESSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpge_ps比较.大于等于.单精.紧缩CMPLEPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpneq_ss比较.不等于.单精.标量CMPNEQSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpneq_ps比较.不等于.单精.紧缩CMPNEQPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpnlt_ss比较.不小于.单精.标量CMPNLTSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpnlt_ps比较.不小于.单精.紧缩CMPNLTPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpnle_ss比较.不小于等于.单精.标量CMPNLESSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpnle_ps比较.不小于等于.单精.紧缩CMPNLEPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpngt_ss比较.不大于.单精.标量CMPNLTSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpngt_ps比较.不大于.单精.紧缩CMPNLTPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpnge_ss比较.不大于等于.单精.标量CMPNLESSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpnge_ps比较.不大于等于.单精CMPNLEPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpord_ss比较.有序.单精.标量CMPORDSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpord_ps比较.有序.单精.紧缩CMPORDPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_cmpunord_ss比较.无序.单精.标量CMPUNORDSSr = BM(m1.fS[0] @ m2.fS[0])

X_mm_cmpunord_ps比较.无序.单精.紧缩CMPUNORDPSfor(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); }

X_mm_comieq_ss有序比较并设标志.相等.单精COMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_comilt_ss有序比较并设标志.小于.单精COMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_comile_ss有序比较并设标志.小于等于.单精COMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_comigt_ss有序比较并设标志.大于.单精COMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_comige_ss有序比较并设标志.大于等于.单精COMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_comineq_ss有序比较并设标志.不等于.单精COMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_ucomieq_ss无序比较并设标志.相等.单精UCOMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_ucomilt_ss无序比较并设标志.小于.单精UCOMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_ucomile_ss无序比较并设标志.小于等于.单精UCOMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_ucomigt_ss无序比较并设标志.大于.单精UCOMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_ucomige_ss无序比较并设标志.大于等于.单精UCOMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_ucomineq_ss无序比较并设标志.不等于.单精UCOMISSr = EFLAGS(m1.fS[0] @ m2.fS[0])

X_mm_cvt_ss2si_mm_cvtss_si32转换.单精度至符32位.标量CVTSS2SIr=(int32)m1.fS[0]

X_mm_cvt_ps2pi_mm_cvtps_pi32转换.单精度至符32位.低位2个CVTPS2PIfor(i=0;i<2;++i){ r.iD[i]=(int32)m1.fS[i]; }

X_mm_cvtt_ss2si_mm_cvttss_si32截尾法转换.单精度至符32位.标量CVTTSS2SIr=(int32)TRUNC(m1.fS[0])

X_mm_cvtt_ps2pi_mm_cvttps_pi32截尾法转换.单精度至符32位.低位2个CVTTPS2PIfor(i=0;i<2;++i){ r.iD[i]=(int32)TRUNC(m1.fS[i]); }

X_mm_cvt_si2ss_mm_cvtsi32_ss转换.符32位至单精度.标量CVTSI2SSr.fS[0]=(float)m2; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_cvt_pi2ps_mm_cvtpi32_ps转换.符32位至单精度.低位2个CVTPI2PSfor(i=0;i<2;++i){ r.fS[i]=(float)m2.iD[i]; } for(i=2;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_cvtss_f32转换.提取低32位的单精度浮点数r=m1.fS[0]

X_mm_cvtss_si64转换.单精度至符64位.标量CVTSS2SIr=(int64)m1.fS[0]

X_mm_cvttss_si64截尾法转换.单精度至符32位.标量CVTTSS2SIr=(int64)TRUNC(m1.fS[0])

X_mm_cvtsi64_ss转换.符64位至单精度.标量CVTSI2SSr.fS[0]=(float)m2; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; }

X_mm_shuffle_ps混洗.单精.2源SHUFPSfor(i=0;i<2;++i){ r.fS[i]=m1.fS[(_Imm8>>(i*2)) & 3]; } for(i=2;i<4;++i){ r.fS[i]=m2.fS[(_Imm8>>(i*2)) & 3]; }

X_mm_unpackhi_ps高位解包.单精UNPCKHPSfor(i=0;i<1;++i){ r.fS[i*2]=m1.fS[2+i]; r.fS[i*2+1]=m2.fS[2+i]; }

X_mm_unpacklo_ps低位解包.单精UNPCKLPSfor(i=0;i<1;++i){ r.fS[i*2]=m1.fS[i]; r.fS[i*2+1]=m2.fS[i]; }

X_mm_loadh_pi高位传送.加载64位MOVHPS reg, memm1.mQ[1]=*m2;

X_mm_movehl_ps高到低传送.高位2组MOVHLPSr=m1; for(i=0;i<2;++i){ r.fS[i]=m2.fS[2+i]; }

X_mm_movelh_ps低到高传送.低位2组MOVLHPSr=m1; for(i=0;i<2;++i){ r.fS[2+i]=m2.fS[2]; }

X_mm_storeh_pi高位传送.存储64位MOVHPS mem, reg*A=m2.mQ[1];

X_mm_loadl_pi低位传送.加载64位MOVLPS reg, memm1.mQ[0]=*m2;

X_mm_storel_pi低位传送.存储64位MOVLPS mem, reg*A=m2.mQ[0];

X_mm_movemask_ps传送符号位生成掩码.单精MOVMSKPSr=0; for(i=0;i<4;++i){ r<<=1; r|=SBIT(m1.fS[i]); }

M_m_pextrw_mm_extract_pi16传送.提取.16位PEXTRWr = ZX(m1.uW[imm8])

M_m_pinsrw_mm_insert_pi16传送.插入.16位PINSRWm1.uW[imm8]=(WORD)m2

M_m_pmaxsw_mm_max_pi16最大.带16位.紧缩PMAXSWfor(i=0;i<4;++i){ r.iW[i]=MAX(m1.iW[i],m2.iW[i]); }

M_m_pmaxub_mm_max_pu8最大.无8位.紧缩PMAXUBfor(i=0;i<8;++i){ r.uB[i]=MAX(m1.uB[i],m2.uB[i]); }

M_m_pminsw_mm_min_pi16最小.带16位.紧缩PMINSWfor(i=0;i<4;++i){ r.iW[i]=MIN(m1.iW[i],m2.iW[i]); }

M_m_pminub_mm_min_pu8最小.无8位.紧缩PMINUBfor(i=0;i<8;++i){ r.uB[i]=MIN(m1.uB[i],m2.uB[i]); }

M_m_pmovmskb_mm_movemask_pi8传送符号位生成掩码.字节PMOVMSKBr=0; for(i=0;i<8;++i){ r<<=1; r|=SBIT(m1.iB[i]); }

M_m_pmulhuw_mm_mulhi_pu16乘法高位.无16位PMULHUWfor(i=0;i<4;++i){ r.uW[i]=hi16(m1.uW[i]*m1.uW[i]); }

M_m_pshufw_mm_shuffle_pi16混洗.字.1源PSHUFWfor(i=0;i<3;++i){ r.uW[i]=m1.uW[(imm8>>(i*2)) & 3]; }

M_m_maskmovq_mm_maskmove_si64选择性传送.8字节MASKMOVQfor(i=0;i<8;++i){ if(SBIT(m2.iB[i])) P[i]=m1.iB[i]; }

M_m_pavgb_mm_avg_pu8平均值.无8位PAVGBfor(i=0;i<8;++i){ r.uB[i]=AVG(m1.uB[i],m2.uB[i]); }

M_m_pavgw_mm_avg_pu16平均值.无16位PAVGWfor(i=0;i<4;++i){ r.uW[i]=AVG(m1.uW[i],m2.uW[i]); }

M_m_psadbw_mm_sad_pu8绝对差.无8位,再水平8求和PSADBWr=0; for(i=0;i<8;++i){ r.uW[0]+=ABS((WORD)m1.uB[i] – m2.uB[i]); }

X_mm_set_ss赋值.单精.标量r.fS[0]=arg[0]; for(i=1;i<4;++i){ r.fS[i]=0; }

X_mm_set_ps1_mm_set1_ps重复赋值.单精.紧缩for(i=0;i<4;++i){ r.fS[i]=arg[0]); }

X_mm_set_ps赋值.单精.紧缩for(i=0;i<4;++i){ r.fS[i]=arg[i]); }

X_mm_setr_ps逆序赋值.单精.紧缩for(i=0;i<4;++i){ r.fS[i]=arg[i?]); }

X_mm_setzero_ps赋值为零.单精.紧缩r=0

X_mm_load_ss加载.单精.标量MOVSSr.fS[0]=_A[0]; for(i=1;i<4;++i){ r.fS[i]=0; }

X_mm_load_ps1_mm_load1_ps重复加载.单精.紧缩MOVSS + Shufflingfor(i=0;i<4;++i){ r.fS[i]=_A[0]); }

X_mm_load_ps加载.单精.紧缩.对齐MOVAPSfor(i=0;i<4;++i){ r.fS[i]=_A[i]); }

X_mm_loadr_ps逆序加载.单精.紧缩.对齐MOVAPS + Shufflingfor(i=0;i<4;++i){ r.fS[i]=_A[3-i]); }

X_mm_loadu_ps加载.单精.紧缩.非对齐MOVUPSfor(i=0;i<4;++i){ r.fS[i]=_A[i]); }

X_mm_store_ss存储.单精.标量MOVSS_A[0]=m1.fS[0]

X_mm_store_ps1_mm_store1_ps重复存储.单精.紧缩MOVSS + Shufflingfor(i=0;i<4;++i){ _A[i]=m1.fS[0]); }

X_mm_store_ps存储.单精.紧缩.对齐MOVAPSfor(i=0;i<4;++i){ _A[i]=m1.fS[i]); }

X_mm_storer_ps逆序存储.单精.紧缩.对齐MOVAPS + Shufflingfor(i=0;i<4;++i){ _A[i]=m1.fS[3-i]); }

X_mm_storeu_ps存储.单精.紧缩.非对齐MOVUPSfor(i=0;i<4;++i){ _A[i]=m1.fS[i]); }

X_mm_move_ss标量传送.单精MOVSSm1.fS[0]=m2.fS[0]

_mm_prefetch缓存.预取PREFETCH(v=vs.110).aspx

M_mm_stream_pi非时间性存储.mmMOVNTQ*_A=m1

X_mm_stream_ps非时间性存储.单精度MOVNTPS*_A=m1

_mm_sfence存储隔离SFENCE(v=vs.110).aspx

_mm_getcsr获取MXCSRSTMXCSRr=MXCSR

_mm_setcsr设置MXCSRLDMXCSRMXCSR=m1

_mm_mallocmm分配内存(IGL)#IGL

_mm_freemm释放内存(IGL)#IGL

X_mm_cvtpi16_ps转换.符16位至单精度.紧缩4个_inlinefor(i=0;i<4;++i){ r.fS[i]=(float)m1.iW[i]); }

X_mm_cvtpu16_ps转换.无16位至单精度.紧缩4个_inlinefor(i=0;i<4;++i){ r.fS[i]=(float)m1.uW[i]); }

X_mm_cvtps_pi16转换.单精度至符16位.紧缩4个_inline_mm_packs_pi32(_mm_cvtps_pi32(a), _mm_cvtps_pi32(_mm_movehl_ps(a, a)));

X_mm_cvtpi8_ps转换.符8位至单精度.低位4个_inline_mm_cvtpi16_ps(_mm_unpacklo_pi8(a, _mm_cmpgt_pi8(_mm_setzero_si64(), a)));

X_mm_cvtpu8_ps转换.无8位至单精度.低位4个_inline_mm_cvtpu16_ps(_mm_unpacklo_pi8(a, _mm_setzero_si64()));

X_mm_cvtps_pi8转换.单精度至符8位.低位4个_inline_mm_packs_pi16(_mm_cvtps_pi16(a), _mm_setzero_si64());

X_mm_cvtpi32x2_ps转换.符32位至单精度.2源_inline_mm_movelh_ps(_mm_cvt_pi2ps(_mm_setzero_ps(), a), _mm_cvt_pi2ps(_mm_setzero_ps(), b));

,香港服务器,网站空间,美国空间也许不是自己该去发挥的地方,还是让自己到最适合自己战斗的方面去吧!勇敢的接受自己的失败,

Instructions函数对照表:02 xmmintrin.h与SSE指令集

相关文章:

你感兴趣的文章:

标签云: