zyl910

优化技巧、硬件体系、图像处理、图形学、游戏编程、国际化与文本信息处理。

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

一、来源

  来源:《PC平台新技术MMX(上册):开发编程指南》第8章 MMX编码技术

  书籍信息——
http://e.360buy.com/30027396.html
PC平台新技术MMX(上册):开发编程指南
作 者: 吴乐南 编
出 版 社: 东南大学出版社
ISBN:9787810502528
出版时间:1997-10-01
页 数:149
字 数:237000
所属分类:
电子书 > 计算机与互联网 > 编程语言与程序设计
电子书 > 计算机与互联网 > 计算机工具书


二、整理后的代码

  代码——

#include <Windows.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <conio.h>
#include <assert.h>

// MMX, SSE, SSE2
#include <emmintrin.h>

// 紧缩无符号字 解包为 两组紧缩无符号双字
// 章节:8.1 数据拆封/8.1.1 无符号数拆封
//
// result: 两个零扩展的32位双字,来自源的两个低端字。
// mm1_dst_hi: 两个零扩展的32位双字,来自源的两个高端字。
// mm0_src: 源值(紧缩16位无符号数)。
inline __m64 md_unpack_mud4muw(__m64& mm1_dst_hi, const __m64 mm0_src)
{
    __m64 muwZero = _mm_setzero_si64();    // [MMX]赋值为0
    mm1_dst_hi = _mm_unpackhi_pi16(mm0_src, muwZero);    // 把两个高端字拆封到两个32位双字中。[MMX]高位解包.字到双字
    return       _mm_unpacklo_pi16(mm0_src, muwZero);    // 把两个低端字拆封到两个32位双字中。[MMX]低位解包.字到双字
}

// 紧缩带符号字 解包为 两组紧缩带符号双字
// 章节:8.1 数据拆封/8.1.2 带符号数拆封
//
// result: 两个符号扩展的32位双字,来自源的两个低端字。
// mm1_dst_hi: 两个符号扩展的32位双字,来自源的两个高端字。
// mm0_src: 源值(紧缩16位带符号数)。
inline __m64 md_unpack_mid4miw(__m64& mm1_dst_hi, const __m64 mm0_src)
{
    // 注:其实并不需要读取mm1_dst_hi,但为了符合语法,只能这样写。
    mm1_dst_hi = _mm_srai_pi32(_mm_unpackhi_pi16(mm1_dst_hi, mm0_src), 16); // 把源数据的两个高端字拆分到 第1字与第3字(即两个紧缩双字的高16位),再紧缩双字算术右移16位。使源数据的两个高端字扩展为2个32位带符号双字。
    return       _mm_srai_pi32(_mm_unpacklo_pi16(mm0_src,    mm0_src), 16); // 把源数据的两个低端字拆分到 第1字与第3字(即两个紧缩双字的高16位),再紧缩双字算术右移16位。使源数据的两个低端字扩展为2个32位带符号双字。
}

// 两组紧缩带符号双字 交叉饱和紧缩为 紧缩带符号字
// 章节:8.2 数据紧缩/8.2.1 带饱和的交叉紧缩
// 例如:将 {[B1,B0], [A1,A0]} 交叉紧缩为 {[B1',A1',B0',A0']}
// 注:紧缩(_mm_packs_pi32)是将 {[B1,B0], [A1,A0]} 转为 {[B1',B0',A1',A0']}
//
// result: 紧缩16位带符号数。第0字和第2字来自mm0_lo的带符号饱和双字,第1字和第3字来自mm1_hi的带符号饱和双字。
// mm0_lo: 低位源值(A)。
// mm1_hi: 高位源值(B)。
inline __m64 md_pack_s_cross_miw4mid(__m64 mm0_lo, __m64 mm1_hi)
{
    mm1_hi = _mm_packs_pi32(mm1_hi, mm1_hi);    // 紧缩并且符号饱和。即变为[B1',B0',B1',B0']。[MMX]饱和打包.双字到字
    mm0_lo = _mm_packs_pi32(mm0_lo, mm0_lo);    // 紧缩并且符号饱和。即变为[A1',A0',A1',A0']。
    return _mm_unpacklo_pi16(mm0_lo, mm1_hi);    // 交叉操作数的低16位。[MMX]低位解包.字到双字
}

// 两组紧缩无符号双字 交叉环绕紧缩为 紧缩无符号字
// 章节:8.2 数据紧缩/8.2.2 不带饱和的交叉紧缩
// 例如:将 {[B1,B0], [A1,A0]} 交叉紧缩为 {[B1',A1',B0',A0']}
//
// result: 紧缩16位无符号数。第0字和第2字来自mm0_lo的无符号双字,第1字和第3字来自mm1_hi的无符号双字。
// mm0_lo: 低位源值(A)。
// mm1_hi: 高位源值(B)。
inline __m64 md_pack_w_cross_muw4mud(__m64 mm0_lo, __m64 mm1_hi)
{
    mm1_hi = _mm_slli_pi32(mm1_hi, 16);    // 将每个双字的低16位左移至高16位
    mm0_lo = _mm_and_si64(mm0_lo, _mm_set_pi16(0, (short)0xFFFF, 0, (short)0xFFFF));    // 用0屏蔽每个双字的最高16位
    return _mm_or_si64(mm0_lo, mm1_hi);    // 合并两个操作数
}

// 2x2矩阵转置.紧缩双字
// 章节:8.3 非交叉拆分
// 例如:将2x2矩阵 [[A1,A0] [B1,B0]] 转置为 [[B0,A0] [B1,A1]]。
// 
// [A1 A0]    [B0 A0]
// [B1 B0] -> [B1 A1]
// msb<-lsb
//
// mm0_row0: 2x2矩阵的第0行(A)。
// mm1_row1: 2x2矩阵的第1行(B)。
inline void md_matrix_transpose_2x2_mmd(__m64& mm0_row0, __m64& mm1_row1)
{
    __m64 tmp = mm0_row0;    // 备份第0行
    mm0_row0 = _mm_unpacklo_pi32(mm0_row0, mm1_row1);    // 高32位为mm1_row1的低32位(B0),低32位为源mm0_row0的低32位(A0)。[MMX]低位解包.双字到四字
    mm1_row1 = _mm_unpackhi_pi32(tmp     , mm1_row1);    // 高32位为mm1_row1的高32位(B1),低32位为源mm0_row0的高32位(A1)。[MMX]高位解包.双字到四字
}

// 复数与常量相乘(紧缩字->紧缩双字)
// 章节:8.4 复数与常量相乘
//
// result: 复数乘法的结果,高32位是实部,低32位是虚部。
// mm0_src: 被乘数([?,?,Dr,Di])。
// mm1_c: 已调整好顺序的常量乘数([Cr,-Ci,Ci,Cr])。
inline __m64 md_complex_mul_c_mid4miw(__m64 mm0_src, const __m64 mm1_c)
{
    mm0_src = _mm_unpacklo_pi32(mm0_src, mm0_src);    // 产生 [Dr,Di,Dr,Di]。[MMX]低位解包.双字到四字
    return _mm_madd_pi16(mm0_src, mm1_c);    // 操作结果是 [(Dr*Cr-Di*Ci), (Dr*Ci+Di*Cr)]。[MMX]乘后二加.带符号16位至带符号32位
}

// 无符号紧缩字节的绝对差
// 章节:8.5 数的绝对差\8.5.1 无符号数的绝对差
//
// result: 无符号紧缩字节的绝对差。伪代码——result[i]=abs(mm0[i] - mm1[i])。
// mm0: 源操作数A。
// mm1: 源操作数B。
inline __m64 md_absolute_deviation_mub(const __m64 mm0, const __m64 mm1)
{
    return _mm_or_si64(_mm_subs_pu8(mm0, mm1), _mm_subs_pu8(mm1, mm0));
        // 1. "_mm_subs_pu8(mm0, mm1)": 计算差值
        // 2. "_mm_subs_pu8(mm1, mm0)": 以另一种途径计算差值
        // 3. "_mm_or_si64(...,  ...)": 合并结果
}

// 带符号紧缩字的绝对差
// 章节:8.5 数的绝对差\8.5.2 带符号数的绝对差
//
// result: 带符号紧缩字的绝对差。伪代码——result[i]=abs(mm0[i] - mm1[i])。
// mm0: 源操作数A。
// mm1: 源操作数B。
inline __m64 md_absolute_deviation_miw(const __m64 mm0, const __m64 mm1)
{
    __m64 miwMaskGt = _mm_cmpgt_pi16(mm0, mm1);    // 产生 A>B 的屏蔽值
    __m64 miwXor = _mm_and_si64(_mm_xor_si64(mm0, mm1), miwMaskGt);    // 产生交换屏蔽值(仅在A>B时的XOR(A,B)值)。即当A>B时,该字是XOR(A,B);而A<=B时,该字是是0。
    __m64 miwMin = _mm_xor_si64(mm0, miwXor);    // 当A>B时就用xor交换,产生最小值
    __m64 miwMax = _mm_xor_si64(mm1, miwXor);    // 当B<=A时就用xor交换,产生最大值
    return _mm_sub_pi16(miwMax, miwMin);    // 绝对差 = 最大值 - 最小值
}

// 带符号紧缩字的绝对值
// 章节:8.6 绝对值
//
// result: 带符号紧缩字的绝对值。伪代码——result[i]=abs(mm0[i])。
// mm0: 源操作数。
inline __m64 md_abs_miw(const __m64 mm0)
{
    __m64 miwSign = _mm_srai_pi16(mm0, 15);    // 将符号位转为掩码。使每个字为全0(对于非负数)或全1(对于负数)。注:补码下的“全1”代表数值“-1”,减法碰到“-1”就形成了“加一”。
    return _mm_subs_pi16(_mm_xor_si64(mm0, miwSign), miwSign);    // 为了获得绝对值,仅对负数求相反数。补码求相反数规则——原码取反再加一。
}

// 将带符号紧缩字限制在[iLow,iHigh]区间
// 章节:8.7 数值的截取/8.7.1 对任意有符号数范围截取符号数/[0]
//
// result: 限制后的带符号紧缩字。伪代码——result[i]=(mm0[i]<iLow)?iLow:( (mm0[i]>iHigh)?iHigh:mm0[i] )。
// mm0: 源操作数。
inline __m64 md_clamp_miw(const __m64 mm0, short iLow, short iHigh)
{
    const __m64 miwMinInt16 = _mm_set1_pi16((short)0x8000);    // 带符号16位的最小值
    __m64 tmp = _mm_add_pi16(mm0, miwMinInt16);    // 利用环绕加法,将带符号数 偏移至 无符号数的空间。
    tmp = _mm_adds_pu16(tmp, _mm_set1_pi16( (short)(0xFFFF-(iHigh+0x8000)) ));    // 限制最高值
    tmp = _mm_subs_pu16(tmp, _mm_set1_pi16( (short)(0xFFFF-(iHigh+0x8000)+(iLow+0x8000)) ));    // 限制最低值
    return _mm_add_pi16(tmp, _mm_set1_pi16( iLow ));    // 恢复偏移
}

// 将无符号紧缩字限制在[uLow,uHigh]区间
// 章节:8.7 数值的截取/8.7.2 对任意有符号数范围截取符号数
//
// result: 限制后的带符号紧缩字。伪代码——result[i]=(mm0[i]<uLow)?uLow:( (mm0[i]>uHigh)?uHigh:mm0[i] )。
// mm0: 源操作数。
inline __m64 md_clamp_muw(const __m64 mm0, unsigned short uLow, unsigned short uHigh)
{
    __m64 tmp = _mm_adds_pu16(mm0, _mm_set1_pi16( (short)(0xFFFFU-uHigh) ));    // 限制最高值
    tmp       = _mm_subs_pu16(tmp, _mm_set1_pi16( (short)(0xFFFFU-uHigh+uLow) ));    // 限制最低值
    return _mm_add_pi16(tmp, _mm_set1_pi16( uLow ));    // 恢复偏移
}

// 返回常数:0
// 章节:8.8 生成常量/[0]在MM0产生0寄存器
inline __m64 md_setzero_mmq()
{
    __m64 tmp=_mm_setzero_si64();    // 其实并不需要赋值,但为了符合语法,只能这样写。
    return _mm_xor_si64(tmp, tmp);
    // 其实Intrinsics函数中有这样的函数——
    // return _mm_setzero_si64();
}

// 返回常数:全1
// 章节:8.8 生成常量/[1]在寄存器MM1中置全1,它在每一个紧缩数据类型的值域中都是-1
inline __m64 md_setfull_mmq()
{
    __m64 tmp=_mm_setzero_si64();    // 其实并不需要赋值,但为了符合语法,只能这样写。
    return _mm_cmpeq_pi8(tmp, tmp);
}

// 返回常数:每个紧缩字节为1
// 章节:8.8 生成常量/[2]在每一个紧缩字节[或紧缩字](或紧缩双字)的值域中产生常数1
inline __m64 md_set_1_mib()
{
    __m64 mibZero = _mm_setzero_si64();
    __m64 mibNegativeOne = _mm_cmpeq_pi8(mibZero, mibZero);
    return _mm_sub_pi8(mibZero, mibNegativeOne);
}

// 返回常数:每个紧缩字为pow(2,n)-1
// 章节:8.8 生成常量/[3]在每一个紧缩字(或紧缩双字)的值域中产生带符号常数pow(2,n)-1
inline __m64 md_set_pow2n_sub1_miw(int n)
{
    assert((n>=1) && (n<=16));
    __m64 mibZero = _mm_setzero_si64();
    __m64 mibFull = _mm_cmpeq_pi8(mibZero, mibZero);
    return _mm_srli_pi16(mibFull, 16-n);
}

// 返回常数:每个紧缩字为-pow(2,n)
// 章节:8.8 生成常量/[4]在每一个紧缩字(或紧缩双字)的值域中产生带符号常数-pow(2,n)
inline __m64 md_set_neg_pow2n_miw(int n)
{
    assert((n>=0) && (n<=15));
    __m64 mibZero = _mm_setzero_si64();
    __m64 mibFull = _mm_cmpeq_pi8(mibZero, mibZero);
    return _mm_slli_pi16(mibFull, n);
}

// 验证
void doTest(int cnt)
{
    __m64 t0,t1,t2;
    int i;

    // 紧缩无符号字 解包为 两组紧缩无符号双字
    printf("md_unpack_mud4muw:\n");
    t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);
    printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t2 = md_unpack_mud4muw(t1, t0);
    }
    printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    printf("\n");

    // 紧缩带符号字 解包为 两组紧缩带符号双字
    printf("md_unpack_mid4miw:\n");
    t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);
    printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t2 = md_unpack_mid4miw(t1, t0);
    }
    printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    printf("\n");

    // 两组紧缩带符号双字 交叉饱和紧缩为 紧缩带符号字
    printf("md_pack_s_cross_miw4mid:\n");
    t1 = _mm_set_pi32(0x00001111, 0x000F2222);
    t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
    printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t0 = md_pack_s_cross_miw4mid(t2, t1);
    }
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    printf("\n");

    // 两组紧缩无符号双字 交叉环绕紧缩为 紧缩无符号字
    printf("md_pack_w_cross_muw4mud:\n");
    t1 = _mm_set_pi32(0x00001111, 0x000F2222);
    t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
    printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t0 = md_pack_w_cross_muw4mud(t2, t1);
    }
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    printf("\n");

    // 2x2矩阵转置.紧缩双字
    printf("md_matrix_transpose_2x2_mmd:\n");
    t1 = _mm_set_pi32(0x00001111, 0x000F2222);
    t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
    printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        md_matrix_transpose_2x2_mmd(t1, t2);
    }
    printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    printf("\n");

    // 复数与常量相乘(紧缩字->紧缩双字)
    printf("md_complex_mul_c_mid4miw:\n");
    t1 = _mm_set_pi16(0,0, 1, 1);    // 1+i
    t2 = _mm_set_pi16(3,-2, 2,3);    // 3+2i.    (1+i)*(3+2i) = 1+5i
    printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t0 = md_complex_mul_c_mid4miw(t1, t2);
    }
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    printf("\n");

    // 无符号紧缩字节的绝对差
    printf("md_absolute_deviation_mub:\n");
    t1 = _mm_set_pi8(1,2,3,4,5,6,7,8);
    t2 = _mm_set_pi8(8,7,6,5,4,3,2,1);
    printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t0 = md_absolute_deviation_mub(t1, t2);
    }
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    printf("\n");

    // 带符号紧缩字的绝对差
    printf("md_absolute_deviation_miw:\n");
    t1 = _mm_set_pi16(-1, 1, 3, 5);
    t2 = _mm_set_pi16( 2, 2, 2, 2);
    printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t0 = md_absolute_deviation_miw(t1, t2);
    }
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    printf("\n");

    // 带符号紧缩字的绝对值
    printf("md_abs_miw4miw:\n");
    t0 = _mm_set_pi16(-1, 1, 3, -5);
    printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t1 = md_abs_miw(t0);
    }
    printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
    printf("\n");

    // 将带符号紧缩字限制在[iLow,iHigh]区间
    printf("md_clamp_miw:\n");
    t0 = _mm_set_pi16(-15, 1, 254, 257);
    printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t1 = md_clamp_miw(t0, -1, 255);
    }
    printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
    printf("\n");

    // 将无符号紧缩字限制在[uLow,uHigh]区间
    printf("md_clamp_muw:\n");
    t0 = _mm_set_pi16(1, 254, 257, 32769U);
    printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    for(i=0; i<cnt; ++i)
    {
        t1 = md_clamp_muw(t0, 16, 255);
    }
    printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
    printf("\n");

    // 返回常数:0
    printf("md_setzero_mmq:\t");
    t0 = md_setzero_mmq();
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    // 返回常数:全1
    printf("md_setfull_mmq:\t");
    t0 = md_setfull_mmq();
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    // 返回常数:每个紧缩字节为1
    printf("md_set_1_mib:\t");
    t0 = md_set_1_mib();
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    // 返回常数:每个紧缩字为pow(2,n)-1
    printf("md_set_pow2n_sub1_miw:\t");
    t0 = md_set_pow2n_sub1_miw(8);
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    // 返回常数:每个紧缩字为pow(2,n)-1
    printf("md_set_neg_pow2n_miw:\t");
    t0 = md_set_neg_pow2n_miw(15);
    printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

}

int main(int argc, char* argv[])
{
    doTest((rand()&1) + 1);    // 用一个随机数作为循环次数,避免编译器优化循环
    return 0;
}

 


三、编译器生成的汇编代码

  VC6编译器生成的汇编代码——

; Listing generated by Microsoft (R) Optimizing Compiler Version 12.00.9044.0 

    TITLE    E:\zylKanbox\Doc\Program\ASM\x86\SIMD\my\md\md01_mmxguide_ch08\md01_mmxguide_ch08.cpp
    .386P
include listing.inc
if @Version gt 510
.model FLAT
else
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
CONST    SEGMENT DWORD USE32 PUBLIC 'CONST'
CONST    ENDS
_BSS    SEGMENT DWORD USE32 PUBLIC 'BSS'
_BSS    ENDS
$$SYMBOLS    SEGMENT BYTE USE32 'DEBSYM'
$$SYMBOLS    ENDS
_TLS    SEGMENT DWORD USE32 PUBLIC 'TLS'
_TLS    ENDS
;    COMDAT ??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_01BJG@?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@
_DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
_DATA    ENDS
;    COMDAT ??8@YAHABU_GUID@@0@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT __mm_cvtpi16_ps
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT __mm_cvtpu16_ps
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT __mm_cvtps_pi16
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_unpack_mud4muw@@YA?AT__m64@@AAT1@T1@@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_unpack_mid4miw@@YA?AT__m64@@AAT1@T1@@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_pack_s_cross_miw4mid@@YA?AT__m64@@T1@0@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_pack_w_cross_muw4mud@@YA?AT__m64@@T1@0@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_matrix_transpose_2x2_mmd@@YAXAAT__m64@@0@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_complex_mul_c_mid4miw@@YA?AT__m64@@T1@T1@@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_absolute_deviation_mub@@YA?AT__m64@@T1@0@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_absolute_deviation_miw@@YA?AT__m64@@T1@0@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_abs_miw@@YA?AT__m64@@T1@@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_clamp_miw@@YA?AT__m64@@T1@FF@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_clamp_muw@@YA?AT__m64@@T1@GG@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_setzero_mmq@@YA?AT__m64@@XZ
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_setfull_mmq@@YA?AT__m64@@XZ
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_set_1_mib@@YA?AT__m64@@XZ
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_set_pow2n_sub1_miw@@YA?AT__m64@@H@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?md_set_neg_pow2n_miw@@YA?AT__m64@@H@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT ?doTest@@YAXH@Z
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
;    COMDAT _main
_TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT    ENDS
FLAT    GROUP _DATA, CONST, _BSS
    ASSUME    CS: FLAT, DS: FLAT, SS: FLAT
endif

INCLUDELIB LIBC
INCLUDELIB OLDNAMES

PUBLIC    ?doTest@@YAXH@Z                    ; doTest
PUBLIC    ??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@    ; `string'
PUBLIC    ??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
PUBLIC    ??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
PUBLIC    ??_C@_01BJG@?6?$AA@                ; `string'
PUBLIC    ??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@    ; `string'
PUBLIC    ??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@    ; `string'
PUBLIC    ??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
PUBLIC    ??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@    ; `string'
PUBLIC    ??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@    ; `string'
PUBLIC    ??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@ ; `string'
PUBLIC    ??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@ ; `string'
PUBLIC    ??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@ ; `string'
PUBLIC    ??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@ ; `string'
PUBLIC    ??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@        ; `string'
PUBLIC    ??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@        ; `string'
PUBLIC    ??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@        ; `string'
PUBLIC    ??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@        ; `string'
PUBLIC    ??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@        ; `string'
PUBLIC    ??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@        ; `string'
PUBLIC    ??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@    ; `string'
PUBLIC    ??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@    ; `string'
EXTRN    _printf:NEAR
;    COMDAT ??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@
; File E:\zylKanbox\Doc\Program\ASM\x86\SIMD\my\md\md01_mmxguide_ch08\md01_mmxguide_ch08.cpp
_DATA    SEGMENT
??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@ DB 'md_unpack_mud4muw:', 0aH, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
_DATA    SEGMENT
??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ DB '[%.8X%.8X] -> ', 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
_DATA    SEGMENT
??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ DB '['
    DB    '%.8X%.8X],[%.8X%.8X]', 0aH, 00H        ; `string'
_DATA    ENDS
;    COMDAT ??_C@_01BJG@?6?$AA@
_DATA    SEGMENT
??_C@_01BJG@?6?$AA@ DB 0aH, 00H                ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@
_DATA    SEGMENT
??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@ DB 'md_unpack_mid4miw:', 0aH, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@
_DATA    SEGMENT
??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@ DB 'md_pack_s_cross_miw4m'
    DB    'id:', 0aH, 00H                ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
_DATA    SEGMENT
??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ DB '['
    DB    '%.8X%.8X],[%.8X%.8X] -> ', 00H        ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
_DATA    SEGMENT
??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ DB '[%.8X%.8X]', 0aH, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@
_DATA    SEGMENT
??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@ DB 'md_pack_w_cross_muw4m'
    DB    'ud:', 0aH, 00H                ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@
_DATA    SEGMENT
??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@ DB 'md_matrix_transpo'
    DB    'se_2x2_mmd:', 0aH, 00H            ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@
_DATA    SEGMENT
??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@ DB 'md_complex_mul_c_mid'
    DB    '4miw:', 0aH, 00H                ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@
_DATA    SEGMENT
??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@ DB 'md_absolute_deviati'
    DB    'on_mub:', 0aH, 00H                ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@
_DATA    SEGMENT
??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@ DB 'md_absolute_deviati'
    DB    'on_miw:', 0aH, 00H                ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@
_DATA    SEGMENT
??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@ DB 'md_abs_miw4miw:', 0aH, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@
_DATA    SEGMENT
??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@ DB 'md_clamp_miw:', 0aH, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@
_DATA    SEGMENT
??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@ DB 'md_clamp_muw:', 0aH, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@
_DATA    SEGMENT
??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@ DB 'md_setzero_mmq:', 09H, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@
_DATA    SEGMENT
??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@ DB 'md_setfull_mmq:', 09H, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@
_DATA    SEGMENT
??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@ DB 'md_set_1_mib:', 09H, 00H ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@
_DATA    SEGMENT
??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@ DB 'md_set_pow2n_sub1_miw:', 09H
    DB    00H                        ; `string'
_DATA    ENDS
;    COMDAT ??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@
_DATA    SEGMENT
??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@ DB 'md_set_neg_pow2n_miw:', 09H
    DB    00H                        ; `string'
; Function compile flags: /Ogty
_DATA    ENDS
;    COMDAT ?doTest@@YAXH@Z
_TEXT    SEGMENT
_cnt$ = 8
_t0$ = -40
_t1$ = -32
_t2$ = -24
?doTest@@YAXH@Z PROC NEAR                ; doTest, COMDAT

; 232  : {

    push    ebp
    mov    ebp, esp
    and    esp, -8                    ; fffffff8H
    sub    esp, 40                    ; 00000028H
    push    esi
    push    edi

; 233  :     __m64 t0,t1,t2;
; 234  :     int i;
; 235  : 
; 236  :     // 紧缩无符号字 解包为 两组紧缩无符号双字
; 237  :     printf("md_unpack_mud4muw:\n");

    push    OFFSET FLAT:??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@ ; `string'
    call    _printf

; 238  :     t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);

    mov    DWORD PTR -24+[esp+52], -1985229329    ; 89abcdefH
    mov    DWORD PTR -24+[esp+56], 19088743    ; 01234567H
    movq    mm0, MMWORD PTR -24+[esp+52]
    movq    MMWORD PTR -8+[esp+52], mm0
    movq    MMWORD PTR _t0$[esp+52], mm0

; 239  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+52]
    mov    ecx, DWORD PTR _t0$[esp+56]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf

; 240  :     for(i=0; i<cnt; ++i)

    mov    esi, DWORD PTR _cnt$[ebp]
    xor    edi, edi
    add    esp, 16                    ; 00000010H
    cmp    esi, edi
    jle    SHORT $L43808

; 241  :     {
; 242  :         t2 = md_unpack_mud4muw(t1, t0);

    movq    mm1, MMWORD PTR _t0$[esp+48]
    pxor    mm0, mm0
    movq    mm2, mm0
    movq    mm3, mm1
    punpckhwd mm3, mm2
    movq    MMWORD PTR _t1$[esp+48], mm3
    punpcklwd mm1, mm0
    movq    MMWORD PTR _t2$[esp+48], mm1
$L43808:

; 243  :     }
; 244  :     printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    edx, DWORD PTR _t2$[esp+48]
    mov    eax, DWORD PTR _t2$[esp+52]
    mov    ecx, DWORD PTR _t1$[esp+48]
    push    edx
    mov    edx, DWORD PTR _t1$[esp+56]
    push    eax
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 245  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 246  : 
; 247  :     // 紧缩带符号字 解包为 两组紧缩带符号双字
; 248  :     printf("md_unpack_mid4miw:\n");

    push    OFFSET FLAT:??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@ ; `string'
    call    _printf

; 249  :     t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);

    movq    mm0, MMWORD PTR -8+[esp+76]
    movq    MMWORD PTR _t0$[esp+76], mm0

; 250  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+76]
    mov    ecx, DWORD PTR _t0$[esp+80]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 40                    ; 00000028H

; 251  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43818

; 252  :     {
; 253  :         t2 = md_unpack_mid4miw(t1, t0);

    movq    mm0, MMWORD PTR _t0$[esp+48]
    movq    mm1, mm0
    movq    mm2, mm0
    mov    eax, esi
    punpcklwd mm2, mm1
    psrad    mm2, 16                    ; 00000010H
    movq    MMWORD PTR _t2$[esp+48], mm2
$L43816:
    movq    mm1, mm0
    dec    eax
    movq    mm2, MMWORD PTR _t1$[esp+48]
    punpckhwd mm2, mm1
    psrad    mm2, 16                    ; 00000010H
    movq    MMWORD PTR _t1$[esp+48], mm2
    jne    SHORT $L43816
$L43818:

; 254  :     }
; 255  :     printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    edx, DWORD PTR _t2$[esp+48]
    mov    eax, DWORD PTR _t2$[esp+52]
    mov    ecx, DWORD PTR _t1$[esp+48]
    push    edx
    mov    edx, DWORD PTR _t1$[esp+56]
    push    eax
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 256  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 257  : 
; 258  :     // 两组紧缩带符号双字 交叉饱和紧缩为 紧缩带符号字
; 259  :     printf("md_pack_s_cross_miw4mid:\n");

    push    OFFSET FLAT:??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@ ; `string'
    call    _printf

; 260  :     t1 = _mm_set_pi32(0x00001111, 0x000F2222);

    mov    DWORD PTR -24+[esp+76], 991778        ; 000f2222H
    mov    DWORD PTR -24+[esp+80], 4369        ; 00001111H
    movq    mm0, MMWORD PTR -24+[esp+76]

; 261  :     t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);

    mov    DWORD PTR -24+[esp+76], -8739        ; ffffddddH
    movq    MMWORD PTR _t1$[esp+76], mm0
    mov    DWORD PTR -24+[esp+80], -13108        ; ffffccccH

; 262  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    edx, DWORD PTR _t1$[esp+76]
    movq    MMWORD PTR -16+[esp+76], mm0
    movq    mm0, MMWORD PTR -24+[esp+76]
    movq    MMWORD PTR -8+[esp+76], mm0
    movq    MMWORD PTR _t2$[esp+76], mm0
    mov    eax, DWORD PTR _t2$[esp+76]
    mov    ecx, DWORD PTR _t2$[esp+80]
    push    eax
    mov    eax, DWORD PTR _t1$[esp+84]
    push    ecx
    push    edx
    push    eax
    push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 48                    ; 00000030H

; 263  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43824

; 264  :     {
; 265  :         t0 = md_pack_s_cross_miw4mid(t2, t1);

    movq    mm0, MMWORD PTR _t1$[esp+48]
    movq    mm1, mm0
    packssdw mm0, mm1
    movq    mm1, mm0
    movq    mm0, MMWORD PTR _t2$[esp+48]
    movq    mm2, mm0
    packssdw mm0, mm2
    punpcklwd mm0, mm1
    movq    MMWORD PTR _t0$[esp+48], mm0
$L43824:

; 266  :     }
; 267  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    ecx, DWORD PTR _t0$[esp+48]
    mov    edx, DWORD PTR _t0$[esp+52]
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 268  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 269  : 
; 270  :     // 两组紧缩无符号双字 交叉环绕紧缩为 紧缩无符号字
; 271  :     printf("md_pack_w_cross_muw4mud:\n");

    push    OFFSET FLAT:??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@ ; `string'
    call    _printf

; 272  :     t1 = _mm_set_pi32(0x00001111, 0x000F2222);

    movq    mm0, MMWORD PTR -16+[esp+68]
    movq    MMWORD PTR _t1$[esp+68], mm0

; 273  :     t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);

    movq    mm0, MMWORD PTR -8+[esp+68]

; 274  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    edx, DWORD PTR _t1$[esp+68]
    movq    MMWORD PTR _t2$[esp+68], mm0
    mov    eax, DWORD PTR _t2$[esp+68]
    mov    ecx, DWORD PTR _t2$[esp+72]
    push    eax
    mov    eax, DWORD PTR _t1$[esp+76]
    push    ecx
    push    edx
    push    eax
    push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 40                    ; 00000028H

; 275  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43832

; 276  :     {
; 277  :         t0 = md_pack_w_cross_muw4mud(t2, t1);

    movq    mm0, MMWORD PTR _t1$[esp+48]
    or    eax, -1
    pslld    mm0, 16                    ; 00000010H
    mov    WORD PTR -32+[esp+48], ax
    mov    WORD PTR -32+[esp+50], di
    mov    WORD PTR -32+[esp+52], ax
    mov    WORD PTR -32+[esp+54], di
    movq    mm1, MMWORD PTR -32+[esp+48]
    movq    mm2, MMWORD PTR _t2$[esp+48]
    pand    mm2, mm1
    por    mm2, mm0
    movq    MMWORD PTR _t0$[esp+48], mm2
$L43832:

; 278  :     }
; 279  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    ecx, DWORD PTR _t0$[esp+48]
    mov    edx, DWORD PTR _t0$[esp+52]
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 280  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 281  : 
; 282  :     // 2x2矩阵转置.紧缩双字
; 283  :     printf("md_matrix_transpose_2x2_mmd:\n");

    push    OFFSET FLAT:??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@ ; `string'
    call    _printf

; 284  :     t1 = _mm_set_pi32(0x00001111, 0x000F2222);

    movq    mm0, MMWORD PTR -16+[esp+68]
    movq    MMWORD PTR _t1$[esp+68], mm0

; 285  :     t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);

    movq    mm0, MMWORD PTR -8+[esp+68]

; 286  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    edx, DWORD PTR _t1$[esp+68]
    movq    MMWORD PTR _t2$[esp+68], mm0
    mov    eax, DWORD PTR _t2$[esp+68]
    mov    ecx, DWORD PTR _t2$[esp+72]
    push    eax
    mov    eax, DWORD PTR _t1$[esp+76]
    push    ecx
    push    edx
    push    eax
    push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 40                    ; 00000028H

; 287  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43841

; 278  :     }
; 279  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, esi
$L43839:

; 288  :     {
; 289  :         md_matrix_transpose_2x2_mmd(t1, t2);

    movq    mm1, MMWORD PTR _t2$[esp+48]
    movq    mm0, MMWORD PTR _t1$[esp+48]
    movq    mm2, mm1
    dec    eax
    movq    mm3, mm0
    punpckldq mm3, mm2
    movq    MMWORD PTR _t1$[esp+48], mm3
    punpckhdq mm0, mm1
    movq    MMWORD PTR _t2$[esp+48], mm0
    jne    SHORT $L43839
$L43841:

; 290  :     }
; 291  :     printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    ecx, DWORD PTR _t2$[esp+48]
    mov    edx, DWORD PTR _t2$[esp+52]
    mov    eax, DWORD PTR _t1$[esp+48]
    push    ecx
    mov    ecx, DWORD PTR _t1$[esp+56]
    push    edx
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 292  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 293  : 
; 294  :     // 复数与常量相乘(紧缩字->紧缩双字)
; 295  :     printf("md_complex_mul_c_mid4miw:\n");

    push    OFFSET FLAT:??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@ ; `string'
    call    _printf

; 296  :     t1 = _mm_set_pi16(0,0, 1, 1);    // 1+i

    mov    eax, 1
    mov    WORD PTR -24+[esp+80], di
    mov    WORD PTR -24+[esp+76], ax
    mov    WORD PTR -24+[esp+78], ax
    mov    WORD PTR -24+[esp+82], di

; 297  :     t2 = _mm_set_pi16(3,-2, 2,3);    // 3+2i.    (1+i)*(3+2i) = 1+5i

    mov    eax, 3
    movq    mm0, MMWORD PTR -24+[esp+76]
    mov    WORD PTR -24+[esp+76], ax
    movq    MMWORD PTR _t1$[esp+76], mm0
    mov    WORD PTR -24+[esp+78], 2
    mov    WORD PTR -24+[esp+80], -2        ; fffffffeH
    mov    WORD PTR -24+[esp+82], ax
    movq    mm0, MMWORD PTR -24+[esp+76]

; 298  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    ecx, DWORD PTR _t1$[esp+76]
    movq    MMWORD PTR _t2$[esp+76], mm0
    mov    edx, DWORD PTR _t2$[esp+76]
    mov    eax, DWORD PTR _t2$[esp+80]
    push    edx
    mov    edx, DWORD PTR _t1$[esp+84]
    push    eax
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 48                    ; 00000030H

; 299  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43845

; 300  :     {
; 301  :         t0 = md_complex_mul_c_mid4miw(t1, t2);

    movq    mm1, MMWORD PTR _t2$[esp+48]
    movq    mm0, MMWORD PTR _t1$[esp+48]
    movq    mm2, mm0
    punpckldq mm0, mm2
    pmaddwd    mm0, mm1
    movq    MMWORD PTR _t0$[esp+48], mm0
$L43845:

; 302  :     }
; 303  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+48]
    mov    ecx, DWORD PTR _t0$[esp+52]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 304  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 305  : 
; 306  :     // 无符号紧缩字节的绝对差
; 307  :     printf("md_absolute_deviation_mub:\n");

    push    OFFSET FLAT:??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@ ; `string'
    call    _printf

; 308  :     t1 = _mm_set_pi8(1,2,3,4,5,6,7,8);

    mov    al, 5
    mov    cl, 3
    mov    dl, 2
    mov    BYTE PTR -24+[esp+68], 8
    mov    BYTE PTR -24+[esp+69], 7
    mov    BYTE PTR -24+[esp+70], 6
    mov    BYTE PTR -24+[esp+71], al
    mov    BYTE PTR -24+[esp+72], 4
    mov    BYTE PTR -24+[esp+73], cl
    mov    BYTE PTR -24+[esp+74], dl
    mov    BYTE PTR -24+[esp+75], 1
    movq    mm0, MMWORD PTR -24+[esp+68]

; 309  :     t2 = _mm_set_pi8(8,7,6,5,4,3,2,1);

    mov    BYTE PTR -24+[esp+68], 1
    movq    MMWORD PTR _t1$[esp+68], mm0
    mov    BYTE PTR -24+[esp+69], dl
    mov    BYTE PTR -24+[esp+70], cl
    mov    BYTE PTR -24+[esp+71], 4
    mov    BYTE PTR -24+[esp+72], al
    mov    BYTE PTR -24+[esp+73], 6
    mov    BYTE PTR -24+[esp+74], 7
    mov    BYTE PTR -24+[esp+75], 8
    movq    mm0, MMWORD PTR -24+[esp+68]

; 310  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    ecx, DWORD PTR _t1$[esp+68]
    movq    MMWORD PTR _t2$[esp+68], mm0
    mov    edx, DWORD PTR _t2$[esp+68]
    mov    eax, DWORD PTR _t2$[esp+72]
    push    edx
    mov    edx, DWORD PTR _t1$[esp+76]
    push    eax
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 40                    ; 00000028H

; 311  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43852

; 312  :     {
; 313  :         t0 = md_absolute_deviation_mub(t1, t2);

    movq    mm0, MMWORD PTR _t1$[esp+48]
    movq    mm1, MMWORD PTR _t2$[esp+48]
    movq    mm2, mm0
    movq    mm3, mm1
    psubusb    mm3, mm2
    psubusb    mm0, mm1
    por    mm0, mm3
    movq    MMWORD PTR _t0$[esp+48], mm0
$L43852:

; 314  :     }
; 315  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+48]
    mov    ecx, DWORD PTR _t0$[esp+52]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 316  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 317  : 
; 318  :     // 带符号紧缩字的绝对差
; 319  :     printf("md_absolute_deviation_miw:\n");

    push    OFFSET FLAT:??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@ ; `string'
    call    _printf

; 320  :     t1 = _mm_set_pi16(-1, 1, 3, 5);

    mov    WORD PTR -24+[esp+68], 5
    mov    WORD PTR -24+[esp+70], 3
    mov    WORD PTR -24+[esp+72], 1
    mov    WORD PTR -24+[esp+74], -1
    movq    mm0, MMWORD PTR -24+[esp+68]

; 321  :     t2 = _mm_set_pi16( 2, 2, 2, 2);

    mov    eax, 2
    movq    MMWORD PTR _t1$[esp+68], mm0
    mov    WORD PTR -24+[esp+68], ax
    mov    WORD PTR -24+[esp+70], ax
    mov    WORD PTR -24+[esp+72], ax
    mov    WORD PTR -24+[esp+74], ax

; 322  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);

    mov    ecx, DWORD PTR _t1$[esp+68]
    movq    mm0, MMWORD PTR -24+[esp+68]
    movq    MMWORD PTR _t2$[esp+68], mm0
    mov    edx, DWORD PTR _t2$[esp+68]
    mov    eax, DWORD PTR _t2$[esp+72]
    push    edx
    mov    edx, DWORD PTR _t1$[esp+76]
    push    eax
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 40                    ; 00000028H

; 323  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43859

; 324  :     {
; 325  :         t0 = md_absolute_deviation_miw(t1, t2);

    movq    mm2, MMWORD PTR _t2$[esp+48]
    movq    mm1, MMWORD PTR _t1$[esp+48]
    movq    mm0, mm2
    movq    mm3, mm1
    pcmpgtw    mm3, mm0
    movq    mm4, mm1
    pxor    mm4, mm0
    movq    mm0, mm4
    pand    mm0, mm3
    movq    mm3, mm0
    pxor    mm1, mm3
    pxor    mm2, mm0
    psubw    mm2, mm1
    movq    MMWORD PTR _t0$[esp+48], mm2
$L43859:

; 326  :     }
; 327  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+48]
    mov    ecx, DWORD PTR _t0$[esp+52]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 328  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 329  : 
; 330  :     // 带符号紧缩字的绝对值
; 331  :     printf("md_abs_miw4miw:\n");

    push    OFFSET FLAT:??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@ ; `string'
    call    _printf

; 332  :     t0 = _mm_set_pi16(-1, 1, 3, -5);

    mov    WORD PTR -24+[esp+68], -5        ; fffffffbH
    mov    WORD PTR -24+[esp+70], 3
    mov    WORD PTR -24+[esp+72], 1
    mov    WORD PTR -24+[esp+74], -1
    movq    mm0, MMWORD PTR -24+[esp+68]
    movq    MMWORD PTR _t0$[esp+68], mm0

; 333  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);

    mov    edx, DWORD PTR _t0$[esp+68]
    mov    eax, DWORD PTR _t0$[esp+72]
    push    edx
    push    eax
    push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 32                    ; 00000020H

; 334  :     for(i=0; i<cnt; ++i)

    cmp    esi, edi
    jle    SHORT $L43865

; 335  :     {
; 336  :         t1 = md_abs_miw(t0);

    movq    mm1, MMWORD PTR _t0$[esp+48]
    movq    mm0, mm1
    psraw    mm0, 15                    ; 0000000fH
    movq    mm2, mm0
    pxor    mm1, mm0
    psubsw    mm1, mm2
    movq    MMWORD PTR _t1$[esp+48], mm1
$L43865:

; 337  :     }
; 338  :     printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);

    mov    ecx, DWORD PTR _t1$[esp+48]
    mov    edx, DWORD PTR _t1$[esp+52]
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 339  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 340  : 
; 341  :     // 将带符号紧缩字限制在[iLow,iHigh]区间
; 342  :     printf("md_clamp_miw:\n");

    push    OFFSET FLAT:??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@ ; `string'
    call    _printf

; 343  :     t0 = _mm_set_pi16(-15, 1, 254, 257);

    mov    edi, 254                ; 000000feH
    mov    WORD PTR -24+[esp+68], 257        ; 00000101H
    mov    WORD PTR -24+[esp+70], di
    mov    WORD PTR -24+[esp+72], 1
    mov    WORD PTR -24+[esp+74], -15        ; fffffff1H
    movq    mm0, MMWORD PTR -24+[esp+68]
    movq    MMWORD PTR _t0$[esp+68], mm0

; 344  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+68]
    mov    ecx, DWORD PTR _t0$[esp+72]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 32                    ; 00000020H

; 345  :     for(i=0; i<cnt; ++i)

    test    esi, esi
    jle    SHORT $L43871

; 346  :     {
; 347  :         t1 = md_clamp_miw(t0, -1, 255);

    or    dx, -1
    mov    ax, -257                ; fffffeffH
    movd    mm0, dx
    mov    cx, 32512                ; 00007f00H
    movq    mm1, mm0
    mov    dx, -32768                ; ffff8000H
    punpcklwd mm1, mm0
    movq    mm0, mm1
    punpcklwd mm1, mm0
    movd    mm0, ax
    movq    mm2, mm0
    punpcklwd mm2, mm0
    movq    mm0, mm2
    punpcklwd mm2, mm0
    movd    mm0, cx
    movq    mm3, mm0
    punpcklwd mm3, mm0
    movq    mm0, mm3
    punpcklwd mm3, mm0
    movd    mm0, dx
    movq    mm4, mm0
    punpcklwd mm4, mm0
    movq    mm0, mm4
    punpcklwd mm4, mm0
    movq    mm0, MMWORD PTR _t0$[esp+48]
    paddw    mm0, mm4
    paddusw    mm0, mm3
    psubusw    mm0, mm2
    paddw    mm0, mm1
    movq    MMWORD PTR _t1$[esp+48], mm0
$L43871:

; 348  :     }
; 349  :     printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);

    mov    eax, DWORD PTR _t1$[esp+48]
    mov    ecx, DWORD PTR _t1$[esp+52]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 350  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 351  : 
; 352  :     // 将无符号紧缩字限制在[uLow,uHigh]区间
; 353  :     printf("md_clamp_muw:\n");

    push    OFFSET FLAT:??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@ ; `string'
    call    _printf

; 354  :     t0 = _mm_set_pi16(1, 254, 257, 32769U);

    mov    WORD PTR -24+[esp+68], -32767        ; ffff8001H
    mov    WORD PTR -24+[esp+70], 257        ; 00000101H
    mov    WORD PTR -24+[esp+72], di
    mov    WORD PTR -24+[esp+74], 1
    movq    mm0, MMWORD PTR -24+[esp+68]
    movq    MMWORD PTR _t0$[esp+68], mm0

; 355  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);

    mov    edx, DWORD PTR _t0$[esp+68]
    mov    eax, DWORD PTR _t0$[esp+72]
    push    edx
    push    eax
    push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    call    _printf
    add    esp, 32                    ; 00000020H

; 356  :     for(i=0; i<cnt; ++i)

    test    esi, esi
    jle    SHORT $L43877

; 357  :     {
; 358  :         t1 = md_clamp_muw(t0, 16, 255);

    mov    cx, 16                    ; 00000010H
    mov    dx, -240                ; ffffff10H
    movd    mm0, cx
    mov    ax, -256                ; ffffff00H
    movq    mm1, mm0
    punpcklwd mm1, mm0
    movq    mm0, mm1
    punpcklwd mm1, mm0
    movd    mm0, dx
    movq    mm2, mm0
    punpcklwd mm2, mm0
    movq    mm0, mm2
    punpcklwd mm2, mm0
    movd    mm0, ax
    movq    mm3, mm0
    punpcklwd mm3, mm0
    movq    mm0, mm3
    punpcklwd mm3, mm0
    movq    mm0, MMWORD PTR _t0$[esp+48]
    paddusw    mm0, mm3
    psubusw    mm0, mm2
    paddw    mm0, mm1
    movq    MMWORD PTR _t1$[esp+48], mm0
$L43877:

; 359  :     }
; 360  :     printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);

    mov    ecx, DWORD PTR _t1$[esp+48]
    mov    edx, DWORD PTR _t1$[esp+52]
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 361  :     printf("\n");

    push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
    call    _printf

; 362  : 
; 363  :     // 返回常数:0
; 364  :     printf("md_setzero_mmq:\t");

    push    OFFSET FLAT:??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@ ; `string'
    call    _printf

; 365  :     t0 = md_setzero_mmq();

    pxor    mm0, mm0
    movq    MMWORD PTR -24+[esp+68], mm0
    movq    mm1, mm0
    pxor    mm0, mm1
    movq    MMWORD PTR _t0$[esp+68], mm0

; 366  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+68]
    mov    ecx, DWORD PTR _t0$[esp+72]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 367  : 
; 368  :     // 返回常数:全1
; 369  :     printf("md_setfull_mmq:\t");

    push    OFFSET FLAT:??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@ ; `string'
    call    _printf

; 370  :     t0 = md_setfull_mmq();

    movq    mm0, MMWORD PTR -24+[esp+84]
    movq    mm1, mm0
    pcmpeqb    mm0, mm1
    movq    MMWORD PTR _t0$[esp+84], mm0

; 371  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    edx, DWORD PTR _t0$[esp+84]
    mov    eax, DWORD PTR _t0$[esp+88]
    push    edx
    push    eax
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 372  : 
; 373  :     // 返回常数:每个紧缩字节为1
; 374  :     printf("md_set_1_mib:\t");

    push    OFFSET FLAT:??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@ ; `string'
    call    _printf

; 375  :     t0 = md_set_1_mib();

    movq    mm0, MMWORD PTR -24+[esp+100]
    movq    mm1, mm0
    movq    mm2, mm0
    pcmpeqb    mm2, mm1
    psubb    mm0, mm2
    movq    MMWORD PTR _t0$[esp+100], mm0

; 376  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    ecx, DWORD PTR _t0$[esp+100]
    mov    edx, DWORD PTR _t0$[esp+104]
    push    ecx
    push    edx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf
    add    esp, 64                    ; 00000040H

; 377  : 
; 378  :     // 返回常数:每个紧缩字为pow(2,n)-1
; 379  :     printf("md_set_pow2n_sub1_miw:\t");

    push    OFFSET FLAT:??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@ ; `string'
    call    _printf

; 380  :     t0 = md_set_pow2n_sub1_miw(8);

    movq    mm0, MMWORD PTR -24+[esp+52]
    movq    mm1, mm0
    pcmpeqb    mm0, mm1
    psrlw    mm0, 8
    movq    MMWORD PTR _t0$[esp+52], mm0

; 381  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    eax, DWORD PTR _t0$[esp+52]
    mov    ecx, DWORD PTR _t0$[esp+56]
    push    eax
    push    ecx
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf

; 382  : 
; 383  :     // 返回常数:每个紧缩字为pow(2,n)-1
; 384  :     printf("md_set_neg_pow2n_miw:\t");

    push    OFFSET FLAT:??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@ ; `string'
    call    _printf

; 385  :     t0 = md_set_neg_pow2n_miw(15);

    movq    mm0, MMWORD PTR -24+[esp+68]
    movq    mm1, mm0
    pcmpeqb    mm0, mm1
    psllw    mm0, 15                    ; 0000000fH
    movq    MMWORD PTR _t0$[esp+68], mm0

; 386  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);

    mov    edx, DWORD PTR _t0$[esp+68]
    mov    eax, DWORD PTR _t0$[esp+72]
    push    edx
    push    eax
    push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    call    _printf
    add    esp, 32                    ; 00000020H

; 387  : 
; 388  : }

    pop    edi
    pop    esi
    mov    esp, ebp
    pop    ebp
    ret    0
?doTest@@YAXH@Z ENDP                    ; doTest
_TEXT    ENDS
PUBLIC    _main
EXTRN    _rand:NEAR
; Function compile flags: /Ogty
;    COMDAT _main
_TEXT    SEGMENT
_main    PROC NEAR                    ; COMDAT

; 392  :     doTest((rand()&1) + 1);    // 用一个随机数作为循环次数,避免编译器优化循环

    call    _rand
    and    eax, 1
    inc    eax
    push    eax
    call    ?doTest@@YAXH@Z                ; doTest
    add    esp, 4

; 393  :     return 0;

    xor    eax, eax

; 394  : }

    ret    0
_main    ENDP
_TEXT    ENDS
END

 

posted on 2012-04-26 21:59  zyl910  阅读(2354)  评论(0编辑  收藏  举报