_mm_maddubs_pi16

Microsoft Specific

Emits the Supplemental Streaming SIMD Extensions 3 (SSSE3) instruction pmaddubsw. This instruction multiplies and adds integers.

__m64 _mm_maddubs_pi16( 
   __m64 a,
   __m64 b
);

Parameters

  • [in] a
    A 64-bit parameter that contains eight 8-bit unsigned integers.

  • [in] b
    A 64-bit parameter that contains eight 8-bit signed integers.

Return value

A 64-bit result that contains four 16-bit signed integers, where each result element represents the saturated sum of adjacent SIMD products. This can expressed with the following equations:

r0 := SATURATE_16((a0 * b0) + (a1 * b1))
r1 := SATURATE_16((a2 * b2) + (a3 * b3))
r2 := SATURATE_16((a4 * b4) + (a5 * b5))
r2 := SATURATE_16((a6 * b6) + (a7 * b7))

Requirements

Intrinsic

Architecture

_mm_maddubs_pi16

x86, x64

Header file <tmmintrin.h>

Remarks

r0-r3 are the sequentially ordered 16-bit components of return value r. r0 indicates the least significant 16 bits.

a0-a7 and b0-b7 are the sequentially ordered 8-bit components of parameters a and b, respectively. a0 and b0 are the least significant 8 bits. Parameter a contains unsigned bytes. Parameter b contains signed bytes.

SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x))

Before you use this intrinsic, software must ensure that the underlying processor supports the instruction.

Example

#include <stdio.h>
#include <tmmintrin.h>

int main ()
{
    __m64 a, b, final;
    int temp;

    a.m64_u8[0] = 1;
    b.m64_i8[0] = 127;
    a.m64_u8[1] = 1;
    b.m64_i8[1] = -127;
    temp = (a.m64_u8[0] * b.m64_i8[0]) + (a.m64_u8[1] * b.m64_i8[1]);
    final.m64_i16[0] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m64_u8[2] = 255;
    b.m64_i8[2] = 127;
    a.m64_u8[3] = 255;
    b.m64_i8[3] = 127;
    temp = (a.m64_u8[2] * b.m64_i8[2]) + (a.m64_u8[3] * b.m64_i8[3]);
    final.m64_i16[1] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m64_u8[4] = 2;
    b.m64_i8[4] = -4;
    a.m64_u8[5] = 16;
    b.m64_i8[5] = 2;
    temp = (a.m64_u8[4] * b.m64_i8[4]) + (a.m64_u8[5] * b.m64_i8[5]);
    final.m64_i16[2] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    a.m64_u8[6] = 0;
    b.m64_i8[6] = -128;
    a.m64_u8[7] = 3;
    b.m64_i8[7] = -15;
    temp = (a.m64_u8[6] * b.m64_i8[6]) + (a.m64_u8[7] * b.m64_i8[7]);
    final.m64_i16[3] = (temp > 32767) ? 32767 : (temp < -32768) ? -32768 : temp;

    __m64 res = _mm_maddubs_pi16(a, b);

    printf_s("Res0 should be %d: %d\nRes1 should be %d: %d\n",
                final.m64_i16[0], res.m64_i16[0], final.m64_i16[1], res.m64_i16[1]);
    printf_s("Res2 should be %d: %d\nRes3 should be %d: %d\n",
                final.m64_i16[2], res.m64_i16[2], final.m64_i16[3], res.m64_i16[3]);

    _mm_empty();

    return 0;
}
Res0 should be 0: 0
Res1 should be 32767: 32767
Res2 should be 24: 24
Res3 should be -45: -45

See Also

Reference

Compiler Intrinsics