I'v found some time today to work on that problem.
First I changed the ASM code. I've checked how M$ VS 2017 handles the _mm_set_epi32 intrinsic. This is the
new routine:
function XMVectorSetBinaryConstant(constref C0: UINT32; constref C1: UINT32; constref C2: UINT32; constref C3: UINT32): TXMVECTOR;
assembler;
const
g_vMask1: TXMVECTORU32 = (u: (1, 1, 1, 1));
asm
// Move the parms to a vector
// __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0);
movd xmm0,dword ptr [C3]
movd xmm1,dword ptr[C2]
movd xmm2,dword ptr[C1]
movd xmm3,dword ptr[C0]
punpckldq xmm3,xmm1
punpckldq xmm2,xmm0
punpckldq xmm3,xmm2 // XMM3 = vTemp
// Mask off the low bits
PAND XMM3, [g_vMask1] // vTemp = _mm_and_si128(vTemp,g_vMask1);
// 0xFFFFFFFF on true bits
PCMPEQD XMM3, [g_vMask1] // vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
// 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f
PAND XMM3, [g_XMOne] // vTemp = _mm_and_si128(vTemp,g_XMOne);
MOVUPS TXMVECTOR([result]), XMM3// return _mm_castsi128_ps(vTemp);
end;
When I now make a breakpoint at the "movd xmm1,dword ptr[C2]" line. I see in the debugger that the value of XMM0 is not what it should be.
Now I looked at the .s file.
DIRECTX.MATH_$$_XMVECTORSETBINARYCONSTANT$LONGWORD$LONGWORD$LONGWORD$LONGWORD$$TXMVECTOR:
.Lc128:
.Ll314:
# [2903] g_vMask1: TXMVECTORU32 = (u: (1, 1, 1, 1));
pushl %ebp
.Lc130:
.Lc131:
movl %esp,%ebp
.Lc132:
# Var C0 located in register eax
# Var C1 located in register edx
# Var C2 located in register ecx
# Var C3 located at ebp+12, size=OS_32
# Var $result located at ebp+8, size=OS_32
.Ll315:
# [2907] movd xmm0,dword ptr [C3]
movd 12(%ebp),%xmm0
.Ll316:
# [2908] movd xmm1,dword ptr[C2]
movd (%ecx),%xmm1
.Ll317:
# [2909] movd xmm2,dword ptr[C1]
movd (%edx),%xmm2
.Ll318:
# [2910] movd xmm3,dword ptr[C0]
movd (%eax),%xmm3
.Ll319:
# [2911] punpckldq xmm3,xmm1
punpckldq %xmm1,%xmm3
.Ll320:
# [2912] punpckldq xmm2,xmm0
punpckldq %xmm0,%xmm2
.Ll321:
# [2913] punpckldq xmm3,xmm2 // XMM3 = vTemp
punpckldq %xmm2,%xmm3
.Ll322:
# [2915] PAND XMM3, [g_vMask1] // vTemp = _mm_and_si128(vTemp,g_vMask1);
pand TC_$DIRECTX.MATH$_$XMVECTORSETBINARYCONSTANT$crcD1D7FBA5_$$_G_VMASK1,%xmm3
.Ll323:
# [2917] PCMPEQD XMM3, [g_vMask1] // vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
pcmpeqd TC_$DIRECTX.MATH$_$XMVECTORSETBINARYCONSTANT$crcD1D7FBA5_$$_G_VMASK1,%xmm3
.Ll324:
# [2919] PAND XMM3, [g_XMOne] // vTemp = _mm_and_si128(vTemp,g_XMOne);
pand TC_$DIRECTX.MATH_$$_G_XMONE,%xmm3
.Ll325:
# [2920] MOVUPS TXMVECTOR([result]), XMM3// return _mm_castsi128_ps(vTemp);
movups %xmm3,8(%ebp)
.Ll326:
# [2921] end;
leave
ret $8
.Lc129:
.Lt14:
.Ll327:
The C3 ist located on the stack. So I change the function to
function XMVectorSetBinaryConstant(constref C0: UINT32; constref C1: UINT32; constref C2: UINT32; const C3: UINT32): TXMVECTOR;
assembler;
The .s output is
DIRECTX.MATH_$$_XMVECTORSETBINARYCONSTANT$LONGWORD$LONGWORD$LONGWORD$LONGWORD$$TXMVECTOR:
.Lc128:
.Ll314:
# [2903] g_vMask1: TXMVECTORU32 = (u: (1, 1, 1, 1));
pushl %ebp
.Lc130:
.Lc131:
movl %esp,%ebp
.Lc132:
# Var C0 located in register eax
# Var C1 located in register edx
# Var C2 located in register ecx
# Var C3 located at ebp+12, size=OS_32
# Var $result located at ebp+8, size=OS_32
.Ll315:
# [2907] movd xmm0,dword ptr [C3]
movd 12(%ebp),%xmm0
.Ll316:
# [2908] movd xmm1,dword ptr[C2]
movd (%ecx),%xmm1
.Ll317:
# [2909] movd xmm2,dword ptr[C1]
movd (%edx),%xmm2
.Ll318:
# [2910] movd xmm3,dword ptr[C0]
movd (%eax),%xmm3
.Ll319:
# [2911] punpckldq xmm3,xmm1
punpckldq %xmm1,%xmm3
.Ll320:
# [2912] punpckldq xmm2,xmm0
punpckldq %xmm0,%xmm2
.Ll321:
# [2913] punpckldq xmm3,xmm2 // XMM3 = vTemp
punpckldq %xmm2,%xmm3
.Ll322:
# [2915] PAND XMM3, [g_vMask1] // vTemp = _mm_and_si128(vTemp,g_vMask1);
pand TC_$DIRECTX.MATH$_$XMVECTORSETBINARYCONSTANT$crcD1D7FBA5_$$_G_VMASK1,%xmm3
.Ll323:
# [2917] PCMPEQD XMM3, [g_vMask1] // vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1);
pcmpeqd TC_$DIRECTX.MATH$_$XMVECTORSETBINARYCONSTANT$crcD1D7FBA5_$$_G_VMASK1,%xmm3
.Ll324:
# [2919] PAND XMM3, [g_XMOne] // vTemp = _mm_and_si128(vTemp,g_XMOne);
pand TC_$DIRECTX.MATH_$$_G_XMONE,%xmm3
.Ll325:
# [2920] MOVUPS TXMVECTOR([result]), XMM3// return _mm_castsi128_ps(vTemp);
movups %xmm3,8(%ebp)
.Ll326:
# [2921] end;
leave
ret $8
.Lc129:
.Lt14:
.Ll327:
As you see, the output is the same. But most of all, the value in XMM0 is now valid.
The only problem remain is that the result is still not valid.
If I change the routine that also the result is in a register and not on the stack everythink works perfekt (this means, I pass a TXMVector as input instead of the four UINT32. So I have the in-var in a register and also the out-var).
Seems this is a problem when result lays on the stack?
And I have found this post
https://forum.lazarus.freepascal.org/index.php?topic=29097.0This is the bug tracker
https://bugs.freepascal.org/view.php?id=32710#c104254.
So I think the problem is the same on Windows?