// very rough + simplified colour distance (rgba to 8-bit).
// useless since real camera data is 8-bit, but made because simulating the 32-bit
// images was too slow in plain pascal.
{$ifdef iacamarker}
{$info iacamarker on in production code!}
{$endif}
const
splitsh6 : array[0..31] of byte = ( $00,$04,$08,$0C,$01,$05,$09,$0d,
$02,$06,$0A,$0E,$03,$07,$0B,$0F,
$00,$04,$08,$0C,$01,$05,$09,$0d,
$02,$06,$0A,$0E,$03,$07,$0B,$0F);
permto8 : array[0..31] of byte = ( $00,$00,$00,$00,$04,$00,$00,$00,
$01,$00,$00,$00,$05,$00,$00,$00,
$02,$00,$00,$00,$06,$00,$00,$00,
$03,$00,$00,$00,$07,$00,$00,$00);
// http://threadlocalmutex.com/?p=8
// reduce Rgba andmask to 8bit for distance image, splitchannel variant
// stackframe comes from VS code. (not related to the body of the code)
procedure asmColourDistance(prgba1,prgba2 : pbyte;pdest8:pbyte;countinner,countouter: integer);assembler; nostackframe;
// src= rcx src2, rdx, pdest8 r8 countinner=r9,countouter=stack
asm
mov rax, rsp
mov QWORD PTR [rax+8], rbx
mov QWORD PTR [rax+16], rdi
push rbp
mov rdi,[rsp+$30]
sub rsp, 128 // 00000080H
vmovaps XMMWORD PTR [rax-24], xmm6
vmovaps XMMWORD PTR [rax-40], xmm7
vmovaps XMMWORD PTR [rax-56], xmm8
vmovaps XMMWORD PTR [rax-72], xmm9
vmovaps XMMWORD PTR [rax-88], xmm10
vmovaps XMMWORD PTR [rax-104], xmm11
lea rbp, QWORD PTR [rax-104]
and rbp, -32 // ffffffffffffffe0H
mov rax,rdi
// pixels naar slagen.
shr r9,5
mov r11d,$7F7F7F7F
movd xmm0,r11d
vpbroadcastd ymm11,xmm0
// in FPC code, loads of constants should now be 32-byte aligned (constmin)
vmovdqa ymm9, [rip+permto8]
vmovdqa ymm10,[rip+splitsh6]
@louter:
mov r11,r9
align 16
@linner:
{$ifdef iacamarker}
mov ebx, 111 // Start marker bytes
db $64, $67, $90 // Start marker bytes
{$endif}
//first load
vmovdqa ymm0,[rcx]
vpsrlw ymm0,ymm0,1
vpand ymm0,ymm0,ymm11
vmovdqa ymm1,[rdx]
vpsrlw ymm1,ymm1,1
vpand ymm1,ymm1,ymm11
vpsubsb ymm1,ymm1,ymm0
vpabsb ymm0,ymm1
// arrange dwords together. // absolute difference.
vpshufb ymm0,ymm0,ymm10
// gather qwords together.
vpermd ymm4,ymm9,ymm0
// ymm4..7 are channels to store.
// ymm4 is already loaded.
// process first load
vpermq ymm5,ymm4,1+2*4+3*16+0*64
vpermq ymm6,ymm4,2+1*4+3*16+0*64
vpermq ymm7,ymm4,3+1*4+2*16+0*64
// second load
vmovdqa ymm0,[rcx+32]
vpsrlw ymm0,ymm0,1
vpand ymm0,ymm0,ymm11
vmovdqa ymm1,[rdx+32]
vpsrlw ymm1,ymm1,1
vpand ymm1,ymm1,ymm11
vpsubsb ymm1,ymm1,ymm0
vpabsb ymm0,ymm1
// arrange dwords together.
vpshufb ymm0,ymm0,ymm10
// gather qwords together.
vpermd ymm2,ymm9,ymm0
//process second load.
vpermq ymm1,ymm2,1+0*4+3*16+2*64
vpblendd ymm4,ymm4,ymm1,4+8
vpblendd ymm5,ymm5,ymm2,4+8
vpermq ymm0,ymm2,3+2*4+1*16+0*64
vpblendd ymm6,ymm6,ymm0,4+8
vpermq ymm2,ymm2,1+3*4+2*16+0*64
vpblendd ymm7,ymm7,ymm2,4+8
// third load
vmovdqa ymm0,[rcx+64]
vpsrlw ymm0,ymm0,1
vpand ymm0,ymm0,ymm11
vmovdqa ymm1,[rdx+64]
vpsrlw ymm1,ymm1,1
vpand ymm1,ymm1,ymm11
vpsubsb ymm1,ymm1,ymm0
vpabsb ymm0,ymm1
// arrange dwords together.
vpshufb ymm0,ymm0,ymm10
// gather qwords together.
vpermd ymm2,ymm9,ymm0
//process third load.
vpermq ymm1,ymm2,1+2*4+0*16+3*64
vpblendd ymm4,ymm4,ymm1,16+32
vpermq ymm0,ymm2,3+2*4+1*16+0*64
vpblendd ymm5,ymm5,ymm0,16+32
vpblendd ymm6,ymm6,ymm2,16+32
vpermq ymm1,ymm2,2+1*4+3*16+0*64
vpblendd ymm7,ymm7,ymm1,16+32
// fourth load
vmovdqa ymm0,[rcx+96]
vpsrlw ymm0,ymm0,1
vpand ymm0,ymm0,ymm11
vmovdqa ymm1,[rdx+96]
vpsrlw ymm1,ymm1,1
vpand ymm1,ymm1,ymm11
vpsubsb ymm1,ymm1,ymm0
vpabsb ymm0,ymm1
// arrange dwords together.
vpshufb ymm0,ymm0,ymm10
// gather qwords together.
vpermd ymm2,ymm9,ymm0
//process fourth load.
vpermq ymm1,ymm2,1+2*4+3*16+0*64
vpblendd ymm4,ymm4,ymm1,64+128
vpermq ymm0,ymm2,3+2*4+0*16+1*64
vpblendd ymm5,ymm5,ymm0,64+128
vpermq ymm3,ymm2,1+3*4+0*16+2*64
vpblendd ymm6,ymm6,ymm3,64+128
vpblendd ymm7,ymm7,ymm2,64+128
// we now have 3 regs with abs differences per colour channel. (ignoring channel a)
// we could do something more interesting, but for now, just add them
// together.
vpaddsb ymm4,ymm4,ymm5
vpaddsb ymm6,ymm6,ymm7
vpaddsb ymm4,ymm4,ymm6
vmovdqa [r8],ymm4
{$ifdef iacamarker}
mov ebx, 222 // End marker bytes
db $64, $67, $90 // End marker bytes
{$endif}
add rdx,128
add rcx,128
add r8,32
dec r11
jne @linner
dec rax
jne @louter
vzeroupper
lea r11, QWORD PTR [rsp+128]
mov rbx, QWORD PTR [r11+16]
mov rdi, QWORD PTR [r11+24]
vmovaps xmm6, XMMWORD PTR [r11-16]
vmovaps xmm7, XMMWORD PTR [r11-32]
vmovaps xmm8, XMMWORD PTR [r11-48]
vmovaps xmm9, XMMWORD PTR [r11-64]
vmovaps xmm10, XMMWORD PTR [r11-80]
vmovaps xmm11, XMMWORD PTR [r11-96]
mov rsp, r11
pop rbp
ret 0
end;