function UTF8Len_aligned16(p16: pchar; BlockCount: PtrInt):PtrInt;assembler;
Label
loop;
asm
PUSH ecx
PUSH edi
PUSH ebx
{ tmp counter }
MOV ecx, 0
{ masks }
//xmm0 := ($00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00);
PXOR xmm0, xmm0
//xmm1 := ($01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01);
PCMPEQW xmm1, xmm1
PSRLW xmm1, 15
PACKUSWB xmm1, xmm1
//xmm2 := ($80,$80,$80,$80,$80,$80,$80,$80,$80,$80,$80,$80,$80,$80,$80,$80);
MOVAPD xmm2, xmm1
PSLLQ xmm2, 7
//xmm3 := ($FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF,$FF);
PCMPEQW xmm3, xmm3
Loop:
{ On big data it increased the speed from 13.x to 15.x times }
{ On small data it decreased the speed }
//PREFETCHNTA [p16+256]
{ get 16 bytes }
MOVDQA xmm4, [p16]
{ Invert 16 bytes }
MOVDQA xmm5, xmm4
ANDNPD xmm5, xmm3 { xmm5 = not xmm4}
{ Shift the inverted bytes 6 bits to the right }
PSRLQ xmm5, 6
{ Keep msb of each non-inverted byte }
PAND xmm4, xmm2//ONEMASKx80
{ Shift them to right 7 bits }
PSRLQ xmm4, 7 { Shift Right Logical QWord }
{ A one in the 1st bit means: NOT the first byte of a codepoint }
PAND xmm5, xmm4
{ Count them ;-) }
PSADBW xmm5, xmm0
MOVD edi, xmm5
PEXTRW ebx, xmm5, 4
ADD ecx, ebx
ADD ecx, edi
{ Next 16 bytes }
ADD p16, 16
DEC edx
JNZ Loop
{ Result }
MOV eax, ecx
POP ebx
POP edi
POP ecx
end;