{ %norun }
{ %cpu=i386,x86_64 }
{ %opt=-Sew -vw }
{$mode objfpc}
{ The test checks that MOVSS instruction assembles without warning.
  Running it could be a nice bonus, but it turns out that we have no portable
  way to detect SSE4.1 support (for DPPS), so disabled for now. }
uses cpu;

{$asmmode att}
procedure test1; assembler;
var
  s: single;
asm
   movss  s, %xmm6
   movss  %xmm6, s
{$ifdef cpui386}
   movss  (%eax, %edx), %xmm7
   movss  %xmm7, (%eax, %edx)
{$endif}
{$ifdef cpux86_64}
   movss  (%rax, %rdx), %xmm7
   movss  %xmm7, (%rax, %rdx)
{$endif}
end;

{$asmmode intel}
procedure test2; assembler;
var
  s: single;
asm
  movss  [s], xmm6
  movss  xmm6, [s]
{$ifdef cpui386}
  movss  [eax+edx], xmm7
  movss  xmm7, [eax+edx]
{$endif}
{$ifdef cpux86_64}
  movss  [rax+rdx], xmm7
  movss  xmm7, [rax+rdx]
{$endif}
end;


type
   TVector4 = packed record
     X, Y, Z, W: Single;
   end;

 function _VectorDotProductSSE4(Vector1, Vector2: TVector4): Single; assembler;
 asm
   MOVUPS XMM0, [Vector1]
   MOVUPS XMM1, [Vector2]
   DPPS XMM0, XMM1, $71 { Only perform calculations on the X, Y and Z coordinates; only store result in the first element }
   MOVSS Result, XMM0 { Store result - first element of XMM0 }
 end;

var
  v: tvector4;
  r: single;
begin
  v.x:=1;
  v.y:=1;
  v.z:=1;
  v.w:=1;
  r:=_vectordotproductSSE4(v,v);
  if r<>3 then
    halt(1);
  writeln('ok');
end.