vp10/encoder/x86/dct_ssse3_x86_64.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

;
;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;

%define private_prefix vp10

%include "third_party/x86inc/x86inc.asm"

; This file provides SSSE3 version of the forward transformation. Part
; of the macro definitions are originally derived from the ffmpeg project.
; The current version applies to x86 64-bit only.

SECTION .text

%if ARCH_X86_64
; matrix transpose
%macro INTERLEAVE_2X 4
  punpckh%1          m%4, m%2, m%3
  punpckl%1          m%2, m%3
  SWAP               %3,  %4
%endmacro

%macro TRANSPOSE8X8 9
  INTERLEAVE_2X  wd, %1, %2, %9
  INTERLEAVE_2X  wd, %3, %4, %9
  INTERLEAVE_2X  wd, %5, %6, %9
  INTERLEAVE_2X  wd, %7, %8, %9

  INTERLEAVE_2X  dq, %1, %3, %9
  INTERLEAVE_2X  dq, %2, %4, %9
  INTERLEAVE_2X  dq, %5, %7, %9
  INTERLEAVE_2X  dq, %6, %8, %9

  INTERLEAVE_2X  qdq, %1, %5, %9
  INTERLEAVE_2X  qdq, %3, %7, %9
  INTERLEAVE_2X  qdq, %2, %6, %9
  INTERLEAVE_2X  qdq, %4, %8, %9

  SWAP  %2, %5
  SWAP  %4, %7
%endmacro

%macro HMD8_1D 0
  psubw              m8, m0, m1
  psubw              m9, m2, m3
  paddw              m0, m1
  paddw              m2, m3
  SWAP               1, 8
  SWAP               3, 9
  psubw              m8, m4, m5
  psubw              m9, m6, m7
  paddw              m4, m5
  paddw              m6, m7
  SWAP               5, 8
  SWAP               7, 9

  psubw              m8, m0, m2
  psubw              m9, m1, m3
  paddw              m0, m2
  paddw              m1, m3
  SWAP               2, 8
  SWAP               3, 9
  psubw              m8, m4, m6
  psubw              m9, m5, m7
  paddw              m4, m6
  paddw              m5, m7
  SWAP               6, 8
  SWAP               7, 9

  psubw              m8, m0, m4
  psubw              m9, m1, m5
  paddw              m0, m4
  paddw              m1, m5
  SWAP               4, 8
  SWAP               5, 9
  psubw              m8, m2, m6
  psubw              m9, m3, m7
  paddw              m2, m6
  paddw              m3, m7
  SWAP               6, 8
  SWAP               7, 9
%endmacro

INIT_XMM ssse3
cglobal hadamard_8x8, 3, 5, 10, input, stride, output
  lea                r3, [2 * strideq]
  lea                r4, [4 * strideq]

  mova               m0, [inputq]
  mova               m1, [inputq + r3]
  lea                inputq, [inputq + r4]
  mova               m2, [inputq]
  mova               m3, [inputq + r3]
  lea                inputq, [inputq + r4]
  mova               m4, [inputq]
  mova               m5, [inputq + r3]
  lea                inputq, [inputq + r4]
  mova               m6, [inputq]
  mova               m7, [inputq + r3]

  HMD8_1D
  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  HMD8_1D

  mova              [outputq +   0], m0
  mova              [outputq +  16], m1
  mova              [outputq +  32], m2
  mova              [outputq +  48], m3
  mova              [outputq +  64], m4
  mova              [outputq +  80], m5
  mova              [outputq +  96], m6
  mova              [outputq + 112], m7

  RET
%endif