30 #ifndef _D_LINE_ARITH_SSE_H
31 #define _D_LINE_ARITH_SSE_H
33 #include <emmintrin.h>
44 struct addLine<UINT8> :
public binaryLineFunctionBase<UINT8> {
45 inline void _exec(UINT8 *lIn1, UINT8 *lIn2,
size_t size, UINT8 *lOut)
47 for (
int i = 0; i < size; i++)
48 lOut[i] = lIn1[i] > (UINT8)(numeric_limits<UINT8>::max() - lIn2[i])
49 ? numeric_limits<UINT8>::max()
52 inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2,
size_t size,
56 __m128i *l1 = (__m128i *) lIn1;
57 __m128i *l2 = (__m128i *) lIn2;
58 __m128i *l3 = (__m128i *) lOut;
60 unsigned long alignLen = size - size % SIMD_VEC_SIZE;
62 for (
size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
63 r0 = _mm_load_si128(l1);
64 r1 = _mm_load_si128(l2);
65 r1 = _mm_adds_epu8(r0, r1);
66 _mm_store_si128(l3, r1);
69 _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
75 struct addNoSatLine<UINT8> :
public binaryLineFunctionBase<UINT8> {
76 inline void _exec(UINT8 *lIn1, UINT8 *lIn2,
size_t size, UINT8 *lOut)
78 for (
int i = 0; i < size; i++)
79 lOut[i] = lIn1[i] + lIn2[i];
81 inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2,
size_t size,
85 __m128i *l1 = (__m128i *) lIn1;
86 __m128i *l2 = (__m128i *) lIn2;
87 __m128i *l3 = (__m128i *) lOut;
89 unsigned long alignLen = size - size % SIMD_VEC_SIZE;
91 for (
size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
92 r0 = _mm_load_si128(l1);
93 r1 = _mm_load_si128(l2);
94 r1 = _mm_add_epi8(r0, r1);
95 _mm_store_si128(l3, r1);
98 _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
104 struct subLine<UINT8> :
public binaryLineFunctionBase<UINT8> {
105 inline void _exec(UINT8 *lIn1, UINT8 *lIn2,
size_t size, UINT8 *lOut)
107 for (
int i = 0; i < size; i++)
108 lOut[i] = lIn1[i] < (UINT8)(numeric_limits<UINT8>::max() + lIn2[i])
109 ? numeric_limits<UINT8>::min()
112 inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2,
size_t size,
116 __m128i *l1 = (__m128i *) lIn1;
117 __m128i *l2 = (__m128i *) lIn2;
118 __m128i *l3 = (__m128i *) lOut;
120 unsigned long alignLen = size - size % SIMD_VEC_SIZE;
122 for (
size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
123 r0 = _mm_load_si128(l1);
124 r1 = _mm_load_si128(l2);
125 r1 = _mm_subs_epu8(r0, r1);
126 _mm_store_si128(l3, r1);
129 _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
135 struct subNoSatLine<UINT8> :
public binaryLineFunctionBase<UINT8> {
136 inline void _exec(UINT8 *lIn1, UINT8 *lIn2,
size_t size, UINT8 *lOut)
138 for (
int i = 0; i < size; i++)
139 lOut[i] = lIn1[i] - lIn2[i];
141 inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2,
size_t size,
145 __m128i *l1 = (__m128i *) lIn1;
146 __m128i *l2 = (__m128i *) lIn2;
147 __m128i *l3 = (__m128i *) lOut;
149 unsigned long alignLen = size - size % SIMD_VEC_SIZE;
151 for (
size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
152 r0 = _mm_load_si128(l1);
153 r1 = _mm_load_si128(l2);
154 r1 = _mm_sub_epi8(r0, r1);
155 _mm_store_si128(l3, r1);
158 _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
164 struct supLine<UINT8> :
public binaryLineFunctionBase<UINT8> {
165 inline void _exec(UINT8 *lIn1, UINT8 *lIn2,
size_t size, UINT8 *lOut)
167 for (
int i = 0; i < size; i++)
168 lOut[i] = lIn1[i] > lIn2[i] ? lIn1[i] : lIn2[i];
170 inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2,
size_t size,
174 __m128i *l1 = (__m128i *) lIn1;
175 __m128i *l2 = (__m128i *) lIn2;
176 __m128i *l3 = (__m128i *) lOut;
178 unsigned long alignLen = size - size % SIMD_VEC_SIZE;
180 for (
size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
181 r0 = _mm_load_si128(l1);
182 r1 = _mm_load_si128(l2);
183 r1 = _mm_max_epu8(r0, r1);
184 _mm_store_si128(l3, r1);
187 _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
193 struct infLine<UINT8> :
public binaryLineFunctionBase<UINT8> {
194 inline void _exec(UINT8 *lIn1, UINT8 *lIn2,
size_t size, UINT8 *lOut)
196 for (
int i = 0; i < size; i++)
197 lOut[i] = lIn1[i] < lIn2[i] ? lIn1[i] : lIn2[i];
199 inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2,
size_t size,
203 __m128i *l1 = (__m128i *) lIn1;
204 __m128i *l2 = (__m128i *) lIn2;
205 __m128i *l3 = (__m128i *) lOut;
207 unsigned long alignLen = size - size % SIMD_VEC_SIZE;
209 for (
size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
210 r0 = _mm_load_si128(l1);
211 r1 = _mm_load_si128(l2);
212 r1 = _mm_min_epu8(r0, r1);
213 _mm_store_si128(l3, r1);
216 _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,