SMIL  1.0.4
DLineArith_SSE.h
1 /*
2  * Copyright (c) 2011-2016, Matthieu FAESSEL and ARMINES
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of Matthieu FAESSEL, or ARMINES nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
21  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 #ifndef _D_LINE_ARITH_SSE_H
31 #define _D_LINE_ARITH_SSE_H
32 
33 #include <emmintrin.h>
34 
35 namespace smil
36 {
42  /* @cond */
43  template <>
44  struct addLine<UINT8> : public binaryLineFunctionBase<UINT8> {
45  inline void _exec(UINT8 *lIn1, UINT8 *lIn2, size_t size, UINT8 *lOut)
46  {
47  for (int i = 0; i < size; i++)
48  lOut[i] = lIn1[i] > (UINT8)(numeric_limits<UINT8>::max() - lIn2[i])
49  ? numeric_limits<UINT8>::max()
50  : lIn1[i] + lIn2[i];
51  }
52  inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2, size_t size,
53  UINT8 *lOut)
54  {
55  __m128i r0, r1;
56  __m128i *l1 = (__m128i *) lIn1;
57  __m128i *l2 = (__m128i *) lIn2;
58  __m128i *l3 = (__m128i *) lOut;
59 
60  unsigned long alignLen = size - size % SIMD_VEC_SIZE;
61 
62  for (size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
63  r0 = _mm_load_si128(l1);
64  r1 = _mm_load_si128(l2);
65  r1 = _mm_adds_epu8(r0, r1);
66  _mm_store_si128(l3, r1);
67  }
68 
69  _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
70  lOut + alignLen);
71  }
72  };
73 
74  template <>
75  struct addNoSatLine<UINT8> : public binaryLineFunctionBase<UINT8> {
76  inline void _exec(UINT8 *lIn1, UINT8 *lIn2, size_t size, UINT8 *lOut)
77  {
78  for (int i = 0; i < size; i++)
79  lOut[i] = lIn1[i] + lIn2[i];
80  }
81  inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2, size_t size,
82  UINT8 *lOut)
83  {
84  __m128i r0, r1;
85  __m128i *l1 = (__m128i *) lIn1;
86  __m128i *l2 = (__m128i *) lIn2;
87  __m128i *l3 = (__m128i *) lOut;
88 
89  unsigned long alignLen = size - size % SIMD_VEC_SIZE;
90 
91  for (size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
92  r0 = _mm_load_si128(l1);
93  r1 = _mm_load_si128(l2);
94  r1 = _mm_add_epi8(r0, r1);
95  _mm_store_si128(l3, r1);
96  }
97 
98  _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
99  lOut + alignLen);
100  }
101  };
102 
103  template <>
104  struct subLine<UINT8> : public binaryLineFunctionBase<UINT8> {
105  inline void _exec(UINT8 *lIn1, UINT8 *lIn2, size_t size, UINT8 *lOut)
106  {
107  for (int i = 0; i < size; i++)
108  lOut[i] = lIn1[i] < (UINT8)(numeric_limits<UINT8>::max() + lIn2[i])
109  ? numeric_limits<UINT8>::min()
110  : lIn1[i] - lIn2[i];
111  }
112  inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2, size_t size,
113  UINT8 *lOut)
114  {
115  __m128i r0, r1;
116  __m128i *l1 = (__m128i *) lIn1;
117  __m128i *l2 = (__m128i *) lIn2;
118  __m128i *l3 = (__m128i *) lOut;
119 
120  unsigned long alignLen = size - size % SIMD_VEC_SIZE;
121 
122  for (size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
123  r0 = _mm_load_si128(l1);
124  r1 = _mm_load_si128(l2);
125  r1 = _mm_subs_epu8(r0, r1);
126  _mm_store_si128(l3, r1);
127  }
128 
129  _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
130  lOut + alignLen);
131  }
132  };
133 
134  template <>
135  struct subNoSatLine<UINT8> : public binaryLineFunctionBase<UINT8> {
136  inline void _exec(UINT8 *lIn1, UINT8 *lIn2, size_t size, UINT8 *lOut)
137  {
138  for (int i = 0; i < size; i++)
139  lOut[i] = lIn1[i] - lIn2[i];
140  }
141  inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2, size_t size,
142  UINT8 *lOut)
143  {
144  __m128i r0, r1;
145  __m128i *l1 = (__m128i *) lIn1;
146  __m128i *l2 = (__m128i *) lIn2;
147  __m128i *l3 = (__m128i *) lOut;
148 
149  unsigned long alignLen = size - size % SIMD_VEC_SIZE;
150 
151  for (size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
152  r0 = _mm_load_si128(l1);
153  r1 = _mm_load_si128(l2);
154  r1 = _mm_sub_epi8(r0, r1);
155  _mm_store_si128(l3, r1);
156  }
157 
158  _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
159  lOut + alignLen);
160  }
161  };
162 
163  template <>
164  struct supLine<UINT8> : public binaryLineFunctionBase<UINT8> {
165  inline void _exec(UINT8 *lIn1, UINT8 *lIn2, size_t size, UINT8 *lOut)
166  {
167  for (int i = 0; i < size; i++)
168  lOut[i] = lIn1[i] > lIn2[i] ? lIn1[i] : lIn2[i];
169  }
170  inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2, size_t size,
171  UINT8 *lOut)
172  {
173  __m128i r0, r1;
174  __m128i *l1 = (__m128i *) lIn1;
175  __m128i *l2 = (__m128i *) lIn2;
176  __m128i *l3 = (__m128i *) lOut;
177 
178  unsigned long alignLen = size - size % SIMD_VEC_SIZE;
179 
180  for (size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
181  r0 = _mm_load_si128(l1);
182  r1 = _mm_load_si128(l2);
183  r1 = _mm_max_epu8(r0, r1);
184  _mm_store_si128(l3, r1);
185  }
186 
187  _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
188  lOut + alignLen);
189  }
190  };
191 
192  template <>
193  struct infLine<UINT8> : public binaryLineFunctionBase<UINT8> {
194  inline void _exec(UINT8 *lIn1, UINT8 *lIn2, size_t size, UINT8 *lOut)
195  {
196  for (int i = 0; i < size; i++)
197  lOut[i] = lIn1[i] < lIn2[i] ? lIn1[i] : lIn2[i];
198  }
199  inline void _exec_aligned(UINT8 *lIn1, UINT8 *lIn2, size_t size,
200  UINT8 *lOut)
201  {
202  __m128i r0, r1;
203  __m128i *l1 = (__m128i *) lIn1;
204  __m128i *l2 = (__m128i *) lIn2;
205  __m128i *l3 = (__m128i *) lOut;
206 
207  unsigned long alignLen = size - size % SIMD_VEC_SIZE;
208 
209  for (size_t i = 0; i < alignLen; i += 16, l1++, l2++, l3++) {
210  r0 = _mm_load_si128(l1);
211  r1 = _mm_load_si128(l2);
212  r1 = _mm_min_epu8(r0, r1);
213  _mm_store_si128(l3, r1);
214  }
215 
216  _exec(lIn1 + alignLen, lIn2 + alignLen, size % SIMD_VEC_SIZE,
217  lOut + alignLen);
218  }
219  };
224 } // namespace smil
225 
226 #endif // _D_LINE_ARITH_SSE_H