/[pcsx2_0.9.7]/trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp
ViewVC logotype

Contents of /trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 280 - (show annotations) (download)
Thu Dec 23 12:02:12 2010 UTC (9 years, 1 month ago) by william
File size: 26653 byte(s)
re-commit (had local access denied errors when committing)
1 /* ZZ Open GL graphics plugin
2 * Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
3 * Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18 */
19
20 #include "GS.h"
21 #include "Mem.h"
22 #include "Mem_Swizzle.h"
23 #ifdef ZEROGS_SSE2
24 #include <emmintrin.h>
25 #endif
26
27 // WARNING a sfence instruction must be call after SwizzleBlock sse2 function
28
29 // Current port of the ASM function to intrinsic
30 #define INTRINSIC_PORT_16
31 #define INTRINSIC_PORT_8
32 #define INTRINSIC_PORT_4
33 #ifdef ZEROGS_SSE2
34 static const __aligned16 u32 mask_24b_H[4] = {0xFF000000, 0x0000FFFF, 0xFF000000, 0x0000FFFF};
35 static const __aligned16 u32 mask_24b_L[4] = {0x00FFFFFF, 0x00000000, 0x00FFFFFF, 0x00000000};
36
37 template<bool aligned>
38 __forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch)
39 {
40 __m128i src_0;
41 __m128i src_1;
42 __m128i src_2;
43 __m128i src_3;
44
45 for (int i=3 ; i >= 0 ; --i) {
46 // load
47 if (aligned) {
48 src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
49 src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
50 src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
51 src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
52 } else {
53 src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
54 src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
55 src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
56 src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
57 }
58
59 // Reorder
60 __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
61 __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
62 __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
63 __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
64
65 // store
66 _mm_stream_si128((__m128i*)dst, dst_0);
67 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
68 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
69 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
70
71 // update the pointer
72 dst += 64;
73 src += 2*pitch;
74 }
75 }
76
77 template<bool aligned>
78 __forceinline void SwizzleBlock16_sse2_I(u8 *dst, u8 *src, int pitch)
79 {
80 __m128i src_0_L;
81 __m128i src_0_H;
82 __m128i src_2_L;
83 __m128i src_2_H;
84
85 for (int i=3 ; i >= 0 ; --i) {
86 // load
87 if (aligned) {
88 src_0_L = _mm_load_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
89 src_0_H = _mm_load_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
90 src_2_L = _mm_load_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
91 src_2_H = _mm_load_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
92 } else {
93 src_0_L = _mm_loadu_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
94 src_0_H = _mm_loadu_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
95 src_2_L = _mm_loadu_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
96 src_2_H = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
97 }
98
99 // Interleave L and H to obtains 32 bits packets
100 __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5H 5L 4H 4L 1H 1L 0H 0L
101 __m128i dst_1_tmp = _mm_unpacklo_epi16(src_2_L, src_2_H); // 7H 7L 6H 6L 3H 3L 2H 2L
102 __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13H 13L 12H 12L 9H 9L 8H 8L
103 __m128i dst_3_tmp = _mm_unpackhi_epi16(src_2_L, src_2_H); // 15H 15L 14H 14L 11H 11L 10H 10L
104
105 // Reorder
106 __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
107 __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
108 __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
109 __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
110
111 // store
112 _mm_stream_si128((__m128i*)dst, dst_0);
113 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
114 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
115 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
116
117 // update the pointer
118 dst += 64;
119 src += 2*pitch;
120 }
121 }
122
123 // Template the code to improve reuse of code
124 template<bool aligned, u32 INDEX>
125 __forceinline void SwizzleColumn8_sse2_I(u8 *dst, u8 *src, int pitch)
126 {
127 __m128i src_0;
128 __m128i src_1;
129 __m128i src_2;
130 __m128i src_3;
131
132 // load 4 line of 16*8 bits packets
133 if (aligned) {
134 src_0 = _mm_load_si128((__m128i*)src);
135 src_2 = _mm_load_si128((__m128i*)(src+pitch));
136 src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
137 src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
138 } else {
139 src_0 = _mm_loadu_si128((__m128i*)src);
140 src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
141 src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
142 src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
143 }
144
145 // shuffle 2 lines to align pixels
146 if (INDEX == 0 || INDEX == 2) {
147 src_1 = _mm_shuffle_epi32(src_1, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 3 & 1)
148 src_3 = _mm_shuffle_epi32(src_3, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 3 & 1)
149 } else if (INDEX == 1 || INDEX == 3) {
150 src_0 = _mm_shuffle_epi32(src_0, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 2 & 0)
151 src_2 = _mm_shuffle_epi32(src_2, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 2 & 0)
152 } else {
153 assert(0);
154 }
155 // src_0 = 13 12 9 8 5 4 1 0 ... (byte 2 & 0)
156 // src_1 = 13 12 9 8 5 4 1 0 ... (byte 3 & 1)
157 // src_2 = 15 14 11 10 7 6 3 2 ... (byte 2 & 0)
158 // src_3 = 15 14 11 10 7 6 3 2 ... (byte 3 & 1)
159
160 // Interleave byte 1 & 0 to obtain 16 bits packets
161 __m128i src_0_L = _mm_unpacklo_epi8(src_0, src_1); // 13L 12L 9L 8L 5L 4L 1L 0L
162 __m128i src_1_L = _mm_unpacklo_epi8(src_2, src_3); // 15L 14L 11L 10L 7L 6L 3L 2L
163 // Interleave byte 3 & 2 to obtain 16 bits packets
164 __m128i src_0_H = _mm_unpackhi_epi8(src_0, src_1); // 13H 12H 9H 8H 5H 4H 1H 0H
165 __m128i src_1_H = _mm_unpackhi_epi8(src_2, src_3); // 15H 14H 11H 10H 7H 6H 3H 2H
166
167 // Interleave H and L to obtain 32 bits packets
168 __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5 4 1 0
169 __m128i dst_1_tmp = _mm_unpacklo_epi16(src_1_L, src_1_H); // 7 6 3 2
170 __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13 12 9 8
171 __m128i dst_3_tmp = _mm_unpackhi_epi16(src_1_L, src_1_H); // 15 14 11 10
172
173 // Reorder the 32 bits packets
174 __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
175 __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
176 __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
177 __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
178
179 // store
180 _mm_stream_si128((__m128i*)dst, dst_0);
181 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
182 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
183 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
184 }
185
186 template<bool aligned>
187 __forceinline void SwizzleBlock8_sse2_I(u8 *dst, u8 *src, int pitch)
188 {
189 SwizzleColumn8_sse2_I<aligned, 0>(dst, src, pitch);
190
191 dst += 64;
192 src += 4*pitch;
193 SwizzleColumn8_sse2_I<aligned, 1>(dst, src, pitch);
194
195 dst += 64;
196 src += 4*pitch;
197 SwizzleColumn8_sse2_I<aligned, 2>(dst, src, pitch);
198
199 dst += 64;
200 src += 4*pitch;
201 SwizzleColumn8_sse2_I<aligned, 3>(dst, src, pitch);
202 }
203
204 // Template the code to improve reuse of code
205 template<bool aligned, u32 INDEX>
206 __forceinline void SwizzleColumn4_sse2_I(u8 *dst, u8 *src, int pitch)
207 {
208 __m128i src_0;
209 __m128i src_1;
210 __m128i src_2;
211 __m128i src_3;
212
213 // Build a mask (tranform a u32 to a 4 packets u32)
214 const u32 mask_template = 0x0f0f0f0f;
215 __m128i mask = _mm_cvtsi32_si128(mask_template);
216 mask = _mm_shuffle_epi32(mask, 0);
217
218 // load 4 line of 32*4 bits packets
219 if (aligned) {
220 src_0 = _mm_load_si128((__m128i*)src);
221 src_2 = _mm_load_si128((__m128i*)(src+pitch));
222 src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
223 src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
224 } else {
225 src_0 = _mm_loadu_si128((__m128i*)src);
226 src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
227 src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
228 src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
229 }
230
231 // shuffle 2 lines to align pixels
232 if (INDEX == 0 || INDEX == 2) {
233 src_1 = _mm_shufflelo_epi16(src_1, 0xB1);
234 src_1 = _mm_shufflehi_epi16(src_1, 0xB1); // 13 12 9 8 5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
235 src_3 = _mm_shufflelo_epi16(src_3, 0xB1);
236 src_3 = _mm_shufflehi_epi16(src_3, 0xB1); // 15 14 11 10 7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
237 } else if (INDEX == 1 || INDEX == 3) {
238 src_0 = _mm_shufflelo_epi16(src_0, 0xB1);
239 src_0 = _mm_shufflehi_epi16(src_0, 0xB1); // 13 12 9 8 5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
240 src_2 = _mm_shufflelo_epi16(src_2, 0xB1);
241 src_2 = _mm_shufflehi_epi16(src_2, 0xB1); // 15 14 11 10 7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
242 } else {
243 assert(0);
244 }
245 // src_0 = 13 12 9 8 5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
246 // src_1 = 13 12 9 8 5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
247 // src_2 = 15 14 11 10 7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
248 // src_3 = 15 14 11 10 7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
249
250 // ** Interleave Half-byte to obtain 8 bits packets
251 // Shift value to ease 4 bits filter.
252 // Note use a packet shift to allow a 4bits shifts
253 __m128i src_0_shift = _mm_srli_epi64(src_0, 4); // ? 13 12 9 8 5 4 1 ... (Half-byte 6 & 4 & 2 & 0)
254 __m128i src_1_shift = _mm_slli_epi64(src_1, 4); // 12 9 8 5 4 1 0 ? ... (Half-byte 7 & 5 & 3 & 1)
255 __m128i src_2_shift = _mm_srli_epi64(src_2, 4); // ? 15 14 11 10 7 6 3 ... (Half-byte 6 & 4 & 2 & 0)
256 __m128i src_3_shift = _mm_slli_epi64(src_3, 4); // 14 11 10 7 6 3 2 ? ... (Half-byte 7 & 5 & 3 & 1)
257
258 // 12 - 8 - 4 - 0 - (HB odd) || - 12 - 8 - 4 - 0 (HB even) => 12 8 4 0 (byte 3 & 2 & 1 & 0)
259 src_0 = _mm_or_si128(_mm_andnot_si128(mask, src_1_shift), _mm_and_si128(mask, src_0));
260 // - 13 - 9 - 5 - 1 (HB even) || 13 - 9 - 5 - 1 - (HB odd) => 13 9 5 1 (byte 3 & 2 & 1 & 0)
261 src_1 = _mm_or_si128(_mm_and_si128(mask, src_0_shift), _mm_andnot_si128(mask, src_1));
262
263 // 14 - 10 - 6 - 2 - (HB odd) || - 14 - 10 - 6 - 2 (HB even) => 14 10 6 2 (byte 3 & 2 & 1 & 0)
264 src_2 = _mm_or_si128(_mm_andnot_si128(mask, src_3_shift), _mm_and_si128(mask, src_2));
265 // - 15 - 11 - 7 - 3 (HB even) || 15 - 11 - 7 - 3 - (HB odd) => 15 11 7 3 (byte 3 & 2 & 1 & 0)
266 src_3 = _mm_or_si128(_mm_and_si128(mask, src_2_shift), _mm_andnot_si128(mask, src_3));
267
268
269 // reorder the 8 bits packets
270 __m128i src_0_tmp = _mm_unpacklo_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 1 & 0)
271 __m128i src_1_tmp = _mm_unpackhi_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 3 & 2)
272 __m128i src_2_tmp = _mm_unpacklo_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 1 & 0)
273 __m128i src_3_tmp = _mm_unpackhi_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 3 & 2)
274
275 // interleave byte to obtain 32 bits packets
276 __m128i src_0_L = _mm_unpacklo_epi8(src_0_tmp, src_1_tmp); // 2.13 0.13 2.12 0.12 2.9 0.9 2.8 0.8 2.5 0.5 2.4 0.4 2.1 0.1 2.0 0.0
277 __m128i src_0_H = _mm_unpackhi_epi8(src_0_tmp, src_1_tmp); // 3.13 1.13 3.12 1.12 3.9 1.9 3.8 1.8 3.5 1.5 3.4 1.4 3.1 1.1 3.0 1.0
278 __m128i src_1_L = _mm_unpacklo_epi8(src_2_tmp, src_3_tmp); // 2.15 0.15 2.14 0.14 2.11 0.11 2.10 0.10 2.7 0.7 2.6 0.6 2.3 0.3 2.2 0.2
279 __m128i src_1_H = _mm_unpackhi_epi8(src_2_tmp, src_3_tmp); // 3.15 1.15 3.14 1.14 3.11 1.11 3.10 1.10 3.7 1.7 3.6 1.6 3.3 1.3 3.2 1.2
280
281 __m128i dst_0_tmp = _mm_unpacklo_epi8(src_0_L, src_0_H); // 5 4 1 0
282 __m128i dst_1_tmp = _mm_unpacklo_epi8(src_1_L, src_1_H); // 7 6 3 2
283 __m128i dst_2_tmp = _mm_unpackhi_epi8(src_0_L, src_0_H); // 13 12 9 8
284 __m128i dst_3_tmp = _mm_unpackhi_epi8(src_1_L, src_1_H); // 15 14 11 10
285
286 // Reorder the 32 bits packets
287 __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
288 __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
289 __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
290 __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
291
292 // store
293 _mm_stream_si128((__m128i*)dst, dst_0);
294 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
295 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
296 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
297 }
298
299 template<bool aligned>
300 __forceinline void SwizzleBlock4_sse2_I(u8 *dst, u8 *src, int pitch)
301 {
302 SwizzleColumn4_sse2_I<aligned, 0>(dst, src, pitch);
303
304 dst += 64;
305 src += 4*pitch;
306 SwizzleColumn4_sse2_I<aligned, 1>(dst, src, pitch);
307
308 dst += 64;
309 src += 4*pitch;
310 SwizzleColumn4_sse2_I<aligned, 2>(dst, src, pitch);
311
312 dst += 64;
313 src += 4*pitch;
314 SwizzleColumn4_sse2_I<aligned, 3>(dst, src, pitch);
315 }
316
317 template<bool FOUR_BIT, bool UPPER>
318 __forceinline void SwizzleBlock8H_4H(u8 *dst, u8 *src, int pitch)
319 {
320 __m128i zero_128 = _mm_setzero_si128();
321 __m128i src_0;
322 __m128i src_1;
323 __m128i src_2;
324 __m128i src_3;
325 __m128i src_0_init_H;
326 __m128i src_0_init_L;
327 __m128i src_2_init_H;
328 __m128i src_2_init_L;
329 __m128i src_0_init;
330 __m128i src_2_init;
331
332 __m128i upper_mask = _mm_cvtsi32_si128(0xF0F0F0F0);
333 // Build the write_mask (tranform a u32 to a 4 packets u32)
334 __m128i write_mask;
335 if (FOUR_BIT) {
336 if (UPPER) write_mask = _mm_cvtsi32_si128(0xF0000000);
337 else write_mask = _mm_cvtsi32_si128(0x0F000000);
338 } else {
339 write_mask = _mm_cvtsi32_si128(0xFF000000);
340 }
341 write_mask = _mm_shuffle_epi32(write_mask, 0);
342
343 for (int i=3 ; i >= 0 ; --i) {
344 if (FOUR_BIT) {
345 src_0_init = _mm_cvtsi32_si128(*(u32*)src);
346 src_2_init = _mm_cvtsi32_si128(*(u32*)(src + pitch));
347 } else {
348 src_0_init = _mm_loadl_epi64((__m128i*)src);
349 src_2_init = _mm_loadl_epi64((__m128i*)(src + pitch));
350 }
351
352 // Convert to 8 bits
353 if (FOUR_BIT) {
354 src_0_init_H = _mm_and_si128(upper_mask, src_0_init);
355 src_0_init_L = _mm_andnot_si128(upper_mask, src_0_init);
356 src_2_init_H = _mm_and_si128(upper_mask, src_2_init);
357 src_2_init_L = _mm_andnot_si128(upper_mask, src_2_init);
358
359 if (UPPER) {
360 src_0_init_L = _mm_slli_epi32(src_0_init_L, 4);
361 src_2_init_L = _mm_slli_epi32(src_2_init_L, 4);
362 } else {
363 src_0_init_H = _mm_srli_epi32(src_0_init_H, 4);
364 src_2_init_H = _mm_srli_epi32(src_2_init_H, 4);
365 }
366
367 // Repack the src to keep HByte order
368 src_0_init = _mm_unpacklo_epi8(src_0_init_L, src_0_init_H);
369 src_2_init = _mm_unpacklo_epi8(src_2_init_L, src_2_init_H);
370 }
371
372 // transform to 16 bits (add 0 in low bits)
373 src_0_init = _mm_unpacklo_epi8(zero_128, src_0_init);
374 src_2_init = _mm_unpacklo_epi8(zero_128, src_2_init);
375
376 // transform to 32 bits (add 0 in low bits)
377 src_0 = _mm_unpacklo_epi16(zero_128, src_0_init);
378 src_1 = _mm_unpackhi_epi16(zero_128, src_0_init);
379 src_2 = _mm_unpacklo_epi16(zero_128, src_2_init);
380 src_3 = _mm_unpackhi_epi16(zero_128, src_2_init);
381
382 // Reorder the data (same as 32 bits format)
383 __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
384 __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
385 __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
386 __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
387
388 // Load previous value and apply the ~write_mask
389 __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
390 dst_0 = _mm_or_si128(dst_0, old_dst_0);
391
392 __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
393 dst_1 = _mm_or_si128(dst_1, old_dst_1);
394
395 __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
396 dst_2 = _mm_or_si128(dst_2, old_dst_2);
397
398 __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
399 dst_3 = _mm_or_si128(dst_3, old_dst_3);
400
401 // store
402 _mm_stream_si128((__m128i*)dst, dst_0);
403 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
404 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
405 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
406
407 // update the pointer
408 dst += 64;
409 src += 2*pitch;
410 }
411 }
412
413 // special swizzle macros - which I converted to functions.
414
415 __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch)
416 {
417 SwizzleBlock32_sse2_I<true>(dst, src, pitch);
418 }
419
420 __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch)
421 {
422 __m128i mask_H = _mm_load_si128((__m128i*)mask_24b_H);
423 __m128i mask_L = _mm_load_si128((__m128i*)mask_24b_L);
424 // Build the write_mask (tranform a u32 to a 4 packets u32)
425 __m128i write_mask = _mm_cvtsi32_si128(0x00FFFFFF);
426 write_mask = _mm_shuffle_epi32(write_mask, 0);
427
428 for (int i=3 ; i >= 0 ; --i) {
429 // Note src can be out of bound of GS memory (but there is some spare allocation
430 // to avoid a tricky corner case)
431 __m128i src_0 = _mm_loadu_si128((__m128i*)src);
432 __m128i src_1 = _mm_loadu_si128((__m128i*)(src+12));
433 __m128i src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
434 __m128i src_3 = _mm_loadu_si128((__m128i*)(src+pitch+12));
435
436 // transform 24 bits value to 32 bits one
437 // 1/ Align a little the data
438 src_0 = _mm_slli_si128(src_0, 2);
439 src_0 = _mm_shufflelo_epi16(src_0, 0x39);
440
441 src_1 = _mm_slli_si128(src_1, 2);
442 src_1 = _mm_shufflelo_epi16(src_1, 0x39);
443
444 src_2 = _mm_slli_si128(src_2, 2);
445 src_2 = _mm_shufflelo_epi16(src_2, 0x39);
446
447 src_3 = _mm_slli_si128(src_3, 2);
448 src_3 = _mm_shufflelo_epi16(src_3, 0x39);
449
450 // 2/ Filter the 24 bits pixels & do the conversion
451 __m128i src_0_H = _mm_and_si128(src_0, mask_H);
452 __m128i src_0_L = _mm_and_si128(src_0, mask_L);
453 src_0_H = _mm_slli_si128(src_0_H, 1);
454 src_0 = _mm_or_si128(src_0_H, src_0_L);
455
456 __m128i src_1_H = _mm_and_si128(src_1, mask_H);
457 __m128i src_1_L = _mm_and_si128(src_1, mask_L);
458 src_1_H = _mm_slli_si128(src_1_H, 1);
459 src_1 = _mm_or_si128(src_1_H, src_1_L);
460
461 __m128i src_2_H = _mm_and_si128(src_2, mask_H);
462 __m128i src_2_L = _mm_and_si128(src_2, mask_L);
463 src_2_H = _mm_slli_si128(src_2_H, 1);
464 src_2 = _mm_or_si128(src_2_H, src_2_L);
465
466 __m128i src_3_H = _mm_and_si128(src_3, mask_H);
467 __m128i src_3_L = _mm_and_si128(src_3, mask_L);
468 src_3_H = _mm_slli_si128(src_3_H, 1);
469 src_3 = _mm_or_si128(src_3_H, src_3_L);
470
471 // Reorder the data (same as 32 bits format)
472 __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
473 __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
474 __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
475 __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
476
477 // Load previous value and apply the ~write_mask
478 __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
479 dst_0 = _mm_or_si128(dst_0, old_dst_0);
480
481 __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
482 dst_1 = _mm_or_si128(dst_1, old_dst_1);
483
484 __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
485 dst_2 = _mm_or_si128(dst_2, old_dst_2);
486
487 __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
488 dst_3 = _mm_or_si128(dst_3, old_dst_3);
489
490 // store
491 _mm_stream_si128((__m128i*)dst, dst_0);
492 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
493 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
494 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
495
496 // update the pointer
497 dst += 64;
498 src += 2*pitch;
499 }
500 }
501
502 __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch)
503 {
504 #ifdef INTRINSIC_PORT_16
505 SwizzleBlock16_sse2_I<true>(dst, src, pitch);
506 #else
507 SwizzleBlock16_sse2(dst, src, pitch);
508 #endif
509 }
510
511 __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch)
512 {
513 #ifdef INTRINSIC_PORT_8
514 SwizzleBlock8_sse2_I<true>(dst, src, pitch);
515 #else
516 SwizzleBlock8_sse2(dst, src, pitch);
517 #endif
518 }
519
520 __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch)
521 {
522 #ifdef INTRINSIC_PORT_4
523 SwizzleBlock4_sse2_I<true>(dst, src, pitch);
524 #else
525 SwizzleBlock4_sse2(dst, src, pitch);
526 #endif
527 }
528
529 __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)
530 {
531 SwizzleBlock32_sse2_I<false>(dst, src, pitch);
532 }
533
534 __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch)
535 {
536 #ifdef INTRINSIC_PORT_16
537 SwizzleBlock16_sse2_I<false>(dst, src, pitch);
538 #else
539 SwizzleBlock16u_sse2(dst, src, pitch);
540 #endif
541 }
542
543 __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch)
544 {
545 #ifdef INTRINSIC_PORT_8
546 SwizzleBlock8_sse2_I<false>(dst, src, pitch);
547 #else
548 SwizzleBlock8u_sse2(dst, src, pitch);
549 #endif
550 }
551
552 __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch)
553 {
554 #ifdef INTRINSIC_PORT_4
555 SwizzleBlock4_sse2_I<false>(dst, src, pitch);
556 #else
557 SwizzleBlock4u_sse2(dst, src, pitch);
558 #endif
559 }
560
561 __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch)
562 {
563 SwizzleBlock8H_4H<false, false>(dst, src, pitch);
564 }
565
566 __forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch)
567 {
568 SwizzleBlock8H_4H<true, true>(dst, src, pitch);
569 }
570
571 __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch)
572 {
573 SwizzleBlock8H_4H<true, false>(dst, src, pitch);
574 }
575
576 #else
577
578 __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch)
579 {
580 SwizzleBlock32_c(dst, src, pitch);
581 }
582
583 __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch)
584 {
585 SwizzleBlock16_c(dst, src, pitch);
586 }
587
588 __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch)
589 {
590 SwizzleBlock8_c(dst, src, pitch);
591 }
592
593 __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch)
594 {
595 SwizzleBlock4_c(dst, src, pitch);
596 }
597
598 __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)
599 {
600 SwizzleBlock32_c(dst, src, pitch);
601 }
602
603 __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch)
604 {
605 SwizzleBlock16_c(dst, src, pitch);
606 }
607
608 __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch)
609 {
610 SwizzleBlock8_c(dst, src, pitch);
611 }
612
613 __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch)
614 {
615 SwizzleBlock4_c(dst, src, pitch);
616 }
617
618 __forceinline void __fastcall SwizzleBlock32_mask(u8* dst, u8* src, int srcpitch, u32 WriteMask)
619 {
620 u32* d = &g_columnTable32[0][0];
621
622 if (WriteMask == 0xffffffff)
623 {
624 for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
625 for (int i = 0; i < 8; i++)
626 ((u32*)dst)[d[i]] = ((u32*)src)[i];
627 }
628 else
629 {
630 for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
631 for (int i = 0; i < 8; i++)
632 ((u32*)dst)[d[i]] = (((u32*)dst)[d[i]] & ~WriteMask) | (((u32*)src)[i] & WriteMask);
633 }
634 }
635
636 __forceinline void __fastcall SwizzleBlock32_c(u8* dst, u8* src, int srcpitch)
637 {
638 SwizzleBlock32_mask(dst, src, srcpitch, 0xffffffff);
639 }
640
641 __forceinline void __fastcall SwizzleBlock16_c(u8* dst, u8* src, int srcpitch)
642 {
643 u32* d = &g_columnTable16[0][0];
644
645 for (int j = 0; j < 8; j++, d += 16, src += srcpitch)
646 for (int i = 0; i < 16; i++)
647 ((u16*)dst)[d[i]] = ((u16*)src)[i];
648 }
649
650 __forceinline void __fastcall SwizzleBlock8_c(u8* dst, u8* src, int srcpitch)
651 {
652 u32* d = &g_columnTable8[0][0];
653
654 for (int j = 0; j < 16; j++, d += 16, src += srcpitch)
655 for (int i = 0; i < 16; i++)
656 dst[d[i]] = src[i];
657 }
658
659 __forceinline void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch)
660 {
661 u32* d = &g_columnTable4[0][0];
662
663 for (int j = 0; j < 16; j++, d += 32, src += srcpitch)
664 {
665 for (int i = 0; i < 32; i++)
666 {
667 u32 addr = d[i];
668 u8 c = (src[i>>1] >> ((i & 1) << 2)) & 0x0f;
669 u32 shift = (addr & 1) << 2;
670 dst[addr >> 1] = (dst[addr >> 1] & (0xf0 >> shift)) | (c << shift);
671 }
672 }
673 }
674
675 __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch)
676 {
677 u8* pnewsrc = src;
678 u32* pblock = tempblock;
679
680 // Note src can be out of bound of GS memory (but there is some spare allocation
681 // to avoid a tricky corner case)
682 for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch - 24)
683 {
684 for (int bx = 0; bx < 8; ++bx, pnewsrc += 3)
685 {
686 pblock[bx] = *(u32*)pnewsrc;
687 }
688 }
689
690 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0x00ffffff);
691 }
692
693 __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch)
694 {
695 u8* pnewsrc = src;
696 u32* pblock = tempblock;
697
698 for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
699 {
700 u32 u = *(u32*)pnewsrc;
701 pblock[0] = u << 24;
702 pblock[1] = u << 16;
703 pblock[2] = u << 8;
704 pblock[3] = u;
705 u = *(u32*)(pnewsrc + 4);
706 pblock[4] = u << 24;
707 pblock[5] = u << 16;
708 pblock[6] = u << 8;
709 pblock[7] = u;
710 }
711
712 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0xff000000);
713 }
714
715 __forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch)
716 {
717 u8* pnewsrc = src;
718 u32* pblock = tempblock;
719
720 for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
721 {
722 u32 u = *(u32*)pnewsrc;
723 pblock[0] = u << 28;
724 pblock[1] = u << 24;
725 pblock[2] = u << 20;
726 pblock[3] = u << 16;
727 pblock[4] = u << 12;
728 pblock[5] = u << 8;
729 pblock[6] = u << 4;
730 pblock[7] = u;
731 }
732
733 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0xf0000000);
734 }
735
736 __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch)
737 {
738 u8* pnewsrc = src;
739 u32* pblock = tempblock;
740
741 for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
742 {
743 u32 u = *(u32*)pnewsrc;
744 pblock[0] = u << 24;
745 pblock[1] = u << 20;
746 pblock[2] = u << 16;
747 pblock[3] = u << 12;
748 pblock[4] = u << 8;
749 pblock[5] = u << 4;
750 pblock[6] = u;
751 pblock[7] = u >> 4;
752 }
753
754 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0x0f000000);
755 }
756 #endif

  ViewVC Help
Powered by ViewVC 1.1.22