/[pcsx2_0.9.7]/trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp
ViewVC logotype

Contents of /trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 273 - (show annotations) (download)
Fri Nov 12 01:10:22 2010 UTC (9 years, 7 months ago) by william
File size: 23683 byte(s)
Auto Commited Import of: pcsx2-0.9.7-DEBUG (upstream: v0.9.7.4013 local: v0.9.7.197-latest) in ./trunk
1 /* ZZ Open GL graphics plugin
2 * Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
3 * Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18 */
19
20 #include "GS.h"
21 #include "Mem.h"
22 #include "Mem_Swizzle.h"
23 #ifdef ZEROGS_SSE2
24 #include <emmintrin.h>
25 #endif
26
27 // Current port of the ASM function to intrinsic
28 #define INTRINSIC_PORT_32
29 #define INTRINSIC_PORT_16
30 #define INTRINSIC_PORT_8
31 #define INTRINSIC_PORT_4
32 #ifdef ZEROGS_SSE2
33 template<bool aligned>
34 __forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch, u32 WriteMask)
35 {
36 __m128i src_0;
37 __m128i src_1;
38 __m128i src_2;
39 __m128i src_3;
40
41 if (WriteMask == 0xffffffff) {
42 for (int i=3 ; i >= 0 ; --i) {
43 // load
44 if (aligned) {
45 src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
46 src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
47 src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
48 src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
49 } else {
50 src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
51 src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
52 src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
53 src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
54 }
55
56 // Reorder
57 __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
58 __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
59 __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
60 __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
61
62 // store
63 _mm_stream_si128((__m128i*)dst, dst_0);
64 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
65 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
66 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
67
68 // update the pointer
69 dst += 64;
70 src += 2*pitch;
71 }
72 }
73 else
74 {
75 // Build the mask (tranform a u32 to a 4 packets u32)
76 __m128i mask = _mm_cvtsi32_si128(WriteMask);
77 mask = _mm_shuffle_epi32(mask, 0);
78
79 for (int i=3 ; i >= 0 ; --i) {
80 // load
81 if (aligned) {
82 src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
83 src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
84 src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
85 src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
86 } else {
87 src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
88 src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
89 src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
90 src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
91 }
92
93 // Apply the WriteMask before reordering
94 src_0 = _mm_and_si128(src_0, mask);
95 src_1 = _mm_and_si128(src_1, mask);
96 src_2 = _mm_and_si128(src_2, mask);
97 src_3 = _mm_and_si128(src_3, mask);
98
99 // Reorder
100 __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
101 __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
102 __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
103 __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
104
105 // Load previous value and apply the ~mask
106 __m128i old_dst_0 = _mm_andnot_si128(mask, _mm_load_si128((__m128i*)dst));
107 __m128i old_dst_1 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+1));
108 __m128i old_dst_2 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+2));
109 __m128i old_dst_3 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+3));
110
111 // Build the final value
112 dst_0 = _mm_or_si128(dst_0, old_dst_0);
113 dst_1 = _mm_or_si128(dst_1, old_dst_1);
114 dst_2 = _mm_or_si128(dst_2, old_dst_2);
115 dst_3 = _mm_or_si128(dst_3, old_dst_3);
116
117 // store
118 _mm_stream_si128((__m128i*)dst, dst_0);
119 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
120 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
121 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
122
123 // update the pointer
124 dst += 64;
125 src += 2*pitch;
126 }
127 }
128 // FIXME normally you must use a sfence but it would impact perf to do here
129 // the function is in a loop and it would have a better place after the loop...
130 }
131
132 template<bool aligned>
133 __forceinline void SwizzleBlock16_sse2_I(u8 *dst, u8 *src, int pitch)
134 {
135 __m128i src_0_L;
136 __m128i src_0_H;
137 __m128i src_2_L;
138 __m128i src_2_H;
139
140 for (int i=3 ; i >= 0 ; --i) {
141 // load
142 if (aligned) {
143 src_0_L = _mm_load_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
144 src_0_H = _mm_load_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
145 src_2_L = _mm_load_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
146 src_2_H = _mm_load_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
147 } else {
148 src_0_L = _mm_loadu_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
149 src_0_H = _mm_loadu_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
150 src_2_L = _mm_loadu_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
151 src_2_H = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
152 }
153
154 // Interleave L and H to obtains 32 bits packets
155 __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5H 5L 4H 4L 1H 1L 0H 0L
156 __m128i dst_1_tmp = _mm_unpacklo_epi16(src_2_L, src_2_H); // 7H 7L 6H 6L 3H 3L 2H 2L
157 __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13H 13L 12H 12L 9H 9L 8H 8L
158 __m128i dst_3_tmp = _mm_unpackhi_epi16(src_2_L, src_2_H); // 15H 15L 14H 14L 11H 11L 10H 10L
159
160 // Reorder
161 __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
162 __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
163 __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
164 __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
165
166 // store
167 _mm_stream_si128((__m128i*)dst, dst_0);
168 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
169 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
170 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
171
172 // update the pointer
173 dst += 64;
174 src += 2*pitch;
175 }
176 // FIXME normally you must use a sfence but it would impact perf to do here
177 // the function is in a loop and it would have a better place after the loop...
178 }
179
180 // Template the code to improve reuse of code
181 template<bool aligned, u32 INDEX>
182 __forceinline void SwizzleColumn8_sse2_I(u8 *dst, u8 *src, int pitch)
183 {
184 __m128i src_0;
185 __m128i src_1;
186 __m128i src_2;
187 __m128i src_3;
188
189 // load 4 line of 16*8 bits packets
190 if (aligned) {
191 src_0 = _mm_load_si128((__m128i*)src);
192 src_2 = _mm_load_si128((__m128i*)(src+pitch));
193 src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
194 src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
195 } else {
196 src_0 = _mm_loadu_si128((__m128i*)src);
197 src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
198 src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
199 src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
200 }
201
202 // shuffle 2 lines to align pixels
203 if (INDEX == 0 || INDEX == 2) {
204 src_1 = _mm_shuffle_epi32(src_1, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 3 & 1)
205 src_3 = _mm_shuffle_epi32(src_3, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 3 & 1)
206 } else if (INDEX == 1 || INDEX == 3) {
207 src_0 = _mm_shuffle_epi32(src_0, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 2 & 0)
208 src_2 = _mm_shuffle_epi32(src_2, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 2 & 0)
209 } else {
210 assert(0);
211 }
212 // src_0 = 13 12 9 8 5 4 1 0 ... (byte 2 & 0)
213 // src_1 = 13 12 9 8 5 4 1 0 ... (byte 3 & 1)
214 // src_2 = 15 14 11 10 7 6 3 2 ... (byte 2 & 0)
215 // src_3 = 15 14 11 10 7 6 3 2 ... (byte 3 & 1)
216
217 // Interleave byte 1 & 0 to obtain 16 bits packets
218 __m128i src_0_L = _mm_unpacklo_epi8(src_0, src_1); // 13L 12L 9L 8L 5L 4L 1L 0L
219 __m128i src_1_L = _mm_unpacklo_epi8(src_2, src_3); // 15L 14L 11L 10L 7L 6L 3L 2L
220 // Interleave byte 3 & 2 to obtain 16 bits packets
221 __m128i src_0_H = _mm_unpackhi_epi8(src_0, src_1); // 13H 12H 9H 8H 5H 4H 1H 0H
222 __m128i src_1_H = _mm_unpackhi_epi8(src_2, src_3); // 15H 14H 11H 10H 7H 6H 3H 2H
223
224 // Interleave H and L to obtain 32 bits packets
225 __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5 4 1 0
226 __m128i dst_1_tmp = _mm_unpacklo_epi16(src_1_L, src_1_H); // 7 6 3 2
227 __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13 12 9 8
228 __m128i dst_3_tmp = _mm_unpackhi_epi16(src_1_L, src_1_H); // 15 14 11 10
229
230 // Reorder the 32 bits packets
231 __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
232 __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
233 __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
234 __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
235
236 // store
237 _mm_stream_si128((__m128i*)dst, dst_0);
238 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
239 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
240 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
241 }
242
243 template<bool aligned>
244 __forceinline void SwizzleBlock8_sse2_I(u8 *dst, u8 *src, int pitch)
245 {
246 SwizzleColumn8_sse2_I<aligned, 0>(dst, src, pitch);
247
248 dst += 64;
249 src += 4*pitch;
250 SwizzleColumn8_sse2_I<aligned, 1>(dst, src, pitch);
251
252 dst += 64;
253 src += 4*pitch;
254 SwizzleColumn8_sse2_I<aligned, 2>(dst, src, pitch);
255
256 dst += 64;
257 src += 4*pitch;
258 SwizzleColumn8_sse2_I<aligned, 3>(dst, src, pitch);
259
260 // FIXME normally you must use a sfence but it would impact perf to do here
261 // the function is in a loop and it would have a better place after the loop...
262 }
263
264 // Template the code to improve reuse of code
265 template<bool aligned, u32 INDEX>
266 __forceinline void SwizzleColumn4_sse2_I(u8 *dst, u8 *src, int pitch)
267 {
268 __m128i src_0;
269 __m128i src_1;
270 __m128i src_2;
271 __m128i src_3;
272
273 // Build a mask (tranform a u32 to a 4 packets u32)
274 const u32 mask_template = 0x0f0f0f0f;
275 __m128i mask = _mm_cvtsi32_si128(mask_template);
276 mask = _mm_shuffle_epi32(mask, 0);
277
278 // load 4 line of 32*4 bits packets
279 if (aligned) {
280 src_0 = _mm_load_si128((__m128i*)src);
281 src_2 = _mm_load_si128((__m128i*)(src+pitch));
282 src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
283 src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
284 } else {
285 src_0 = _mm_loadu_si128((__m128i*)src);
286 src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
287 src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
288 src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
289 }
290
291 // shuffle 2 lines to align pixels
292 if (INDEX == 0 || INDEX == 2) {
293 src_1 = _mm_shufflelo_epi16(src_1, 0xB1);
294 src_1 = _mm_shufflehi_epi16(src_1, 0xB1); // 13 12 9 8 5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
295 src_3 = _mm_shufflelo_epi16(src_3, 0xB1);
296 src_3 = _mm_shufflehi_epi16(src_3, 0xB1); // 15 14 11 10 7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
297 } else if (INDEX == 1 || INDEX == 3) {
298 src_0 = _mm_shufflelo_epi16(src_0, 0xB1);
299 src_0 = _mm_shufflehi_epi16(src_0, 0xB1); // 13 12 9 8 5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
300 src_2 = _mm_shufflelo_epi16(src_2, 0xB1);
301 src_2 = _mm_shufflehi_epi16(src_2, 0xB1); // 15 14 11 10 7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
302 } else {
303 assert(0);
304 }
305 // src_0 = 13 12 9 8 5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
306 // src_1 = 13 12 9 8 5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
307 // src_2 = 15 14 11 10 7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
308 // src_3 = 15 14 11 10 7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
309
310 // ** Interleave Half-byte to obtain 8 bits packets
311 // Shift value to ease 4 bits filter.
312 // Note use a packet shift to allow a 4bits shifts
313 __m128i src_0_shift = _mm_srli_epi64(src_0, 4); // ? 13 12 9 8 5 4 1 ... (Half-byte 6 & 4 & 2 & 0)
314 __m128i src_1_shift = _mm_slli_epi64(src_1, 4); // 12 9 8 5 4 1 0 ? ... (Half-byte 7 & 5 & 3 & 1)
315 __m128i src_2_shift = _mm_srli_epi64(src_2, 4); // ? 15 14 11 10 7 6 3 ... (Half-byte 6 & 4 & 2 & 0)
316 __m128i src_3_shift = _mm_slli_epi64(src_3, 4); // 14 11 10 7 6 3 2 ? ... (Half-byte 7 & 5 & 3 & 1)
317
318 // 12 - 8 - 4 - 0 - (HB odd) || - 12 - 8 - 4 - 0 (HB even) => 12 8 4 0 (byte 3 & 2 & 1 & 0)
319 src_0 = _mm_or_si128(_mm_andnot_si128(mask, src_1_shift), _mm_and_si128(mask, src_0));
320 // - 13 - 9 - 5 - 1 (HB even) || 13 - 9 - 5 - 1 - (HB odd) => 13 9 5 1 (byte 3 & 2 & 1 & 0)
321 src_1 = _mm_or_si128(_mm_and_si128(mask, src_0_shift), _mm_andnot_si128(mask, src_1));
322
323 // 14 - 10 - 6 - 2 - (HB odd) || - 14 - 10 - 6 - 2 (HB even) => 14 10 6 2 (byte 3 & 2 & 1 & 0)
324 src_2 = _mm_or_si128(_mm_andnot_si128(mask, src_3_shift), _mm_and_si128(mask, src_2));
325 // - 15 - 11 - 7 - 3 (HB even) || 15 - 11 - 7 - 3 - (HB odd) => 15 11 7 3 (byte 3 & 2 & 1 & 0)
326 src_3 = _mm_or_si128(_mm_and_si128(mask, src_2_shift), _mm_andnot_si128(mask, src_3));
327
328
329 // reorder the 8 bits packets
330 __m128i src_0_tmp = _mm_unpacklo_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 1 & 0)
331 __m128i src_1_tmp = _mm_unpackhi_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 3 & 2)
332 __m128i src_2_tmp = _mm_unpacklo_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 1 & 0)
333 __m128i src_3_tmp = _mm_unpackhi_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 3 & 2)
334
335 // interleave byte to obtain 32 bits packets
336 __m128i src_0_L = _mm_unpacklo_epi8(src_0_tmp, src_1_tmp); // 2.13 0.13 2.12 0.12 2.9 0.9 2.8 0.8 2.5 0.5 2.4 0.4 2.1 0.1 2.0 0.0
337 __m128i src_0_H = _mm_unpackhi_epi8(src_0_tmp, src_1_tmp); // 3.13 1.13 3.12 1.12 3.9 1.9 3.8 1.8 3.5 1.5 3.4 1.4 3.1 1.1 3.0 1.0
338 __m128i src_1_L = _mm_unpacklo_epi8(src_2_tmp, src_3_tmp); // 2.15 0.15 2.14 0.14 2.11 0.11 2.10 0.10 2.7 0.7 2.6 0.6 2.3 0.3 2.2 0.2
339 __m128i src_1_H = _mm_unpackhi_epi8(src_2_tmp, src_3_tmp); // 3.15 1.15 3.14 1.14 3.11 1.11 3.10 1.10 3.7 1.7 3.6 1.6 3.3 1.3 3.2 1.2
340
341 __m128i dst_0_tmp = _mm_unpacklo_epi8(src_0_L, src_0_H); // 5 4 1 0
342 __m128i dst_1_tmp = _mm_unpacklo_epi8(src_1_L, src_1_H); // 7 6 3 2
343 __m128i dst_2_tmp = _mm_unpackhi_epi8(src_0_L, src_0_H); // 13 12 9 8
344 __m128i dst_3_tmp = _mm_unpackhi_epi8(src_1_L, src_1_H); // 15 14 11 10
345
346 // Reorder the 32 bits packets
347 __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
348 __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
349 __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
350 __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
351
352 // store
353 _mm_stream_si128((__m128i*)dst, dst_0);
354 _mm_stream_si128(((__m128i*)dst)+1, dst_1);
355 _mm_stream_si128(((__m128i*)dst)+2, dst_2);
356 _mm_stream_si128(((__m128i*)dst)+3, dst_3);
357 }
358
359 template<bool aligned>
360 __forceinline void SwizzleBlock4_sse2_I(u8 *dst, u8 *src, int pitch)
361 {
362 SwizzleColumn4_sse2_I<aligned, 0>(dst, src, pitch);
363
364 dst += 64;
365 src += 4*pitch;
366 SwizzleColumn4_sse2_I<aligned, 1>(dst, src, pitch);
367
368 dst += 64;
369 src += 4*pitch;
370 SwizzleColumn4_sse2_I<aligned, 2>(dst, src, pitch);
371
372 dst += 64;
373 src += 4*pitch;
374 SwizzleColumn4_sse2_I<aligned, 3>(dst, src, pitch);
375
376 // FIXME normally you must use a sfence but it would impact perf to do here
377 // the function is in a loop and it would have a better place after the loop...
378 }
379 #endif
380
381 // special swizzle macros - which I converted to functions.
382 #ifdef ZEROGS_SSE2
383
384 __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
385 {
386 #ifdef INTRINSIC_PORT_32
387 SwizzleBlock32_sse2_I<true>(dst, src, pitch, WriteMask);
388 #else
389 SwizzleBlock32_sse2(dst, src, pitch, WriteMask);
390 #endif
391 }
392
393 __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask)
394 {
395 #ifdef INTRINSIC_PORT_16
396 SwizzleBlock16_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
397 #else
398 SwizzleBlock16_sse2(dst, src, pitch/*, WriteMask*/);
399 #endif
400 }
401
402 __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch, u32 WriteMask)
403 {
404 #ifdef INTRINSIC_PORT_8
405 SwizzleBlock8_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
406 #else
407 SwizzleBlock8_sse2(dst, src, pitch/*, WriteMask*/);
408 #endif
409 }
410
411 __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch, u32 WriteMask)
412 {
413 #ifdef INTRINSIC_PORT_4
414 SwizzleBlock4_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
415 #else
416 SwizzleBlock4_sse2(dst, src, pitch/*, WriteMask*/);
417 #endif
418 }
419
420 __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
421 {
422 #ifdef INTRINSIC_PORT_32
423 SwizzleBlock32_sse2_I<false>(dst, src, pitch, WriteMask);
424 #else
425 SwizzleBlock32u_sse2(dst, src, pitch, WriteMask);
426 #endif
427 }
428
429 __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
430 {
431 #ifdef INTRINSIC_PORT_16
432 SwizzleBlock16_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
433 #else
434 SwizzleBlock16u_sse2(dst, src, pitch/*, WriteMask*/);
435 #endif
436 }
437
438 __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
439 {
440 #ifdef INTRINSIC_PORT_8
441 SwizzleBlock8_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
442 #else
443 SwizzleBlock8u_sse2(dst, src, pitch/*, WriteMask*/);
444 #endif
445 }
446
447 __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
448 {
449 #ifdef INTRINSIC_PORT_4
450 SwizzleBlock4_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
451 #else
452 SwizzleBlock4u_sse2(dst, src, pitch/*, WriteMask*/);
453 #endif
454 }
455
456 #else
457
458 __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
459 {
460 SwizzleBlock32_c(dst, src, pitch, WriteMask);
461 }
462
463 __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask)
464 {
465 SwizzleBlock16_c(dst, src, pitch/*, WriteMask*/);
466 }
467
468 __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch, u32 WriteMask)
469 {
470 SwizzleBlock8_c(dst, src, pitch/*, WriteMask*/);
471 }
472
473 __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch, u32 WriteMask)
474 {
475 SwizzleBlock4_c(dst, src, pitch/*, WriteMask*/);
476 }
477
478 __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
479 {
480 SwizzleBlock32_c(dst, src, pitch, WriteMask);
481 }
482
483 __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
484 {
485 SwizzleBlock16_c(dst, src, pitch/*, WriteMask*/);
486 }
487
488 __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
489 {
490 SwizzleBlock8_c(dst, src, pitch/*, WriteMask*/);
491 }
492
493 __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
494 {
495 SwizzleBlock4_c(dst, src, pitch/*, WriteMask*/);
496 }
497
498 __forceinline void __fastcall SwizzleBlock32_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
499 {
500 u32* d = &g_columnTable32[0][0];
501
502 if (WriteMask == 0xffffffff)
503 {
504 for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
505 for (int i = 0; i < 8; i++)
506 ((u32*)dst)[d[i]] = ((u32*)src)[i];
507 }
508 else
509 {
510 for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
511 for (int i = 0; i < 8; i++)
512 ((u32*)dst)[d[i]] = (((u32*)dst)[d[i]] & ~WriteMask) | (((u32*)src)[i] & WriteMask);
513 }
514 }
515
516
517 __forceinline void __fastcall SwizzleBlock24_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
518 {
519 u32* d = &g_columnTable32[0][0];
520
521 if (WriteMask == 0x00ffffff)
522 {
523 for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
524 for (int i = 0; i < 8; i++)
525 ((u32*)dst)[d[i]] = ((u32*)src)[i];
526 }
527 else
528 {
529 for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
530 for (int i = 0; i < 8; i++)
531 ((u32*)dst)[d[i]] = (((u32*)dst)[d[i]] & ~WriteMask) | (((u32*)src)[i] & WriteMask);
532 }
533 }
534
535 __forceinline void __fastcall SwizzleBlock16_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
536 {
537 u32* d = &g_columnTable16[0][0];
538
539 for (int j = 0; j < 8; j++, d += 16, src += srcpitch)
540 for (int i = 0; i < 16; i++)
541 ((u16*)dst)[d[i]] = ((u16*)src)[i];
542 }
543
544 __forceinline void __fastcall SwizzleBlock8_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
545 {
546 u32* d = &g_columnTable8[0][0];
547
548 for (int j = 0; j < 16; j++, d += 16, src += srcpitch)
549 for (int i = 0; i < 16; i++)
550 dst[d[i]] = src[i];
551 }
552
553 __forceinline void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
554 {
555 u32* d = &g_columnTable4[0][0];
556
557 for (int j = 0; j < 16; j++, d += 32, src += srcpitch)
558 {
559 for (int i = 0; i < 32; i++)
560 {
561 u32 addr = d[i];
562 u8 c = (src[i>>1] >> ((i & 1) << 2)) & 0x0f;
563 u32 shift = (addr & 1) << 2;
564 dst[addr >> 1] = (dst[addr >> 1] & (0xf0 >> shift)) | (c << shift);
565 }
566 }
567 }
568
569 #endif
570 __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask)
571 {
572 u8* pnewsrc = src;
573 u32* pblock = tempblock;
574
575 for (int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch - 24)
576 {
577 for (int bx = 0; bx < 8; ++bx, pnewsrc += 3)
578 {
579 pblock[bx] = *(u32*)pnewsrc;
580 }
581 }
582
583 for (int bx = 0; bx < 7; ++bx, pnewsrc += 3)
584 {
585 /* might be 1 byte out of bounds of GS memory */
586 pblock[bx] = *(u32*)pnewsrc;
587 }
588
589 /* do 3 bytes for the last copy */
590 *((u8*)pblock + 28) = pnewsrc[0];
591
592 *((u8*)pblock + 29) = pnewsrc[1];
593
594 *((u8*)pblock + 30) = pnewsrc[2];
595
596 SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff);
597 }
598
599 __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch, u32 WriteMask)
600 {
601 u8* pnewsrc = src;
602 u32* pblock = tempblock;
603
604 for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
605 {
606 u32 u = *(u32*)pnewsrc;
607 pblock[0] = u << 24;
608 pblock[1] = u << 16;
609 pblock[2] = u << 8;
610 pblock[3] = u;
611 u = *(u32*)(pnewsrc + 4);
612 pblock[4] = u << 24;
613 pblock[5] = u << 16;
614 pblock[6] = u << 8;
615 pblock[7] = u;
616 }
617
618 SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xff000000);
619 }
620
621 __forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch, u32 WriteMask)
622 {
623 u8* pnewsrc = src;
624 u32* pblock = tempblock;
625
626 for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
627 {
628 u32 u = *(u32*)pnewsrc;
629 pblock[0] = u << 28;
630 pblock[1] = u << 24;
631 pblock[2] = u << 20;
632 pblock[3] = u << 16;
633 pblock[4] = u << 12;
634 pblock[5] = u << 8;
635 pblock[6] = u << 4;
636 pblock[7] = u;
637 }
638
639 SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xf0000000);
640 }
641
642 __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask)
643 {
644 u8* pnewsrc = src;
645 u32* pblock = tempblock;
646
647 for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
648 {
649 u32 u = *(u32*)pnewsrc;
650 pblock[0] = u << 24;
651 pblock[1] = u << 20;
652 pblock[2] = u << 16;
653 pblock[3] = u << 12;
654 pblock[4] = u << 8;
655 pblock[5] = u << 4;
656 pblock[6] = u;
657 pblock[7] = u >> 4;
658 }
659
660 SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000);
661 }
662

  ViewVC Help
Powered by ViewVC 1.1.22