/[pcsx2_0.9.7]/branch/debug/0.X/0.9.X/0.9.7/ramdump-lateset/plugins/zzogl-pg/opengl/ZZClut.cpp
ViewVC logotype

Contents of /branch/debug/0.X/0.9.X/0.9.7/ramdump-lateset/plugins/zzogl-pg/opengl/ZZClut.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 314 - (show annotations) (download)
Sun Dec 26 18:56:19 2010 UTC (9 years, 1 month ago) by william
File size: 34505 byte(s)
** merged upstream r4049 (re-integration of GregMiscellaneous branch)
** applied patched to GigTranser.cpp in ZZOgl from r4140 to change 'static int count = 0;' to 'static int path1_count = 0;')
1 /* ZZ Open GL graphics plugin
2 * Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
3 * Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18 */
19
20 #include "GS.h"
21 #include "Mem.h"
22 #include "Util.h"
23
24 #if defined(ZEROGS_SSE2)
25 #include <emmintrin.h>
26 #endif
27
28 // Local Clut buffer:
29 // It supports both 32 bits and 16 bits colors formats. The size of the buffer is 1KBytes.
30 // The 16 bits entries are arranged in 2 columns. One row is a 32 bits colors.
31 // 256 0
32 // 271 1
33 // ... ..
34 // 510 254
35 // 511 255
36 //
37 // CSA -> clut buffer offset:
38 // 16 bits format: CSA < 32 <=> 16 entries, 16 half-row of the buffer (for example 0 to 15)
39 // 32 bits format: CSA < 16 <=> 16 entries, 16 full row of the buffer (for example 256|0 to 271|15)
40
41 static const __aligned16 int s_clut_16bits_mask[4] = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff };
42
43 template <class T>
44 __forceinline T* GetClutBufferAddress(u32 csa) { }
45
46 template <>
47 __forceinline u32* GetClutBufferAddress<u32>(u32 csa)
48 {
49 return (u32*)(g_pbyGSClut + 64 * (csa & 15));
50 }
51
52 template <>
53 __forceinline u16* GetClutBufferAddress<u16>(u32 csa)
54 {
55 return (u16*)(g_pbyGSClut + 64 * (csa & 15) + (csa >= 16 ? 2 : 0));
56 }
57
58 /* *****************************************************************
59 * Local memory -> Clut buffer
60 * *****************************************************************/
61
62 #ifdef ZEROGS_SSE2
63 __forceinline void GSMem_to_ClutBuffer__T32_I8_CSM1_sse2(u32* vm, u32 csa)
64 {
65 u32* clut = GetClutBufferAddress<u32>(csa);
66
67 __m128i* src = (__m128i*)vm;
68 __m128i* dst = (__m128i*)clut;
69
70 for (int j = 0; j < 64; j += 32, src += 32, dst += 32)
71 {
72 for (int i = 0; i < 16; i += 4)
73 {
74 __m128i r0 = _mm_load_si128(&src[i+0]);
75 __m128i r1 = _mm_load_si128(&src[i+1]);
76 __m128i r2 = _mm_load_si128(&src[i+2]);
77 __m128i r3 = _mm_load_si128(&src[i+3]);
78
79 _mm_store_si128(&dst[i*2+0], _mm_unpacklo_epi64(r0, r1));
80 _mm_store_si128(&dst[i*2+1], _mm_unpacklo_epi64(r2, r3));
81 _mm_store_si128(&dst[i*2+2], _mm_unpackhi_epi64(r0, r1));
82 _mm_store_si128(&dst[i*2+3], _mm_unpackhi_epi64(r2, r3));
83
84 __m128i r4 = _mm_load_si128(&src[i+0+16]);
85 __m128i r5 = _mm_load_si128(&src[i+1+16]);
86 __m128i r6 = _mm_load_si128(&src[i+2+16]);
87 __m128i r7 = _mm_load_si128(&src[i+3+16]);
88
89 _mm_store_si128(&dst[i*2+4], _mm_unpacklo_epi64(r4, r5));
90 _mm_store_si128(&dst[i*2+5], _mm_unpacklo_epi64(r6, r7));
91 _mm_store_si128(&dst[i*2+6], _mm_unpackhi_epi64(r4, r5));
92 _mm_store_si128(&dst[i*2+7], _mm_unpackhi_epi64(r6, r7));
93 }
94 }
95 }
96
97 __forceinline void GSMem_to_ClutBuffer__T32_I4_CSM1_sse2(u32* vm, u32 csa)
98 {
99 u32* clut = GetClutBufferAddress<u32>(csa);
100
101 __m128i* src = (__m128i*)vm;
102 __m128i* dst = (__m128i*)clut;
103
104 __m128i r0 = _mm_load_si128(&src[0]);
105 __m128i r1 = _mm_load_si128(&src[1]);
106 __m128i r2 = _mm_load_si128(&src[2]);
107 __m128i r3 = _mm_load_si128(&src[3]);
108
109 _mm_store_si128(&dst[0], _mm_unpacklo_epi64(r0, r1));
110 _mm_store_si128(&dst[1], _mm_unpacklo_epi64(r2, r3));
111 _mm_store_si128(&dst[2], _mm_unpackhi_epi64(r0, r1));
112 _mm_store_si128(&dst[3], _mm_unpackhi_epi64(r2, r3));
113 }
114
115
116 template<bool CSA_0_15, bool HIGH_16BITS_VM>
117 __forceinline void GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
118 {
119 __m128i vm_0;
120 __m128i vm_1;
121 __m128i vm_2;
122 __m128i vm_3;
123 __m128i clut_0;
124 __m128i clut_1;
125 __m128i clut_2;
126 __m128i clut_3;
127
128 __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);
129
130 // !HIGH_16BITS_VM
131 // CSA in 0-15
132 // Replace lower 16 bits of clut with lower 16 bits of vm
133 // CSA in 16-31
134 // Replace higher 16 bits of clut with lower 16 bits of vm
135
136 // HIGH_16BITS_VM
137 // CSA in 0-15
138 // Replace lower 16 bits of clut with higher 16 bits of vm
139 // CSA in 16-31
140 // Replace higher 16 bits of clut with higher 16 bits of vm
141 if(HIGH_16BITS_VM && CSA_0_15) {
142 // move up to low
143 vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
144 vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
145 vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
146 vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
147 vm_0 = _mm_srli_epi32(vm_0, 16);
148 vm_1 = _mm_srli_epi32(vm_1, 16);
149 vm_2 = _mm_srli_epi32(vm_2, 16);
150 vm_3 = _mm_srli_epi32(vm_3, 16);
151 } else if(HIGH_16BITS_VM && !CSA_0_15) {
152 // Remove lower 16 bits
153 vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
154 vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
155 vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
156 vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
157 } else if(!HIGH_16BITS_VM && CSA_0_15) {
158 // Remove higher 16 bits
159 vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
160 vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
161 vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
162 vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
163 } else if(!HIGH_16BITS_VM && !CSA_0_15) {
164 // move low to high
165 vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
166 vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
167 vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
168 vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
169 vm_0 = _mm_slli_epi32(vm_0, 16);
170 vm_1 = _mm_slli_epi32(vm_1, 16);
171 vm_2 = _mm_slli_epi32(vm_2, 16);
172 vm_3 = _mm_slli_epi32(vm_3, 16);
173 }
174
175 // Unsizzle the data
176 __m128i row_0 = _mm_unpacklo_epi64(vm_0, vm_1); // 3 2 1 0
177 __m128i row_1 = _mm_unpacklo_epi64(vm_2, vm_3); // 7 6 5 4
178 __m128i row_2 = _mm_unpackhi_epi64(vm_0, vm_1); // 11 10 9 8
179 __m128i row_3 = _mm_unpackhi_epi64(vm_2, vm_3); // 15 14 13 12
180
181 // load old data & remove useless part
182 if(CSA_0_15) {
183 // Remove lower 16 bits
184 clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut));
185 clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
186 clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
187 clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
188 } else {
189 // Remove higher 16 bits
190 clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut));
191 clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
192 clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
193 clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
194 }
195
196 // Merge old & new data
197 clut_0 = _mm_or_si128(clut_0, row_0);
198 clut_1 = _mm_or_si128(clut_1, row_1);
199 clut_2 = _mm_or_si128(clut_2, row_2);
200 clut_3 = _mm_or_si128(clut_3, row_3);
201
202 _mm_store_si128((__m128i*)clut, clut_0);
203 _mm_store_si128((__m128i*)clut+1, clut_1);
204 _mm_store_si128((__m128i*)clut+2, clut_2);
205 _mm_store_si128((__m128i*)clut+3, clut_3);
206 }
207
208 __forceinline void GSMem_to_ClutBuffer__T16_I4_CSM1_sse2(u32* vm, u32 csa)
209 {
210 u32* clut = GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2
211
212 if (csa > 15) {
213 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<false, false>(vm, clut);
214 } else {
215 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<true, false>(vm, clut);
216 }
217 }
218
219 __forceinline void GSMem_to_ClutBuffer__T16_I8_CSM1_sse2(u32* vm, u32 csa)
220 {
221 // update the right clut column (csa < 16)
222 u32* clut = GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2
223
224 u32 csa_right = (csa < 16) ? 16 - csa : 0;
225
226 for(int i = (csa_right/2); i > 0 ; --i) {
227 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<true,false>(vm, clut);
228 clut += 16;
229 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<true,true>(vm, clut);
230 clut += 16;
231 vm += 16; // go down one column
232 }
233
234 // update the left clut column
235 u32 csa_left = (csa >= 16) ? 16 : csa;
236
237 // In case csa_right is odd (so csa_left is also odd), we cross the clut column
238 if(csa_right & 0x1) {
239 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<true,false>(vm, clut);
240 // go back to the base before processing left clut column
241 clut = GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
242
243 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<false,true>(vm, clut);
244 clut += 16;
245 vm += 16; // go down one column
246 } else if(csa_right != 0) {
247 // go back to the base before processing left clut column
248 clut = GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
249
250 }
251
252 for(int i = (csa_left/2); i > 0 ; --i) {
253 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<false,false>(vm, clut);
254 clut += 16;
255 GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<false,true>(vm, clut);
256 clut += 16;
257 vm += 16; // go down one column
258 }
259 }
260
261 #endif // ZEROGS_SSE2
262
263 __forceinline void GSMem_to_ClutBuffer__T16_I8_CSM1_c(u32* _vm, u32 csa)
264 {
265 const static u32 map[] =
266 {
267 0, 2, 8, 10, 16, 18, 24, 26,
268 4, 6, 12, 14, 20, 22, 28, 30,
269 1, 3, 9, 11, 17, 19, 25, 27,
270 5, 7, 13, 15, 21, 23, 29, 31
271 };
272
273 u16* vm = (u16*)_vm;
274 u16* clut = GetClutBufferAddress<u16>(csa);
275
276 int left = ((u32)(uptr)clut & 2) ? 512 : 512 - (((u32)(uptr)clut) & 0x3ff) / 2;
277
278 for (int j = 0; j < 8; j++, vm += 32, clut += 64, left -= 32)
279 {
280 if (left == 32)
281 {
282 assert(left == 32);
283
284 for (int i = 0; i < 16; i++)
285 clut[2*i] = vm[map[i]];
286
287 clut = (u16*)((uptr)clut & ~0x3ff) + 1;
288
289 for (int i = 16; i < 32; i++)
290 clut[2*i] = vm[map[i]];
291 }
292 else
293 {
294 if (left == 0)
295 {
296 clut = (u16*)((uptr)clut & ~0x3ff) + 1;
297 left = -1;
298 }
299
300 for (int i = 0; i < 32; i++)
301 clut[2*i] = vm[map[i]];
302 }
303 }
304 }
305
306 __forceinline void GSMem_to_ClutBuffer__T32_I8_CSM1_c(u32* vm, u32 csa)
307 {
308 u64* src = (u64*)vm;
309 u64* dst = (u64*)GetClutBufferAddress<u32>(csa);
310
311 for (int j = 0; j < 2; j++, src += 32)
312 {
313 for (int i = 0; i < 4; i++, dst += 16, src += 8)
314 {
315 dst[0] = src[0];
316 dst[1] = src[2];
317 dst[2] = src[4];
318 dst[3] = src[6];
319 dst[4] = src[1];
320 dst[5] = src[3];
321 dst[6] = src[5];
322 dst[7] = src[7];
323
324 dst[8] = src[32];
325 dst[9] = src[32+2];
326 dst[10] = src[32+4];
327 dst[11] = src[32+6];
328 dst[12] = src[32+1];
329 dst[13] = src[32+3];
330 dst[14] = src[32+5];
331 dst[15] = src[32+7];
332 }
333 }
334 }
335
336 __forceinline void GSMem_to_ClutBuffer__T16_I4_CSM1_c(u32* _vm, u32 csa)
337 {
338 u16* dst = GetClutBufferAddress<u16>(csa);
339 u16* src = (u16*)_vm;
340
341 dst[0] = src[0];
342 dst[2] = src[2];
343 dst[4] = src[8];
344 dst[6] = src[10];
345 dst[8] = src[16];
346 dst[10] = src[18];
347 dst[12] = src[24];
348 dst[14] = src[26];
349 dst[16] = src[4];
350 dst[18] = src[6];
351 dst[20] = src[12];
352 dst[22] = src[14];
353 dst[24] = src[20];
354 dst[26] = src[22];
355 dst[28] = src[28];
356 dst[30] = src[30];
357 }
358
359 __forceinline void GSMem_to_ClutBuffer__T32_I4_CSM1_c(u32* vm, u32 csa)
360 {
361 u64* src = (u64*)vm;
362 u64* dst = (u64*)GetClutBufferAddress<u32>(csa);
363
364 dst[0] = src[0];
365 dst[1] = src[2];
366 dst[2] = src[4];
367 dst[3] = src[6];
368 dst[4] = src[1];
369 dst[5] = src[3];
370 dst[6] = src[5];
371 dst[7] = src[7];
372 }
373
374 // Main GSmem to Clutbuffer function
375 /*__forceinline*/ void GSMem_to_ClutBuffer(tex0Info &tex0)
376 {
377 int entries = PSMT_IS8CLUT(tex0.psm) ? 256 : 16;
378
379 u8* _src = g_pbyGSMemory + 256 * tex0.cbp;
380
381 if (tex0.csm)
382 {
383 switch (tex0.cpsm)
384 {
385 // 16bit psm
386 // eggomania uses non16bit textures for csm2
387
388 case PSMCT16:
389 {
390 u16* src = (u16*)_src;
391 u16 *dst = GetClutBufferAddress<u16>(tex0.csa);
392
393 for (int i = 0; i < entries; ++i)
394 {
395 *dst = src[getPixelAddress16_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)];
396 dst += 2;
397
398 // check for wrapping
399 if (((u32)dst & 0x3ff) == 0) dst = GetClutBufferAddress<u16>(16);
400 }
401 break;
402 }
403
404 case PSMCT16S:
405 {
406 u16* src = (u16*)_src;
407 u16 *dst = GetClutBufferAddress<u16>(tex0.csa);
408
409 for (int i = 0; i < entries; ++i)
410 {
411 *dst = src[getPixelAddress16S_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)];
412 dst += 2;
413
414 // check for wrapping
415 if (((u32)dst & 0x3ff) == 0) dst = GetClutBufferAddress<u16>(16);
416 }
417 break;
418 }
419
420 case PSMCT32:
421 case PSMCT24:
422 {
423 u32* src = (u32*)_src;
424 u32 *dst = GetClutBufferAddress<u32>(tex0.csa);
425
426 // check if address exceeds src
427
428 if (src + getPixelAddress32_0(gs.clut.cou + entries - 1, gs.clut.cov, gs.clut.cbw) >= (u32*)g_pbyGSMemory + 0x00100000)
429 ZZLog::Error_Log("texClutWrite out of bounds.");
430 else
431 for (int i = 0; i < entries; ++i)
432 {
433 *dst = src[getPixelAddress32_0(gs.clut.cou+i, gs.clut.cov, gs.clut.cbw)];
434 dst++;
435 }
436 break;
437 }
438
439 default:
440 {
441 //ZZLog::Debug_Log("Unknown cpsm: %x (%x).", tex0.cpsm, tex0.psm);
442 break;
443 }
444 }
445 }
446 else
447 {
448 u32* src = (u32*)_src;
449
450 if (entries == 16)
451 {
452 if (tex0.cpsm < 2) {
453 #ifdef ZEROGS_SSE2
454 GSMem_to_ClutBuffer__T32_I4_CSM1_sse2(src, tex0.csa);
455 #else
456 GSMem_to_ClutBuffer__T32_I4_CSM1_c(src, tex0.csa);
457 #endif
458 } else {
459 #ifdef ZEROGS_SSE2
460 GSMem_to_ClutBuffer__T16_I4_CSM1_sse2(src, tex0.csa);
461 #else
462 GSMem_to_ClutBuffer__T16_I4_CSM1_c(src, tex0.csa);
463 #endif
464 }
465 }
466 else
467 {
468 if (tex0.cpsm < 2) {
469 #ifdef ZEROGS_SSE2
470 GSMem_to_ClutBuffer__T32_I8_CSM1_sse2(src, tex0.csa);
471 #else
472 GSMem_to_ClutBuffer__T32_I8_CSM1_c(src, tex0.csa);
473 #endif
474 } else {
475 #ifdef ZEROGS_SSE2
476 GSMem_to_ClutBuffer__T16_I8_CSM1_sse2(src, tex0.csa);
477 #else
478 GSMem_to_ClutBuffer__T16_I8_CSM1_c(src, tex0.csa);
479 #endif
480 }
481
482 }
483 }
484 }
485
486 /* *****************************************************************
487 * Clut buffer -> local C array (linear)
488 * *****************************************************************/
489 template <class T>
490 /*__forceinline*/ void ClutBuffer_to_Array(T* dst, u32 csa, u32 clutsize) {}
491
492 template <>
493 /*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
494 {
495 u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
496 memcpy_amd((u8*)dst, clut, clutsize);
497 }
498
499 template <>
500 /*__forceinline*/ void ClutBuffer_to_Array<u16>(u16* dst, u32 csa, u32 clutsize)
501 {
502 u16* clut = (u16*)GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2
503
504 // which side to copy
505 u32 clutsize_right;
506 u32 clutsize_left;
507 if (csa < 16) {
508 clutsize_right = min(clutsize, (16-csa)*64);
509 clutsize_left = clutsize - clutsize_right;
510 } else {
511 clutsize_right = 0;
512 clutsize_left = clutsize;
513 }
514
515 while (clutsize_right > 0)
516 {
517 #ifdef ZEROGS_SSE2
518 // only lower 16 bits of dword are valid
519 __m128i clut_0 = _mm_load_si128((__m128i*)clut);
520 __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
521 __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
522 __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
523
524 clut_0 = _mm_shufflelo_epi16(clut_0, 0x88);
525 clut_1 = _mm_shufflelo_epi16(clut_1, 0x88);
526 clut_2 = _mm_shufflelo_epi16(clut_2, 0x88);
527 clut_3 = _mm_shufflelo_epi16(clut_3, 0x88);
528
529 clut_0 = _mm_shufflehi_epi16(clut_0, 0x88); // - - 3 2 1 0 - -
530 clut_1 = _mm_shufflehi_epi16(clut_1, 0x88);
531 clut_2 = _mm_shufflehi_epi16(clut_2, 0x88);
532 clut_3 = _mm_shufflehi_epi16(clut_3, 0x88);
533
534 clut_0 = _mm_srli_si128(clut_0, 4);
535 clut_1 = _mm_srli_si128(clut_1, 4);
536 clut_2 = _mm_srli_si128(clut_2, 4);
537 clut_3 = _mm_srli_si128(clut_3, 4);
538
539 _mm_store_si128((__m128i*)dst, _mm_unpacklo_epi64(clut_0, clut_1));
540 _mm_store_si128((__m128i*)dst+1, _mm_unpacklo_epi64(clut_2, clut_3));
541 #else
542 for(int i = 0; i < 16; ++i)
543 dst[i] = clut[2*i];
544 #endif
545
546 dst += 16;
547 clut += 32;
548 clutsize_right -= 32;
549 }
550
551 if(csa < 16) {
552 // go back to the base before processing left clut column
553 clut = (u16*)GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
554 }
555
556 while (clutsize_left > 0)
557 {
558 #ifdef ZEROGS_SSE2
559 // only higher 16 bits of dword are valid
560 __m128i clut_0 = _mm_load_si128((__m128i*)clut);
561 __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
562 __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
563 __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
564
565 clut_0 = _mm_shufflelo_epi16(clut_0, 0x88);
566 clut_1 = _mm_shufflelo_epi16(clut_1, 0x88);
567 clut_2 = _mm_shufflelo_epi16(clut_2, 0x88);
568 clut_3 = _mm_shufflelo_epi16(clut_3, 0x88);
569
570 clut_0 = _mm_shufflehi_epi16(clut_0, 0x88); // - - 3 2 1 0 - -
571 clut_1 = _mm_shufflehi_epi16(clut_1, 0x88);
572 clut_2 = _mm_shufflehi_epi16(clut_2, 0x88);
573 clut_3 = _mm_shufflehi_epi16(clut_3, 0x88);
574
575 clut_0 = _mm_srli_si128(clut_0, 4);
576 clut_1 = _mm_srli_si128(clut_1, 4);
577 clut_2 = _mm_srli_si128(clut_2, 4);
578 clut_3 = _mm_srli_si128(clut_3, 4);
579
580 _mm_store_si128((__m128i*)dst, _mm_unpacklo_epi64(clut_0, clut_1));
581 _mm_store_si128((__m128i*)dst+1, _mm_unpacklo_epi64(clut_2, clut_3));
582 #else
583 // Note +1 because we change higher 16 bits
584 for(int i = 0; i < 16; ++i)
585 dst[i] = clut[2*i];
586 #endif
587
588 dst += 16;
589 clut += 32;
590 clutsize_left -= 32;
591 }
592 }
593
594 /* *****************************************************************
595 * Compare: Clut buffer <-> Local Memory
596 * *****************************************************************/
597 // false -> identical
598 // true -> different
599 template <class T>
600 /*__forceinline*/ bool Cmp_ClutBuffer_GSMem(T* GSmem, u32 csa, u32 clutsize);
601
602 template <>
603 /*__forceinline*/ bool Cmp_ClutBuffer_GSMem<u32>(u32* GSmem, u32 csa, u32 clutsize)
604 {
605 u64* _GSmem = (u64*) GSmem;
606 u64* clut = (u64*)GetClutBufferAddress<u32>(csa);
607
608 while(clutsize > 0) {
609 #ifdef ZEROGS_SSE2
610 // Note: local memory datas are swizzles
611 __m128i GSmem_0 = _mm_load_si128((__m128i*)_GSmem); // 9 8 1 0
612 __m128i GSmem_1 = _mm_load_si128((__m128i*)_GSmem+1); // 11 10 3 2
613 __m128i GSmem_2 = _mm_load_si128((__m128i*)_GSmem+2); // 13 12 5 4
614 __m128i GSmem_3 = _mm_load_si128((__m128i*)_GSmem+3); // 15 14 7 6
615
616 __m128i clut_0 = _mm_load_si128((__m128i*)clut);
617 __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
618 __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
619 __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
620
621 __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_0, GSmem_1), clut_0);
622
623 __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_2, GSmem_3), clut_1);
624 result = _mm_and_si128(result, result_tmp);
625
626 result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_0, GSmem_1), clut_2);
627 result = _mm_and_si128(result, result_tmp);
628
629 result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_2, GSmem_3), clut_3);
630 result = _mm_and_si128(result, result_tmp);
631
632 u32 result_int = _mm_movemask_epi8(result);
633 if (result_int != 0xFFFF)
634 return true;
635 #else
636 // I see no point to keep an mmx version. SSE2 versions is probably faster.
637 // Keep a slow portable C version for reference/debug
638 // Note: local memory datas are swizzles
639 if (clut[0] != _GSmem[0] || clut[1] != _GSmem[2] || clut[2] != _GSmem[4] || clut[3] != _GSmem[6]
640 || clut[4] != _GSmem[1] || clut[5] != _GSmem[3] || clut[6] != _GSmem[5] || clut[7] != _GSmem[7])
641 return true;
642 #endif
643
644 // go to the next memory block
645 _GSmem += 32;
646
647 // go back to the previous memory block then down one memory column
648 if (clutsize & 0x40) {
649 _GSmem -= (64-8);
650 }
651 // In case previous operation (down one column) cross the block boundary
652 // Go to the next block
653 if (clutsize == 0x240) {
654 _GSmem += 32;
655 }
656
657 clut += 8;
658 clutsize -= 64;
659 }
660
661 return false;
662 }
663
664 #ifdef ZEROGS_SSE2
665 template<bool CSA_0_15, bool HIGH_16BITS_VM>
666 __forceinline bool Cmp_ClutBuffer_GSMem_core(u16* GSmem, u16* clut)
667 {
668 __m128i GSmem_0;
669 __m128i GSmem_1;
670 __m128i GSmem_2;
671 __m128i GSmem_3;
672 __m128i clut_0;
673 __m128i clut_1;
674 __m128i clut_2;
675 __m128i clut_3;
676
677 __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);
678
679 // !HIGH_16BITS_VM
680 // CSA in 0-15
681 // cmp lower 16 bits of clut with lower 16 bits of GSmem
682 // CSA in 16-31
683 // cmp higher 16 bits of clut with lower 16 bits of GSmem
684
685 // HIGH_16BITS_VM
686 // CSA in 0-15
687 // cmp lower 16 bits of clut with higher 16 bits of GSmem
688 // CSA in 16-31
689 // cmp higher 16 bits of clut with higher 16 bits of GSmem
690 if(HIGH_16BITS_VM && CSA_0_15) {
691 // move up to low
692 GSmem_0 = _mm_load_si128((__m128i*)GSmem); // 9 8 1 0
693 GSmem_1 = _mm_load_si128((__m128i*)GSmem+1); // 11 10 3 2
694 GSmem_2 = _mm_load_si128((__m128i*)GSmem+2); // 13 12 5 4
695 GSmem_3 = _mm_load_si128((__m128i*)GSmem+3); // 15 14 7 6
696 GSmem_0 = _mm_srli_epi32(GSmem_0, 16);
697 GSmem_1 = _mm_srli_epi32(GSmem_1, 16);
698 GSmem_2 = _mm_srli_epi32(GSmem_2, 16);
699 GSmem_3 = _mm_srli_epi32(GSmem_3, 16);
700 } else if(HIGH_16BITS_VM && !CSA_0_15) {
701 // Remove lower 16 bits
702 GSmem_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem)); // 9 8 1 0
703 GSmem_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+1)); // 11 10 3 2
704 GSmem_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+2)); // 13 12 5 4
705 GSmem_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+3)); // 15 14 7 6
706 } else if(!HIGH_16BITS_VM && CSA_0_15) {
707 // Remove higher 16 bits
708 GSmem_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem)); // 9 8 1 0
709 GSmem_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+1)); // 11 10 3 2
710 GSmem_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+2)); // 13 12 5 4
711 GSmem_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+3)); // 15 14 7 6
712 } else if(!HIGH_16BITS_VM && !CSA_0_15) {
713 // move low to high
714 GSmem_0 = _mm_load_si128((__m128i*)GSmem); // 9 8 1 0
715 GSmem_1 = _mm_load_si128((__m128i*)GSmem+1); // 11 10 3 2
716 GSmem_2 = _mm_load_si128((__m128i*)GSmem+2); // 13 12 5 4
717 GSmem_3 = _mm_load_si128((__m128i*)GSmem+3); // 15 14 7 6
718 GSmem_0 = _mm_slli_epi32(GSmem_0, 16);
719 GSmem_1 = _mm_slli_epi32(GSmem_1, 16);
720 GSmem_2 = _mm_slli_epi32(GSmem_2, 16);
721 GSmem_3 = _mm_slli_epi32(GSmem_3, 16);
722 }
723
724 // Unsizzle the data
725 __m128i row_0 = _mm_unpacklo_epi64(GSmem_0, GSmem_1); // 3 2 1 0
726 __m128i row_1 = _mm_unpacklo_epi64(GSmem_2, GSmem_3); // 7 6 5 4
727 __m128i row_2 = _mm_unpackhi_epi64(GSmem_0, GSmem_1); // 11 10 9 8
728 __m128i row_3 = _mm_unpackhi_epi64(GSmem_2, GSmem_3); // 15 14 13 12
729
730 // load old data & remove useless part
731 if(!CSA_0_15) {
732 // Remove lower 16 bits
733 clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut));
734 clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
735 clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
736 clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
737 } else {
738 // Remove higher 16 bits
739 clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut));
740 clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
741 clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
742 clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
743 }
744
745 // Do the comparaison
746 __m128i result = _mm_cmpeq_epi16(row_0, clut_0);
747 __m128i result_tmp = _mm_cmpeq_epi16(row_1, clut_1);
748 result = _mm_and_si128(result, result_tmp);
749
750 result_tmp = _mm_cmpeq_epi16(row_2, clut_2);
751 result = _mm_and_si128(result, result_tmp);
752
753 result_tmp = _mm_cmpeq_epi16(row_3, clut_3);
754 result = _mm_and_si128(result, result_tmp);
755
756 u32 result_int = _mm_movemask_epi8(result);
757 if(CSA_0_15) {
758 // only lower 16bits must be checked
759 if ((result_int&0x3333) != 0x3333)
760 return true;
761 } else {
762 // only higher 16bits must be checked
763 if ((result_int&0xCCCC) != 0xCCCC)
764 return true;
765 }
766
767 return false;
768 }
769 #endif
770
771 template <>
772 /*__forceinline*/ bool Cmp_ClutBuffer_GSMem<u16>(u16* GSmem, u32 csa, u32 clutsize)
773 {
774 #ifdef ZEROGS_SSE2
775 u16* clut = (u16*)GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2
776
777 // Special case only one CSA block to check
778 if(clutsize == 32) {
779 if (csa < 16)
780 return Cmp_ClutBuffer_GSMem_core<true, false>(GSmem, clut);
781 else
782 return Cmp_ClutBuffer_GSMem_core<false, false>(GSmem, clut);
783 }
784
785 // which side to cmp
786 s32 clutsize_right; // Note clutsize_right could be negative !
787 u32 clutsize_left;
788 if (csa < 16) {
789 // the '-32' is a trick to handle easily when csa is odd
790 clutsize_right = min(clutsize, (16-csa)*32) -32;
791 clutsize_left = clutsize - clutsize_right;
792 } else {
793 clutsize_right = 0;
794 clutsize_left = clutsize;
795 }
796
797 while(clutsize_right > 0) {
798 if (Cmp_ClutBuffer_GSMem_core<true, false>(GSmem, clut))
799 return true;
800 clut += 32;
801
802 if (Cmp_ClutBuffer_GSMem_core<true, true>(GSmem, clut))
803 return true;
804 clut += 32;
805
806 GSmem += 32; // go down one column
807 clutsize_right -= 64;
808 }
809
810 if(csa < 16) {
811 // because of the extra -32, csa_righ is null when csa is odd
812 if (clutsize_right == 0) {
813 // cross the clut
814 if (Cmp_ClutBuffer_GSMem_core<true, false>(GSmem, clut))
815 return true;
816 clut += 32;
817
818 if (Cmp_ClutBuffer_GSMem_core<false, true>(GSmem, clut))
819 return true;
820
821 GSmem += 32; // go down one column
822 clutsize_left -= 32;
823 }
824
825 // go back to the base before processing left clut column
826 clut = (u16*)GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
827 }
828
829 while(clutsize_left > 0) {
830 if (Cmp_ClutBuffer_GSMem_core<false, false>(GSmem, clut))
831 return true;
832 clut += 32;
833
834 if (Cmp_ClutBuffer_GSMem_core<false, true>(GSmem, clut))
835 return true;
836 clut += 32;
837
838 GSmem += 32; // go down one column
839 clutsize_left -= 64;
840 }
841
842 return false;
843 #else
844 // This function is only useful for performance. So just return
845 // for a plain c build
846 return true;
847 #endif
848 }
849
850 /* *****************************************************************
851 * Compare: Clut buffer <-> local C array (linear)
852 * *****************************************************************/
853 // false -> identical
854 // true -> different
855 template <class T>
856 /*__forceinline*/ bool Cmp_ClutBuffer_SavedClut(T* saved_clut, u32 csa, u32 clutsize);
857
858 template <>
859 /*__forceinline*/ bool Cmp_ClutBuffer_SavedClut<u32>(u32* saved_clut, u32 csa, u32 clutsize)
860 {
861 u32* clut = GetClutBufferAddress<u32>(csa);
862 return !!memcmp_mmx(saved_clut, clut, clutsize);
863 }
864
865 template <>
866 /*__forceinline*/ bool Cmp_ClutBuffer_SavedClut<u16>(u16* saved_clut, u32 csa, u32 clutsize)
867 {
868 assert((clutsize&31) == 0);
869
870 #ifdef ZEROGS_SSE2
871 __m128i zero_128 = _mm_setzero_si128();
872 #endif
873 u16* clut = (u16*)GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2
874
875 // which side to cmp
876 u32 clutsize_right;
877 u32 clutsize_left;
878 if (csa < 16) {
879 clutsize_right = min(clutsize, (16-csa)*32);
880 clutsize_left = clutsize - clutsize_right;
881 } else {
882 clutsize_right = 0;
883 clutsize_left = clutsize;
884 }
885
886 while (clutsize_right > 0)
887 {
888 #ifdef ZEROGS_SSE2
889 // only lower 16 bits of dword are valid
890 __m128i clut_0 = _mm_load_si128((__m128i*)clut);
891 __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
892 __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
893 __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
894
895 // value must converted to 32 bits
896 __m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut);
897 __m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1);
898
899 __m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_0, zero_128), clut_0);
900 __m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_0, zero_128), clut_1);
901 result = _mm_and_si128(result, result_tmp);
902
903 result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_1, zero_128), clut_2);
904 result = _mm_and_si128(result, result_tmp);
905
906 result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_1, zero_128), clut_3);
907 result = _mm_and_si128(result, result_tmp);
908
909 u32 result_int = _mm_movemask_epi8(result);
910 // only lower 16bits must be checked
911 if ((result_int&0x3333) != 0x3333)
912 return true;
913 #else
914 for (int i = 0; i < 16; ++i)
915 if (saved_clut[i] != clut[2*i]) return true;
916 #endif
917
918 saved_clut += 16;
919 clut += 32;
920 clutsize_right -= 32;
921 }
922
923 if(csa < 16) {
924 // go back to the base before processing left clut column
925 clut = (u16*)GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
926 }
927
928 while (clutsize_left > 0)
929 {
930 #ifdef ZEROGS_SSE2
931 // only higher 16 bits of dword are valid
932 __m128i clut_0 = _mm_load_si128((__m128i*)clut);
933 __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
934 __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
935 __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);
936
937 // value must converted to 32 bits (with 0 in lower 16 bits)
938 __m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut);
939 __m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1);
940
941 __m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_0), clut_0);
942 __m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_0), clut_1);
943 result = _mm_and_si128(result, result_tmp);
944
945 result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_1), clut_2);
946 result = _mm_and_si128(result, result_tmp);
947
948 result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_1), clut_3);
949 result = _mm_and_si128(result, result_tmp);
950
951 u32 result_int = _mm_movemask_epi8(result);
952 // only higher 16bits must be checked
953 if ((result_int&0xCCCC) != 0xCCCC)
954 return true;
955 #else
956 // Note +1 because we change higher 16 bits
957 for (int i = 0; i < 16; ++i)
958 if (saved_clut[i] != clut[2*i+1]) return true;
959 #endif
960
961 saved_clut += 16;
962 clut += 32;
963 clutsize_left -= 32;
964 }
965
966 return false;
967 }
968
969
970 /* *****************************************************************
971 * Resolve color of clut texture
972 * *****************************************************************/
973
974 // used to build clut textures (note that this is for both 16 and 32 bit cluts)
975 template <class T>
976 /*__forceinline*/ void Build_Clut_Texture(u32 psm, u32 height, T* pclut, u8* psrc, T* pdst)
977 {
978 switch (psm)
979 {
980 case PSMT8:
981 for (u32 i = 0; i < height; ++i)
982 {
983 for (int j = 0; j < GPU_TEXWIDTH / 2; ++j)
984 {
985 pdst[0] = pclut[psrc[0]];
986 pdst[1] = pclut[psrc[1]];
987 pdst[2] = pclut[psrc[2]];
988 pdst[3] = pclut[psrc[3]];
989 pdst[4] = pclut[psrc[4]];
990 pdst[5] = pclut[psrc[5]];
991 pdst[6] = pclut[psrc[6]];
992 pdst[7] = pclut[psrc[7]];
993 pdst += 8;
994 psrc += 8;
995 }
996 }
997 break;
998
999 case PSMT4:
1000 for (u32 i = 0; i < height; ++i)
1001 {
1002 for (int j = 0; j < GPU_TEXWIDTH; ++j)
1003 {
1004 pdst[0] = pclut[psrc[0] & 15];
1005 pdst[1] = pclut[psrc[0] >> 4];
1006 pdst[2] = pclut[psrc[1] & 15];
1007 pdst[3] = pclut[psrc[1] >> 4];
1008 pdst[4] = pclut[psrc[2] & 15];
1009 pdst[5] = pclut[psrc[2] >> 4];
1010 pdst[6] = pclut[psrc[3] & 15];
1011 pdst[7] = pclut[psrc[3] >> 4];
1012
1013 pdst += 8;
1014 psrc += 4;
1015 }
1016 }
1017 break;
1018
1019 case PSMT8H:
1020 for (u32 i = 0; i < height; ++i)
1021 {
1022 for (int j = 0; j < GPU_TEXWIDTH / 8; ++j)
1023 {
1024 pdst[0] = pclut[psrc[3]];
1025 pdst[1] = pclut[psrc[7]];
1026 pdst[2] = pclut[psrc[11]];
1027 pdst[3] = pclut[psrc[15]];
1028 pdst[4] = pclut[psrc[19]];
1029 pdst[5] = pclut[psrc[23]];
1030 pdst[6] = pclut[psrc[27]];
1031 pdst[7] = pclut[psrc[31]];
1032 pdst += 8;
1033 psrc += 32;
1034 }
1035 }
1036 break;
1037
1038 case PSMT4HH:
1039 for (u32 i = 0; i < height; ++i)
1040 {
1041 for (int j = 0; j < GPU_TEXWIDTH / 8; ++j)
1042 {
1043 pdst[0] = pclut[psrc[3] >> 4];
1044 pdst[1] = pclut[psrc[7] >> 4];
1045 pdst[2] = pclut[psrc[11] >> 4];
1046 pdst[3] = pclut[psrc[15] >> 4];
1047 pdst[4] = pclut[psrc[19] >> 4];
1048 pdst[5] = pclut[psrc[23] >> 4];
1049 pdst[6] = pclut[psrc[27] >> 4];
1050 pdst[7] = pclut[psrc[31] >> 4];
1051 pdst += 8;
1052 psrc += 32;
1053 }
1054 }
1055 break;
1056
1057 case PSMT4HL:
1058 for (u32 i = 0; i < height; ++i)
1059 {
1060 for (int j = 0; j < GPU_TEXWIDTH / 8; ++j)
1061 {
1062 pdst[0] = pclut[psrc[3] & 15];
1063 pdst[1] = pclut[psrc[7] & 15];
1064 pdst[2] = pclut[psrc[11] & 15];
1065 pdst[3] = pclut[psrc[15] & 15];
1066 pdst[4] = pclut[psrc[19] & 15];
1067 pdst[5] = pclut[psrc[23] & 15];
1068 pdst[6] = pclut[psrc[27] & 15];
1069 pdst[7] = pclut[psrc[31] & 15];
1070 pdst += 8;
1071 psrc += 32;
1072 }
1073 }
1074 break;
1075
1076 default:
1077 assert(0);
1078 }
1079 }
1080
1081 // Instantiate the Build_Clut_Texture template...
1082 template void Build_Clut_Texture<u32>(u32 psm, u32 height, u32* pclut, u8* psrc, u32* pdst);
1083 template void Build_Clut_Texture<u16>(u32 psm, u32 height, u16* pclut, u8* psrc, u16* pdst);

  ViewVC Help
Powered by ViewVC 1.1.22