/[pcsx2_0.9.7]/trunk/plugins/zzogl-pg/opengl/x86.cpp
ViewVC logotype

Contents of /trunk/plugins/zzogl-pg/opengl/x86.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 280 - (show annotations) (download)
Thu Dec 23 12:02:12 2010 UTC (9 years, 2 months ago) by william
File size: 34616 byte(s)
re-commit (had local access denied errors when committing)
1 /* ZZ Open GL graphics plugin
2 * Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
3 * Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18 */
19
20 #include "GS.h"
21 #include "Mem.h"
22 #include "x86.h"
23
24 #if defined(ZEROGS_SSE2)
25 #include <emmintrin.h>
26 #endif
27
28 // swizzling
29
30 //These were only used in the old version of RESOLVE_32_BITS. Keeping for reference.
31 #if 0
32
33 /* FrameSwizzleBlock32 */
34 void __fastcall FrameSwizzleBlock32_c(u32* dst, u32* src, int srcpitch, u32 WriteMask)
35 {
36 u32* d = &g_columnTable32[0][0];
37
38 if (WriteMask == 0xffffffff)
39 {
40 for(int i = 0; i < 8; ++i, d += 8)
41 {
42 for(int j = 0; j < 8; ++j)
43 {
44 dst[d[j]] = (src[j]);
45 }
46 src += srcpitch;
47 }
48 }
49 else
50 {
51 for(int i = 0; i < 8; ++i, d += 8)
52 {
53 for(int j = 0; j < 8; ++j)
54 {
55 dst[d[j]] = ((src[j])&WriteMask)|(dst[d[j]]&~WriteMask);
56 }
57 src += srcpitch;
58 }
59 }
60 }
61
62 void __fastcall FrameSwizzleBlock32A2_c(u32* dst, u32* src, int srcpitch, u32 WriteMask)
63 {
64 u32* d = &g_columnTable32[0][0];
65
66 if( WriteMask == 0xffffffff ) {
67 for(int i = 0; i < 8; ++i, d += 8) {
68 for(int j = 0; j < 8; ++j) {
69 dst[d[j]] = ((src[2*j] + src[2*j+1]) >> 1);
70 }
71 src += srcpitch;
72 }
73 }
74 else {
75 for(int i = 0; i < 8; ++i, d += 8) {
76 for(int j = 0; j < 8; ++j) {
77 dst[d[j]] = (((src[2*j] + src[2*j+1]) >> 1)&WriteMask)|(dst[d[j]]&~WriteMask);
78 }
79 src += srcpitch;
80 }
81 }
82 }
83
84 void __fastcall FrameSwizzleBlock32A4_c(u32* dst, u32* src, int srcpitch, u32 WriteMask)
85 {
86 u32* d = &g_columnTable32[0][0];
87
88 if( WriteMask == 0xffffffff ) {
89 for(int i = 0; i < 8; ++i, d += 8) {
90 for(int j = 0; j < 8; ++j) {
91 dst[d[j]] = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2);
92 }
93 src += srcpitch << 1;
94 }
95 }
96 else {
97 for(int i = 0; i < 8; ++i, d += 8) {
98 for(int j = 0; j < 8; ++j) {
99 dst[d[j]] = (((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2)&WriteMask)|(dst[d[j]]&~WriteMask);
100 }
101 src += srcpitch << 1;
102 }
103 }
104 }
105
106 #define FrameSwizzleBlock24_c FrameSwizzleBlock32_c
107 #define FrameSwizzleBlock24A2_c FrameSwizzleBlock32A2_c
108 #define FrameSwizzleBlock24A4_c FrameSwizzleBlock32A4_c
109
110 /* FrameSwizzleBlock16 */
111 void __fastcall FrameSwizzleBlock16_c(u16* dst, u32* src, int srcpitch, u32 WriteMask)
112 {
113 u32* d = &g_columnTable16[0][0];
114
115 if (WriteMask == 0xffff)
116 {
117 for(int i = 0; i < 8; ++i, d += 16)
118 {
119 for(int j = 0; j < 16; ++j)
120 {
121 u32 temp = (src[j]);
122 dst[d[j]] = RGBA32to16(temp);
123 }
124 src += srcpitch;
125 }
126 }
127 else
128 {
129 for(int i = 0; i < 8; ++i, d += 16)
130 {
131 for(int j = 0; j < 16; ++j)
132 {
133 u32 temp = (src[j]);
134 u32 dsrc = RGBA32to16(temp);
135 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
136 }
137 src += srcpitch;
138 }
139 }
140 }
141
142 void __fastcall FrameSwizzleBlock16A2_c(u16* dst, u32* src, int srcpitch, u32 WriteMask)
143 {
144 u32* d = &g_columnTable16[0][0];
145
146 if (WriteMask == 0xffff)
147 {
148 for(int i = 0; i < 8; ++i, d += 16)
149 {
150 for(int j = 0; j < 16; ++j)
151 {
152 u32 temp = ((src[2*j] + src[2*j+1]) >> 1);
153 dst[d[j]] = RGBA32to16(temp);
154 }
155 src += srcpitch;
156 }
157 }
158 else
159 {
160 for(int i = 0; i < 8; ++i, d += 16)
161 {
162 for(int j = 0; j < 16; ++j)
163 {
164 u32 temp = ((src[2*j] + src[2*j+1]) >> 1);
165 u32 dsrc = RGBA32to16(temp);
166 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
167 }
168 src += srcpitch;
169 }
170 }
171 }
172
173 void __fastcall FrameSwizzleBlock16A4_c(u16* dst, u32* src, int srcpitch, u32 WriteMask)
174 {
175 u32* d = &g_columnTable16[0][0];
176
177 if (WriteMask == 0xffff)
178 {
179 for(int i = 0; i < 8; ++i, d += 16)
180 {
181 for(int j = 0; j < 16; ++j)
182 {
183 u32 temp = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2);
184 dst[d[j]] = RGBA32to16(temp);
185 }
186 src += srcpitch << 1;
187 }
188 }
189 else
190 {
191 for(int i = 0; i < 8; ++i, d += 16)
192 {
193 for(int j = 0; j < 16; ++j)
194 {
195 u32 temp = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2);
196 u32 dsrc = RGBA32to16(temp);
197 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
198 }
199 src += srcpitch << 1;
200 }
201 }
202 }
203
204
205 /* Frame16SwizzleBlock32 */
206 void __fastcall Frame16SwizzleBlock32_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
207 {
208 u32* d = &g_columnTable32[0][0];
209
210 if( WriteMask == 0xffffffff )
211 {
212 for(int i = 0; i < 8; ++i, d += 8)
213 {
214 for(int j = 0; j < 8; ++j)
215 {
216 Vector_16F dsrc16 = (src[j]);
217 dst[d[j]] = Float16ToARGB(dsrc16);
218 }
219 src += srcpitch;
220 }
221 }
222 else
223 {
224 for(int i = 0; i < 8; ++i, d += 8)
225 {
226 for(int j = 0; j < 8; ++j)
227 {
228 Vector_16F dsrc16 = (src[j]);
229 u32 dsrc = Float16ToARGB(dsrc16);
230 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
231 }
232 src += srcpitch;
233 }
234 }
235 }
236
237 void __fastcall Frame16SwizzleBlock32A2_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
238 {
239 u32* d = &g_columnTable32[0][0];
240
241 if( WriteMask == 0xffffffff )
242 {
243 for(int i = 0; i < 8; ++i, d += 8)
244 {
245 for(int j = 0; j < 8; ++j)
246 {
247 Vector_16F dsrc16 = (src[2*j]);
248 dst[d[j]] = Float16ToARGB(dsrc16);
249 }
250 src += srcpitch;
251 }
252 }
253 else
254 {
255 for(int i = 0; i < 8; ++i, d += 8)
256 {
257 for(int j = 0; j < 8; ++j)
258 {
259 Vector_16F dsrc16 = (src[2*j]);
260 u32 dsrc = Float16ToARGB(dsrc16);
261 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
262 }
263 src += srcpitch;
264 }
265 }
266 }
267
268 void __fastcall Frame16SwizzleBlock32A4_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
269 {
270 u32* d = &g_columnTable32[0][0];
271
272 if( WriteMask == 0xffffffff )
273 {
274 for(int i = 0; i < 8; ++i, d += 8)
275 {
276 for(int j = 0; j < 8; ++j)
277 {
278 Vector_16F dsrc16 = (src[2*j]);
279 dst[d[j]] = Float16ToARGB(dsrc16);
280 }
281 src += srcpitch << 1;
282 }
283 }
284 else
285 {
286 for(int i = 0; i < 8; ++i, d += 8)
287 {
288 for(int j = 0; j < 8; ++j)
289 {
290 Vector_16F dsrc16 = (src[2*j]);
291 u32 dsrc = Float16ToARGB(dsrc16);
292 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
293 }
294 src += srcpitch << 1;
295 }
296 }
297 }
298
299 /* Frame16SwizzleBlock32Z */
300 void __fastcall Frame16SwizzleBlock32Z_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
301 {
302 u32* d = &g_columnTable32[0][0];
303 if( WriteMask == 0xffffffff ) /* breaks KH text if not checked */
304 {
305 for(int i = 0; i < 8; ++i, d += 8)
306 {
307 for(int j = 0; j < 8; ++j)
308 {
309 Vector_16F dsrc16 = (src[j]);
310 dst[d[j]] = Float16ToARGB_Z(dsrc16);
311 }
312 src += srcpitch;
313 }
314 }
315 else
316 {
317 for(int i = 0; i < 8; ++i, d += 8)
318 {
319 for(int j = 0; j < 8; ++j)
320 {
321 Vector_16F dsrc16 = (src[j]);
322 u32 dsrc = Float16ToARGB_Z(dsrc16);
323 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
324 }
325 src += srcpitch;
326 }
327 }
328 }
329
330 void __fastcall Frame16SwizzleBlock32ZA2_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
331 {
332 u32* d = &g_columnTable32[0][0];
333 if( WriteMask == 0xffffffff ) /* breaks KH text if not checked */
334 {
335 for(int i = 0; i < 8; ++i, d += 8)
336 {
337 for(int j = 0; j < 8; ++j)
338 {
339 Vector_16F dsrc16 = (src[2*j]);
340 dst[d[j]] = Float16ToARGB_Z(dsrc16);
341 }
342 src += srcpitch;
343 }
344 }
345 else
346 {
347 for(int i = 0; i < 8; ++i, d += 8)
348 {
349 for(int j = 0; j < 8; ++j)
350 {
351 Vector_16F dsrc16 = (src[2*j]);
352 u32 dsrc = Float16ToARGB_Z(dsrc16);
353 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
354 }
355 src += srcpitch;
356 }
357 }
358 }
359
360 void __fastcall Frame16SwizzleBlock32ZA4_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
361 {
362 u32* d = &g_columnTable32[0][0];
363 if( WriteMask == 0xffffffff ) /* breaks KH text if not checked */
364 {
365 for(int i = 0; i < 8; ++i, d += 8)
366 {
367 for(int j = 0; j < 8; ++j)
368 {
369 Vector_16F dsrc16 = (src[2*j]);
370 dst[d[j]] = Float16ToARGB_Z(dsrc16);
371 }
372 src += srcpitch << 1;
373 }
374 }
375 else
376 {
377 for(int i = 0; i < 8; ++i, d += 8)
378 {
379 for(int j = 0; j < 8; ++j)
380 {
381 Vector_16F dsrc16 = (src[2*j]);
382 u32 dsrc = Float16ToARGB_Z(dsrc16);
383 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
384 }
385 src += srcpitch << 1;
386 }
387 }
388 }
389
390
391 /* Frame16SwizzleBlock16 */
392 void __fastcall Frame16SwizzleBlock16_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
393 {
394 u32* d = &g_columnTable16[0][0];
395
396 if ((WriteMask&0xfff8f8f8) == 0xfff8f8f8)
397 {
398 for(int i = 0; i < 8; ++i, d += 16)
399 {
400 for(int j = 0; j < 16; ++j)
401 {
402 Vector_16F dsrc16 = (src[j]);
403 dst[d[j]] = Float16ToARGB16(dsrc16);
404 }
405 src += srcpitch;
406 }
407 }
408 else
409 {
410 for(int i = 0; i < 8; ++i, d += 16)
411 {
412 for(int j = 0; j < 16; ++j)
413 {
414 Vector_16F dsrc16 = (src[j]);
415 u32 dsrc = Float16ToARGB16(dsrc16);
416 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
417 }
418 src += srcpitch;
419 }
420 }
421 }
422
423 void __fastcall Frame16SwizzleBlock16A2_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
424 {
425 u32* d = &g_columnTable16[0][0];
426
427 if ((WriteMask&0xfff8f8f8) == 0xfff8f8f8)
428 {
429 for(int i = 0; i < 8; ++i, d += 16)
430 {
431 for(int j = 0; j < 16; ++j)
432 {
433 Vector_16F dsrc16 = (src[2*j]);
434 dst[d[j]] = Float16ToARGB16(dsrc16);
435 }
436 src += srcpitch;
437 }
438 }
439 else
440 {
441 for(int i = 0; i < 8; ++i, d += 16)
442 {
443 for(int j = 0; j < 16; ++j)
444 {
445 Vector_16F dsrc16 = (src[2*j]);
446 u32 dsrc = Float16ToARGB16(dsrc16);
447 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
448 }
449 src += srcpitch;
450 }
451 }
452 }
453
454 void __fastcall Frame16SwizzleBlock16A4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
455 {
456 u32* d = &g_columnTable16[0][0];
457
458 if ((WriteMask&0xfff8f8f8) == 0xfff8f8f8)
459 {
460 for(int i = 0; i < 8; ++i, d += 16)
461 {
462 for(int j = 0; j < 16; ++j)
463 {
464 Vector_16F dsrc16 = (src[2*j]);
465 dst[d[j]] = Float16ToARGB16(dsrc16);
466 }
467 src += srcpitch << 1;
468 }
469 }
470 else
471 {
472 for(int i = 0; i < 8; ++i, d += 16)
473 {
474 for(int j = 0; j < 16; ++j)
475 {
476 Vector_16F dsrc16 = (src[2*j]);
477 u32 dsrc = Float16ToARGB16(dsrc16);
478 dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask);
479 }
480 src += srcpitch << 1;
481 }
482 }
483 }
484
485 /* Frame16SwizzleBlock16Z */
486 void __fastcall Frame16SwizzleBlock16Z_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
487 {
488 u32* d = &g_columnTable16[0][0];
489
490 for(int i = 0; i < 8; ++i, d += 16)
491 {
492 for(int j = 0; j < 16; ++j)
493 {
494 Vector_16F dsrc16 = (src[j]);
495 dst[d[j]] = Float16ToARGB16_Z(dsrc16);
496 }
497 src += srcpitch;
498 }
499 }
500
501 void __fastcall Frame16SwizzleBlock16ZA2_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
502 {
503 u32* d = &g_columnTable16[0][0];
504
505 for(int i = 0; i < 8; ++i, d += 16)
506 {
507 for(int j = 0; j < 16; ++j)
508 {
509 Vector_16F dsrc16 = (src[2*j]);
510 dst[d[j]] = Float16ToARGB16_Z(dsrc16);
511 }
512 src += srcpitch;
513 }
514 }
515
516 void __fastcall Frame16SwizzleBlock16ZA4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask)
517 {
518 u32* d = &g_columnTable16[0][0];
519
520 for(int i = 0; i < 8; ++i, d += 16)
521 {
522 for(int j = 0; j < 16; ++j)
523 {
524 Vector_16F dsrc16 = (src[2*j]);
525 dst[d[j]] = Float16ToARGB16_Z(dsrc16);
526 }
527 src += srcpitch << 1;
528 }
529 }
530 #endif
531
532 #ifdef ZEROGS_SSE2
533
534 //void __fastcall WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32* clut)
535 //{
536 // __asm {
537 // mov eax, vm
538 // mov ecx, clut
539 // mov edx, 8
540 // }
541 //
542 //Extract32x2:
543 // __asm {
544 // movdqa xmm0, qword ptr [eax]
545 // movdqa xmm1, qword ptr [eax+16]
546 // movdqa xmm2, qword ptr [eax+32]
547 // movdqa xmm3, qword ptr [eax+48]
548 //
549 // // rearrange
550 // pshuflw xmm0, xmm0, 0xd8
551 // pshufhw xmm0, xmm0, 0xd8
552 // pshuflw xmm1, xmm1, 0xd8
553 // pshufhw xmm1, xmm1, 0xd8
554 // pshuflw xmm2, xmm2, 0xd8
555 // pshufhw xmm2, xmm2, 0xd8
556 // pshuflw xmm3, xmm3, 0xd8
557 // pshufhw xmm3, xmm3, 0xd8
558 //
559 // movdqa xmm4, xmm0
560 // movdqa xmm6, xmm2
561 //
562 // shufps xmm0, xmm1, 0x88
563 // shufps xmm2, xmm3, 0x88
564 //
565 // shufps xmm4, xmm1, 0xdd
566 // shufps xmm6, xmm3, 0xdd
567 //
568 // pshufd xmm0, xmm0, 0xd8
569 // pshufd xmm2, xmm2, 0xd8
570 // pshufd xmm4, xmm4, 0xd8
571 // pshufd xmm6, xmm6, 0xd8
572 //
573 // // left column
574 // movhlps xmm1, xmm0
575 // movlhps xmm0, xmm2
576 // //movdqa xmm7, [ecx]
577 //
578 // movdqa [ecx], xmm0
579 // shufps xmm1, xmm2, 0xe4
580 // movdqa [ecx+16], xmm1
581 //
582 // // right column
583 // movhlps xmm3, xmm4
584 // movlhps xmm4, xmm6
585 // movdqa [ecx+32], xmm4
586 // shufps xmm3, xmm6, 0xe4
587 // movdqa [ecx+48], xmm3
588 //
589 // add eax, 16*4
590 // add ecx, 16*8
591 // sub edx, 1
592 // cmp edx, 0
593 // jne Extract32x2
594 // }
595 //}
596
597 #if 0
598 extern "C" void __fastcall WriteCLUT_T32_I8_CSM1_sse2(u32* vm, u32* clut)
599 {
600 __m128i* src = (__m128i*)vm;
601 __m128i* dst = (__m128i*)clut;
602
603 for (int j = 0; j < 64; j += 32, src += 32, dst += 32)
604 {
605 for (int i = 0; i < 16; i += 4)
606 {
607 __m128i r0 = _mm_load_si128(&src[i+0]);
608 __m128i r1 = _mm_load_si128(&src[i+1]);
609 __m128i r2 = _mm_load_si128(&src[i+2]);
610 __m128i r3 = _mm_load_si128(&src[i+3]);
611
612 _mm_store_si128(&dst[i*2+0], _mm_unpacklo_epi64(r0, r1));
613 _mm_store_si128(&dst[i*2+1], _mm_unpacklo_epi64(r2, r3));
614 _mm_store_si128(&dst[i*2+2], _mm_unpackhi_epi64(r0, r1));
615 _mm_store_si128(&dst[i*2+3], _mm_unpackhi_epi64(r2, r3));
616
617 __m128i r4 = _mm_load_si128(&src[i+0+16]);
618 __m128i r5 = _mm_load_si128(&src[i+1+16]);
619 __m128i r6 = _mm_load_si128(&src[i+2+16]);
620 __m128i r7 = _mm_load_si128(&src[i+3+16]);
621
622 _mm_store_si128(&dst[i*2+4], _mm_unpacklo_epi64(r4, r5));
623 _mm_store_si128(&dst[i*2+5], _mm_unpacklo_epi64(r6, r7));
624 _mm_store_si128(&dst[i*2+6], _mm_unpackhi_epi64(r4, r5));
625 _mm_store_si128(&dst[i*2+7], _mm_unpackhi_epi64(r6, r7));
626 }
627 }
628 }
629
630
631 extern "C" void __fastcall WriteCLUT_T32_I4_CSM1_sse2(u32* vm, u32* clut)
632 {
633 __m128i* src = (__m128i*)vm;
634 __m128i* dst = (__m128i*)clut;
635
636 __m128i r0 = _mm_load_si128(&src[0]);
637 __m128i r1 = _mm_load_si128(&src[1]);
638 __m128i r2 = _mm_load_si128(&src[2]);
639 __m128i r3 = _mm_load_si128(&src[3]);
640
641 _mm_store_si128(&dst[0], _mm_unpacklo_epi64(r0, r1));
642 _mm_store_si128(&dst[1], _mm_unpacklo_epi64(r2, r3));
643 _mm_store_si128(&dst[2], _mm_unpackhi_epi64(r0, r1));
644 _mm_store_si128(&dst[3], _mm_unpackhi_epi64(r2, r3));
645 }
646
647 static const __aligned16 int s_clut_16bits_mask[4] = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff };
648 static const __aligned16 int s_clut16mask2[4] = { 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff };
649 static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0000, 0xffff0000,
650 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
651 };
652
653 template<bool CSA_0_15, bool HIGH_16BITS_VM>
654 void __fastcall WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
655 {
656 __m128i vm_0;
657 __m128i vm_1;
658 __m128i vm_2;
659 __m128i vm_3;
660 __m128i clut_0;
661 __m128i clut_1;
662 __m128i clut_2;
663 __m128i clut_3;
664
665 __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);
666
667 // !HIGH_16BITS_VM
668 // CSA in 0-15
669 // Replace lower 16 bits of clut0 with lower 16 bits of vm
670 // CSA in 16-31
671 // Replace higher 16 bits of clut0 with lower 16 bits of vm
672
673 // HIGH_16BITS_VM
674 // CSA in 0-15
675 // Replace lower 16 bits of clut0 with higher 16 bits of vm
676 // CSA in 16-31
677 // Replace higher 16 bits of clut0 with higher 16 bits of vm
678 if(HIGH_16BITS_VM && CSA_0_15) {
679 // move up to low
680 vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
681 vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
682 vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
683 vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
684 vm_0 = _mm_srli_epi32(vm_0, 16);
685 vm_1 = _mm_srli_epi32(vm_1, 16);
686 vm_2 = _mm_srli_epi32(vm_2, 16);
687 vm_3 = _mm_srli_epi32(vm_3, 16);
688 } else if(HIGH_16BITS_VM && !CSA_0_15) {
689 // Remove lower 16 bits
690 vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
691 vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
692 vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
693 vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
694 } else if(!HIGH_16BITS_VM && CSA_0_15) {
695 // Remove higher 16 bits
696 vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
697 vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
698 vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
699 vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
700 } else if(!HIGH_16BITS_VM && !CSA_0_15) {
701 // move low to high
702 vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
703 vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
704 vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
705 vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
706 vm_0 = _mm_slli_epi32(vm_0, 16);
707 vm_1 = _mm_slli_epi32(vm_1, 16);
708 vm_2 = _mm_slli_epi32(vm_2, 16);
709 vm_3 = _mm_slli_epi32(vm_3, 16);
710 }
711
712 // Unsizzle the data
713 __m128i row_0 = _mm_unpacklo_epi32(vm_0, vm_1); // 3 2 1 0
714 __m128i row_1 = _mm_unpacklo_epi32(vm_2, vm_3); // 7 6 5 4
715 __m128i row_2 = _mm_unpackhi_epi32(vm_0, vm_1); // 11 10 9 8
716 __m128i row_3 = _mm_unpackhi_epi32(vm_2, vm_3); // 15 14 13 12
717
718 // load old data & remove useless part
719 if(CSA_0_15) {
720 // Remove lower 16 bits
721 clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut));
722 clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
723 clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
724 clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
725 } else {
726 // Remove higher 16 bits
727 clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut));
728 clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
729 clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
730 clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
731 }
732
733 // Merge old & new data
734 clut_0 = _mm_or_si128(clut_0, row_0);
735 clut_1 = _mm_or_si128(clut_1, row_1);
736 clut_2 = _mm_or_si128(clut_2, row_2);
737 clut_3 = _mm_or_si128(clut_3, row_3);
738
739 _mm_store_si128((__m128i*)clut, clut_0);
740 _mm_store_si128((__m128i*)clut+1, clut_1);
741 _mm_store_si128((__m128i*)clut+2, clut_2);
742 _mm_store_si128((__m128i*)clut+3, clut_3);
743 }
744
745 extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32 csa)
746 {
747 u32* clut = (u32*)(g_pbyGSClut + 64*(csa & 15));
748
749 if (csa > 15) {
750 WriteCLUT_T16_I4_CSM1_core_sse2<false, false>(vm, clut);
751 } else {
752 WriteCLUT_T16_I4_CSM1_core_sse2<true, false>(vm, clut);
753 }
754 }
755
756 extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2_old(u32* vm, u32* clut)
757 {
758 #define YET_ANOTHER_INTRINSIC
759 #ifdef YET_ANOTHER_INTRINSIC
760 __m128i vm0 = _mm_load_si128((__m128i*)vm);
761 __m128i vm1 = _mm_load_si128((__m128i*)vm+1);
762 __m128i vm2 = _mm_load_si128((__m128i*)vm+2);
763 __m128i vm3 = _mm_load_si128((__m128i*)vm+3);
764
765 // rearrange 16bits words
766 vm0 = _mm_shufflehi_epi16(vm0, 0x88);
767 vm0 = _mm_shufflelo_epi16(vm0, 0x88); // 6 4 6 4 2 0 2 0
768 vm1 = _mm_shufflehi_epi16(vm1, 0x88);
769 vm1 = _mm_shufflelo_epi16(vm1, 0x88); // 14 12 14 12 10 8 10 8
770
771 // Note: MSVC complains about direct c-cast...
772 // vm0 = (__m128i)_mm_shuffle_ps((__m128)vm0, (__m128)vm1, 0x88); // 14 12 10 8 6 4 2 0
773 __m128 vm0_f = (_mm_shuffle_ps((__m128&)vm0, (__m128&)vm1, 0x88)); // 14 12 10 8 6 4 2 0
774 vm0 = (__m128i&)vm0_f;
775 vm0 = _mm_shuffle_epi32(vm0, 0xD8); // 14 12 6 4 10 8 2 0
776
777 // *** Same jobs for vm2 and vm3
778 vm2 = _mm_shufflehi_epi16(vm2, 0x88);
779 vm2 = _mm_shufflelo_epi16(vm2, 0x88);
780 vm3 = _mm_shufflehi_epi16(vm3, 0x88);
781 vm3 = _mm_shufflelo_epi16(vm3, 0x88);
782
783 // Note: MSVC complains about direct c-cast...
784 // vm2 = (__m128i)_mm_shuffle_ps((__m128)vm2, (__m128)vm3, 0x88);
785 __m128 vm2_f = (_mm_shuffle_ps((__m128&)vm2, (__m128&)vm3, 0x88));
786 vm2 = (__m128i&)vm2_f;
787 vm2 = _mm_shuffle_epi32(vm2, 0xD8);
788
789 // Create a zero register.
790 __m128i zero_128 = _mm_setzero_si128();
791
792 if ((u32)clut & 0x0F) {
793 // Unaligned write.
794
795 u16* clut_word_ptr = (u16*)clut;
796 __m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask2);
797
798 // Load previous data and clear high 16 bits of double words
799 __m128i clut_0 = _mm_load_si128((__m128i*)(clut_word_ptr-1)); // 6 5 4 3 2 1 0 x
800 __m128i clut_2 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+2); // 22 21 20 19 18 17 16 15
801 clut_0 = _mm_and_si128(clut_0, clut_mask); // - 5 - 3 - 1 - x
802 clut_2 = _mm_and_si128(clut_2, clut_mask); // - 21 - 19 - 17 - 15
803
804 // Convert 16bits to 32 bits vm0 (zero entended)
805 __m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8 - 2 - 0
806 __m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12 - 6 - 4
807
808 // shift the value to aligned it with clut
809 vm0_low = _mm_slli_epi32(vm0_low, 16); // 10 - 8 - 2 - 0 -
810 vm0_high = _mm_slli_epi32(vm0_high, 16); // 14 - 12 - 6 - 4 -
811
812 // Interlace old and new data
813 clut_0 = _mm_or_si128(clut_0, vm0_low); // 10 5 8 3 2 1 0 x
814 clut_2 = _mm_or_si128(clut_2, vm0_high); // 14 21 12 19 6 17 4 15
815
816 // Save the result
817 _mm_store_si128((__m128i*)(clut_word_ptr-1), clut_0);
818 _mm_store_si128((__m128i*)(clut_word_ptr-1)+2, clut_2);
819
820 // *** Same jobs for clut_1 and clut_3
821 __m128i clut_1 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+1);
822 __m128i clut_3 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+3);
823 clut_1 = _mm_and_si128(clut_1, clut_mask);
824 clut_3 = _mm_and_si128(clut_3, clut_mask);
825
826 __m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
827 __m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
828 vm2_low = _mm_slli_epi32(vm2_low, 16);
829 vm2_high = _mm_slli_epi32(vm2_high, 16);
830
831 clut_1 = _mm_or_si128(clut_1, vm2_low);
832 clut_3 = _mm_or_si128(clut_3, vm2_high);
833
834 _mm_store_si128((__m128i*)(clut_word_ptr-1)+1, clut_1);
835 _mm_store_si128((__m128i*)(clut_word_ptr-1)+3, clut_3);
836 } else {
837 // Standard write
838
839 __m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask);
840
841 // Load previous data and clear low 16 bits of double words
842 __m128i clut_0 = _mm_and_si128(_mm_load_si128((__m128i*)clut), clut_mask); // 7 - 5 - 3 - 1 -
843 __m128i clut_2 = _mm_and_si128(_mm_load_si128((__m128i*)clut+2), clut_mask); // 23 - 21 - 19 - 17 -
844
845 // Convert 16bits to 32 bits vm0 (zero entended)
846 __m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8 - 2 - 0
847 __m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12 - 6 - 4
848
849 // Interlace old and new data
850 clut_0 = _mm_or_si128(clut_0, vm0_low); // 7 10 5 8 3 2 1 0
851 clut_2 = _mm_or_si128(clut_2, vm0_high); // 23 14 21 12 19 6 17 4
852
853 // Save the result
854 _mm_store_si128((__m128i*)clut, clut_0);
855 _mm_store_si128((__m128i*)clut+2, clut_2);
856
857 // *** Same jobs for clut_1 and clut_3
858 __m128i clut_1 = _mm_and_si128(_mm_load_si128((__m128i*)clut+1), clut_mask);
859 __m128i clut_3 = _mm_and_si128(_mm_load_si128((__m128i*)clut+3), clut_mask);
860
861 __m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
862 __m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
863
864 clut_1 = _mm_or_si128(clut_1, vm2_low);
865 clut_3 = _mm_or_si128(clut_3, vm2_high);
866
867 _mm_store_si128((__m128i*)clut+1, clut_1);
868 _mm_store_si128((__m128i*)clut+3, clut_3);
869 }
870
871 #else
872 #if defined(_MSC_VER)
873 __asm
874 {
875 mov eax, vm
876 mov ecx, clut
877 movdqa xmm0, qword ptr [eax]
878 movdqa xmm1, qword ptr [eax+16]
879 movdqa xmm2, qword ptr [eax+32]
880 movdqa xmm3, qword ptr [eax+48]
881
882 // rearrange
883 pshuflw xmm0, xmm0, 0x88
884 pshufhw xmm0, xmm0, 0x88
885 pshuflw xmm1, xmm1, 0x88
886 pshufhw xmm1, xmm1, 0x88
887 pshuflw xmm2, xmm2, 0x88
888 pshufhw xmm2, xmm2, 0x88
889 pshuflw xmm3, xmm3, 0x88
890 pshufhw xmm3, xmm3, 0x88
891
892 shufps xmm0, xmm1, 0x88
893 shufps xmm2, xmm3, 0x88
894
895 pshufd xmm0, xmm0, 0xd8
896 pshufd xmm2, xmm2, 0xd8
897
898 pxor xmm6, xmm6
899
900 test ecx, 15
901 jnz WriteUnaligned
902
903 movdqa xmm7, s_clut16mask // saves upper 16 bytes
904
905 // have to save interlaced with the old data
906 movdqa xmm4, [ecx]
907 movdqa xmm5, [ecx+32]
908 movhlps xmm1, xmm0
909 movlhps xmm0, xmm2 // lower 8 colors
910
911 pand xmm4, xmm7
912 pand xmm5, xmm7
913
914 shufps xmm1, xmm2, 0xe4 // upper 8 colors
915 movdqa xmm2, xmm0
916 movdqa xmm3, xmm1
917
918 punpcklwd xmm0, xmm6
919 punpcklwd xmm1, xmm6
920 por xmm0, xmm4
921 por xmm1, xmm5
922
923 punpckhwd xmm2, xmm6
924 punpckhwd xmm3, xmm6
925
926 movdqa [ecx], xmm0
927 movdqa [ecx+32], xmm1
928
929 movdqa xmm5, xmm7
930 pand xmm7, [ecx+16]
931 pand xmm5, [ecx+48]
932
933 por xmm2, xmm7
934 por xmm3, xmm5
935
936 movdqa [ecx+16], xmm2
937 movdqa [ecx+48], xmm3
938 jmp End
939
940 WriteUnaligned:
941 // ecx is offset by 2
942 sub ecx, 2
943
944 movdqa xmm7, s_clut16mask2 // saves lower 16 bytes
945
946 // have to save interlaced with the old data
947 movdqa xmm4, [ecx]
948 movdqa xmm5, [ecx+32]
949 movhlps xmm1, xmm0
950 movlhps xmm0, xmm2 // lower 8 colors
951
952 pand xmm4, xmm7
953 pand xmm5, xmm7
954
955 shufps xmm1, xmm2, 0xe4 // upper 8 colors
956 movdqa xmm2, xmm0
957 movdqa xmm3, xmm1
958
959 punpcklwd xmm0, xmm6
960 punpcklwd xmm1, xmm6
961 pslld xmm0, 16
962 pslld xmm1, 16
963 por xmm0, xmm4
964 por xmm1, xmm5
965
966 punpckhwd xmm2, xmm6
967 punpckhwd xmm3, xmm6
968 pslld xmm2, 16
969 pslld xmm3, 16
970
971 movdqa [ecx], xmm0
972 movdqa [ecx+32], xmm1
973
974 movdqa xmm5, xmm7
975 pand xmm7, [ecx+16]
976 pand xmm5, [ecx+48]
977
978 por xmm2, xmm7
979 por xmm3, xmm5
980
981 movdqa [ecx+16], xmm2
982 movdqa [ecx+48], xmm3
983
984 End:
985 }
986 #else
987 __asm__ __volatile__(".intel_syntax noprefix\n"
988 "movdqa xmm0, xmmword ptr [%[vm]]\n"
989 "movdqa xmm1, xmmword ptr [%[vm]+16]\n"
990 "movdqa xmm2, xmmword ptr [%[vm]+32]\n"
991 "movdqa xmm3, xmmword ptr [%[vm]+48]\n"
992
993 // rearrange
994 "pshuflw xmm0, xmm0, 0x88\n"
995 "pshufhw xmm0, xmm0, 0x88\n"
996 "pshuflw xmm1, xmm1, 0x88\n"
997 "pshufhw xmm1, xmm1, 0x88\n"
998 "pshuflw xmm2, xmm2, 0x88\n"
999 "pshufhw xmm2, xmm2, 0x88\n"
1000 "pshuflw xmm3, xmm3, 0x88\n"
1001 "pshufhw xmm3, xmm3, 0x88\n"
1002
1003 "shufps xmm0, xmm1, 0x88\n"
1004 "shufps xmm2, xmm3, 0x88\n"
1005
1006 "pshufd xmm0, xmm0, 0xd8\n"
1007 "pshufd xmm2, xmm2, 0xd8\n"
1008
1009 "pxor xmm6, xmm6\n"
1010
1011 "test %[clut], 15\n"
1012 "jnz WriteUnaligned\n"
1013
1014 "movdqa xmm7, %[s_clut16mask]\n" // saves upper 16 bits
1015
1016 // have to save interlaced with the old data
1017 "movdqa xmm4, [%[clut]]\n"
1018 "movdqa xmm5, [%[clut]+32]\n"
1019 "movhlps xmm1, xmm0\n"
1020 "movlhps xmm0, xmm2\n"// lower 8 colors
1021
1022 "pand xmm4, xmm7\n"
1023 "pand xmm5, xmm7\n"
1024
1025 "shufps xmm1, xmm2, 0xe4\n" // upper 8 colors
1026 "movdqa xmm2, xmm0\n"
1027 "movdqa xmm3, xmm1\n"
1028
1029 "punpcklwd xmm0, xmm6\n"
1030 "punpcklwd xmm1, xmm6\n"
1031 "por xmm0, xmm4\n"
1032 "por xmm1, xmm5\n"
1033
1034 "punpckhwd xmm2, xmm6\n"
1035 "punpckhwd xmm3, xmm6\n"
1036
1037 "movdqa [%[clut]], xmm0\n"
1038 "movdqa [%[clut]+32], xmm1\n"
1039
1040 "movdqa xmm5, xmm7\n"
1041 "pand xmm7, [%[clut]+16]\n"
1042 "pand xmm5, [%[clut]+48]\n"
1043
1044 "por xmm2, xmm7\n"
1045 "por xmm3, xmm5\n"
1046
1047 "movdqa [%[clut]+16], xmm2\n"
1048 "movdqa [%[clut]+48], xmm3\n"
1049 "jmp WriteCLUT_T16_I4_CSM1_End\n"
1050
1051 "WriteUnaligned:\n"
1052 // %[clut] is offset by 2
1053 "sub %[clut], 2\n"
1054
1055 "movdqa xmm7, %[s_clut16mask2]\n" // saves lower 16 bits
1056
1057 // have to save interlaced with the old data
1058 "movdqa xmm4, [%[clut]]\n"
1059 "movdqa xmm5, [%[clut]+32]\n"
1060 "movhlps xmm1, xmm0\n"
1061 "movlhps xmm0, xmm2\n" // lower 8 colors
1062
1063 "pand xmm4, xmm7\n"
1064 "pand xmm5, xmm7\n"
1065
1066 "shufps xmm1, xmm2, 0xe4\n" // upper 8 colors
1067 "movdqa xmm2, xmm0\n"
1068 "movdqa xmm3, xmm1\n"
1069
1070 "punpcklwd xmm0, xmm6\n"
1071 "punpcklwd xmm1, xmm6\n"
1072 "pslld xmm0, 16\n"
1073 "pslld xmm1, 16\n"
1074 "por xmm0, xmm4\n"
1075 "por xmm1, xmm5\n"
1076
1077 "punpckhwd xmm2, xmm6\n"
1078 "punpckhwd xmm3, xmm6\n"
1079 "pslld xmm2, 16\n"
1080 "pslld xmm3, 16\n"
1081
1082 "movdqa [%[clut]], xmm0\n"
1083 "movdqa [%[clut]+32], xmm1\n"
1084
1085 "movdqa xmm5, xmm7\n"
1086 "pand xmm7, [%[clut]+16]\n"
1087 "pand xmm5, [%[clut]+48]\n"
1088
1089 "por xmm2, xmm7\n"
1090 "por xmm3, xmm5\n"
1091
1092 "movdqa [%[clut]+16], xmm2\n"
1093 "movdqa [%[clut]+48], xmm3\n"
1094 "WriteCLUT_T16_I4_CSM1_End:\n"
1095 "\n"
1096 ".att_syntax\n"
1097 :
1098 : [vm] "r" (vm), [clut] "r" (clut), [s_clut16mask] "m" (*s_clut16mask), [s_clut16mask2] "m" (*s_clut16mask2)
1099 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
1100 );
1101 #endif // _MSC_VER
1102 #endif
1103 }
1104
1105 __forceinline void WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32 csa)
1106 {
1107 // update the right clut column (csa < 16)
1108 u32* clut = (u32*)(g_pbyGSClut + 64*(csa & 15));
1109 u32 csa_right = (csa < 16) ? 16 - csa : 0;
1110
1111 for(int i = (csa_right/2); i > 0 ; --i) {
1112 WriteCLUT_T16_I4_CSM1_core_sse2<true,false>(vm, clut);
1113 clut += 16;
1114 WriteCLUT_T16_I4_CSM1_core_sse2<true,true>(vm, clut);
1115 clut += 16;
1116 vm += 16; // go down one column
1117 }
1118
1119 // update the left clut column
1120 u32 csa_left = (csa >= 16) ? 16 : csa;
1121
1122 // In case csa_right is odd (so csa_left is also odd), we cross the clut column
1123 if(csa_right & 0x1) {
1124 WriteCLUT_T16_I4_CSM1_core_sse2<true,false>(vm, clut);
1125 // go back to the base before processing left clut column
1126 clut = (u32*)(g_pbyGSClut);
1127 WriteCLUT_T16_I4_CSM1_core_sse2<false,true>(vm, clut);
1128 } else if(csa_right != 0) {
1129 // go back to the base before processing left clut column
1130 clut = (u32*)(g_pbyGSClut);
1131 }
1132
1133 for(int i = (csa_left/2); i > 0 ; --i) {
1134 WriteCLUT_T16_I4_CSM1_core_sse2<false,false>(vm, clut);
1135 clut += 16;
1136 WriteCLUT_T16_I4_CSM1_core_sse2<false,true>(vm, clut);
1137 clut += 16;
1138 vm += 16; // go down one column
1139 }
1140 }
1141 #endif
1142
1143 #endif // ZEROGS_SSE2
1144
1145 #if 0
1146 void __fastcall WriteCLUT_T16_I8_CSM1_c(u32* _vm, u32* _clut)
1147 {
1148 const static u32 map[] =
1149 {
1150 0, 2, 8, 10, 16, 18, 24, 26,
1151 4, 6, 12, 14, 20, 22, 28, 30,
1152 1, 3, 9, 11, 17, 19, 25, 27,
1153 5, 7, 13, 15, 21, 23, 29, 31
1154 };
1155
1156 u16* vm = (u16*)_vm;
1157 u16* clut = (u16*)_clut;
1158
1159 int left = ((u32)(uptr)clut & 2) ? 512 : 512 - (((u32)(uptr)clut) & 0x3ff) / 2;
1160
1161 for (int j = 0; j < 8; j++, vm += 32, clut += 64, left -= 32)
1162 {
1163 if (left == 32)
1164 {
1165 assert(left == 32);
1166
1167 for (int i = 0; i < 16; i++)
1168 clut[2*i] = vm[map[i]];
1169
1170 clut = (u16*)((uptr)clut & ~0x3ff) + 1;
1171
1172 for (int i = 16; i < 32; i++)
1173 clut[2*i] = vm[map[i]];
1174 }
1175 else
1176 {
1177 if (left == 0)
1178 {
1179 clut = (u16*)((uptr)clut & ~0x3ff) + 1;
1180 left = -1;
1181 }
1182
1183 for (int i = 0; i < 32; i++)
1184 clut[2*i] = vm[map[i]];
1185 }
1186 }
1187 }
1188
1189 void __fastcall WriteCLUT_T32_I8_CSM1_c(u32* vm, u32* clut)
1190 {
1191 u64* src = (u64*)vm;
1192 u64* dst = (u64*)clut;
1193
1194 for (int j = 0; j < 2; j++, src += 32)
1195 {
1196 for (int i = 0; i < 4; i++, dst += 16, src += 8)
1197 {
1198 dst[0] = src[0];
1199 dst[1] = src[2];
1200 dst[2] = src[4];
1201 dst[3] = src[6];
1202 dst[4] = src[1];
1203 dst[5] = src[3];
1204 dst[6] = src[5];
1205 dst[7] = src[7];
1206
1207 dst[8] = src[32];
1208 dst[9] = src[32+2];
1209 dst[10] = src[32+4];
1210 dst[11] = src[32+6];
1211 dst[12] = src[32+1];
1212 dst[13] = src[32+3];
1213 dst[14] = src[32+5];
1214 dst[15] = src[32+7];
1215 }
1216 }
1217 }
1218
1219 void __fastcall WriteCLUT_T16_I4_CSM1_c(u32* _vm, u32* _clut)
1220 {
1221 u16* dst = (u16*)_clut;
1222 u16* src = (u16*)_vm;
1223
1224 dst[0] = src[0];
1225 dst[2] = src[2];
1226 dst[4] = src[8];
1227 dst[6] = src[10];
1228 dst[8] = src[16];
1229 dst[10] = src[18];
1230 dst[12] = src[24];
1231 dst[14] = src[26];
1232 dst[16] = src[4];
1233 dst[18] = src[6];
1234 dst[20] = src[12];
1235 dst[22] = src[14];
1236 dst[24] = src[20];
1237 dst[26] = src[22];
1238 dst[28] = src[28];
1239 dst[30] = src[30];
1240 }
1241
1242 void __fastcall WriteCLUT_T32_I4_CSM1_c(u32* vm, u32* clut)
1243 {
1244 u64* src = (u64*)vm;
1245 u64* dst = (u64*)clut;
1246
1247 dst[0] = src[0];
1248 dst[1] = src[2];
1249 dst[2] = src[4];
1250 dst[3] = src[6];
1251 dst[4] = src[1];
1252 dst[5] = src[3];
1253 dst[6] = src[5];
1254 dst[7] = src[7];
1255 }
1256
1257 #endif
1258
1259 void SSE2_UnswizzleZ16Target(u16* dst, u16* src, int iters)
1260 {
1261
1262 #if defined(_MSC_VER)
1263 __asm
1264 {
1265 mov edx, iters
1266 pxor xmm7, xmm7
1267 mov eax, dst
1268 mov ecx, src
1269
1270 Z16Loop:
1271 // unpack 64 bytes at a time
1272 movdqa xmm0, [ecx]
1273 movdqa xmm2, [ecx+16]
1274 movdqa xmm4, [ecx+32]
1275 movdqa xmm6, [ecx+48]
1276
1277 movdqa xmm1, xmm0
1278 movdqa xmm3, xmm2
1279 movdqa xmm5, xmm4
1280
1281 punpcklwd xmm0, xmm7
1282 punpckhwd xmm1, xmm7
1283 punpcklwd xmm2, xmm7
1284 punpckhwd xmm3, xmm7
1285
1286 // start saving
1287 movdqa [eax], xmm0
1288 movdqa [eax+16], xmm1
1289
1290 punpcklwd xmm4, xmm7
1291 punpckhwd xmm5, xmm7
1292
1293 movdqa [eax+32], xmm2
1294 movdqa [eax+48], xmm3
1295
1296 movdqa xmm0, xmm6
1297 punpcklwd xmm6, xmm7
1298
1299 movdqa [eax+64], xmm4
1300 movdqa [eax+80], xmm5
1301
1302 punpckhwd xmm0, xmm7
1303
1304 movdqa [eax+96], xmm6
1305 movdqa [eax+112], xmm0
1306
1307 add ecx, 64
1308 add eax, 128
1309 sub edx, 1
1310 jne Z16Loop
1311 }
1312 #else // _MSC_VER
1313
1314 __asm__ __volatile__(".intel_syntax\n"
1315 "pxor %%xmm7, %%xmm7\n"
1316
1317 "Z16Loop:\n"
1318 // unpack 64 bytes at a time
1319 "movdqa %%xmm0, [%[src]]\n"
1320 "movdqa %%xmm2, [%[src]+16]\n"
1321 "movdqa %%xmm4, [%[src]+32]\n"
1322 "movdqa %%xmm6, [%[src]+48]\n"
1323
1324 "movdqa %%xmm1, %%xmm0\n"
1325 "movdqa %%xmm3, %%xmm2\n"
1326 "movdqa %%xmm5, %%xmm4\n"
1327
1328 "punpcklwd %%xmm0, %%xmm7\n"
1329 "punpckhwd %%xmm1, %%xmm7\n"
1330 "punpcklwd %%xmm2, %%xmm7\n"
1331 "punpckhwd %%xmm3, %%xmm7\n"
1332
1333 // start saving
1334 "movdqa [%[dst]], %%xmm0\n"
1335 "movdqa [%[dst]+16], %%xmm1\n"
1336
1337 "punpcklwd %%xmm4, %%xmm7\n"
1338 "punpckhwd %%xmm5, %%xmm7\n"
1339
1340 "movdqa [%[dst]+32], %%xmm2\n"
1341 "movdqa [%[dst]+48], %%xmm3\n"
1342
1343 "movdqa %%xmm0, %%xmm6\n"
1344 "punpcklwd %%xmm6, %%xmm7\n"
1345
1346 "movdqa [%[dst]+64], %%xmm4\n"
1347 "movdqa [%[dst]+80], %%xmm5\n"
1348
1349 "punpckhwd %%xmm0, %%xmm7\n"
1350
1351 "movdqa [%[dst]+96], %%xmm6\n"
1352 "movdqa [%[dst]+112], %%xmm0\n"
1353
1354 "add %[src], 64\n"
1355 "add %[dst], 128\n"
1356 "sub %[iters], 1\n"
1357 "jne Z16Loop\n"
1358
1359 ".att_syntax\n"
1360 : "=&r"(src), "=&r"(dst), "=&r"(iters)
1361 : [src] "0"(src), [dst] "1"(dst), [iters] "2"(iters)
1362 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
1363 );
1364 #endif // _MSC_VER
1365 }
1366

  ViewVC Help
Powered by ViewVC 1.1.22