/[pcsx2_0.9.7]/trunk/pcsx2/x86/microVU_Lower.inl
ViewVC logotype

Contents of /trunk/pcsx2/x86/microVU_Lower.inl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 280 - (show annotations) (download)
Thu Dec 23 12:02:12 2010 UTC (9 years, 2 months ago) by william
File size: 40156 byte(s)
re-commit (had local access denied errors when committing)
1 /* PCSX2 - PS2 Emulator for PCs
2 * Copyright (C) 2002-2010 PCSX2 Dev Team
3 *
4 * PCSX2 is free software: you can redistribute it and/or modify it under the terms
5 * of the GNU Lesser General Public License as published by the Free Software Found-
6 * ation, either version 3 of the License, or (at your option) any later version.
7 *
8 * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10 * PURPOSE. See the GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License along with PCSX2.
13 * If not, see <http://www.gnu.org/licenses/>.
14 */
15
16 #pragma once
17
18 //------------------------------------------------------------------
19 // Micro VU Micromode Lower instructions
20 //------------------------------------------------------------------
21
22 //------------------------------------------------------------------
23 // DIV/SQRT/RSQRT
24 //------------------------------------------------------------------
25
26 // Test if Vector is +/- Zero
27 static __fi void testZero(const xmm& xmmReg, const xmm& xmmTemp, const x32& gprTemp)
28 {
29 xXOR.PS(xmmTemp, xmmTemp);
30 xCMPEQ.SS(xmmTemp, xmmReg);
31 if (!x86caps.hasStreamingSIMD4Extensions) {
32 xMOVMSKPS(gprTemp, xmmTemp);
33 xTEST(gprTemp, 1);
34 }
35 else xPTEST(xmmTemp, xmmTemp);
36 }
37
38 // Test if Vector is Negative (Set Flags and Makes Positive)
39 static __fi void testNeg(mV, const xmm& xmmReg, const x32& gprTemp)
40 {
41 xMOVMSKPS(gprTemp, xmmReg);
42 xTEST(gprTemp, 1);
43 xForwardJZ8 skip;
44 xMOV(ptr32[&mVU->divFlag], divI);
45 xAND.PS(xmmReg, ptr128[mVUglob.absclip]);
46 skip.SetTarget();
47 }
48
49 mVUop(mVU_DIV) {
50 pass1 { mVUanalyzeFDIV(mVU, _Fs_, _Fsf_, _Ft_, _Ftf_, 7); }
51 pass2 {
52 xmm Ft;
53 if (_Ftf_) Ft = mVU->regAlloc->allocReg(_Ft_, 0, (1 << (3 - _Ftf_)));
54 else Ft = mVU->regAlloc->allocReg(_Ft_);
55 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
56 const xmm& t1 = mVU->regAlloc->allocReg();
57
58 testZero(Ft, t1, gprT1); // Test if Ft is zero
59 xForwardJZ8 cjmp; // Skip if not zero
60
61 testZero(Fs, t1, gprT1); // Test if Fs is zero
62 xForwardJZ8 ajmp;
63 xMOV(ptr32[&mVU->divFlag], divI); // Set invalid flag (0/0)
64 xForwardJump8 bjmp;
65 ajmp.SetTarget();
66 xMOV(ptr32[&mVU->divFlag], divD); // Zero divide (only when not 0/0)
67 bjmp.SetTarget();
68
69 xXOR.PS(Fs, Ft);
70 xAND.PS(Fs, ptr128[mVUglob.signbit]);
71 xOR.PS (Fs, ptr128[mVUglob.maxvals]); // If division by zero, then xmmFs = +/- fmax
72
73 xForwardJump8 djmp;
74 cjmp.SetTarget();
75 xMOV(ptr32[&mVU->divFlag], 0); // Clear I/D flags
76 SSE_DIVSS(mVU, Fs, Ft);
77 mVUclamp1(Fs, t1, 8, 1);
78 djmp.SetTarget();
79
80 writeQreg(Fs, mVUinfo.writeQ);
81
82 mVU->regAlloc->clearNeeded(Fs);
83 mVU->regAlloc->clearNeeded(Ft);
84 mVU->regAlloc->clearNeeded(t1);
85 }
86 pass3 { mVUlog("DIV Q, vf%02d%s, vf%02d%s", _Fs_, _Fsf_String, _Ft_, _Ftf_String); }
87 }
88
89 mVUop(mVU_SQRT) {
90 pass1 { mVUanalyzeFDIV(mVU, 0, 0, _Ft_, _Ftf_, 7); }
91 pass2 {
92 const xmm& Ft = mVU->regAlloc->allocReg(_Ft_, 0, (1 << (3 - _Ftf_)));
93
94 xMOV(ptr32[&mVU->divFlag], 0); // Clear I/D flags
95 testNeg(mVU, Ft, gprT1); // Check for negative sqrt
96
97 if (CHECK_VU_OVERFLOW) xMIN.SS(Ft, ptr32[mVUglob.maxvals]); // Clamp infinities (only need to do positive clamp since xmmFt is positive)
98 xSQRT.SS(Ft, Ft);
99 writeQreg(Ft, mVUinfo.writeQ);
100
101 mVU->regAlloc->clearNeeded(Ft);
102 }
103 pass3 { mVUlog("SQRT Q, vf%02d%s", _Ft_, _Ftf_String); }
104 }
105
106 mVUop(mVU_RSQRT) {
107 pass1 { mVUanalyzeFDIV(mVU, _Fs_, _Fsf_, _Ft_, _Ftf_, 13); }
108 pass2 {
109 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
110 const xmm& Ft = mVU->regAlloc->allocReg(_Ft_, 0, (1 << (3 - _Ftf_)));
111 const xmm& t1 = mVU->regAlloc->allocReg();
112
113 xMOV(ptr32[&mVU->divFlag], 0); // Clear I/D flags
114 testNeg(mVU, Ft, gprT1); // Check for negative sqrt
115
116 xSQRT.SS(Ft, Ft);
117 testZero(Ft, t1, gprT1); // Test if Ft is zero
118 xForwardJZ8 ajmp; // Skip if not zero
119
120 testZero(Fs, t1, gprT1); // Test if Fs is zero
121 xForwardJZ8 bjmp; // Skip if none are
122 xMOV(ptr32[&mVU->divFlag], divI); // Set invalid flag (0/0)
123 xForwardJump8 cjmp;
124 bjmp.SetTarget();
125 xMOV(ptr32[&mVU->divFlag], divD); // Zero divide flag (only when not 0/0)
126 cjmp.SetTarget();
127
128 xAND.PS(Fs, ptr128[mVUglob.signbit]);
129 xOR.PS (Fs, ptr128[mVUglob.maxvals]); // xmmFs = +/-Max
130
131 xForwardJump8 djmp;
132 ajmp.SetTarget();
133 SSE_DIVSS(mVU, Fs, Ft);
134 mVUclamp1(Fs, t1, 8, 1);
135 djmp.SetTarget();
136
137 writeQreg(Fs, mVUinfo.writeQ);
138
139 mVU->regAlloc->clearNeeded(Fs);
140 mVU->regAlloc->clearNeeded(Ft);
141 mVU->regAlloc->clearNeeded(t1);
142 }
143 pass3 { mVUlog("RSQRT Q, vf%02d%s, vf%02d%s", _Fs_, _Fsf_String, _Ft_, _Ftf_String); }
144 }
145
146 //------------------------------------------------------------------
147 // EATAN/EEXP/ELENG/ERCPR/ERLENG/ERSADD/ERSQRT/ESADD/ESIN/ESQRT/ESUM
148 //------------------------------------------------------------------
149
150 #define EATANhelper(addr) { \
151 SSE_MULSS(mVU, t2, Fs); \
152 SSE_MULSS(mVU, t2, Fs); \
153 xMOVAPS (t1, t2); \
154 xMUL.SS (t1, ptr32[addr]); \
155 SSE_ADDSS(mVU, PQ, t1); \
156 }
157
158 // ToDo: Can Be Optimized Further? (takes approximately (~115 cycles + mem access time) on a c2d)
159 static __fi void mVU_EATAN_(mV, const xmm& PQ, const xmm& Fs, const xmm& t1, const xmm& t2) {
160 xMOVSS(PQ, Fs);
161 xMUL.SS(PQ, ptr32[mVUglob.T1]);
162 xMOVAPS(t2, Fs);
163 EATANhelper(mVUglob.T2);
164 EATANhelper(mVUglob.T3);
165 EATANhelper(mVUglob.T4);
166 EATANhelper(mVUglob.T5);
167 EATANhelper(mVUglob.T6);
168 EATANhelper(mVUglob.T7);
169 EATANhelper(mVUglob.T8);
170 xADD.SS(PQ, ptr32[mVUglob.Pi4]);
171 xPSHUF.D(PQ, PQ, mVUinfo.writeP ? 0x27 : 0xC6);
172 }
173
174 mVUop(mVU_EATAN) {
175 pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 54); }
176 pass2 {
177 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
178 const xmm& t1 = mVU->regAlloc->allocReg();
179 const xmm& t2 = mVU->regAlloc->allocReg();
180 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
181 xMOVSS (xmmPQ, Fs);
182 xSUB.SS(Fs, ptr32[mVUglob.one]);
183 xADD.SS(xmmPQ, ptr32[mVUglob.one]);
184 SSE_DIVSS(mVU, Fs, xmmPQ);
185 mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2);
186 mVU->regAlloc->clearNeeded(Fs);
187 mVU->regAlloc->clearNeeded(t1);
188 mVU->regAlloc->clearNeeded(t2);
189 }
190 pass3 { mVUlog("EATAN P"); }
191 }
192
193 mVUop(mVU_EATANxy) {
194 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 54); }
195 pass2 {
196 const xmm& t1 = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
197 const xmm& Fs = mVU->regAlloc->allocReg();
198 const xmm& t2 = mVU->regAlloc->allocReg();
199 xPSHUF.D(Fs, t1, 0x01);
200 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
201 xMOVSS (xmmPQ, Fs);
202 SSE_SUBSS (mVU, Fs, t1); // y-x, not y-1? ><
203 SSE_ADDSS (mVU, t1, xmmPQ);
204 SSE_DIVSS (mVU, Fs, t1);
205 mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2);
206 mVU->regAlloc->clearNeeded(Fs);
207 mVU->regAlloc->clearNeeded(t1);
208 mVU->regAlloc->clearNeeded(t2);
209 }
210 pass3 { mVUlog("EATANxy P"); }
211 }
212
213 mVUop(mVU_EATANxz) {
214 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 54); }
215 pass2 {
216 const xmm& t1 = mVU->regAlloc->allocReg(_Fs_, 0, 0xf);
217 const xmm& Fs = mVU->regAlloc->allocReg();
218 const xmm& t2 = mVU->regAlloc->allocReg();
219 xPSHUF.D(Fs, t1, 0x02);
220 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
221 xMOVSS (xmmPQ, Fs);
222 SSE_SUBSS (mVU, Fs, t1);
223 SSE_ADDSS (mVU, t1, xmmPQ);
224 SSE_DIVSS (mVU, Fs, t1);
225 mVU_EATAN_(mVU, xmmPQ, Fs, t1, t2);
226 mVU->regAlloc->clearNeeded(Fs);
227 mVU->regAlloc->clearNeeded(t1);
228 mVU->regAlloc->clearNeeded(t2);
229 }
230 pass3 { mVUlog("EATANxz P"); }
231 }
232
233 #define eexpHelper(addr) { \
234 SSE_MULSS(mVU, t2, Fs); \
235 xMOVAPS (t1, t2); \
236 xMUL.SS (t1, ptr32[addr]); \
237 SSE_ADDSS(mVU, xmmPQ, t1); \
238 }
239
240 mVUop(mVU_EEXP) {
241 pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 44); }
242 pass2 {
243 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
244 const xmm& t1 = mVU->regAlloc->allocReg();
245 const xmm& t2 = mVU->regAlloc->allocReg();
246 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
247 xMOVSS (xmmPQ, Fs);
248 xMUL.SS (xmmPQ, ptr32[mVUglob.E1]);
249 xADD.SS (xmmPQ, ptr32[mVUglob.one]);
250 xMOVAPS (t1, Fs);
251 SSE_MULSS(mVU, t1, Fs);
252 xMOVAPS (t2, t1);
253 xMUL.SS (t1, ptr32[mVUglob.E2]);
254 SSE_ADDSS(mVU, xmmPQ, t1);
255 eexpHelper(&mVUglob.E3);
256 eexpHelper(&mVUglob.E4);
257 eexpHelper(&mVUglob.E5);
258 SSE_MULSS(mVU, t2, Fs);
259 xMUL.SS (t2, ptr32[mVUglob.E6]);
260 SSE_ADDSS(mVU, xmmPQ, t2);
261 SSE_MULSS(mVU, xmmPQ, xmmPQ);
262 SSE_MULSS(mVU, xmmPQ, xmmPQ);
263 xMOVSSZX (t2, ptr32[mVUglob.one]);
264 SSE_DIVSS(mVU, t2, xmmPQ);
265 xMOVSS (xmmPQ, t2);
266 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
267 mVU->regAlloc->clearNeeded(Fs);
268 mVU->regAlloc->clearNeeded(t1);
269 mVU->regAlloc->clearNeeded(t2);
270 }
271 pass3 { mVUlog("EEXP P"); }
272 }
273
274 // sumXYZ(): PQ.x = x ^ 2 + y ^ 2 + z ^ 2
275 static __fi void mVU_sumXYZ(mV, const xmm& PQ, const xmm& Fs) {
276 if( x86caps.hasStreamingSIMD4Extensions ) {
277 xDP.PS(Fs, Fs, 0x71);
278 xMOVSS(PQ, Fs);
279 }
280 else {
281 SSE_MULPS(mVU, Fs, Fs); // wzyx ^ 2
282 xMOVSS (PQ, Fs); // x ^ 2
283 xPSHUF.D (Fs, Fs, 0xe1); // wzyx -> wzxy
284 SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2
285 xPSHUF.D (Fs, Fs, 0xD2); // wzxy -> wxyz
286 SSE_ADDSS(mVU, PQ, Fs); // x ^ 2 + y ^ 2 + z ^ 2
287 }
288 }
289
290 mVUop(mVU_ELENG) {
291 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); }
292 pass2 {
293 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
294 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
295 mVU_sumXYZ(mVU, xmmPQ, Fs);
296 xSQRT.SS (xmmPQ, xmmPQ);
297 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
298 mVU->regAlloc->clearNeeded(Fs);
299 }
300 pass3 { mVUlog("ELENG P"); }
301 }
302
303 mVUop(mVU_ERCPR) {
304 pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); }
305 pass2 {
306 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
307 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
308 xMOVSS (xmmPQ, Fs);
309 xMOVSSZX (Fs, ptr32[mVUglob.one]);
310 SSE_DIVSS(mVU, Fs, xmmPQ);
311 xMOVSS (xmmPQ, Fs);
312 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
313 mVU->regAlloc->clearNeeded(Fs);
314 }
315 pass3 { mVUlog("ERCPR P"); }
316 }
317
318 mVUop(mVU_ERLENG) {
319 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 24); }
320 pass2 {
321 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
322 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
323 mVU_sumXYZ(mVU, xmmPQ, Fs);
324 xSQRT.SS (xmmPQ, xmmPQ);
325 xMOVSSZX (Fs, ptr32[mVUglob.one]);
326 SSE_DIVSS (mVU, Fs, xmmPQ);
327 xMOVSS (xmmPQ, Fs);
328 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
329 mVU->regAlloc->clearNeeded(Fs);
330 }
331 pass3 { mVUlog("ERLENG P"); }
332 }
333
334 mVUop(mVU_ERSADD) {
335 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 18); }
336 pass2 {
337 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
338 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
339 mVU_sumXYZ(mVU, xmmPQ, Fs);
340 xMOVSSZX (Fs, ptr32[mVUglob.one]);
341 SSE_DIVSS (mVU, Fs, xmmPQ);
342 xMOVSS (xmmPQ, Fs);
343 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
344 mVU->regAlloc->clearNeeded(Fs);
345 }
346 pass3 { mVUlog("ERSADD P"); }
347 }
348
349 mVUop(mVU_ERSQRT) {
350 pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 18); }
351 pass2 {
352 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
353 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
354 xAND.PS (Fs, ptr128[mVUglob.absclip]);
355 xSQRT.SS (xmmPQ, Fs);
356 xMOVSSZX (Fs, ptr32[mVUglob.one]);
357 SSE_DIVSS(mVU, Fs, xmmPQ);
358 xMOVSS (xmmPQ, Fs);
359 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
360 mVU->regAlloc->clearNeeded(Fs);
361 }
362 pass3 { mVUlog("ERSQRT P"); }
363 }
364
365 mVUop(mVU_ESADD) {
366 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 11); }
367 pass2 {
368 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
369 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
370 mVU_sumXYZ(mVU, xmmPQ, Fs);
371 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
372 mVU->regAlloc->clearNeeded(Fs);
373 }
374 pass3 { mVUlog("ESADD P"); }
375 }
376
377 #define esinHelper(addr) { \
378 SSE_MULSS(mVU, t2, t1); \
379 xMOVAPS (Fs, t2); \
380 xMUL.SS (Fs, ptr32[addr]); \
381 SSE_ADDSS(mVU, xmmPQ, Fs); \
382 }
383
384 mVUop(mVU_ESIN) {
385 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 29); }
386 pass2 {
387 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
388 const xmm& t1 = mVU->regAlloc->allocReg();
389 const xmm& t2 = mVU->regAlloc->allocReg();
390 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
391 xMOVSS (xmmPQ, Fs);
392 xMOVAPS (t1, Fs);
393 SSE_MULSS(mVU, Fs, t1);
394 xMOVAPS (t2, Fs);
395 SSE_MULSS(mVU, Fs, t1);
396 xMOVAPS (t1, Fs);
397 xMUL.SS (Fs, ptr32[mVUglob.S2]);
398 SSE_ADDSS(mVU, xmmPQ, Fs);
399 esinHelper(mVUglob.S3);
400 esinHelper(mVUglob.S4);
401 SSE_MULSS(mVU, t2, t1);
402 xMUL.SS (t2, ptr32[mVUglob.S5]);
403 SSE_ADDSS(mVU, xmmPQ, t2);
404 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
405 mVU->regAlloc->clearNeeded(Fs);
406 mVU->regAlloc->clearNeeded(t1);
407 mVU->regAlloc->clearNeeded(t2);
408 }
409 pass3 { mVUlog("ESIN P"); }
410 }
411
412 mVUop(mVU_ESQRT) {
413 pass1 { mVUanalyzeEFU1(mVU, _Fs_, _Fsf_, 12); }
414 pass2 {
415 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
416 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
417 xAND.PS (Fs, ptr128[mVUglob.absclip]);
418 xSQRT.SS(xmmPQ, Fs);
419 xPSHUF.D(xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
420 mVU->regAlloc->clearNeeded(Fs);
421 }
422 pass3 { mVUlog("ESQRT P"); }
423 }
424
425 mVUop(mVU_ESUM) {
426 pass1 { mVUanalyzeEFU2(mVU, _Fs_, 12); }
427 pass2 {
428 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
429 const xmm& t1 = mVU->regAlloc->allocReg();
430 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip xmmPQ to get Valid P instance
431 xPSHUF.D (t1, Fs, 0x1b);
432 SSE_ADDPS(mVU, Fs, t1);
433 xPSHUF.D (t1, Fs, 0x01);
434 SSE_ADDSS(mVU, Fs, t1);
435 xMOVSS (xmmPQ, Fs);
436 xPSHUF.D (xmmPQ, xmmPQ, mVUinfo.writeP ? 0x27 : 0xC6); // Flip back
437 mVU->regAlloc->clearNeeded(Fs);
438 mVU->regAlloc->clearNeeded(t1);
439 }
440 pass3 { mVUlog("ESUM P"); }
441 }
442
443 //------------------------------------------------------------------
444 // FCAND/FCEQ/FCGET/FCOR/FCSET
445 //------------------------------------------------------------------
446
447 mVUop(mVU_FCAND) {
448 pass1 { mVUanalyzeCflag(mVU, 1); }
449 pass2 {
450 mVUallocCFLAGa(mVU, gprT1, cFLAG.read);
451 xAND(gprT1, _Imm24_);
452 xADD(gprT1, 0xffffff);
453 xSHR(gprT1, 24);
454 mVUallocVIb(mVU, gprT1, 1);
455 }
456 pass3 { mVUlog("FCAND vi01, $%x", _Imm24_); }
457 pass4 { mVUregs.needExactMatch |= 4; }
458 }
459
460 mVUop(mVU_FCEQ) {
461 pass1 { mVUanalyzeCflag(mVU, 1); }
462 pass2 {
463 mVUallocCFLAGa(mVU, gprT1, cFLAG.read);
464 xXOR(gprT1, _Imm24_);
465 xSUB(gprT1, 1);
466 xSHR(gprT1, 31);
467 mVUallocVIb(mVU, gprT1, 1);
468 }
469 pass3 { mVUlog("FCEQ vi01, $%x", _Imm24_); }
470 pass4 { mVUregs.needExactMatch |= 4; }
471 }
472
473 mVUop(mVU_FCGET) {
474 pass1 { mVUanalyzeCflag(mVU, _It_); }
475 pass2 {
476 mVUallocCFLAGa(mVU, gprT1, cFLAG.read);
477 xAND(gprT1, 0xfff);
478 mVUallocVIb(mVU, gprT1, _It_);
479 }
480 pass3 { mVUlog("FCGET vi%02d", _Ft_); }
481 pass4 { mVUregs.needExactMatch |= 4; }
482 }
483
484 mVUop(mVU_FCOR) {
485 pass1 { mVUanalyzeCflag(mVU, 1); }
486 pass2 {
487 mVUallocCFLAGa(mVU, gprT1, cFLAG.read);
488 xOR(gprT1, _Imm24_);
489 xADD(gprT1, 1); // If 24 1's will make 25th bit 1, else 0
490 xSHR(gprT1, 24); // Get the 25th bit (also clears the rest of the garbage in the reg)
491 mVUallocVIb(mVU, gprT1, 1);
492 }
493 pass3 { mVUlog("FCOR vi01, $%x", _Imm24_); }
494 pass4 { mVUregs.needExactMatch |= 4; }
495 }
496
497 mVUop(mVU_FCSET) {
498 pass1 { cFLAG.doFlag = 1; }
499 pass2 {
500 xMOV(gprT1, _Imm24_);
501 mVUallocCFLAGb(mVU, gprT1, cFLAG.write);
502 }
503 pass3 { mVUlog("FCSET $%x", _Imm24_); }
504 }
505
506 //------------------------------------------------------------------
507 // FMAND/FMEQ/FMOR
508 //------------------------------------------------------------------
509
510 mVUop(mVU_FMAND) {
511 pass1 { mVUanalyzeMflag(mVU, _Is_, _It_); }
512 pass2 {
513 mVUallocMFLAGa(mVU, gprT1, mFLAG.read);
514 mVUallocVIa(mVU, gprT2, _Is_);
515 xAND(gprT1b, gprT2b);
516 mVUallocVIb(mVU, gprT1, _It_);
517 }
518 pass3 { mVUlog("FMAND vi%02d, vi%02d", _Ft_, _Fs_); }
519 pass4 { mVUregs.needExactMatch |= 2; }
520 }
521
522 mVUop(mVU_FMEQ) {
523 pass1 { mVUanalyzeMflag(mVU, _Is_, _It_); }
524 pass2 {
525 mVUallocMFLAGa(mVU, gprT1, mFLAG.read);
526 mVUallocVIa(mVU, gprT2, _Is_);
527 xXOR(gprT1, gprT2);
528 xSUB(gprT1, 1);
529 xSHR(gprT1, 31);
530 mVUallocVIb(mVU, gprT1, _It_);
531 }
532 pass3 { mVUlog("FMEQ vi%02d, vi%02d", _Ft_, _Fs_); }
533 pass4 { mVUregs.needExactMatch |= 2; }
534 }
535
536 mVUop(mVU_FMOR) {
537 pass1 { mVUanalyzeMflag(mVU, _Is_, _It_); }
538 pass2 {
539 mVUallocMFLAGa(mVU, gprT1, mFLAG.read);
540 mVUallocVIa(mVU, gprT2, _Is_);
541 xOR(gprT1b, gprT2b);
542 mVUallocVIb(mVU, gprT1, _It_);
543 }
544 pass3 { mVUlog("FMOR vi%02d, vi%02d", _Ft_, _Fs_); }
545 pass4 { mVUregs.needExactMatch |= 2; }
546 }
547
548 //------------------------------------------------------------------
549 // FSAND/FSEQ/FSOR/FSSET
550 //------------------------------------------------------------------
551
552 mVUop(mVU_FSAND) {
553 pass1 { mVUanalyzeSflag(mVU, _It_); }
554 pass2 {
555 mVUallocSFLAGc(gprT1, gprT2, sFLAG.read);
556 xAND(gprT1, _Imm12_);
557 mVUallocVIb(mVU, gprT1, _It_);
558 }
559 pass3 { mVUlog("FSAND vi%02d, $%x", _Ft_, _Imm12_); }
560 pass4 { mVUregs.needExactMatch |= 1; }
561 }
562
563 mVUop(mVU_FSOR) {
564 pass1 { mVUanalyzeSflag(mVU, _It_); }
565 pass2 {
566 mVUallocSFLAGc(gprT1, gprT2, sFLAG.read);
567 xOR(gprT1, _Imm12_);
568 mVUallocVIb(mVU, gprT1, _It_);
569 }
570 pass3 { mVUlog("FSOR vi%02d, $%x", _Ft_, _Imm12_); }
571 pass4 { mVUregs.needExactMatch |= 1; }
572 }
573
574 mVUop(mVU_FSEQ) {
575 pass1 { mVUanalyzeSflag(mVU, _It_); }
576 pass2 {
577 int imm = 0;
578 if (_Imm12_ & 0x0001) imm |= 0x0000f00; // Z
579 if (_Imm12_ & 0x0002) imm |= 0x000f000; // S
580 if (_Imm12_ & 0x0004) imm |= 0x0010000; // U
581 if (_Imm12_ & 0x0008) imm |= 0x0020000; // O
582 if (_Imm12_ & 0x0010) imm |= 0x0040000; // I
583 if (_Imm12_ & 0x0020) imm |= 0x0080000; // D
584 if (_Imm12_ & 0x0040) imm |= 0x000000f; // ZS
585 if (_Imm12_ & 0x0080) imm |= 0x00000f0; // SS
586 if (_Imm12_ & 0x0100) imm |= 0x0400000; // US
587 if (_Imm12_ & 0x0200) imm |= 0x0800000; // OS
588 if (_Imm12_ & 0x0400) imm |= 0x1000000; // IS
589 if (_Imm12_ & 0x0800) imm |= 0x2000000; // DS
590
591 mVUallocSFLAGa(gprT1, sFLAG.read);
592 setBitFSEQ(gprT1, 0x0f00); // Z bit
593 setBitFSEQ(gprT1, 0xf000); // S bit
594 setBitFSEQ(gprT1, 0x000f); // ZS bit
595 setBitFSEQ(gprT1, 0x00f0); // SS bit
596 xXOR(gprT1, imm);
597 xSUB(gprT1, 1);
598 xSHR(gprT1, 31);
599 mVUallocVIb(mVU, gprT1, _It_);
600 }
601 pass3 { mVUlog("FSEQ vi%02d, $%x", _Ft_, _Imm12_); }
602 pass4 { mVUregs.needExactMatch |= 1; }
603 }
604
605 mVUop(mVU_FSSET) {
606 pass1 { mVUanalyzeFSSET(mVU); }
607 pass2 {
608 int imm = 0;
609 if (_Imm12_ & 0x0040) imm |= 0x000000f; // ZS
610 if (_Imm12_ & 0x0080) imm |= 0x00000f0; // SS
611 if (_Imm12_ & 0x0100) imm |= 0x0400000; // US
612 if (_Imm12_ & 0x0200) imm |= 0x0800000; // OS
613 if (_Imm12_ & 0x0400) imm |= 0x1000000; // IS
614 if (_Imm12_ & 0x0800) imm |= 0x2000000; // DS
615 if (!(sFLAG.doFlag || mVUinfo.doDivFlag)) {
616 mVUallocSFLAGa(getFlagReg(sFLAG.write), sFLAG.lastWrite); // Get Prev Status Flag
617 }
618 xAND(getFlagReg(sFLAG.write), 0xfff00); // Keep Non-Sticky Bits
619 if (imm) xOR(getFlagReg(sFLAG.write), imm);
620 }
621 pass3 { mVUlog("FSSET $%x", _Imm12_); }
622 }
623
624 //------------------------------------------------------------------
625 // IADD/IADDI/IADDIU/IAND/IOR/ISUB/ISUBIU
626 //------------------------------------------------------------------
627
628 mVUop(mVU_IADD) {
629 pass1 { mVUanalyzeIALU1(mVU, _Id_, _Is_, _It_); }
630 pass2 {
631 mVUallocVIa(mVU, gprT1, _Is_);
632 if (_It_ != _Is_) {
633 mVUallocVIa(mVU, gprT2, _It_);
634 xADD(gprT1b, gprT2b);
635 }
636 else xADD(gprT1b, gprT1b);
637 mVUallocVIb(mVU, gprT1, _Id_);
638 }
639 pass3 { mVUlog("IADD vi%02d, vi%02d, vi%02d", _Fd_, _Fs_, _Ft_); }
640 }
641
642 mVUop(mVU_IADDI) {
643 pass1 { mVUanalyzeIADDI(mVU, _Is_, _It_, _Imm5_); }
644 pass2 {
645 mVUallocVIa(mVU, gprT1, _Is_);
646 xADD(gprT1b, _Imm5_);
647 mVUallocVIb(mVU, gprT1, _It_);
648 }
649 pass3 { mVUlog("IADDI vi%02d, vi%02d, %d", _Ft_, _Fs_, _Imm5_); }
650 }
651
652 mVUop(mVU_IADDIU) {
653 pass1 { mVUanalyzeIADDI(mVU, _Is_, _It_, _Imm15_); }
654 pass2 {
655 mVUallocVIa(mVU, gprT1, _Is_);
656 xADD(gprT1b, _Imm15_);
657 mVUallocVIb(mVU, gprT1, _It_);
658 }
659 pass3 { mVUlog("IADDIU vi%02d, vi%02d, %d", _Ft_, _Fs_, _Imm15_); }
660 }
661
662 mVUop(mVU_IAND) {
663 pass1 { mVUanalyzeIALU1(mVU, _Id_, _Is_, _It_); }
664 pass2 {
665 mVUallocVIa(mVU, gprT1, _Is_);
666 if (_It_ != _Is_) {
667 mVUallocVIa(mVU, gprT2, _It_);
668 xAND(gprT1, gprT2);
669 }
670 mVUallocVIb(mVU, gprT1, _Id_);
671 }
672 pass3 { mVUlog("IAND vi%02d, vi%02d, vi%02d", _Fd_, _Fs_, _Ft_); }
673 }
674
675 mVUop(mVU_IOR) {
676 pass1 { mVUanalyzeIALU1(mVU, _Id_, _Is_, _It_); }
677 pass2 {
678 mVUallocVIa(mVU, gprT1, _Is_);
679 if (_It_ != _Is_) {
680 mVUallocVIa(mVU, gprT2, _It_);
681 xOR(gprT1, gprT2);
682 }
683 mVUallocVIb(mVU, gprT1, _Id_);
684 }
685 pass3 { mVUlog("IOR vi%02d, vi%02d, vi%02d", _Fd_, _Fs_, _Ft_); }
686 }
687
688 mVUop(mVU_ISUB) {
689 pass1 { mVUanalyzeIALU1(mVU, _Id_, _Is_, _It_); }
690 pass2 {
691 if (_It_ != _Is_) {
692 mVUallocVIa(mVU, gprT1, _Is_);
693 mVUallocVIa(mVU, gprT2, _It_);
694 xSUB(gprT1b, gprT2b);
695 mVUallocVIb(mVU, gprT1, _Id_);
696 }
697 else {
698 xXOR(gprT1, gprT1);
699 mVUallocVIb(mVU, gprT1, _Id_);
700 }
701 }
702 pass3 { mVUlog("ISUB vi%02d, vi%02d, vi%02d", _Fd_, _Fs_, _Ft_); }
703 }
704
705 mVUop(mVU_ISUBIU) {
706 pass1 { mVUanalyzeIALU2(mVU, _Is_, _It_); }
707 pass2 {
708 mVUallocVIa(mVU, gprT1, _Is_);
709 xSUB(gprT1b, _Imm15_);
710 mVUallocVIb(mVU, gprT1, _It_);
711 }
712 pass3 { mVUlog("ISUBIU vi%02d, vi%02d, %d", _Ft_, _Fs_, _Imm15_); }
713 }
714
715 //------------------------------------------------------------------
716 // MFIR/MFP/MOVE/MR32/MTIR
717 //------------------------------------------------------------------
718
719 mVUop(mVU_MFIR) {
720 pass1 { if (!_Ft_) { mVUlow.isNOP = 1; } analyzeVIreg1(_Is_, mVUlow.VI_read[0]); analyzeReg2(_Ft_, mVUlow.VF_write, 1); }
721 pass2 {
722 const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
723 mVUallocVIa(mVU, gprT1, _Is_, true);
724 xMOVDZX(Ft, gprT1);
725 if (!_XYZW_SS) { mVUunpack_xyzw(Ft, Ft, 0); }
726 mVU->regAlloc->clearNeeded(Ft);
727 }
728 pass3 { mVUlog("MFIR.%s vf%02d, vi%02d", _XYZW_String, _Ft_, _Fs_); }
729 }
730
731 mVUop(mVU_MFP) {
732 pass1 { mVUanalyzeMFP(mVU, _Ft_); }
733 pass2 {
734 const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
735 getPreg(mVU, Ft);
736 mVU->regAlloc->clearNeeded(Ft);
737 }
738 pass3 { mVUlog("MFP.%s vf%02d, P", _XYZW_String, _Ft_); }
739 }
740
741 mVUop(mVU_MOVE) {
742 pass1 { mVUanalyzeMOVE(mVU, _Fs_, _Ft_); }
743 pass2 {
744 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, _Ft_, _X_Y_Z_W);
745 mVU->regAlloc->clearNeeded(Fs);
746 }
747 pass3 { mVUlog("MOVE.%s vf%02d, vf%02d", _XYZW_String, _Ft_, _Fs_); }
748 }
749
750 mVUop(mVU_MR32) {
751 pass1 { mVUanalyzeMR32(mVU, _Fs_, _Ft_); }
752 pass2 {
753 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_);
754 const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
755 if (_XYZW_SS) mVUunpack_xyzw(Ft, Fs, (_X ? 1 : (_Y ? 2 : (_Z ? 3 : 0))));
756 else xPSHUF.D(Ft, Fs, 0x39);
757 mVU->regAlloc->clearNeeded(Ft);
758 mVU->regAlloc->clearNeeded(Fs);
759 }
760 pass3 { mVUlog("MR32.%s vf%02d, vf%02d", _XYZW_String, _Ft_, _Fs_); }
761 }
762
763 mVUop(mVU_MTIR) {
764 pass1 { if (!_It_) { mVUlow.isNOP = 1; } analyzeReg5(_Fs_, _Fsf_, mVUlow.VF_read[0]); analyzeVIreg2(_It_, mVUlow.VI_write, 1); }
765 pass2 {
766 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
767 xMOVD(gprT1, Fs);
768 mVUallocVIb(mVU, gprT1, _It_);
769 mVU->regAlloc->clearNeeded(Fs);
770 }
771 pass3 { mVUlog("MTIR vi%02d, vf%02d%s", _Ft_, _Fs_, _Fsf_String); }
772 }
773
774 //------------------------------------------------------------------
775 // ILW/ILWR
776 //------------------------------------------------------------------
777
778 mVUop(mVU_ILW) {
779 pass1 { if (!_It_) { mVUlow.isNOP = 1; } analyzeVIreg1(_Is_, mVUlow.VI_read[0]); analyzeVIreg2(_It_, mVUlow.VI_write, 4); }
780 pass2 {
781 xAddressVoid ptr(mVU->regs().Mem + offsetSS);
782 if (_Is_) {
783 mVUallocVIa(mVU, gprT2, _Is_);
784 xADD(gprT2, _Imm11_);
785 mVUaddrFix (mVU, gprT2);
786 ptr += gprT2;
787 }
788 else
789 ptr += getVUmem(_Imm11_);
790 xMOVZX(gprT1, ptr16[ptr]);
791 mVUallocVIb(mVU, gprT1, _It_);
792 }
793 pass3 { mVUlog("ILW.%s vi%02d, vi%02d + %d", _XYZW_String, _Ft_, _Fs_, _Imm11_); }
794 }
795
796 mVUop(mVU_ILWR) {
797 pass1 { if (!_It_) { mVUlow.isNOP = 1; } analyzeVIreg1(_Is_, mVUlow.VI_read[0]); analyzeVIreg2(_It_, mVUlow.VI_write, 4); }
798 pass2 {
799 xAddressVoid ptr(mVU->regs().Mem + offsetSS);
800 if (_Is_) {
801 mVUallocVIa(mVU, gprT2, _Is_);
802 mVUaddrFix (mVU, gprT2);
803 ptr += gprT2;
804 }
805 xMOVZX(gprT1, ptr16[ptr]);
806 mVUallocVIb(mVU, gprT1, _It_);
807 }
808 pass3 { mVUlog("ILWR.%s vi%02d, vi%02d", _XYZW_String, _Ft_, _Fs_); }
809 }
810
811 //------------------------------------------------------------------
812 // ISW/ISWR
813 //------------------------------------------------------------------
814
815 mVUop(mVU_ISW) {
816 pass1 { analyzeVIreg1(_Is_, mVUlow.VI_read[0]); analyzeVIreg1(_It_, mVUlow.VI_read[1]); }
817 pass2 {
818 xAddressVoid ptr(mVU->regs().Mem);
819 if (_Is_) {
820 mVUallocVIa(mVU, gprT2, _Is_);
821 xADD(gprT2, _Imm11_);
822 mVUaddrFix (mVU, gprT2);
823 ptr += gprT2;
824 }
825 else
826 ptr += getVUmem(_Imm11_);
827 mVUallocVIa(mVU, gprT1, _It_);
828 if (_X) xMOV(ptr32[ptr], gprT1);
829 if (_Y) xMOV(ptr32[ptr+4], gprT1);
830 if (_Z) xMOV(ptr32[ptr+8], gprT1);
831 if (_W) xMOV(ptr32[ptr+12], gprT1);
832 }
833 pass3 { mVUlog("ISW.%s vi%02d, vi%02d + %d", _XYZW_String, _Ft_, _Fs_, _Imm11_); }
834 }
835
836 mVUop(mVU_ISWR) {
837 pass1 { analyzeVIreg1(_Is_, mVUlow.VI_read[0]); analyzeVIreg1(_It_, mVUlow.VI_read[1]); }
838 pass2 {
839 xAddressVoid ptr(mVU->regs().Mem);
840 if (_Is_) {
841 mVUallocVIa(mVU, gprT2, _Is_);
842 mVUaddrFix (mVU, gprT2);
843 ptr += gprT2;
844 }
845 mVUallocVIa(mVU, gprT1, _It_);
846 if (_X) xMOV(ptr32[ptr], gprT1);
847 if (_Y) xMOV(ptr32[ptr+4], gprT1);
848 if (_Z) xMOV(ptr32[ptr+8], gprT1);
849 if (_W) xMOV(ptr32[ptr+12], gprT1);
850 }
851 pass3 { mVUlog("ISWR.%s vi%02d, vi%02d", _XYZW_String, _Ft_, _Fs_); }
852 }
853
854 //------------------------------------------------------------------
855 // LQ/LQD/LQI
856 //------------------------------------------------------------------
857
858 mVUop(mVU_LQ) {
859 pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 0); }
860 pass2 {
861 xAddressVoid ptr(mVU->regs().Mem);
862 if (_Is_) {
863 mVUallocVIa(mVU, gprT2, _Is_);
864 xADD(gprT2, _Imm11_);
865 mVUaddrFix(mVU, gprT2);
866 ptr += gprT2;
867 }
868 else
869 ptr += getVUmem(_Imm11_);
870 const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
871 mVUloadReg(Ft, ptr, _X_Y_Z_W);
872 mVU->regAlloc->clearNeeded(Ft);
873 }
874 pass3 { mVUlog("LQ.%s vf%02d, vi%02d + %d", _XYZW_String, _Ft_, _Fs_, _Imm11_); }
875 }
876
877 mVUop(mVU_LQD) {
878 pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 1); }
879 pass2 {
880 xAddressVoid ptr(mVU->regs().Mem);
881 if (_Is_) {
882 mVUallocVIa(mVU, gprT2, _Is_);
883 xSUB(gprT2b, 1);
884 mVUallocVIb(mVU, gprT2, _Is_);
885 mVUaddrFix (mVU, gprT2);
886 ptr += gprT2;
887 }
888 if (!mVUlow.noWriteVF) {
889 const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
890 mVUloadReg(Ft, ptr, _X_Y_Z_W);
891 mVU->regAlloc->clearNeeded(Ft);
892 }
893 }
894 pass3 { mVUlog("LQD.%s vf%02d, --vi%02d", _XYZW_String, _Ft_, _Is_); }
895 }
896
897 mVUop(mVU_LQI) {
898 pass1 { mVUanalyzeLQ(mVU, _Ft_, _Is_, 1); }
899 pass2 {
900 xAddressVoid ptr(mVU->regs().Mem);
901 if (_Is_) {
902 mVUallocVIa(mVU, gprT1, _Is_);
903 xMOV(gprT2, gprT1);
904 xADD(gprT1b, 1);
905 mVUallocVIb(mVU, gprT1, _Is_);
906 mVUaddrFix (mVU, gprT2);
907 ptr += gprT2;
908 }
909 if (!mVUlow.noWriteVF) {
910 const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
911 mVUloadReg(Ft, ptr, _X_Y_Z_W);
912 mVU->regAlloc->clearNeeded(Ft);
913 }
914 }
915 pass3 { mVUlog("LQI.%s vf%02d, vi%02d++", _XYZW_String, _Ft_, _Fs_); }
916 }
917
918 //------------------------------------------------------------------
919 // SQ/SQD/SQI
920 //------------------------------------------------------------------
921
922 mVUop(mVU_SQ) {
923 pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 0); }
924 pass2 {
925 xAddressVoid ptr(mVU->regs().Mem);
926 if (_It_) {
927 mVUallocVIa(mVU, gprT2, _It_);
928 xADD(gprT2, _Imm11_);
929 mVUaddrFix(mVU, gprT2);
930 ptr += gprT2;
931 }
932 else
933 ptr += getVUmem(_Imm11_);
934 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
935 mVUsaveReg(Fs, ptr, _X_Y_Z_W, 1);
936 mVU->regAlloc->clearNeeded(Fs);
937 }
938 pass3 { mVUlog("SQ.%s vf%02d, vi%02d + %d", _XYZW_String, _Fs_, _Ft_, _Imm11_); }
939 }
940
941 mVUop(mVU_SQD) {
942 pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 1); }
943 pass2 {
944 xAddressVoid ptr(mVU->regs().Mem);
945 if (_It_) {
946 mVUallocVIa(mVU, gprT2, _It_);
947 xSUB(gprT2b, 1);
948 mVUallocVIb(mVU, gprT2, _It_);
949 mVUaddrFix (mVU, gprT2);
950 ptr += gprT2;
951 }
952 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
953 mVUsaveReg(Fs, ptr, _X_Y_Z_W, 1);
954 mVU->regAlloc->clearNeeded(Fs);
955 }
956 pass3 { mVUlog("SQD.%s vf%02d, --vi%02d", _XYZW_String, _Fs_, _Ft_); }
957 }
958
959 mVUop(mVU_SQI) {
960 pass1 { mVUanalyzeSQ(mVU, _Fs_, _It_, 1); }
961 pass2 {
962 xAddressVoid ptr(mVU->regs().Mem);
963 if (_It_) {
964 mVUallocVIa(mVU, gprT1, _It_);
965 xMOV(gprT2, gprT1);
966 xADD(gprT1b, 1);
967 mVUallocVIb(mVU, gprT1, _It_);
968 mVUaddrFix (mVU, gprT2);
969 ptr += gprT2;
970 }
971 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W);
972 mVUsaveReg(Fs, ptr, _X_Y_Z_W, 1);
973 mVU->regAlloc->clearNeeded(Fs);
974 }
975 pass3 { mVUlog("SQI.%s vf%02d, vi%02d++", _XYZW_String, _Fs_, _Ft_); }
976 }
977
978 //------------------------------------------------------------------
979 // RINIT/RGET/RNEXT/RXOR
980 //------------------------------------------------------------------
981
982 mVUop(mVU_RINIT) {
983 pass1 { mVUanalyzeR1(mVU, _Fs_, _Fsf_); }
984 pass2 {
985 if (_Fs_ || (_Fsf_ == 3)) {
986 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
987 xMOVD(gprT1, Fs);
988 xAND(gprT1, 0x007fffff);
989 xOR (gprT1, 0x3f800000);
990 xMOV(ptr32[Rmem], gprT1);
991 mVU->regAlloc->clearNeeded(Fs);
992 }
993 else xMOV(ptr32[Rmem], 0x3f800000);
994 }
995 pass3 { mVUlog("RINIT R, vf%02d%s", _Fs_, _Fsf_String); }
996 }
997
998 static __fi void mVU_RGET_(mV, const x32& Rreg) {
999 if (!mVUlow.noWriteVF) {
1000 const xmm& Ft = mVU->regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W);
1001 xMOVDZX(Ft, Rreg);
1002 if (!_XYZW_SS) mVUunpack_xyzw(Ft, Ft, 0);
1003 mVU->regAlloc->clearNeeded(Ft);
1004 }
1005 }
1006
1007 mVUop(mVU_RGET) {
1008 pass1 { mVUanalyzeR2(mVU, _Ft_, 1); }
1009 pass2 { xMOV(gprT1, ptr32[Rmem]); mVU_RGET_(mVU, gprT1); }
1010 pass3 { mVUlog("RGET.%s vf%02d, R", _XYZW_String, _Ft_); }
1011 }
1012
1013 mVUop(mVU_RNEXT) {
1014 pass1 { mVUanalyzeR2(mVU, _Ft_, 0); }
1015 pass2 {
1016 // algorithm from www.project-fao.org
1017 xMOV(gprT3, ptr32[Rmem]);
1018 xMOV(gprT1, gprT3);
1019 xSHR(gprT1, 4);
1020 xAND(gprT1, 1);
1021
1022 xMOV(gprT2, gprT3);
1023 xSHR(gprT2, 22);
1024 xAND(gprT2, 1);
1025
1026 xSHL(gprT3, 1);
1027 xXOR(gprT1, gprT2);
1028 xXOR(gprT3, gprT1);
1029 xAND(gprT3, 0x007fffff);
1030 xOR (gprT3, 0x3f800000);
1031 xMOV(ptr32[Rmem], gprT3);
1032 mVU_RGET_(mVU, gprT3);
1033 }
1034 pass3 { mVUlog("RNEXT.%s vf%02d, R", _XYZW_String, _Ft_); }
1035 }
1036
1037 mVUop(mVU_RXOR) {
1038 pass1 { mVUanalyzeR1(mVU, _Fs_, _Fsf_); }
1039 pass2 {
1040 if (_Fs_ || (_Fsf_ == 3)) {
1041 const xmm& Fs = mVU->regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_)));
1042 xMOVD(gprT1, Fs);
1043 xAND(gprT1, 0x7fffff);
1044 xXOR(ptr32[Rmem], gprT1);
1045 mVU->regAlloc->clearNeeded(Fs);
1046 }
1047 }
1048 pass3 { mVUlog("RXOR R, vf%02d%s", _Fs_, _Fsf_String); }
1049 }
1050
1051 //------------------------------------------------------------------
1052 // WaitP/WaitQ
1053 //------------------------------------------------------------------
1054
1055 mVUop(mVU_WAITP) {
1056 pass1 { mVUstall = aMax(mVUstall, ((mVUregs.p) ? (mVUregs.p - 1) : 0)); }
1057 pass3 { mVUlog("WAITP"); }
1058 }
1059
1060 mVUop(mVU_WAITQ) {
1061 pass1 { mVUstall = aMax(mVUstall, mVUregs.q); }
1062 pass3 { mVUlog("WAITQ"); }
1063 }
1064
1065 //------------------------------------------------------------------
1066 // XTOP/XITOP
1067 //------------------------------------------------------------------
1068
1069 mVUop(mVU_XTOP) {
1070 pass1 { if (!_It_) { mVUlow.isNOP = 1; } analyzeVIreg2(_It_, mVUlow.VI_write, 1); }
1071 pass2 {
1072 xMOVZX(gprT1, ptr16[&mVU->getVifRegs().top]);
1073 mVUallocVIb(mVU, gprT1, _It_);
1074 }
1075 pass3 { mVUlog("XTOP vi%02d", _Ft_); }
1076 }
1077
1078 mVUop(mVU_XITOP) {
1079 pass1 { if (!_It_) { mVUlow.isNOP = 1; } analyzeVIreg2(_It_, mVUlow.VI_write, 1); }
1080 pass2 {
1081 xMOVZX(gprT1, ptr16[&mVU->getVifRegs().itop]);
1082 xAND(gprT1, isVU1 ? 0x3ff : 0xff);
1083 mVUallocVIb(mVU, gprT1, _It_);
1084 }
1085 pass3 { mVUlog("XITOP vi%02d", _Ft_); }
1086 }
1087
1088 //------------------------------------------------------------------
1089 // XGkick
1090 //------------------------------------------------------------------
1091
1092 extern void gsPath1Interrupt();
1093 extern bool SIGNAL_IMR_Pending;
1094
1095 void __fastcall mVU_XGKICK_(u32 addr) {
1096 addr &= 0x3ff;
1097 u8* data = vuRegs[1].Mem + (addr*16);
1098 u32 diff = 0x400 - addr;
1099 u32 size;
1100
1101 ///////////////////////////////////////////////
1102 ///////////////SIGNAL WARNING!!////////////////
1103 ///////////////////////////////////////////////
1104 /* Due to the face SIGNAL can cause the loop
1105 to leave early, we can end up missing data.
1106 The only way we can avoid this is to queue
1107 it :(, im relying on someone else to come
1108 up with a better solution! */
1109
1110 /*if(gifRegs.stat.APATH <= GIF_APATH1 || (gifRegs.stat.APATH == GIF_APATH3 && gifRegs.stat.IP3 == true) && SIGNAL_IMR_Pending == false)
1111 {
1112 if(Path1WritePos != 0)
1113 {
1114 //Flush any pending transfers so things dont go up in the wrong order
1115 while(gifRegs.stat.P1Q == true) gsPath1Interrupt();
1116 }
1117 GetMTGS().PrepDataPacket(GIF_PATH_1, 0x400);
1118 size = GIFPath_CopyTag(GIF_PATH_1, (u128*)data, diff);
1119 GetMTGS().SendDataPacket();
1120
1121 if(GSTransferStatus.PTH1 == STOPPED_MODE)
1122 {
1123 gifRegs.stat.OPH = false;
1124 gifRegs.stat.APATH = GIF_APATH_IDLE;
1125 }
1126 }
1127 else
1128 {*/
1129 //DevCon.Warning("GIF APATH busy %x Holding for later W %x, R %x", gifRegs.stat.APATH, Path1WritePos, Path1ReadPos);
1130 size = GIFPath_ParseTagQuick(GIF_PATH_1, data, diff);
1131 u8* pDest = &Path1Buffer[Path1WritePos*16];
1132
1133 Path1WritePos += size;
1134
1135 pxAssumeMsg((Path1WritePos < sizeof(Path1Buffer)), "XGKick Buffer Overflow detected on Path1Buffer!");
1136 //DevCon.Warning("Storing size %x PATH 1", size);
1137
1138 if (size > diff) {
1139 //DevCon.Status("XGkick Wrap!");
1140 memcpy_qwc(pDest, vuRegs[1].Mem + (addr*16), diff);
1141 memcpy_qwc(pDest+(diff*16), vuRegs[1].Mem, size-diff);
1142 }
1143 else {
1144 memcpy_qwc(pDest, vuRegs[1].Mem + (addr*16), size);
1145 }
1146 //if(!gifRegs.stat.P1Q) CPU_INT(28, 128);
1147 gifRegs.stat.P1Q = true;
1148 //}
1149 gsPath1Interrupt();
1150 }
1151
1152 static __fi void mVU_XGKICK_DELAY(mV, bool memVI) {
1153 mVUbackupRegs(mVU);
1154 if (memVI) xMOV(gprT2, ptr32[&mVU->VIxgkick]);
1155 else mVUallocVIa(mVU, gprT2, _Is_);
1156 xCALL(mVU_XGKICK_);
1157 mVUrestoreRegs(mVU);
1158 }
1159
1160 mVUop(mVU_XGKICK) {
1161 pass1 { mVUanalyzeXGkick(mVU, _Is_, mVU_XGKICK_CYCLES); }
1162 pass2 {
1163 if (!mVU_XGKICK_CYCLES) { mVU_XGKICK_DELAY(mVU, 0); return; }
1164 else if (mVUinfo.doXGKICK) { mVU_XGKICK_DELAY(mVU, 1); mVUinfo.doXGKICK = 0; }
1165 mVUallocVIa(mVU, gprT1, _Is_);
1166 xMOV(ptr32[&mVU->VIxgkick], gprT1);
1167 }
1168 pass3 { mVUlog("XGKICK vi%02d", _Fs_); }
1169 }
1170
1171 //------------------------------------------------------------------
1172 // Branches/Jumps
1173 //------------------------------------------------------------------
1174
1175 void setBranchA(mP, int x, int _x_) {
1176 pass1 {
1177 if (_Imm11_ == 1 && !_x_) {
1178 DevCon.WriteLn(Color_Green, "microVU%d: Branch Optimization", mVU->index);
1179 mVUlow.isNOP = 1;
1180 return;
1181 }
1182 mVUbranch = x;
1183 mVUlow.branch = x;
1184 }
1185 pass2 { if (_Imm11_ == 1 && !_x_) { return; } mVUbranch = x; }
1186 pass3 { mVUbranch = x; }
1187 pass4 { if (_Imm11_ == 1 && !_x_) { return; } mVUbranch = x; }
1188 }
1189
1190 void condEvilBranch(mV, int JMPcc) {
1191 if (mVUlow.badBranch) {
1192 xMOV(ptr32[&mVU->branch], gprT1);
1193 xMOV(ptr32[&mVU->badBranch], branchAddrN);
1194 xCMP(gprT1b, 0);
1195 xForwardJump8 cJMP((JccComparisonType)JMPcc);
1196 incPC(4); // Branch Not Taken
1197 xMOV(ptr32[&mVU->badBranch], xPC);
1198 incPC(-4);
1199 cJMP.SetTarget();
1200 return;
1201 }
1202 xMOV(ptr32[&mVU->evilBranch], branchAddr);
1203 xCMP(gprT1b, 0);
1204 xForwardJump8 cJMP((JccComparisonType)JMPcc);
1205 xMOV(gprT1, ptr32[&mVU->badBranch]); // Branch Not Taken
1206 xMOV(ptr32[&mVU->evilBranch], gprT1);
1207 cJMP.SetTarget();
1208 }
1209
1210 mVUop(mVU_B) {
1211 setBranchA(mX, 1, 0);
1212 pass1 { mVUanalyzeNormBranch(mVU, 0, 0); }
1213 pass2 {
1214 if (mVUlow.badBranch) { xMOV(ptr32[&mVU->badBranch], branchAddrN); }
1215 if (mVUlow.evilBranch) { xMOV(ptr32[&mVU->evilBranch], branchAddr); }
1216 }
1217 pass3 { mVUlog("B [<a href=\"#addr%04x\">%04x</a>]", branchAddr, branchAddr); }
1218 }
1219
1220 mVUop(mVU_BAL) {
1221 setBranchA(mX, 2, _It_);
1222 pass1 { mVUanalyzeNormBranch(mVU, _It_, 1); }
1223 pass2 {
1224 xMOV(gprT1, bSaveAddr);
1225 mVUallocVIb(mVU, gprT1, _It_);
1226 if (mVUlow.badBranch) { xMOV(ptr32[&mVU->badBranch], branchAddrN); }
1227 if (mVUlow.evilBranch) { xMOV(ptr32[&mVU->evilBranch], branchAddr); }
1228 }
1229 pass3 { mVUlog("BAL vi%02d [<a href=\"#addr%04x\">%04x</a>]", _Ft_, branchAddr, branchAddr); }
1230 }
1231
1232 mVUop(mVU_IBEQ) {
1233 setBranchA(mX, 3, 0);
1234 pass1 { mVUanalyzeCondBranch2(mVU, _Is_, _It_); }
1235 pass2 {
1236 if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU->VIbackup]);
1237 else mVUallocVIa(mVU, gprT1, _Is_);
1238
1239 if (mVUlow.memReadIt) xXOR(gprT1, ptr32[&mVU->VIbackup]);
1240 else { mVUallocVIa(mVU, gprT2, _It_); xXOR(gprT1, gprT2); }
1241
1242 if (!(isBadOrEvil)) xMOV(ptr32[&mVU->branch], gprT1);
1243 else condEvilBranch(mVU, Jcc_Equal);
1244 }
1245 pass3 { mVUlog("IBEQ vi%02d, vi%02d [<a href=\"#addr%04x\">%04x</a>]", _Ft_, _Fs_, branchAddr, branchAddr); }
1246 }
1247
1248 mVUop(mVU_IBGEZ) {
1249 setBranchA(mX, 4, 0);
1250 pass1 { mVUanalyzeCondBranch1(mVU, _Is_); }
1251 pass2 {
1252 if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU->VIbackup]);
1253 else mVUallocVIa(mVU, gprT1, _Is_);
1254 if (!(isBadOrEvil)) xMOV(ptr32[&mVU->branch], gprT1);
1255 else condEvilBranch(mVU, Jcc_GreaterOrEqual);
1256 }
1257 pass3 { mVUlog("IBGEZ vi%02d [<a href=\"#addr%04x\">%04x</a>]", _Fs_, branchAddr, branchAddr); }
1258 }
1259
1260 mVUop(mVU_IBGTZ) {
1261 setBranchA(mX, 5, 0);
1262 pass1 { mVUanalyzeCondBranch1(mVU, _Is_); }
1263 pass2 {
1264 if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU->VIbackup]);
1265 else mVUallocVIa(mVU, gprT1, _Is_);
1266 if (!(isBadOrEvil)) xMOV(ptr32[&mVU->branch], gprT1);
1267 else condEvilBranch(mVU, Jcc_Greater);
1268 }
1269 pass3 { mVUlog("IBGTZ vi%02d [<a href=\"#addr%04x\">%04x</a>]", _Fs_, branchAddr, branchAddr); }
1270 }
1271
1272 mVUop(mVU_IBLEZ) {
1273 setBranchA(mX, 6, 0);
1274 pass1 { mVUanalyzeCondBranch1(mVU, _Is_); }
1275 pass2 {
1276 if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU->VIbackup]);
1277 else mVUallocVIa(mVU, gprT1, _Is_);
1278 if (!(isBadOrEvil)) xMOV(ptr32[&mVU->branch], gprT1);
1279 else condEvilBranch(mVU, Jcc_LessOrEqual);
1280 }
1281 pass3 { mVUlog("IBLEZ vi%02d [<a href=\"#addr%04x\">%04x</a>]", _Fs_, branchAddr, branchAddr); }
1282 }
1283
1284 mVUop(mVU_IBLTZ) {
1285 setBranchA(mX, 7, 0);
1286 pass1 { mVUanalyzeCondBranch1(mVU, _Is_); }
1287 pass2 {
1288 if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU->VIbackup]);
1289 else mVUallocVIa(mVU, gprT1, _Is_);
1290 if (!(isBadOrEvil)) xMOV(ptr32[&mVU->branch], gprT1);
1291 else condEvilBranch(mVU, Jcc_Less);
1292 }
1293 pass3 { mVUlog("IBLTZ vi%02d [<a href=\"#addr%04x\">%04x</a>]", _Fs_, branchAddr, branchAddr); }
1294 }
1295
1296 mVUop(mVU_IBNE) {
1297 setBranchA(mX, 8, 0);
1298 pass1 { mVUanalyzeCondBranch2(mVU, _Is_, _It_); }
1299 pass2 {
1300 if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU->VIbackup]);
1301 else mVUallocVIa(mVU, gprT1, _Is_);
1302
1303 if (mVUlow.memReadIt) xXOR(gprT1, ptr32[&mVU->VIbackup]);
1304 else { mVUallocVIa(mVU, gprT2, _It_); xXOR(gprT1, gprT2); }
1305
1306 if (!(isBadOrEvil)) xMOV(ptr32[&mVU->branch], gprT1);
1307 else condEvilBranch(mVU, Jcc_NotEqual);
1308 }
1309 pass3 { mVUlog("IBNE vi%02d, vi%02d [<a href=\"#addr%04x\">%04x</a>]", _Ft_, _Fs_, branchAddr, branchAddr); }
1310 }
1311
1312 void normJumpPass2(mV) {
1313 if (!mVUlow.constJump.isValid || mVUlow.evilBranch) {
1314 mVUallocVIa(mVU, gprT1, _Is_);
1315 xSHL(gprT1, 3);
1316 xAND(gprT1, mVU->microMemSize - 8);
1317 xMOV(ptr32[&mVU->branch], gprT1);
1318 if (!mVUlow.evilBranch) xMOV(ptr32[&mVU->branch], gprT1);
1319 else xMOV(ptr32[&mVU->evilBranch], gprT1);
1320 if (mVUlow.badBranch) {
1321 xADD(gprT1, 8);
1322 xAND(gprT1, mVU->microMemSize - 8);
1323 xMOV(ptr32[&mVU->badBranch], gprT1);
1324 }
1325 }
1326 }
1327
1328 mVUop(mVU_JR) {
1329 mVUbranch = 9;
1330 pass1 { mVUanalyzeJump(mVU, _Is_, 0, 0); }
1331 pass2 { normJumpPass2(mVU); }
1332 pass3 { mVUlog("JR [vi%02d]", _Fs_); }
1333 }
1334
1335 mVUop(mVU_JALR) {
1336 mVUbranch = 10;
1337 pass1 { mVUanalyzeJump(mVU, _Is_, _It_, 1); }
1338 pass2 {
1339 normJumpPass2(mVU);
1340 xMOV(gprT1, bSaveAddr);
1341 mVUallocVIb(mVU, gprT1, _It_);
1342 }
1343 pass3 { mVUlog("JALR vi%02d, [vi%02d]", _Ft_, _Fs_); }
1344 }

  ViewVC Help
Powered by ViewVC 1.1.22