/[pcsx2_0.9.7]/branch/debug/0.X/0.9.X/0.9.7/ramdump-lateset/common/src/Utilities/x86/MemcpyVibes.cpp
ViewVC logotype

Contents of /branch/debug/0.X/0.9.X/0.9.7/ramdump-lateset/common/src/Utilities/x86/MemcpyVibes.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 314 - (show annotations) (download)
Sun Dec 26 18:56:19 2010 UTC (9 years, 1 month ago) by william
File size: 8887 byte(s)
** merged upstream r4049 (re-integration of GregMiscellaneous branch)
** applied patched to GigTranser.cpp in ZZOgl from r4140 to change 'static int count = 0;' to 'static int path1_count = 0;')
1 /* PCSX2 - PS2 Emulator for PCs
2 * Copyright (C) 2002-2010 PCSX2 Dev Team
3 *
4 * PCSX2 is free software: you can redistribute it and/or modify it under the terms
5 * of the GNU Lesser General Public License as published by the Free Software Found-
6 * ation, either version 3 of the License, or (at your option) any later version.
7 *
8 * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10 * PURPOSE. See the GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License along with PCSX2.
13 * If not, see <http://www.gnu.org/licenses/>.
14 */
15
16 #include "PrecompiledHeader.h"
17 #include "x86emitter/x86emitter.h"
18 #include <xmmintrin.h>
19
20 using namespace x86Emitter;
21
22 // Max Number of qwc supported
23 #define _maxSize 0x400
24
25 typedef void (__fastcall *_memCpyCall)(void*, void*);
26 __aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
27
28 #if 1
29
30 // this version uses SSE intrinsics to perform an inline copy. MSVC disasm shows pretty
31 // decent code generation on whole, but it hasn't been benchmarked at all yet --air
32 __fi void memcpy_vibes(void * dest, const void * src, int size) {
33
34 float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
35 size_t count = size & ~15, extra = size & 15;
36
37 destxmm -= 8 - extra, srcxmm -= 8 - extra;
38 switch (extra) {
39 do {
40 destxmm += 16, srcxmm += 16, count -= 16;
41 _mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
42 case 15:
43 _mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
44 case 14:
45 _mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
46 case 13:
47 _mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
48 case 12:
49 _mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
50 case 11:
51 _mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
52 case 10:
53 _mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
54 case 9:
55 _mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
56 case 8:
57 _mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
58 case 7:
59 _mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
60 case 6:
61 _mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
62 case 5:
63 _mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
64 case 4:
65 _mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
66 case 3:
67 _mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
68 case 2:
69 _mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
70 case 1:
71 _mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
72 case 0: NULL;
73 } while (count);
74 }
75 }
76
77 #else
78 #if 1
79 // This version creates one function with a lot of movaps
80 // It jumps to the correct movaps entry-point while adding
81 // the proper offset for adjustment...
82
83 static __pagealigned u8 _memCpyExec[__pagesize*16];
84
85 void gen_memcpy_vibes() {
86 HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
87 memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
88 xSetPtr(_memCpyExec);
89
90 int off =-(((_maxSize & 0xf) - 7) << 4);
91 for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
92
93 _memcpy_vibes[i] = (_memCpyCall)xGetPtr();
94
95 if (off >= 128) {
96 off = -128;
97 xADD(edx, 256);
98 xADD(ecx, 256);
99 }
100 const xRegisterSSE xmm_t(x);
101 xMOVAPS (xmm_t, ptr32[edx+off]);
102 xMOVNTPS(ptr32[ecx+off], xmm_t);
103 }
104
105 _memcpy_vibes[0] = (_memCpyCall)xGetPtr();
106
107 xRET();
108 pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
109
110 HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
111 }
112
113 __fi void memcpy_vibes(void * dest, const void * src, int size) {
114 int offset = ((size & 0xf) - 7) << 4;
115 _memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
116 }
117
118 #else
119
120 // This version creates '_maxSize' number of different functions,
121 // and calls the appropriate one...
122
123 static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
124
125 void gen_memcpy_vibes() {
126 HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
127 memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
128 xSetPtr(_memCpyExec);
129
130 for (int i = 0; i < _maxSize+1; i++)
131 {
132 int off = 0;
133 _memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
134
135 for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
136 if (off >= 128) {
137 off = -128;
138 xADD(edx, 256);
139 xADD(ecx, 256);
140 }
141 const xRegisterSSE xmm_t(x);
142 xMOVAPS(xmm_t, ptr32[edx+off]);
143 xMOVAPS(ptr32[ecx+off], xmm_t);
144 }
145
146 xRET();
147 pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
148 }
149
150 HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
151 }
152
153 __fi void memcpy_vibes(void * dest, const void * src, int size) {
154 _memcpy_vibes[size](dest, src);
155 }
156
157 #endif
158 #endif
159
160 // Since MemcpyVibes is already in the project, I'll just tuck the Linux version of memcpy_amd_qwc here for the moment,
161 // to get around compilation issues with having it in the headers.
162 #ifdef __LINUX__
163
164 // This can be moved later, but Linux doesn't even compile memcpyFast.cpp, so I figured I'd stick it here for now.
165 // Quadword Copy! Count is in QWCs (128 bits). Neither source nor dest need to be aligned.
166 __fi void memcpy_amd_qwc(void *dest, const void *src, size_t qwc)
167 {
168 // Optimization Analysis: This code is *nearly* optimal. Do not think that using XMM
169 // registers will improve copy performance, because they won't. Use of XMMs is only
170 // warranted in situations where both source and dest are guaranteed aligned to 16 bytes,
171 // and even then the benefits are typically minimal (sometimes slower depending on the
172 // amount of data being copied).
173 //
174 // Thus: MMX are alignment safe, fast, and widely available. Lets just stick with them.
175 // --air
176
177 // Linux Conversion note:
178 // This code would benefit nicely from having inline-able GAS syntax, since it should
179 // allow GCC to optimize the first 3 instructions out of existence in many scenarios.
180 // And its called enough times to probably merit the extra effort to ensure proper
181 // optimization. --air
182
183 __asm__ __volatile__
184 (
185 ".intel_syntax noprefix\n"
186 "sub %[qwc], 1\n" // dec the counter to ease the count of 16bytes block later (optimization)
187 // Note after this line, real value of the counter is %[qwc] + 1
188 "jle memcpy_qwc_1_%=\n" // only one 16 byte block to copy? Or nothing.
189
190 "cmp %[qwc], 127\n" // "IN_CACHE_COPY/16"
191 "jb memcpy_qwc_loop1_%=\n" // small copies should be cached (definite speedup --air)
192
193 "memcpy_qwc_loop2_%=:\n" // 32-byte blocks, uncached copy
194 "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
195
196 "movq mm0,[%[src]+0]\n" // read 64 bits
197 "movq mm1,[%[src]+8]\n"
198 "movq mm2,[%[src]+16]\n"
199 "movntq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
200 "movntq [%[dest]+8], mm1\n"
201 "movq mm3,[%[src]+24]\n"
202 "movntq [%[dest]+16], mm2\n"
203 "movntq [%[dest]+24], mm3\n"
204
205 "add %[src],32\n" // update source pointer
206 "add %[dest],32\n" // update destination pointer
207 "sub %[qwc],2\n"
208 "jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
209 "sfence\n" // flush the write buffer
210 "jmp memcpy_qwc_1_%=\n"
211
212 // 32-byte blocks, cached!
213 // This *is* important. Removing this and using exclusively non-temporal stores
214 // results in noticeable speed loss!
215
216 "memcpy_qwc_loop1_%=:\n"
217 "prefetchnta [%[src] + 568]\n" // start reading ahead (tested: it helps! --air)
218
219 "movq mm0,[%[src]+0]\n" // read 64 bits
220 "movq mm1,[%[src]+8]\n"
221 "movq mm2,[%[src]+16]\n"
222 "movq [%[dest]+0], mm0\n" // write 64 bits, bypassing the cache
223 "movq [%[dest]+8], mm1\n"
224 "movq mm3,[%[src]+24]\n"
225 "movq [%[dest]+16], mm2\n"
226 "movq [%[dest]+24], mm3\n"
227
228 "add %[src],32\n" // update source pointer
229 "add %[dest],32\n" // update destination pointer
230 "sub %[qwc],2\n"
231 "jg memcpy_qwc_loop2_%=\n" // last 64-byte block?
232
233 "memcpy_qwc_1_%=:\n"
234 "cmp %[qwc],0\n"
235 "jne memcpy_qwc_final_%=\n"
236 "movq mm0,[%[src]]\n"
237 "movq mm1,[%[src]+8]\n"
238 "movq [%[dest]], mm0\n"
239 "movq [%[dest]+8], mm1\n"
240
241 "memcpy_qwc_final_%=:\n"
242 "emms\n" // clean up the MMX state
243 ".att_syntax\n"
244 : "=&r"(dest), "=&r"(src), "=&r"(qwc)
245 : [dest]"0"(dest), [src]"1"(src), [qwc]"2"(qwc)
246 : "memory", "mm0", "mm1", "mm2", "mm3"
247 );
248 }
249 #endif
250

  ViewVC Help
Powered by ViewVC 1.1.22