/[pcsx2_0.9.7]/trunk/common/include/Utilities/win_memzero.h
ViewVC logotype

Contents of /trunk/common/include/Utilities/win_memzero.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 62 - (show annotations) (download)
Tue Sep 7 11:08:22 2010 UTC (9 years, 10 months ago) by william
File MIME type: text/plain
File size: 12112 byte(s)
Auto Commited Import of: pcsx2-0.9.7-r3738-debug in ./trunk
1 /* PCSX2 - PS2 Emulator for PCs
2 * Copyright (C) 2002-2010 PCSX2 Dev Team
3 *
4 * PCSX2 is free software: you can redistribute it and/or modify it under the terms
5 * of the GNU Lesser General Public License as published by the Free Software Found-
6 * ation, either version 3 of the License, or (at your option) any later version.
7 *
8 * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10 * PURPOSE. See the GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License along with PCSX2.
13 * If not, see <http://www.gnu.org/licenses/>.
14 */
15
16 #pragma once
17
18 #ifdef _MSC_VER
19 # pragma warning(disable:4063) // case '1' is not a valid value for switch()
20 #endif
21
22 // These functions are meant for memset operations of constant length only.
23 // For dynamic length clears, use the C-compiler provided memset instead.
24
25 // MemZero Code Strategies:
26 // I use a trick to help the MSVC compiler optimize it's asm code better. The compiler
27 // won't optimize local variables very well because it insists in storing them on the
28 // stack and then loading them out of the stack when I use them from inline ASM, and
29 // it won't allow me to use template parameters in inline asm code either. But I can
30 // assign the template parameters to enums, and then use the enums from asm code.
31 // Yeah, silly, but it works. :D (air)
32
33 // All methods defined in this header use template in combination with the aforementioned
34 // enumerations to generate very efficient and compact inlined code. These optimized
35 // memsets work on the theory that most uses of memset involve static arrays and
36 // structures, which are constant in size, thus allowing us to generate optimal compile-
37 // time code for each use of the function.
38
39 // Use of CLD (Clear Direction Flag):
40 // On Windows platforms the ABI declares that the direction flag should be cleared upon
41 // entry of *any* function. Therefore there is no need to have CLD prior to our use of
42 // rep strosd here.
43
44 // Notes on XMM0's "storage" area (_xmm_backup):
45 // Unfortunately there's no way to guarantee alignment for this variable. If I use the
46 // __declspec(aligned(16)) decorator, MSVC fails to inline the function since stack
47 // alignment requires prep work. And for the same reason it's not possible to check the
48 // alignment of the stack at compile time, so I'm forced to use movups to store and
49 // retrieve xmm0.
50
51 // MSVC Template Issue:
52 // MSVC treats int template parameters like macro insertions. That is, if you have a
53 // a template parameter in the form of "func<10-5>()", MSVC inserts 10-5 into the
54 // templated function, causing order-of-operation problems (sigh). The normal fix would
55 // be to assign the template parameter to a static const int inside each function, but that
56 // won't fly with the enums optimization. So in order to fix the problem I define a macro
57 // that encapsulates the template parameter inside parenthesis for us:
58
59 #define MZFbytes (_bytes)
60
61 // This is an implementation of the memzero_ptr fast memset routine (for zero-clears only).
62 template< size_t _bytes >
63 static __fi void memzero_ptr( void *dest )
64 {
65 if( MZFbytes == 0 ) return;
66
67 // This function only works on 32-bit alignments. For anything else we just fall back
68 // on the compiler-provided implementation of memset...
69
70 if( (MZFbytes & 0x3) != 0 )
71 {
72 memset( dest, 0, MZFbytes );
73 return;
74 }
75
76 #if 0
77 // SSE-based memory clear. Currently disabled so to avoid unnecessary dependence on
78 // SSE cpu instruction sets. (memzero typically isn't used in any performance critical
79 // situations anyway)
80 enum
81 {
82 remainder = MZFbytes & 127,
83 bytes128 = MZFbytes / 128
84 };
85
86 // Initial check -- if the length is not a multiple of 16 then fall back on
87 // using rep movsd methods. Handling these unaligned clears in a more efficient
88 // manner isn't necessary in pcsx2 (meaning they aren't used in speed-critical
89 // scenarios).
90
91 if( (MZFbytes & 0xf) == 0 )
92 {
93 if( ((uptr)dest & 0xf) != 0 )
94 {
95 // UNALIGNED COPY MODE.
96 // For unaligned copies we have a threshold of at least 128 vectors. Anything
97 // less and it's probably better off just falling back on the rep movsd.
98 if( bytes128 > 128 )
99 {
100 __asm
101 {
102 mov ecx,dest
103 pxor xmm0,xmm0
104 mov eax,bytes128
105
106 _loop_6:
107 movups [ecx],xmm0
108 movups [ecx+0x10],xmm0
109 movups [ecx+0x20],xmm0
110 movups [ecx+0x30],xmm0
111 movups [ecx+0x40],xmm0
112 movups [ecx+0x50],xmm0
113 movups [ecx+0x60],xmm0
114 movups [ecx+0x70],xmm0
115 sub ecx,-128
116 sub eax,1
117 jnz _loop_6;
118 }
119 if( remainder != 0 )
120 {
121 // Copy the remainder in reverse (using the decrementing eax as our indexer)
122 __asm
123 {
124 mov eax, remainder
125
126 _loop_5:
127 movups [ecx+eax],xmm0;
128 sub eax,16;
129 jnz _loop_5;
130 }
131 }
132 return;
133 }
134 }
135 else if( bytes128 > 48 )
136 {
137 // ALIGNED COPY MODE
138 // Data is aligned and the size of data is large enough to merit a nice
139 // fancy chunk of unrolled goodness:
140
141 __asm
142 {
143 mov ecx,dest
144 pxor xmm0,xmm0
145 mov eax,bytes128
146
147 _loop_8:
148 movaps [ecx],xmm0
149 movaps [ecx+0x10],xmm0
150 movaps [ecx+0x20],xmm0
151 movaps [ecx+0x30],xmm0
152 movaps [ecx+0x40],xmm0
153 movaps [ecx+0x50],xmm0
154 movaps [ecx+0x60],xmm0
155 movaps [ecx+0x70],xmm0
156 sub ecx,-128
157 sub eax,1
158 jnz _loop_8;
159 }
160 if( remainder != 0 )
161 {
162 // Copy the remainder in reverse (using the decrementing eax as our indexer)
163 __asm
164 {
165 mov eax, remainder
166
167 _loop_10:
168 movaps [ecx+eax],xmm0
169 sub eax,16;
170 jnz _loop_10;
171 }
172 }
173 return;
174 }
175 }
176 #endif
177
178 // This function only works on 32-bit alignments.
179 pxAssume( (MZFbytes & 0x3) == 0 );
180 pxAssume( ((uptr)dest & 0x3) == 0 );
181
182 enum
183 {
184 remdat = MZFbytes >> 2
185 };
186
187 // This case statement handles 5 special-case sizes (small blocks)
188 // in addition to the generic large block that uses rep stosd.
189
190 switch( remdat )
191 {
192 case 1:
193 *(u32*)dest = 0;
194 return;
195
196 case 2:
197 *(u64*)dest = 0;
198 return;
199
200 case 3:
201 __asm
202 {
203 mov edi, dest
204 xor eax, eax
205 stosd
206 stosd
207 stosd
208 }
209 return;
210
211 case 4:
212 __asm
213 {
214 mov edi, dest
215 xor eax, eax
216 stosd
217 stosd
218 stosd
219 stosd
220 }
221 return;
222
223 case 5:
224 __asm
225 {
226 mov edi, dest
227 xor eax, eax
228 stosd
229 stosd
230 stosd
231 stosd
232 stosd
233 }
234 return;
235
236 default:
237 __asm
238 {
239 mov ecx, remdat
240 mov edi, dest
241 xor eax, eax
242 rep stosd
243 }
244 return;
245 }
246 }
247
248 // An optimized memset for 8 bit destination data.
249 template< u8 data, size_t _bytes >
250 static __fi void memset_8( void *dest )
251 {
252 if( MZFbytes == 0 ) return;
253
254 if( (MZFbytes & 0x3) != 0 )
255 {
256 // unaligned data length. No point in doing an optimized inline version (too complicated!)
257 // So fall back on the compiler implementation:
258
259 memset( dest, data, MZFbytes );
260 return;
261 }
262
263 /*static const size_t remainder = MZFbytes & 127;
264 static const size_t bytes128 = MZFbytes / 128;
265 if( bytes128 > 32 )
266 {
267 // This function only works on 128-bit alignments.
268 pxAssume( (MZFbytes & 0xf) == 0 );
269 pxAssume( ((uptr)dest & 0xf) == 0 );
270
271 __asm
272 {
273 mov eax,bytes128
274 mov ecx,dest
275 movss xmm0,data
276
277 align 16
278
279 _loop_8:
280 movaps [ecx],xmm0;
281 movaps [ecx+0x10],xmm0;
282 movaps [ecx+0x20],xmm0;
283 movaps [ecx+0x30],xmm0;
284 movaps [ecx+0x40],xmm0;
285 movaps [ecx+0x50],xmm0;
286 movaps [ecx+0x60],xmm0;
287 movaps [ecx+0x70],xmm0;
288 sub ecx,-128
289 dec eax;
290 jnz _loop_8;
291 }
292 if( remainder != 0 )
293 {
294 // Copy the remainder in reverse (using the decrementing eax as our indexer)
295 __asm
296 {
297 mov eax, remainder
298
299 _loop_10:
300 movaps [ecx+eax],xmm0;
301 sub eax,16;
302 jnz _loop_10;
303 }
304 }
305 }*/
306
307 // This function only works on 32-bit alignments of data copied.
308 pxAssume( (MZFbytes & 0x3) == 0 );
309
310 enum
311 {
312 remdat = MZFbytes >> 2,
313 data32 = data + (data<<8) + (data<<16) + (data<<24)
314 };
315
316 // macro to execute the x86/32 "stosd" copies.
317 switch( remdat )
318 {
319 case 1:
320 *(u32*)dest = data32;
321 return;
322
323 case 2:
324 ((u32*)dest)[0] = data32;
325 ((u32*)dest)[1] = data32;
326 return;
327
328 case 3:
329 __asm
330 {
331 mov edi, dest;
332 mov eax, data32;
333 stosd;
334 stosd;
335 stosd;
336 }
337 return;
338
339 case 4:
340 __asm
341 {
342 mov edi, dest;
343 mov eax, data32;
344 stosd;
345 stosd;
346 stosd;
347 stosd;
348 }
349 return;
350
351 case 5:
352 __asm
353 {
354 mov edi, dest;
355 mov eax, data32;
356 stosd;
357 stosd;
358 stosd;
359 stosd;
360 stosd;
361 }
362 return;
363
364 default:
365 __asm
366 {
367 mov ecx, remdat;
368 mov edi, dest;
369 mov eax, data32;
370 rep stosd;
371 }
372 return;
373 }
374 }
375
376 template< u16 data, size_t _bytes >
377 static __fi void memset_16( void *dest )
378 {
379 if( MZFbytes == 0 ) return;
380
381 // Assertion: data length must be a multiple of 16 or 32 bits
382 pxAssume( (MZFbytes & 0x1) == 0 );
383
384 if( (MZFbytes & 0x3) != 0 )
385 {
386 // Unaligned data length. No point in doing an optimized inline version (too complicated with
387 // remainders and such).
388
389 _memset16_unaligned( dest, data, MZFbytes );
390 return;
391 }
392
393 //u64 _xmm_backup[2];
394
395 // This function only works on 32-bit alignments of data copied.
396 pxAssume( (MZFbytes & 0x3) == 0 );
397
398 enum
399 {
400 remdat = MZFbytes >> 2,
401 data32 = data + (data<<16)
402 };
403
404 // macro to execute the x86/32 "stosd" copies.
405 switch( remdat )
406 {
407 case 1:
408 *(u32*)dest = data32;
409 return;
410
411 case 2:
412 ((u32*)dest)[0] = data32;
413 ((u32*)dest)[1] = data32;
414 return;
415
416 case 3:
417 __asm
418 {
419 mov edi, dest;
420 mov eax, data32;
421 stosd;
422 stosd;
423 stosd;
424 }
425 return;
426
427 case 4:
428 __asm
429 {
430 mov edi, dest;
431 mov eax, data32;
432 stosd;
433 stosd;
434 stosd;
435 stosd;
436 }
437 return;
438
439 case 5:
440 __asm
441 {
442 mov edi, dest;
443 mov eax, data32;
444 stosd;
445 stosd;
446 stosd;
447 stosd;
448 stosd;
449 }
450 return;
451
452 default:
453 __asm
454 {
455 mov ecx, remdat;
456 mov edi, dest;
457 mov eax, data32;
458 rep stosd;
459 }
460 return
461 }
462 }
463
464 template< u32 data, size_t MZFbytes >
465 static __fi void memset_32( void *dest )
466 {
467 if( MZFbytes == 0 ) return;
468
469 // Assertion: data length must be a multiple of 32 bits
470 pxAssume( (MZFbytes & 0x3) == 0 );
471
472 //u64 _xmm_backup[2];
473
474 // This function only works on 32-bit alignments of data copied.
475 // If the data length is not a factor of 32 bits, the C++ optimizing compiler will
476 // probably just generate mysteriously broken code in Release builds. ;)
477
478 pxAssume( (MZFbytes & 0x3) == 0 );
479
480 enum
481 {
482 remdat = MZFbytes>>2,
483 data32 = data
484 };
485
486 // macro to execute the x86/32 "stosd" copies.
487 switch( remdat )
488 {
489 case 1:
490 *(u32*)dest = data32;
491 return;
492
493 case 2:
494 ((u32*)dest)[0] = data32;
495 ((u32*)dest)[1] = data32;
496 return;
497
498 case 3:
499 __asm
500 {
501 mov edi, dest;
502 mov eax, data32;
503 stosd;
504 stosd;
505 stosd;
506 }
507 return;
508
509 case 4:
510 __asm
511 {
512 mov edi, dest;
513 mov eax, data32;
514 stosd;
515 stosd;
516 stosd;
517 stosd;
518 }
519 return;
520
521 case 5:
522 __asm
523 {
524 mov edi, dest;
525 mov eax, data32;
526 stosd;
527 stosd;
528 stosd;
529 stosd;
530 stosd;
531 }
532 return;
533
534 default:
535 __asm
536 {
537 mov ecx, remdat;
538 mov edi, dest;
539 mov eax, data32;
540 rep stosd;
541 }
542 return
543 }
544 }
545
546 // This method can clear any object-like entity -- which is anything that is not a pointer.
547 // Structures, static arrays, etc. No need to include sizeof() crap, this does it automatically
548 // for you!
549 template< typename T >
550 static __fi void memzero( T& object )
551 {
552 memzero_ptr<sizeof(T)>( &object );
553 }
554
555 // This method clears an object with the given 8 bit value.
556 template< u8 data, typename T >
557 static __fi void memset8( T& object )
558 {
559 memset_8<data, sizeof(T)>( &object );
560 }
561
562 // This method clears an object with the given 16 bit value.
563 template< u16 data, typename T >
564 static __fi void memset16( T& object )
565 {
566 memset_16<data, sizeof(T)>( &object );
567 }
568
569 // This method clears an object with the given 32 bit value.
570 template< u32 data, typename T >
571 static __fi void memset32( T& object )
572 {
573 memset_32<data, sizeof(T)>( &object );
574 }
575
576 #undef MZFbytes

  ViewVC Help
Powered by ViewVC 1.1.22