/[pcsx2_0.9.7]/trunk/common/include/Utilities/win_memzero.h
ViewVC logotype

Contents of /trunk/common/include/Utilities/win_memzero.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (show annotations) (download)
Tue Sep 7 03:24:11 2010 UTC (9 years, 11 months ago) by william
File MIME type: text/plain
File size: 12402 byte(s)
committing r3113 initial commit again...
1 /* PCSX2 - PS2 Emulator for PCs
2 * Copyright (C) 2002-2010 PCSX2 Dev Team
3 *
4 * PCSX2 is free software: you can redistribute it and/or modify it under the terms
5 * of the GNU Lesser General Public License as published by the Free Software Found-
6 * ation, either version 3 of the License, or (at your option) any later version.
7 *
8 * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10 * PURPOSE. See the GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License along with PCSX2.
13 * If not, see <http://www.gnu.org/licenses/>.
14 */
15
16 #pragma once
17
18 #ifdef _MSC_VER
19 # pragma warning(disable:4063) // case '1' is not a valid value for switch()
20 #endif
21
22 // These functions are meant for memset operations of constant length only.
23 // For dynamic length clears, use the C-compiler provided memset instead.
24
25 // MemZero Code Strategies:
26 // I use a trick to help the MSVC compiler optimize it's asm code better. The compiler
27 // won't optimize local variables very well because it insists in storing them on the
28 // stack and then loading them out of the stack when I use them from inline ASM, and
29 // it won't allow me to use template parameters in inline asm code either. But I can
30 // assign the template parameters to enums, and then use the enums from asm code.
31 // Yeah, silly, but it works. :D (air)
32
33 // All methods defined in this header use template in combination with the aforementioned
34 // enumerations to generate very efficient and compact inlined code. These optimized
35 // memsets work on the theory that most uses of memset involve static arrays and
36 // structures, which are constant in size, thus allowing us to generate optimal compile-
37 // time code for each use of the function.
38
39 // Use of CLD (Clear Direction Flag):
40 // On Windows platforms the ABI declares that the direction flag should be cleared upon
41 // entry of *any* function. Therefore there is no need to have CLD prior to our use of
42 // rep strosd here.
43
44 // Notes on XMM0's "storage" area (_xmm_backup):
45 // Unfortunately there's no way to guarantee alignment for this variable. If I use the
46 // __declspec(aligned(16)) decorator, MSVC fails to inline the function since stack
47 // alignment requires prep work. And for the same reason it's not possible to check the
48 // alignment of the stack at compile time, so I'm forced to use movups to store and
49 // retrieve xmm0.
50
51 // MSVC Template Issue:
52 // MSVC treats int template parameters like macro insertions. That is, if you have a
53 // a template parameter in the form of "func<10-5>()", MSVC inserts 10-5 into the
54 // templated function, causing order-of-operation problems (sigh). The normal fix would
55 // be to assign the template parameter to a static const int inside each function, but that
56 // won't fly with the enums optimization. So in order to fix the problem I define a macro
57 // that encapsulates the template parameter inside parenthesis for us:
58
59 #define MZFbytes (_bytes)
60
61 // This is an implementation of the memzero_ptr fast memset routine (for zero-clears only).
62 template< size_t _bytes >
63 static __forceinline void memzero_ptr( void *dest )
64 {
65 if( MZFbytes == 0 ) return;
66
67 // This function only works on 32-bit alignments. For anything else we just fall back
68 // on the compiler-provided implementation of memset...
69
70 if( (MZFbytes & 0x3) != 0 )
71 {
72 memset( dest, 0, MZFbytes );
73 return;
74 }
75
76 enum
77 {
78 remainder = MZFbytes & 127,
79 bytes128 = MZFbytes / 128
80 };
81
82 // Initial check -- if the length is not a multiple of 16 then fall back on
83 // using rep movsd methods. Handling these unaligned clears in a more efficient
84 // manner isn't necessary in pcsx2 (meaning they aren't used in speed-critical
85 // scenarios).
86
87 if( (MZFbytes & 0xf) == 0 )
88 {
89 u64 _xmm_backup[2];
90
91 if( ((uptr)dest & 0xf) != 0 )
92 {
93 // UNALIGNED COPY MODE.
94 // For unaligned copies we have a threshold of at least 128 vectors. Anything
95 // less and it's probably better off just falling back on the rep movsd.
96 if( bytes128 > 128 )
97 {
98 __asm
99 {
100 movups _xmm_backup,xmm0;
101 mov ecx,dest
102 pxor xmm0,xmm0
103 mov eax,bytes128
104
105 align 16
106
107 _loop_6:
108 movups [ecx],xmm0;
109 movups [ecx+0x10],xmm0;
110 movups [ecx+0x20],xmm0;
111 movups [ecx+0x30],xmm0;
112 movups [ecx+0x40],xmm0;
113 movups [ecx+0x50],xmm0;
114 movups [ecx+0x60],xmm0;
115 movups [ecx+0x70],xmm0;
116 sub ecx,-128
117 dec eax;
118 jnz _loop_6;
119 }
120 if( remainder != 0 )
121 {
122 // Copy the remainder in reverse (using the decrementing eax as our indexer)
123 __asm
124 {
125 mov eax, remainder
126
127 _loop_5:
128 movups [ecx+eax],xmm0;
129 sub eax,16;
130 jnz _loop_5;
131 }
132 }
133 __asm
134 {
135 movups xmm0,[_xmm_backup];
136 }
137 return;
138 }
139 }
140 else if( bytes128 > 48 )
141 {
142 // ALIGNED COPY MODE
143 // Data is aligned and the size of data is large enough to merit a nice
144 // fancy chunk of unrolled goodness:
145
146 __asm
147 {
148 movups _xmm_backup,xmm0;
149 mov ecx,dest
150 pxor xmm0,xmm0
151 mov eax,bytes128
152
153 align 16
154
155 _loop_8:
156 movaps [ecx],xmm0;
157 movaps [ecx+0x10],xmm0;
158 movaps [ecx+0x20],xmm0;
159 movaps [ecx+0x30],xmm0;
160 movaps [ecx+0x40],xmm0;
161 movaps [ecx+0x50],xmm0;
162 movaps [ecx+0x60],xmm0;
163 movaps [ecx+0x70],xmm0;
164 sub ecx,-128
165 dec eax;
166 jnz _loop_8;
167 }
168 if( remainder != 0 )
169 {
170 // Copy the remainder in reverse (using the decrementing eax as our indexer)
171 __asm
172 {
173 mov eax, remainder
174
175 _loop_10:
176 movaps [ecx+eax],xmm0;
177 sub eax,16;
178 jnz _loop_10;
179 }
180 }
181 __asm
182 {
183 movups xmm0,[_xmm_backup];
184 }
185 return;
186 }
187 }
188
189 // This function only works on 32-bit alignments.
190 jASSUME( (MZFbytes & 0x3) == 0 );
191 jASSUME( ((uptr)dest & 0x3) == 0 );
192
193 enum
194 {
195 remdat = MZFbytes >> 2
196 };
197
198 // This case statement handles 5 special-case sizes (small blocks)
199 // in addition to the generic large block that uses rep stosd.
200
201 switch( remdat )
202 {
203 case 1:
204 *(u32*)dest = 0;
205 return;
206
207 case 2:
208 *(u64*)dest = 0;
209 return;
210
211 case 3:
212 __asm
213 {
214 mov edi, dest
215 xor eax, eax
216 stosd
217 stosd
218 stosd
219 }
220 return;
221
222 case 4:
223 __asm
224 {
225 mov edi, dest
226 xor eax, eax
227 stosd
228 stosd
229 stosd
230 stosd
231 }
232 return;
233
234 case 5:
235 __asm
236 {
237 mov edi, dest
238 xor eax, eax
239 stosd
240 stosd
241 stosd
242 stosd
243 stosd
244 }
245 return;
246
247 default:
248 __asm
249 {
250 mov ecx, remdat
251 mov edi, dest
252 xor eax, eax
253 rep stosd
254 }
255 return;
256 }
257 }
258
259 // An optimized memset for 8 bit destination data.
260 template< u8 data, size_t _bytes >
261 static __forceinline void memset_8( void *dest )
262 {
263 if( MZFbytes == 0 ) return;
264
265 if( (MZFbytes & 0x3) != 0 )
266 {
267 // unaligned data length. No point in doing an optimized inline version (too complicated!)
268 // So fall back on the compiler implementation:
269
270 memset( dest, data, MZFbytes );
271 return;
272 }
273
274 //u64 _xmm_backup[2];
275
276 /*static const size_t remainder = MZFbytes & 127;
277 static const size_t bytes128 = MZFbytes / 128;
278 if( bytes128 > 32 )
279 {
280 // This function only works on 128-bit alignments.
281 jASSUME( (MZFbytes & 0xf) == 0 );
282 jASSUME( ((uptr)dest & 0xf) == 0 );
283
284 __asm
285 {
286 movups _xmm_backup,xmm0;
287 mov eax,bytes128
288 mov ecx,dest
289 movss xmm0,data
290
291 align 16
292
293 _loop_8:
294 movaps [ecx],xmm0;
295 movaps [ecx+0x10],xmm0;
296 movaps [ecx+0x20],xmm0;
297 movaps [ecx+0x30],xmm0;
298 movaps [ecx+0x40],xmm0;
299 movaps [ecx+0x50],xmm0;
300 movaps [ecx+0x60],xmm0;
301 movaps [ecx+0x70],xmm0;
302 sub ecx,-128
303 dec eax;
304 jnz _loop_8;
305 }
306 if( remainder != 0 )
307 {
308 // Copy the remainder in reverse (using the decrementing eax as our indexer)
309 __asm
310 {
311 mov eax, remainder
312
313 _loop_10:
314 movaps [ecx+eax],xmm0;
315 sub eax,16;
316 jnz _loop_10;
317 }
318 }
319 __asm
320 {
321 movups xmm0,[_xmm_backup];
322 }
323 }*/
324
325 // This function only works on 32-bit alignments of data copied.
326 jASSUME( (MZFbytes & 0x3) == 0 );
327
328 enum
329 {
330 remdat = MZFbytes >> 2,
331 data32 = data + (data<<8) + (data<<16) + (data<<24)
332 };
333
334 // macro to execute the x86/32 "stosd" copies.
335 switch( remdat )
336 {
337 case 1:
338 *(u32*)dest = data32;
339 return;
340
341 case 2:
342 ((u32*)dest)[0] = data32;
343 ((u32*)dest)[1] = data32;
344 return;
345
346 case 3:
347 __asm
348 {
349 mov edi, dest;
350 mov eax, data32;
351 stosd;
352 stosd;
353 stosd;
354 }
355 return;
356
357 case 4:
358 __asm
359 {
360 mov edi, dest;
361 mov eax, data32;
362 stosd;
363 stosd;
364 stosd;
365 stosd;
366 }
367 return;
368
369 case 5:
370 __asm
371 {
372 mov edi, dest;
373 mov eax, data32;
374 stosd;
375 stosd;
376 stosd;
377 stosd;
378 stosd;
379 }
380 return;
381
382 default:
383 __asm
384 {
385 mov ecx, remdat;
386 mov edi, dest;
387 mov eax, data32;
388 rep stosd;
389 }
390 return;
391 }
392 }
393
394 template< u16 data, size_t _bytes >
395 static __forceinline void memset_16( void *dest )
396 {
397 if( MZFbytes == 0 ) return;
398
399 if( (MZFbytes & 0x1) != 0 )
400 throw Exception::LogicError( "Invalid parameter passed to memset_16 - data length is not a multiple of 16 or 32 bits." );
401
402 if( (MZFbytes & 0x3) != 0 )
403 {
404 // Unaligned data length. No point in doing an optimized inline version (too complicated with
405 // remainders and such).
406
407 _memset16_unaligned( dest, data, MZFbytes );
408 return;
409 }
410
411 //u64 _xmm_backup[2];
412
413 // This function only works on 32-bit alignments of data copied.
414 jASSUME( (MZFbytes & 0x3) == 0 );
415
416 enum
417 {
418 remdat = MZFbytes >> 2,
419 data32 = data + (data<<16)
420 };
421
422 // macro to execute the x86/32 "stosd" copies.
423 switch( remdat )
424 {
425 case 1:
426 *(u32*)dest = data32;
427 return;
428
429 case 2:
430 ((u32*)dest)[0] = data32;
431 ((u32*)dest)[1] = data32;
432 return;
433
434 case 3:
435 __asm
436 {
437 mov edi, dest;
438 mov eax, data32;
439 stosd;
440 stosd;
441 stosd;
442 }
443 return;
444
445 case 4:
446 __asm
447 {
448 mov edi, dest;
449 mov eax, data32;
450 stosd;
451 stosd;
452 stosd;
453 stosd;
454 }
455 return;
456
457 case 5:
458 __asm
459 {
460 mov edi, dest;
461 mov eax, data32;
462 stosd;
463 stosd;
464 stosd;
465 stosd;
466 stosd;
467 }
468 return;
469
470 default:
471 __asm
472 {
473 mov ecx, remdat;
474 mov edi, dest;
475 mov eax, data32;
476 rep stosd;
477 }
478 return
479 }
480 }
481
482 template< u32 data, size_t MZFbytes >
483 static __forceinline void memset_32( void *dest )
484 {
485 if( MZFbytes == 0 ) return;
486
487 if( (MZFbytes & 0x3) != 0 )
488 throw Exception::LogicError( "Invalid parameter passed to memset_32 - data length is not a multiple of 32 bits." );
489
490
491 //u64 _xmm_backup[2];
492
493 // This function only works on 32-bit alignments of data copied.
494 // If the data length is not a factor of 32 bits, the C++ optimizing compiler will
495 // probably just generate mysteriously broken code in Release builds. ;)
496
497 jASSUME( (MZFbytes & 0x3) == 0 );
498
499 enum
500 {
501 remdat = MZFbytes>>2,
502 data32 = data
503 };
504
505 // macro to execute the x86/32 "stosd" copies.
506 switch( remdat )
507 {
508 case 1:
509 *(u32*)dest = data32;
510 return;
511
512 case 2:
513 ((u32*)dest)[0] = data32;
514 ((u32*)dest)[1] = data32;
515 return;
516
517 case 3:
518 __asm
519 {
520 mov edi, dest;
521 mov eax, data32;
522 stosd;
523 stosd;
524 stosd;
525 }
526 return;
527
528 case 4:
529 __asm
530 {
531 mov edi, dest;
532 mov eax, data32;
533 stosd;
534 stosd;
535 stosd;
536 stosd;
537 }
538 return;
539
540 case 5:
541 __asm
542 {
543 mov edi, dest;
544 mov eax, data32;
545 stosd;
546 stosd;
547 stosd;
548 stosd;
549 stosd;
550 }
551 return;
552
553 default:
554 __asm
555 {
556 mov ecx, remdat;
557 mov edi, dest;
558 mov eax, data32;
559 rep stosd;
560 }
561 return
562 }
563 }
564
565 // This method can clear any object-like entity -- which is anything that is not a pointer.
566 // Structures, static arrays, etc. No need to include sizeof() crap, this does it automatically
567 // for you!
568 template< typename T >
569 static __forceinline void memzero( T& object )
570 {
571 memzero_ptr<sizeof(T)>( &object );
572 }
573
574 // This method clears an object with the given 8 bit value.
575 template< u8 data, typename T >
576 static __forceinline void memset8( T& object )
577 {
578 memset_8<data, sizeof(T)>( &object );
579 }
580
581 // This method clears an object with the given 16 bit value.
582 template< u16 data, typename T >
583 static __forceinline void memset16( T& object )
584 {
585 memset_16<data, sizeof(T)>( &object );
586 }
587
588 // This method clears an object with the given 32 bit value.
589 template< u32 data, typename T >
590 static __forceinline void memset32( T& object )
591 {
592 memset_32<data, sizeof(T)>( &object );
593 }
594
595 #undef MZFbytes

  ViewVC Help
Powered by ViewVC 1.1.22