/[pcsx2_0.9.7]/trunk/common/include/Utilities/win_memzero.h
ViewVC logotype

Annotation of /trunk/common/include/Utilities/win_memzero.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 62 - (hide annotations) (download)
Tue Sep 7 11:08:22 2010 UTC (9 years, 11 months ago) by william
File MIME type: text/plain
File size: 12112 byte(s)
Auto Commited Import of: pcsx2-0.9.7-r3738-debug in ./trunk
1 william 31 /* PCSX2 - PS2 Emulator for PCs
2     * Copyright (C) 2002-2010 PCSX2 Dev Team
3     *
4     * PCSX2 is free software: you can redistribute it and/or modify it under the terms
5     * of the GNU Lesser General Public License as published by the Free Software Found-
6     * ation, either version 3 of the License, or (at your option) any later version.
7     *
8     * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10     * PURPOSE. See the GNU General Public License for more details.
11     *
12     * You should have received a copy of the GNU General Public License along with PCSX2.
13     * If not, see <http://www.gnu.org/licenses/>.
14     */
15    
16     #pragma once
17    
18     #ifdef _MSC_VER
19     # pragma warning(disable:4063) // case '1' is not a valid value for switch()
20     #endif
21    
22     // These functions are meant for memset operations of constant length only.
23     // For dynamic length clears, use the C-compiler provided memset instead.
24    
25     // MemZero Code Strategies:
26     // I use a trick to help the MSVC compiler optimize it's asm code better. The compiler
27     // won't optimize local variables very well because it insists in storing them on the
28     // stack and then loading them out of the stack when I use them from inline ASM, and
29     // it won't allow me to use template parameters in inline asm code either. But I can
30     // assign the template parameters to enums, and then use the enums from asm code.
31     // Yeah, silly, but it works. :D (air)
32    
33     // All methods defined in this header use template in combination with the aforementioned
34     // enumerations to generate very efficient and compact inlined code. These optimized
35     // memsets work on the theory that most uses of memset involve static arrays and
36     // structures, which are constant in size, thus allowing us to generate optimal compile-
37     // time code for each use of the function.
38    
39     // Use of CLD (Clear Direction Flag):
40     // On Windows platforms the ABI declares that the direction flag should be cleared upon
41     // entry of *any* function. Therefore there is no need to have CLD prior to our use of
42     // rep strosd here.
43    
44     // Notes on XMM0's "storage" area (_xmm_backup):
45     // Unfortunately there's no way to guarantee alignment for this variable. If I use the
46     // __declspec(aligned(16)) decorator, MSVC fails to inline the function since stack
47     // alignment requires prep work. And for the same reason it's not possible to check the
48     // alignment of the stack at compile time, so I'm forced to use movups to store and
49     // retrieve xmm0.
50    
51     // MSVC Template Issue:
52     // MSVC treats int template parameters like macro insertions. That is, if you have a
53     // a template parameter in the form of "func<10-5>()", MSVC inserts 10-5 into the
54     // templated function, causing order-of-operation problems (sigh). The normal fix would
55     // be to assign the template parameter to a static const int inside each function, but that
56     // won't fly with the enums optimization. So in order to fix the problem I define a macro
57     // that encapsulates the template parameter inside parenthesis for us:
58    
59     #define MZFbytes (_bytes)
60    
61     // This is an implementation of the memzero_ptr fast memset routine (for zero-clears only).
62     template< size_t _bytes >
63 william 62 static __fi void memzero_ptr( void *dest )
64 william 31 {
65     if( MZFbytes == 0 ) return;
66    
67     // This function only works on 32-bit alignments. For anything else we just fall back
68     // on the compiler-provided implementation of memset...
69    
70     if( (MZFbytes & 0x3) != 0 )
71     {
72     memset( dest, 0, MZFbytes );
73     return;
74     }
75    
76 william 62 #if 0
77     // SSE-based memory clear. Currently disabled so to avoid unnecessary dependence on
78     // SSE cpu instruction sets. (memzero typically isn't used in any performance critical
79     // situations anyway)
80 william 31 enum
81     {
82     remainder = MZFbytes & 127,
83     bytes128 = MZFbytes / 128
84     };
85    
86     // Initial check -- if the length is not a multiple of 16 then fall back on
87     // using rep movsd methods. Handling these unaligned clears in a more efficient
88     // manner isn't necessary in pcsx2 (meaning they aren't used in speed-critical
89     // scenarios).
90    
91     if( (MZFbytes & 0xf) == 0 )
92     {
93     if( ((uptr)dest & 0xf) != 0 )
94     {
95     // UNALIGNED COPY MODE.
96     // For unaligned copies we have a threshold of at least 128 vectors. Anything
97     // less and it's probably better off just falling back on the rep movsd.
98     if( bytes128 > 128 )
99     {
100     __asm
101     {
102     mov ecx,dest
103     pxor xmm0,xmm0
104     mov eax,bytes128
105    
106     _loop_6:
107 william 62 movups [ecx],xmm0
108     movups [ecx+0x10],xmm0
109     movups [ecx+0x20],xmm0
110     movups [ecx+0x30],xmm0
111     movups [ecx+0x40],xmm0
112     movups [ecx+0x50],xmm0
113     movups [ecx+0x60],xmm0
114     movups [ecx+0x70],xmm0
115 william 31 sub ecx,-128
116 william 62 sub eax,1
117 william 31 jnz _loop_6;
118     }
119     if( remainder != 0 )
120     {
121     // Copy the remainder in reverse (using the decrementing eax as our indexer)
122     __asm
123     {
124     mov eax, remainder
125    
126     _loop_5:
127     movups [ecx+eax],xmm0;
128     sub eax,16;
129     jnz _loop_5;
130     }
131     }
132     return;
133     }
134     }
135     else if( bytes128 > 48 )
136     {
137     // ALIGNED COPY MODE
138     // Data is aligned and the size of data is large enough to merit a nice
139     // fancy chunk of unrolled goodness:
140    
141     __asm
142     {
143     mov ecx,dest
144     pxor xmm0,xmm0
145     mov eax,bytes128
146    
147     _loop_8:
148 william 62 movaps [ecx],xmm0
149     movaps [ecx+0x10],xmm0
150     movaps [ecx+0x20],xmm0
151     movaps [ecx+0x30],xmm0
152     movaps [ecx+0x40],xmm0
153     movaps [ecx+0x50],xmm0
154     movaps [ecx+0x60],xmm0
155     movaps [ecx+0x70],xmm0
156 william 31 sub ecx,-128
157 william 62 sub eax,1
158 william 31 jnz _loop_8;
159     }
160     if( remainder != 0 )
161     {
162     // Copy the remainder in reverse (using the decrementing eax as our indexer)
163     __asm
164     {
165     mov eax, remainder
166    
167     _loop_10:
168 william 62 movaps [ecx+eax],xmm0
169 william 31 sub eax,16;
170     jnz _loop_10;
171     }
172     }
173     return;
174     }
175     }
176 william 62 #endif
177 william 31
178     // This function only works on 32-bit alignments.
179 william 62 pxAssume( (MZFbytes & 0x3) == 0 );
180     pxAssume( ((uptr)dest & 0x3) == 0 );
181 william 31
182     enum
183     {
184     remdat = MZFbytes >> 2
185     };
186    
187     // This case statement handles 5 special-case sizes (small blocks)
188     // in addition to the generic large block that uses rep stosd.
189    
190     switch( remdat )
191     {
192     case 1:
193     *(u32*)dest = 0;
194     return;
195    
196     case 2:
197     *(u64*)dest = 0;
198     return;
199    
200     case 3:
201     __asm
202     {
203     mov edi, dest
204     xor eax, eax
205     stosd
206     stosd
207     stosd
208     }
209     return;
210    
211     case 4:
212     __asm
213     {
214     mov edi, dest
215     xor eax, eax
216     stosd
217     stosd
218     stosd
219     stosd
220     }
221     return;
222    
223     case 5:
224     __asm
225     {
226     mov edi, dest
227     xor eax, eax
228     stosd
229     stosd
230     stosd
231     stosd
232     stosd
233     }
234     return;
235    
236     default:
237     __asm
238     {
239     mov ecx, remdat
240     mov edi, dest
241     xor eax, eax
242     rep stosd
243     }
244     return;
245     }
246     }
247    
248     // An optimized memset for 8 bit destination data.
249     template< u8 data, size_t _bytes >
250 william 62 static __fi void memset_8( void *dest )
251 william 31 {
252     if( MZFbytes == 0 ) return;
253    
254     if( (MZFbytes & 0x3) != 0 )
255     {
256     // unaligned data length. No point in doing an optimized inline version (too complicated!)
257     // So fall back on the compiler implementation:
258    
259     memset( dest, data, MZFbytes );
260     return;
261     }
262    
263     /*static const size_t remainder = MZFbytes & 127;
264     static const size_t bytes128 = MZFbytes / 128;
265     if( bytes128 > 32 )
266     {
267     // This function only works on 128-bit alignments.
268 william 62 pxAssume( (MZFbytes & 0xf) == 0 );
269     pxAssume( ((uptr)dest & 0xf) == 0 );
270 william 31
271     __asm
272     {
273     mov eax,bytes128
274     mov ecx,dest
275     movss xmm0,data
276    
277     align 16
278    
279     _loop_8:
280     movaps [ecx],xmm0;
281     movaps [ecx+0x10],xmm0;
282     movaps [ecx+0x20],xmm0;
283     movaps [ecx+0x30],xmm0;
284     movaps [ecx+0x40],xmm0;
285     movaps [ecx+0x50],xmm0;
286     movaps [ecx+0x60],xmm0;
287     movaps [ecx+0x70],xmm0;
288     sub ecx,-128
289     dec eax;
290     jnz _loop_8;
291     }
292     if( remainder != 0 )
293     {
294     // Copy the remainder in reverse (using the decrementing eax as our indexer)
295     __asm
296     {
297     mov eax, remainder
298    
299     _loop_10:
300     movaps [ecx+eax],xmm0;
301     sub eax,16;
302     jnz _loop_10;
303     }
304     }
305     }*/
306    
307     // This function only works on 32-bit alignments of data copied.
308 william 62 pxAssume( (MZFbytes & 0x3) == 0 );
309 william 31
310     enum
311     {
312     remdat = MZFbytes >> 2,
313     data32 = data + (data<<8) + (data<<16) + (data<<24)
314     };
315    
316     // macro to execute the x86/32 "stosd" copies.
317     switch( remdat )
318     {
319     case 1:
320     *(u32*)dest = data32;
321     return;
322    
323     case 2:
324     ((u32*)dest)[0] = data32;
325     ((u32*)dest)[1] = data32;
326     return;
327    
328     case 3:
329     __asm
330     {
331     mov edi, dest;
332     mov eax, data32;
333     stosd;
334     stosd;
335     stosd;
336     }
337     return;
338    
339     case 4:
340     __asm
341     {
342     mov edi, dest;
343     mov eax, data32;
344     stosd;
345     stosd;
346     stosd;
347     stosd;
348     }
349     return;
350    
351     case 5:
352     __asm
353     {
354     mov edi, dest;
355     mov eax, data32;
356     stosd;
357     stosd;
358     stosd;
359     stosd;
360     stosd;
361     }
362     return;
363    
364     default:
365     __asm
366     {
367     mov ecx, remdat;
368     mov edi, dest;
369     mov eax, data32;
370     rep stosd;
371     }
372     return;
373     }
374     }
375    
376     template< u16 data, size_t _bytes >
377 william 62 static __fi void memset_16( void *dest )
378 william 31 {
379     if( MZFbytes == 0 ) return;
380    
381 william 62 // Assertion: data length must be a multiple of 16 or 32 bits
382     pxAssume( (MZFbytes & 0x1) == 0 );
383 william 31
384     if( (MZFbytes & 0x3) != 0 )
385     {
386     // Unaligned data length. No point in doing an optimized inline version (too complicated with
387     // remainders and such).
388    
389     _memset16_unaligned( dest, data, MZFbytes );
390     return;
391     }
392    
393     //u64 _xmm_backup[2];
394    
395     // This function only works on 32-bit alignments of data copied.
396 william 62 pxAssume( (MZFbytes & 0x3) == 0 );
397 william 31
398     enum
399     {
400     remdat = MZFbytes >> 2,
401     data32 = data + (data<<16)
402     };
403    
404     // macro to execute the x86/32 "stosd" copies.
405     switch( remdat )
406     {
407     case 1:
408     *(u32*)dest = data32;
409     return;
410    
411     case 2:
412     ((u32*)dest)[0] = data32;
413     ((u32*)dest)[1] = data32;
414     return;
415    
416     case 3:
417     __asm
418     {
419     mov edi, dest;
420     mov eax, data32;
421     stosd;
422     stosd;
423     stosd;
424     }
425     return;
426    
427     case 4:
428     __asm
429     {
430     mov edi, dest;
431     mov eax, data32;
432     stosd;
433     stosd;
434     stosd;
435     stosd;
436     }
437     return;
438    
439     case 5:
440     __asm
441     {
442     mov edi, dest;
443     mov eax, data32;
444     stosd;
445     stosd;
446     stosd;
447     stosd;
448     stosd;
449     }
450     return;
451    
452     default:
453     __asm
454     {
455     mov ecx, remdat;
456     mov edi, dest;
457     mov eax, data32;
458     rep stosd;
459     }
460     return
461     }
462     }
463    
464     template< u32 data, size_t MZFbytes >
465 william 62 static __fi void memset_32( void *dest )
466 william 31 {
467     if( MZFbytes == 0 ) return;
468    
469 william 62 // Assertion: data length must be a multiple of 32 bits
470     pxAssume( (MZFbytes & 0x3) == 0 );
471 william 31
472     //u64 _xmm_backup[2];
473    
474     // This function only works on 32-bit alignments of data copied.
475     // If the data length is not a factor of 32 bits, the C++ optimizing compiler will
476     // probably just generate mysteriously broken code in Release builds. ;)
477    
478 william 62 pxAssume( (MZFbytes & 0x3) == 0 );
479 william 31
480     enum
481     {
482     remdat = MZFbytes>>2,
483     data32 = data
484     };
485    
486     // macro to execute the x86/32 "stosd" copies.
487     switch( remdat )
488     {
489     case 1:
490     *(u32*)dest = data32;
491     return;
492    
493     case 2:
494     ((u32*)dest)[0] = data32;
495     ((u32*)dest)[1] = data32;
496     return;
497    
498     case 3:
499     __asm
500     {
501     mov edi, dest;
502     mov eax, data32;
503     stosd;
504     stosd;
505     stosd;
506     }
507     return;
508    
509     case 4:
510     __asm
511     {
512     mov edi, dest;
513     mov eax, data32;
514     stosd;
515     stosd;
516     stosd;
517     stosd;
518     }
519     return;
520    
521     case 5:
522     __asm
523     {
524     mov edi, dest;
525     mov eax, data32;
526     stosd;
527     stosd;
528     stosd;
529     stosd;
530     stosd;
531     }
532     return;
533    
534     default:
535     __asm
536     {
537     mov ecx, remdat;
538     mov edi, dest;
539     mov eax, data32;
540     rep stosd;
541     }
542     return
543     }
544     }
545    
546     // This method can clear any object-like entity -- which is anything that is not a pointer.
547     // Structures, static arrays, etc. No need to include sizeof() crap, this does it automatically
548     // for you!
549     template< typename T >
550 william 62 static __fi void memzero( T& object )
551 william 31 {
552     memzero_ptr<sizeof(T)>( &object );
553     }
554    
555     // This method clears an object with the given 8 bit value.
556     template< u8 data, typename T >
557 william 62 static __fi void memset8( T& object )
558 william 31 {
559     memset_8<data, sizeof(T)>( &object );
560     }
561    
562     // This method clears an object with the given 16 bit value.
563     template< u16 data, typename T >
564 william 62 static __fi void memset16( T& object )
565 william 31 {
566     memset_16<data, sizeof(T)>( &object );
567     }
568    
569     // This method clears an object with the given 32 bit value.
570     template< u32 data, typename T >
571 william 62 static __fi void memset32( T& object )
572 william 31 {
573     memset_32<data, sizeof(T)>( &object );
574     }
575    
576     #undef MZFbytes

  ViewVC Help
Powered by ViewVC 1.1.22