/[pcsx2_0.9.7]/trunk/common/include/Utilities/win_memzero.h
ViewVC logotype

Annotation of /trunk/common/include/Utilities/win_memzero.h

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (hide annotations) (download)
Tue Sep 7 03:24:11 2010 UTC (9 years, 11 months ago) by william
File MIME type: text/plain
File size: 12402 byte(s)
committing r3113 initial commit again...
1 william 31 /* PCSX2 - PS2 Emulator for PCs
2     * Copyright (C) 2002-2010 PCSX2 Dev Team
3     *
4     * PCSX2 is free software: you can redistribute it and/or modify it under the terms
5     * of the GNU Lesser General Public License as published by the Free Software Found-
6     * ation, either version 3 of the License, or (at your option) any later version.
7     *
8     * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10     * PURPOSE. See the GNU General Public License for more details.
11     *
12     * You should have received a copy of the GNU General Public License along with PCSX2.
13     * If not, see <http://www.gnu.org/licenses/>.
14     */
15    
16     #pragma once
17    
18     #ifdef _MSC_VER
19     # pragma warning(disable:4063) // case '1' is not a valid value for switch()
20     #endif
21    
22     // These functions are meant for memset operations of constant length only.
23     // For dynamic length clears, use the C-compiler provided memset instead.
24    
25     // MemZero Code Strategies:
26     // I use a trick to help the MSVC compiler optimize it's asm code better. The compiler
27     // won't optimize local variables very well because it insists in storing them on the
28     // stack and then loading them out of the stack when I use them from inline ASM, and
29     // it won't allow me to use template parameters in inline asm code either. But I can
30     // assign the template parameters to enums, and then use the enums from asm code.
31     // Yeah, silly, but it works. :D (air)
32    
33     // All methods defined in this header use template in combination with the aforementioned
34     // enumerations to generate very efficient and compact inlined code. These optimized
35     // memsets work on the theory that most uses of memset involve static arrays and
36     // structures, which are constant in size, thus allowing us to generate optimal compile-
37     // time code for each use of the function.
38    
39     // Use of CLD (Clear Direction Flag):
40     // On Windows platforms the ABI declares that the direction flag should be cleared upon
41     // entry of *any* function. Therefore there is no need to have CLD prior to our use of
42     // rep strosd here.
43    
44     // Notes on XMM0's "storage" area (_xmm_backup):
45     // Unfortunately there's no way to guarantee alignment for this variable. If I use the
46     // __declspec(aligned(16)) decorator, MSVC fails to inline the function since stack
47     // alignment requires prep work. And for the same reason it's not possible to check the
48     // alignment of the stack at compile time, so I'm forced to use movups to store and
49     // retrieve xmm0.
50    
51     // MSVC Template Issue:
52     // MSVC treats int template parameters like macro insertions. That is, if you have a
53     // a template parameter in the form of "func<10-5>()", MSVC inserts 10-5 into the
54     // templated function, causing order-of-operation problems (sigh). The normal fix would
55     // be to assign the template parameter to a static const int inside each function, but that
56     // won't fly with the enums optimization. So in order to fix the problem I define a macro
57     // that encapsulates the template parameter inside parenthesis for us:
58    
59     #define MZFbytes (_bytes)
60    
61     // This is an implementation of the memzero_ptr fast memset routine (for zero-clears only).
62     template< size_t _bytes >
63     static __forceinline void memzero_ptr( void *dest )
64     {
65     if( MZFbytes == 0 ) return;
66    
67     // This function only works on 32-bit alignments. For anything else we just fall back
68     // on the compiler-provided implementation of memset...
69    
70     if( (MZFbytes & 0x3) != 0 )
71     {
72     memset( dest, 0, MZFbytes );
73     return;
74     }
75    
76     enum
77     {
78     remainder = MZFbytes & 127,
79     bytes128 = MZFbytes / 128
80     };
81    
82     // Initial check -- if the length is not a multiple of 16 then fall back on
83     // using rep movsd methods. Handling these unaligned clears in a more efficient
84     // manner isn't necessary in pcsx2 (meaning they aren't used in speed-critical
85     // scenarios).
86    
87     if( (MZFbytes & 0xf) == 0 )
88     {
89     u64 _xmm_backup[2];
90    
91     if( ((uptr)dest & 0xf) != 0 )
92     {
93     // UNALIGNED COPY MODE.
94     // For unaligned copies we have a threshold of at least 128 vectors. Anything
95     // less and it's probably better off just falling back on the rep movsd.
96     if( bytes128 > 128 )
97     {
98     __asm
99     {
100     movups _xmm_backup,xmm0;
101     mov ecx,dest
102     pxor xmm0,xmm0
103     mov eax,bytes128
104    
105     align 16
106    
107     _loop_6:
108     movups [ecx],xmm0;
109     movups [ecx+0x10],xmm0;
110     movups [ecx+0x20],xmm0;
111     movups [ecx+0x30],xmm0;
112     movups [ecx+0x40],xmm0;
113     movups [ecx+0x50],xmm0;
114     movups [ecx+0x60],xmm0;
115     movups [ecx+0x70],xmm0;
116     sub ecx,-128
117     dec eax;
118     jnz _loop_6;
119     }
120     if( remainder != 0 )
121     {
122     // Copy the remainder in reverse (using the decrementing eax as our indexer)
123     __asm
124     {
125     mov eax, remainder
126    
127     _loop_5:
128     movups [ecx+eax],xmm0;
129     sub eax,16;
130     jnz _loop_5;
131     }
132     }
133     __asm
134     {
135     movups xmm0,[_xmm_backup];
136     }
137     return;
138     }
139     }
140     else if( bytes128 > 48 )
141     {
142     // ALIGNED COPY MODE
143     // Data is aligned and the size of data is large enough to merit a nice
144     // fancy chunk of unrolled goodness:
145    
146     __asm
147     {
148     movups _xmm_backup,xmm0;
149     mov ecx,dest
150     pxor xmm0,xmm0
151     mov eax,bytes128
152    
153     align 16
154    
155     _loop_8:
156     movaps [ecx],xmm0;
157     movaps [ecx+0x10],xmm0;
158     movaps [ecx+0x20],xmm0;
159     movaps [ecx+0x30],xmm0;
160     movaps [ecx+0x40],xmm0;
161     movaps [ecx+0x50],xmm0;
162     movaps [ecx+0x60],xmm0;
163     movaps [ecx+0x70],xmm0;
164     sub ecx,-128
165     dec eax;
166     jnz _loop_8;
167     }
168     if( remainder != 0 )
169     {
170     // Copy the remainder in reverse (using the decrementing eax as our indexer)
171     __asm
172     {
173     mov eax, remainder
174    
175     _loop_10:
176     movaps [ecx+eax],xmm0;
177     sub eax,16;
178     jnz _loop_10;
179     }
180     }
181     __asm
182     {
183     movups xmm0,[_xmm_backup];
184     }
185     return;
186     }
187     }
188    
189     // This function only works on 32-bit alignments.
190     jASSUME( (MZFbytes & 0x3) == 0 );
191     jASSUME( ((uptr)dest & 0x3) == 0 );
192    
193     enum
194     {
195     remdat = MZFbytes >> 2
196     };
197    
198     // This case statement handles 5 special-case sizes (small blocks)
199     // in addition to the generic large block that uses rep stosd.
200    
201     switch( remdat )
202     {
203     case 1:
204     *(u32*)dest = 0;
205     return;
206    
207     case 2:
208     *(u64*)dest = 0;
209     return;
210    
211     case 3:
212     __asm
213     {
214     mov edi, dest
215     xor eax, eax
216     stosd
217     stosd
218     stosd
219     }
220     return;
221    
222     case 4:
223     __asm
224     {
225     mov edi, dest
226     xor eax, eax
227     stosd
228     stosd
229     stosd
230     stosd
231     }
232     return;
233    
234     case 5:
235     __asm
236     {
237     mov edi, dest
238     xor eax, eax
239     stosd
240     stosd
241     stosd
242     stosd
243     stosd
244     }
245     return;
246    
247     default:
248     __asm
249     {
250     mov ecx, remdat
251     mov edi, dest
252     xor eax, eax
253     rep stosd
254     }
255     return;
256     }
257     }
258    
259     // An optimized memset for 8 bit destination data.
260     template< u8 data, size_t _bytes >
261     static __forceinline void memset_8( void *dest )
262     {
263     if( MZFbytes == 0 ) return;
264    
265     if( (MZFbytes & 0x3) != 0 )
266     {
267     // unaligned data length. No point in doing an optimized inline version (too complicated!)
268     // So fall back on the compiler implementation:
269    
270     memset( dest, data, MZFbytes );
271     return;
272     }
273    
274     //u64 _xmm_backup[2];
275    
276     /*static const size_t remainder = MZFbytes & 127;
277     static const size_t bytes128 = MZFbytes / 128;
278     if( bytes128 > 32 )
279     {
280     // This function only works on 128-bit alignments.
281     jASSUME( (MZFbytes & 0xf) == 0 );
282     jASSUME( ((uptr)dest & 0xf) == 0 );
283    
284     __asm
285     {
286     movups _xmm_backup,xmm0;
287     mov eax,bytes128
288     mov ecx,dest
289     movss xmm0,data
290    
291     align 16
292    
293     _loop_8:
294     movaps [ecx],xmm0;
295     movaps [ecx+0x10],xmm0;
296     movaps [ecx+0x20],xmm0;
297     movaps [ecx+0x30],xmm0;
298     movaps [ecx+0x40],xmm0;
299     movaps [ecx+0x50],xmm0;
300     movaps [ecx+0x60],xmm0;
301     movaps [ecx+0x70],xmm0;
302     sub ecx,-128
303     dec eax;
304     jnz _loop_8;
305     }
306     if( remainder != 0 )
307     {
308     // Copy the remainder in reverse (using the decrementing eax as our indexer)
309     __asm
310     {
311     mov eax, remainder
312    
313     _loop_10:
314     movaps [ecx+eax],xmm0;
315     sub eax,16;
316     jnz _loop_10;
317     }
318     }
319     __asm
320     {
321     movups xmm0,[_xmm_backup];
322     }
323     }*/
324    
325     // This function only works on 32-bit alignments of data copied.
326     jASSUME( (MZFbytes & 0x3) == 0 );
327    
328     enum
329     {
330     remdat = MZFbytes >> 2,
331     data32 = data + (data<<8) + (data<<16) + (data<<24)
332     };
333    
334     // macro to execute the x86/32 "stosd" copies.
335     switch( remdat )
336     {
337     case 1:
338     *(u32*)dest = data32;
339     return;
340    
341     case 2:
342     ((u32*)dest)[0] = data32;
343     ((u32*)dest)[1] = data32;
344     return;
345    
346     case 3:
347     __asm
348     {
349     mov edi, dest;
350     mov eax, data32;
351     stosd;
352     stosd;
353     stosd;
354     }
355     return;
356    
357     case 4:
358     __asm
359     {
360     mov edi, dest;
361     mov eax, data32;
362     stosd;
363     stosd;
364     stosd;
365     stosd;
366     }
367     return;
368    
369     case 5:
370     __asm
371     {
372     mov edi, dest;
373     mov eax, data32;
374     stosd;
375     stosd;
376     stosd;
377     stosd;
378     stosd;
379     }
380     return;
381    
382     default:
383     __asm
384     {
385     mov ecx, remdat;
386     mov edi, dest;
387     mov eax, data32;
388     rep stosd;
389     }
390     return;
391     }
392     }
393    
394     template< u16 data, size_t _bytes >
395     static __forceinline void memset_16( void *dest )
396     {
397     if( MZFbytes == 0 ) return;
398    
399     if( (MZFbytes & 0x1) != 0 )
400     throw Exception::LogicError( "Invalid parameter passed to memset_16 - data length is not a multiple of 16 or 32 bits." );
401    
402     if( (MZFbytes & 0x3) != 0 )
403     {
404     // Unaligned data length. No point in doing an optimized inline version (too complicated with
405     // remainders and such).
406    
407     _memset16_unaligned( dest, data, MZFbytes );
408     return;
409     }
410    
411     //u64 _xmm_backup[2];
412    
413     // This function only works on 32-bit alignments of data copied.
414     jASSUME( (MZFbytes & 0x3) == 0 );
415    
416     enum
417     {
418     remdat = MZFbytes >> 2,
419     data32 = data + (data<<16)
420     };
421    
422     // macro to execute the x86/32 "stosd" copies.
423     switch( remdat )
424     {
425     case 1:
426     *(u32*)dest = data32;
427     return;
428    
429     case 2:
430     ((u32*)dest)[0] = data32;
431     ((u32*)dest)[1] = data32;
432     return;
433    
434     case 3:
435     __asm
436     {
437     mov edi, dest;
438     mov eax, data32;
439     stosd;
440     stosd;
441     stosd;
442     }
443     return;
444    
445     case 4:
446     __asm
447     {
448     mov edi, dest;
449     mov eax, data32;
450     stosd;
451     stosd;
452     stosd;
453     stosd;
454     }
455     return;
456    
457     case 5:
458     __asm
459     {
460     mov edi, dest;
461     mov eax, data32;
462     stosd;
463     stosd;
464     stosd;
465     stosd;
466     stosd;
467     }
468     return;
469    
470     default:
471     __asm
472     {
473     mov ecx, remdat;
474     mov edi, dest;
475     mov eax, data32;
476     rep stosd;
477     }
478     return
479     }
480     }
481    
482     template< u32 data, size_t MZFbytes >
483     static __forceinline void memset_32( void *dest )
484     {
485     if( MZFbytes == 0 ) return;
486    
487     if( (MZFbytes & 0x3) != 0 )
488     throw Exception::LogicError( "Invalid parameter passed to memset_32 - data length is not a multiple of 32 bits." );
489    
490    
491     //u64 _xmm_backup[2];
492    
493     // This function only works on 32-bit alignments of data copied.
494     // If the data length is not a factor of 32 bits, the C++ optimizing compiler will
495     // probably just generate mysteriously broken code in Release builds. ;)
496    
497     jASSUME( (MZFbytes & 0x3) == 0 );
498    
499     enum
500     {
501     remdat = MZFbytes>>2,
502     data32 = data
503     };
504    
505     // macro to execute the x86/32 "stosd" copies.
506     switch( remdat )
507     {
508     case 1:
509     *(u32*)dest = data32;
510     return;
511    
512     case 2:
513     ((u32*)dest)[0] = data32;
514     ((u32*)dest)[1] = data32;
515     return;
516    
517     case 3:
518     __asm
519     {
520     mov edi, dest;
521     mov eax, data32;
522     stosd;
523     stosd;
524     stosd;
525     }
526     return;
527    
528     case 4:
529     __asm
530     {
531     mov edi, dest;
532     mov eax, data32;
533     stosd;
534     stosd;
535     stosd;
536     stosd;
537     }
538     return;
539    
540     case 5:
541     __asm
542     {
543     mov edi, dest;
544     mov eax, data32;
545     stosd;
546     stosd;
547     stosd;
548     stosd;
549     stosd;
550     }
551     return;
552    
553     default:
554     __asm
555     {
556     mov ecx, remdat;
557     mov edi, dest;
558     mov eax, data32;
559     rep stosd;
560     }
561     return
562     }
563     }
564    
565     // This method can clear any object-like entity -- which is anything that is not a pointer.
566     // Structures, static arrays, etc. No need to include sizeof() crap, this does it automatically
567     // for you!
568     template< typename T >
569     static __forceinline void memzero( T& object )
570     {
571     memzero_ptr<sizeof(T)>( &object );
572     }
573    
574     // This method clears an object with the given 8 bit value.
575     template< u8 data, typename T >
576     static __forceinline void memset8( T& object )
577     {
578     memset_8<data, sizeof(T)>( &object );
579     }
580    
581     // This method clears an object with the given 16 bit value.
582     template< u16 data, typename T >
583     static __forceinline void memset16( T& object )
584     {
585     memset_16<data, sizeof(T)>( &object );
586     }
587    
588     // This method clears an object with the given 32 bit value.
589     template< u32 data, typename T >
590     static __forceinline void memset32( T& object )
591     {
592     memset_32<data, sizeof(T)>( &object );
593     }
594    
595     #undef MZFbytes

  ViewVC Help
Powered by ViewVC 1.1.22