/[pcsx2_0.9.7]/trunk/common/src/Utilities/x86/MemcpyFast.cpp
ViewVC logotype

Contents of /trunk/common/src/Utilities/x86/MemcpyFast.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (show annotations) (download)
Mon Sep 6 11:40:06 2010 UTC (9 years, 5 months ago) by william
File size: 23530 byte(s)
exported r3113 from ./upstream/trunk
1 /******************************************************************************
2
3 Copyright (c) 2001 Advanced Micro Devices, Inc.
4
5 LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
6 EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
7 NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
8 PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
9 DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
10 BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
11 INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
12 OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
13 OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
14 NOT APPLY TO YOU.
15
16 AMD does not assume any responsibility for any errors which may appear in the
17 Materials nor any responsibility to support or update the Materials. AMD retains
18 the right to make changes to its test specifications at any time, without notice.
19
20 NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
21 further information, software, technical information, know-how, or show-how
22 available to you.
23
24 So that all may benefit from your experience, please report any problems
25 or suggestions about this software to 3dsdk.support@amd.com
26
27 AMD Developer Technologies, M/S 585
28 Advanced Micro Devices, Inc.
29 5900 E. Ben White Blvd.
30 Austin, TX 78741
31 3dsdk.support@amd.com
32 ******************************************************************************/
33
34 #include "PrecompiledHeader.h"
35
36 #ifdef _MSC_VER
37 #pragma warning(disable:4414)
38 #endif
39
40 /*****************************************************************************
41 MEMCPY_AMD.CPP
42 ******************************************************************************/
43
44 // Very optimized memcpy() routine for AMD Athlon and Duron family.
45 // This code uses any of FOUR different basic copy methods, depending
46 // on the transfer size.
47 // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
48 // "Streaming Store"), and also uses the software prefetch instructions,
49 // be sure you're running on Athlon/Duron or other recent CPU before calling!
50
51 #define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
52 // The smallest copy uses the X86 "movsd" instruction, in an optimized
53 // form which is an "unrolled loop".
54
55 #define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
56 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
57 // also using the "unrolled loop" optimization. This code uses
58 // the software prefetch instruction to get the data into the cache.
59
60 #define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
61 // For larger blocks, which will spill beyond the cache, it's faster to
62 // use the Streaming Store instruction MOVNTQ. This write instruction
63 // bypasses the cache and writes straight to main memory. This code also
64 // uses the software prefetch instruction to pre-read the data.
65 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
66
67 #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
68 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
69 // For the largest size blocks, a special technique called Block Prefetch
70 // can be used to accelerate the read operations. Block Prefetch reads
71 // one address per cache line, for a series of cache lines, in a short loop.
72 // This is faster than using software prefetch. The technique is great for
73 // getting maximum read bandwidth, especially in DDR memory systems.
74
75 // Inline assembly syntax for use with Visual C++
76
77 #if defined(_MSC_VER)
78
79 #ifdef PCSX2_DEBUG
80 extern u8 g_globalMMXSaved;
81
82 #endif
83
84
85 static __aligned16 u8 _xmm_backup[16*2];
86 static __aligned16 u8 _mmx_backup[8*4];
87
88 static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
89 {
90 // MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
91 #define MOVSRC movdqu
92 #define MOVDST movdqa
93
94 __asm
95 {
96 //Reads before reads, to avoid stalls
97 mov eax,[esp+4];
98 //Make sure to save xmm0, it must be preserved ...
99 movaps [_xmm_backup],xmm0;
100
101 //if >=128 bytes use 128 byte unrolled loop
102 //i use cmp ..,127 + jna because 127 is encodable using the simm8 form
103 cmp eax,127;
104 jna _loop_1;
105
106 //since this is a common branch target it could be good to align it -- no idea if it has any effect :p
107 align 16
108
109 //128 byte unrolled loop
110 _loop_8:
111
112 MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
113 MOVDST [ecx+0x00],xmm0; //then write :p
114 MOVSRC xmm0,[edx+0x10];
115 MOVDST [ecx+0x10],xmm0;
116 sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
117 sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
118
119 MOVSRC xmm0,[edx+0x20-128];
120 MOVDST [ecx+0x20-128],xmm0;
121 MOVSRC xmm0,[edx+0x30-128];
122 MOVDST [ecx+0x30-128],xmm0;
123 add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
124
125 MOVSRC xmm0,[edx+0x40-128];
126 MOVDST [ecx+0x40-128],xmm0;
127 MOVSRC xmm0,[edx+0x50-128];
128 MOVDST [ecx+0x50-128],xmm0;
129
130 MOVSRC xmm0,[edx+0x60-128];
131 MOVDST [ecx+0x60-128],xmm0;
132 MOVSRC xmm0,[edx+0x70-128];
133 MOVDST [ecx+0x70-128],xmm0;
134
135 //127~ja, 127 is encodable as simm8 :)
136 cmp eax,127;
137 ja _loop_8;
138
139 //direct copy for 0~7 qwords
140 //in order to avoid the inc/dec of all 3 registers
141 //i use negative relative addressing from the top of the buffers
142 //[top-current index]
143
144 _loop_1:
145 //prepare the regs for 'negative relative addressing'
146 add edx,eax;
147 add ecx,eax;
148 neg eax;
149 jz cleanup; //exit if nothing to do
150
151 _loop_1_inner:
152 MOVSRC xmm0,[edx+eax];
153 MOVDST [ecx+eax],xmm0;
154
155 add eax,16; //while the offset is still negative we have data to copy
156 js _loop_1_inner;
157
158 //done !
159 cleanup:
160 //restore xmm and exit ~)
161 movaps xmm0,[_xmm_backup];
162 ret 4;
163 }
164 #undef MOVSRC
165 #undef MOVDST
166 }
167
168
169 static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
170 {
171 // MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
172 #define MOVSRC movaps
173 #define MOVDST movups
174 __asm
175 {
176 //Reads before reads, to avoid stalls
177 mov eax,[esp+4];
178 //Make sure to save xmm0, it must be preserved ...
179 movaps [_xmm_backup],xmm0;
180
181 //if >=128 bytes use 128 byte unrolled loop
182 //i use cmp ..,127 + jna because 127 is encodable using the simm8 form
183 cmp eax,127;
184 jna _loop_1;
185
186 //since this is a common branch target it could be good to align it -- no idea if it has any effect :p
187 align 16
188
189 //128 byte unrolled loop
190 _loop_8:
191
192 MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
193 MOVDST [ecx+0x00],xmm0; //then write :p
194 MOVSRC xmm0,[edx+0x10];
195 MOVDST [ecx+0x10],xmm0;
196 sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
197 sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
198
199 MOVSRC xmm0,[edx+0x20-128];
200 MOVDST [ecx+0x20-128],xmm0;
201 MOVSRC xmm0,[edx+0x30-128];
202 MOVDST [ecx+0x30-128],xmm0;
203 add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
204
205 MOVSRC xmm0,[edx+0x40-128];
206 MOVDST [ecx+0x40-128],xmm0;
207 MOVSRC xmm0,[edx+0x50-128];
208 MOVDST [ecx+0x50-128],xmm0;
209
210 MOVSRC xmm0,[edx+0x60-128];
211 MOVDST [ecx+0x60-128],xmm0;
212 MOVSRC xmm0,[edx+0x70-128];
213 MOVDST [ecx+0x70-128],xmm0;
214
215 //127~ja, 127 is encodable as simm8 :)
216 cmp eax,127;
217 ja _loop_8;
218
219 //direct copy for 0~7 qwords
220 //in order to avoid the inc/dec of all 3 registers
221 //i use negative relative addressing from the top of the buffers
222 //[top-current index]
223
224 _loop_1:
225 //prepare the regs for 'negative relative addressing'
226 add edx,eax;
227 add ecx,eax;
228 neg eax;
229 jz cleanup; //exit if nothing to do
230
231 _loop_1_inner:
232 MOVSRC xmm0,[edx+eax];
233 movaps [ecx+eax],xmm0;
234
235 add eax,16; //while the offset is still negative we have data to copy
236 js _loop_1_inner;
237
238 //done !
239 cleanup:
240 //restore xmm and exit ~)
241 movaps xmm0,[_xmm_backup];
242 ret 4;
243 }
244 #undef MOVSRC
245 #undef MOVDST
246 }
247
248 // Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
249 // This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
250 // used since the reads are linear and the cache logic can predict em :)
251 // *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
252 __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
253 {
254 // Code Implementation Notes:
255 // Uses a forward copy, in 128 byte blocks, and then does the remaining in 16 byte blocks :)
256
257 // MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
258 #define MOVSRC movaps
259 #define MOVDST movaps
260 __asm
261 {
262 //Reads before reads, to avoid stalls
263 mov eax,[esp+4];
264 //Make sure to save xmm0, it must be preserved ...
265 movaps [_xmm_backup],xmm0;
266
267 //if >=128 bytes use 128 byte unrolled loop
268 //i use cmp ..,127 + jna because 127 is encodable using the simm8 form
269 cmp eax,127;
270 jna _loop_1;
271
272 //since this is a common branch target it could be good to align it -- no idea if it has any effect :p
273 align 16
274
275 //128 byte unrolled loop
276 _loop_8:
277
278 MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
279 MOVDST [ecx+0x00],xmm0; //then write :p
280 MOVSRC xmm0,[edx+0x10];
281 MOVDST [ecx+0x10],xmm0;
282 sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
283 sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
284
285 MOVSRC xmm0,[edx+0x20-128];
286 MOVDST [ecx+0x20-128],xmm0;
287 MOVSRC xmm0,[edx+0x30-128];
288 MOVDST [ecx+0x30-128],xmm0;
289 add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
290
291 MOVSRC xmm0,[edx+0x40-128];
292 MOVDST [ecx+0x40-128],xmm0;
293 MOVSRC xmm0,[edx+0x50-128];
294 MOVDST [ecx+0x50-128],xmm0;
295
296 MOVSRC xmm0,[edx+0x60-128];
297 MOVDST [ecx+0x60-128],xmm0;
298 MOVSRC xmm0,[edx+0x70-128];
299 MOVDST [ecx+0x70-128],xmm0;
300
301 //127~ja, 127 is encodable as simm8 :)
302 cmp eax,127;
303 ja _loop_8;
304
305 //direct copy for 0~7 qwords
306 //in order to avoid the inc/dec of all 3 registers
307 //i use negative relative addressing from the top of the buffers
308 //[top-current index]
309
310 _loop_1:
311 //prepare the regs for 'negative relative addressing'
312 add edx,eax;
313 add ecx,eax;
314 neg eax;
315 jz cleanup; //exit if nothing to do
316
317 _loop_1_inner:
318 MOVSRC xmm0,[edx+eax];
319 MOVDST [ecx+eax],xmm0;
320
321 add eax,16; //while the offset is still negative we have data to copy
322 js _loop_1_inner;
323
324 //done !
325 cleanup:
326 //restore xmm and exit ~)
327 movaps xmm0,[_xmm_backup];
328 ret 4;
329 }
330 #undef MOVSRC
331 #undef MOVDST
332 }
333
334 // This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
335 __forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
336 {
337 if( ((uptr)src & 0xf) == 0 )
338 memcpy_raz_( dest, src, bytes );
339 else
340 _memcpy_raz_usrc( dest, src, bytes );
341 }
342
343 // This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
344 __forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
345 {
346 if( ((uptr)dest & 0xf) == 0 )
347 memcpy_raz_( dest, src, bytes );
348 else
349 _memcpy_raz_udst( dest, src, bytes );
350 }
351
352
353 //////////////////////////////////////////////////////////////////////////
354 // Fast memcpy as coded by AMD, and thn improved by air.
355 //
356 // This routine preserves mmx registers! It's the complete real deal!
357 __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
358 {
359 __asm
360 {
361 push edi
362 push esi
363
364 mov edi, ecx ; destination
365 mov esi, edx ; source
366 mov ecx, [esp+12] ; number of bytes to copy
367 mov eax, ecx ; keep a copy of count
368
369 cld
370 cmp eax, TINY_BLOCK_COPY
371 jb $memcpy_ic_3 ; tiny? skip mmx copy
372
373 cmp eax, 32*1024 ; don't align between 32k-64k because
374 jbe $memcpy_do_align ; it appears to be slower
375 cmp eax, 64*1024
376 jbe $memcpy_align_done
377 $memcpy_do_align:
378 mov eax, 8 ; a trick that's faster than rep movsb...
379 sub eax, edi ; align destination to qword
380 and eax, 111b ; get the low bits
381 sub ecx, eax ; update copy count
382 neg eax ; set up to jump into the array
383 add eax, offset $memcpy_align_done
384 jmp eax ; jump to array of movsb's
385
386 align 4
387 movsb
388 movsb
389 movsb
390 movsb
391 movsb
392 movsb
393 movsb
394 movsb
395
396 $memcpy_align_done: ; destination is dword aligned
397 mov eax, ecx ; number of bytes left to copy
398 shr eax, 6 ; get 64-byte block count
399 jz $memcpy_ic_2 ; finish the last few bytes
400
401 mov edx, offset _mmx_backup ; will probably need this to save/restore mmx
402 cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
403 jae $memcpy_uc_test
404
405 movq [edx+0x00],mm0
406 movq [edx+0x08],mm1
407 movq [edx+0x10],mm2
408 movq [edx+0x18],mm3
409
410 // This is small block copy that uses the MMX registers to copy 8 bytes
411 // at a time. It uses the "unrolled loop" optimization, and also uses
412 // the software prefetch instruction to get the data into the cache.
413 align 16
414 $memcpy_ic_1: ; 64-byte block copies, in-cache copy
415
416 prefetchnta [esi + (200*64/34+192)] ; start reading ahead
417
418 movq mm0, [esi+0] ; read 64 bits
419 movq mm1, [esi+8]
420 movq [edi+0], mm0 ; write 64 bits
421 movq [edi+8], mm1 ; note: the normal movq writes the
422 movq mm2, [esi+16] ; data to cache; a cache line will be
423 movq mm3, [esi+24] ; allocated as needed, to store the data
424 movq [edi+16], mm2
425 movq [edi+24], mm3
426 movq mm0, [esi+32]
427 movq mm1, [esi+40]
428 movq [edi+32], mm0
429 movq [edi+40], mm1
430 movq mm2, [esi+48]
431 movq mm3, [esi+56]
432 movq [edi+48], mm2
433 movq [edi+56], mm3
434
435 add esi, 64 ; update source pointer
436 add edi, 64 ; update destination pointer
437 dec eax ; count down
438 jnz $memcpy_ic_1 ; last 64-byte block?
439
440 movq mm0,[edx+0x00]
441 movq mm1,[edx+0x08]
442 movq mm2,[edx+0x10]
443 movq mm3,[edx+0x18]
444
445 $memcpy_ic_2:
446 mov eax, ecx ; has valid low 6 bits of the byte count
447 $memcpy_ic_3:
448 shr eax, 2 ; dword count
449 and eax, 1111b ; only look at the "remainder" bits
450 neg eax ; set up to jump into the array
451 add eax, offset $memcpy_last_few
452 jmp eax ; jump to array of movsd's
453
454 $memcpy_uc_test:
455 or eax, eax ; tail end of block prefetch will jump here
456 jz $memcpy_ic_2 ; no more 64-byte blocks left
457
458 // For larger blocks, which will spill beyond the cache, it's faster to
459 // use the Streaming Store instruction MOVNTQ. This write instruction
460 // bypasses the cache and writes straight to main memory. This code also
461 // uses the software prefetch instruction to pre-read the data.
462
463 movq [edx+0x00],mm0
464 movq [edx+0x08],mm1
465 movq [edx+0x10],mm2
466
467 align 16
468 $memcpy_uc_1: ; 64-byte blocks, uncached copy
469
470 prefetchnta [esi + (200*64/34+192)] ; start reading ahead
471
472 movq mm0,[esi+0] ; read 64 bits
473 add edi,64 ; update destination pointer
474 movq mm1,[esi+8]
475 add esi,64 ; update source pointer
476 movq mm2,[esi-48]
477 movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
478 movq mm0,[esi-40] ; note: movntq also prevents the CPU
479 movntq [edi-56], mm1 ; from READING the destination address
480 movq mm1,[esi-32] ; into the cache, only to be over-written
481 movntq [edi-48], mm2 ; so that also helps performance
482 movq mm2,[esi-24]
483 movntq [edi-40], mm0
484 movq mm0,[esi-16]
485 movntq [edi-32], mm1
486 movq mm1,[esi-8]
487 movntq [edi-24], mm2
488 movntq [edi-16], mm0
489 dec eax
490 movntq [edi-8], mm1
491 jnz $memcpy_uc_1 ; last 64-byte block?
492
493 movq mm0,[edx+0x00]
494 movq mm1,[edx+0x08]
495 movq mm2,[edx+0x10]
496
497 jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
498
499 // For the largest size blocks, a special technique called Block Prefetch
500 // can be used to accelerate the read operations. Block Prefetch reads
501 // one address per cache line, for a series of cache lines, in a short loop.
502 // This is faster than using software prefetch. The technique is great for
503 // getting maximum read bandwidth, especially in DDR memory systems.
504
505 // Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
506 // help keep the code cache footprint of memcpy_fast to a minimum.
507 /*
508 $memcpy_bp_1: ; large blocks, block prefetch copy
509
510 cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
511 jl $memcpy_64_test ; no, back to regular uncached copy
512
513 mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
514 add esi, CACHEBLOCK * 64 ; move to the top of the block
515 align 16
516 $memcpy_bp_2:
517 mov edx, [esi-64] ; grab one address per cache line
518 mov edx, [esi-128] ; grab one address per cache line
519 sub esi, 128 ; go reverse order to suppress HW prefetcher
520 dec eax ; count down the cache lines
521 jnz $memcpy_bp_2 ; keep grabbing more lines into cache
522
523 mov eax, CACHEBLOCK ; now that it's in cache, do the copy
524 align 16
525 $memcpy_bp_3:
526 movq mm0, [esi ] ; read 64 bits
527 movq mm1, [esi+ 8]
528 movq mm2, [esi+16]
529 movq mm3, [esi+24]
530 movq mm4, [esi+32]
531 movq mm5, [esi+40]
532 movq mm6, [esi+48]
533 movq mm7, [esi+56]
534 add esi, 64 ; update source pointer
535 movntq [edi ], mm0 ; write 64 bits, bypassing cache
536 movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
537 movntq [edi+16], mm2 ; from READING the destination address
538 movntq [edi+24], mm3 ; into the cache, only to be over-written,
539 movntq [edi+32], mm4 ; so that also helps performance
540 movntq [edi+40], mm5
541 movntq [edi+48], mm6
542 movntq [edi+56], mm7
543 add edi, 64 ; update dest pointer
544
545 dec eax ; count down
546
547 jnz $memcpy_bp_3 ; keep copying
548 sub ecx, CACHEBLOCK ; update the 64-byte block count
549 jmp $memcpy_bp_1 ; keep processing chunks
550 */
551
552 // The smallest copy uses the X86 "movsd" instruction, in an optimized
553 // form which is an "unrolled loop". Then it handles the last few bytes.
554 align 16
555 movsd
556 movsd ; perform last 1-15 dword copies
557 movsd
558 movsd
559 movsd
560 movsd
561 movsd
562 movsd
563 movsd
564 movsd ; perform last 1-7 dword copies
565 movsd
566 movsd
567 movsd
568 movsd
569 movsd
570 movsd
571
572 $memcpy_last_few: ; dword aligned from before movsd's
573 and ecx, 11b ; the last few cows must come home
574 jz $memcpy_final ; no more, let's leave
575 rep movsb ; the last 1, 2, or 3 bytes
576
577 $memcpy_final:
578 emms ; clean up the MMX state
579 sfence ; flush the write buffer
580 //mov eax, [dest] ; ret value = destination pointer
581
582 pop esi
583 pop edi
584
585 ret 4
586 }
587 }
588
589 // mmx mem-compare implementation, size has to be a multiple of 8
590 // returns 0 is equal, nonzero value if not equal
591 // ~10 times faster than standard memcmp
592 // (zerofrog)
593 u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
594 {
595 pxAssert( (cmpsize&7) == 0 );
596
597 __asm {
598 push esi
599 mov ecx, cmpsize
600 mov edx, src1
601 mov esi, src2
602
603 cmp ecx, 32
604 jl Done4
605
606 // custom test first 8 to make sure things are ok
607 movq mm0, [esi]
608 movq mm1, [esi+8]
609 pcmpeqd mm0, [edx]
610 pcmpeqd mm1, [edx+8]
611 pand mm0, mm1
612 movq mm2, [esi+16]
613 pmovmskb eax, mm0
614 movq mm3, [esi+24]
615
616 // check if eq
617 cmp eax, 0xff
618 je NextComp
619 mov eax, 1
620 jmp End
621
622 NextComp:
623 pcmpeqd mm2, [edx+16]
624 pcmpeqd mm3, [edx+24]
625 pand mm2, mm3
626 pmovmskb eax, mm2
627
628 sub ecx, 32
629 add esi, 32
630 add edx, 32
631
632 // check if eq
633 cmp eax, 0xff
634 je ContinueTest
635 mov eax, 1
636 jmp End
637
638 cmp ecx, 64
639 jl Done8
640
641 Cmp8:
642 movq mm0, [esi]
643 movq mm1, [esi+8]
644 movq mm2, [esi+16]
645 movq mm3, [esi+24]
646 movq mm4, [esi+32]
647 movq mm5, [esi+40]
648 movq mm6, [esi+48]
649 movq mm7, [esi+56]
650 pcmpeqd mm0, [edx]
651 pcmpeqd mm1, [edx+8]
652 pcmpeqd mm2, [edx+16]
653 pcmpeqd mm3, [edx+24]
654 pand mm0, mm1
655 pcmpeqd mm4, [edx+32]
656 pand mm0, mm2
657 pcmpeqd mm5, [edx+40]
658 pand mm0, mm3
659 pcmpeqd mm6, [edx+48]
660 pand mm0, mm4
661 pcmpeqd mm7, [edx+56]
662 pand mm0, mm5
663 pand mm0, mm6
664 pand mm0, mm7
665 pmovmskb eax, mm0
666
667 // check if eq
668 cmp eax, 0xff
669 je Continue
670 mov eax, 1
671 jmp End
672
673 Continue:
674 sub ecx, 64
675 add esi, 64
676 add edx, 64
677 ContinueTest:
678 cmp ecx, 64
679 jge Cmp8
680
681 Done8:
682 test ecx, 0x20
683 jz Done4
684 movq mm0, [esi]
685 movq mm1, [esi+8]
686 movq mm2, [esi+16]
687 movq mm3, [esi+24]
688 pcmpeqd mm0, [edx]
689 pcmpeqd mm1, [edx+8]
690 pcmpeqd mm2, [edx+16]
691 pcmpeqd mm3, [edx+24]
692 pand mm0, mm1
693 pand mm0, mm2
694 pand mm0, mm3
695 pmovmskb eax, mm0
696 sub ecx, 32
697 add esi, 32
698 add edx, 32
699
700 // check if eq
701 cmp eax, 0xff
702 je Done4
703 mov eax, 1
704 jmp End
705
706 Done4:
707 cmp ecx, 24
708 jne Done2
709 movq mm0, [esi]
710 movq mm1, [esi+8]
711 movq mm2, [esi+16]
712 pcmpeqd mm0, [edx]
713 pcmpeqd mm1, [edx+8]
714 pcmpeqd mm2, [edx+16]
715 pand mm0, mm1
716 pand mm0, mm2
717 pmovmskb eax, mm0
718
719 // check if eq
720 cmp eax, 0xff
721 setne al
722 jmp End
723
724 Done2:
725 cmp ecx, 16
726 jne Done1
727
728 movq mm0, [esi]
729 movq mm1, [esi+8]
730 pcmpeqd mm0, [edx]
731 pcmpeqd mm1, [edx+8]
732 pand mm0, mm1
733 pmovmskb eax, mm0
734
735 // check if eq
736 cmp eax, 0xff
737 setne al
738 jmp End
739
740 Done1:
741 cmp ecx, 8
742 jne Done
743
744 mov eax, [esi]
745 mov esi, [esi+4]
746 cmp eax, [edx]
747 je Next
748 mov eax, 1
749 jmp End
750
751 Next:
752 cmp esi, [edx+4]
753 setne al
754 jmp End
755
756 Done:
757 xor eax, eax
758
759 End:
760 pop esi
761 emms
762 }
763 }
764
765
766 // returns the xor of all elements, cmpsize has to be mult of 8
767 void memxor_mmx(void* dst, const void* src1, int cmpsize)
768 {
769 pxAssert( (cmpsize&7) == 0 );
770
771 __asm {
772 mov ecx, cmpsize
773 mov eax, src1
774 mov edx, dst
775
776 cmp ecx, 64
777 jl Setup4
778
779 movq mm0, [eax]
780 movq mm1, [eax+8]
781 movq mm2, [eax+16]
782 movq mm3, [eax+24]
783 movq mm4, [eax+32]
784 movq mm5, [eax+40]
785 movq mm6, [eax+48]
786 movq mm7, [eax+56]
787 sub ecx, 64
788 add eax, 64
789 cmp ecx, 64
790 jl End8
791
792 Cmp8:
793 pxor mm0, [eax]
794 pxor mm1, [eax+8]
795 pxor mm2, [eax+16]
796 pxor mm3, [eax+24]
797 pxor mm4, [eax+32]
798 pxor mm5, [eax+40]
799 pxor mm6, [eax+48]
800 pxor mm7, [eax+56]
801
802 sub ecx, 64
803 add eax, 64
804 cmp ecx, 64
805 jge Cmp8
806
807 End8:
808 pxor mm0, mm4
809 pxor mm1, mm5
810 pxor mm2, mm6
811 pxor mm3, mm7
812
813 cmp ecx, 32
814 jl End4
815 pxor mm0, [eax]
816 pxor mm1, [eax+8]
817 pxor mm2, [eax+16]
818 pxor mm3, [eax+24]
819 sub ecx, 32
820 add eax, 32
821 jmp End4
822
823 Setup4:
824 cmp ecx, 32
825 jl Setup2
826
827 movq mm0, [eax]
828 movq mm1, [eax+8]
829 movq mm2, [eax+16]
830 movq mm3, [eax+24]
831 sub ecx, 32
832 add eax, 32
833
834 End4:
835 pxor mm0, mm2
836 pxor mm1, mm3
837
838 cmp ecx, 16
839 jl End2
840 pxor mm0, [eax]
841 pxor mm1, [eax+8]
842 sub ecx, 16
843 add eax, 16
844 jmp End2
845
846 Setup2:
847 cmp ecx, 16
848 jl Setup1
849
850 movq mm0, [eax]
851 movq mm1, [eax+8]
852 sub ecx, 16
853 add eax, 16
854
855 End2:
856 pxor mm0, mm1
857
858 cmp ecx, 8
859 jl End1
860 pxor mm0, [eax]
861 End1:
862 movq [edx], mm0
863 jmp End
864
865 Setup1:
866 movq mm0, [eax]
867 movq [edx], mm0
868 End:
869 emms
870 }
871 }
872
873 #endif

  ViewVC Help
Powered by ViewVC 1.1.22