/[pcsx2_0.9.7]/trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp
ViewVC logotype

Diff of /trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

--- trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp	2010/12/23 11:48:33	279
+++ trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp	2010/12/23 12:02:12	280
@@ -24,109 +24,54 @@
 #include <emmintrin.h>
 #endif
 
+// WARNING a sfence instruction must be call after SwizzleBlock sse2 function
+
 // Current port of the ASM function to intrinsic
-#define INTRINSIC_PORT_32
 #define INTRINSIC_PORT_16
 #define INTRINSIC_PORT_8
 #define INTRINSIC_PORT_4
 #ifdef ZEROGS_SSE2
+static const __aligned16 u32 mask_24b_H[4] = {0xFF000000, 0x0000FFFF, 0xFF000000, 0x0000FFFF};
+static const __aligned16 u32 mask_24b_L[4] = {0x00FFFFFF, 0x00000000, 0x00FFFFFF, 0x00000000};
+
 template<bool aligned>
-__forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch)
 {
     __m128i src_0;
     __m128i src_1;
     __m128i src_2;
     __m128i src_3;
 
-	if (WriteMask == 0xffffffff) {
-        for (int i=3 ; i >= 0 ; --i) {
-            // load
-            if (aligned) {
-                src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
-                src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
-                src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
-                src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
-            } else {
-                src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
-                src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
-                src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
-                src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
-            }
-
-            //  Reorder
-            __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
-            __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
-            __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
-            __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
-
-            // store
-            _mm_stream_si128((__m128i*)dst, dst_0);
-            _mm_stream_si128(((__m128i*)dst)+1, dst_1);
-            _mm_stream_si128(((__m128i*)dst)+2, dst_2);
-            _mm_stream_si128(((__m128i*)dst)+3, dst_3);
-
-            // update the pointer
-            dst += 64;
-            src += 2*pitch;
+    for (int i=3 ; i >= 0 ; --i) {
+        // load
+        if (aligned) {
+            src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
+            src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
+            src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
+            src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
+        } else {
+            src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
+            src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
+            src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
+            src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
         }
-	}
-	else
-	{
-        // Build the mask (tranform a u32 to a 4 packets u32)
-        __m128i mask = _mm_cvtsi32_si128(WriteMask);
-        mask = _mm_shuffle_epi32(mask, 0);
-
-        for (int i=3 ; i >= 0 ; --i) {
-            // load
-            if (aligned) {
-                src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
-                src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
-                src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
-                src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
-            } else {
-                src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
-                src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
-                src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
-                src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
-            }
 
-            // Apply the WriteMask before reordering
-            src_0 = _mm_and_si128(src_0, mask);
-            src_1 = _mm_and_si128(src_1, mask);
-            src_2 = _mm_and_si128(src_2, mask);
-            src_3 = _mm_and_si128(src_3, mask);
-
-            //  Reorder
-            __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
-            __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
-            __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
-            __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
-
-            // Load previous value and apply the ~mask
-            __m128i old_dst_0 = _mm_andnot_si128(mask, _mm_load_si128((__m128i*)dst));
-            __m128i old_dst_1 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+1));
-            __m128i old_dst_2 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+2));
-            __m128i old_dst_3 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+3));
-
-            // Build the final value
-            dst_0 = _mm_or_si128(dst_0, old_dst_0);
-            dst_1 = _mm_or_si128(dst_1, old_dst_1);
-            dst_2 = _mm_or_si128(dst_2, old_dst_2);
-            dst_3 = _mm_or_si128(dst_3, old_dst_3);
-
-            // store
-            _mm_stream_si128((__m128i*)dst, dst_0);
-            _mm_stream_si128(((__m128i*)dst)+1, dst_1);
-            _mm_stream_si128(((__m128i*)dst)+2, dst_2);
-            _mm_stream_si128(((__m128i*)dst)+3, dst_3);
-
-            // update the pointer
-            dst += 64;
-            src += 2*pitch;
-        }
-	}
-    // FIXME normally you must use a sfence but it would impact perf to do here
-    // the function is in a loop and it would have a better place after the loop...
+        //  Reorder
+        __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
+        __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
+        __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
+        __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
+
+        // store
+        _mm_stream_si128((__m128i*)dst, dst_0);
+        _mm_stream_si128(((__m128i*)dst)+1, dst_1);
+        _mm_stream_si128(((__m128i*)dst)+2, dst_2);
+        _mm_stream_si128(((__m128i*)dst)+3, dst_3);
+
+        // update the pointer
+        dst += 64;
+        src += 2*pitch;
+    }
 }
 
 template<bool aligned>
@@ -173,8 +118,6 @@
         dst += 64;
         src += 2*pitch;
     }
-    // FIXME normally you must use a sfence but it would impact perf to do here
-    // the function is in a loop and it would have a better place after the loop...
 }
 
 // Template the code to improve reuse of code
@@ -256,9 +199,6 @@
     dst += 64;
     src += 4*pitch;
     SwizzleColumn8_sse2_I<aligned, 3>(dst, src, pitch);
-
-    // FIXME normally you must use a sfence but it would impact perf to do here
-    // the function is in a loop and it would have a better place after the loop...
 }
 
 // Template the code to improve reuse of code
@@ -372,130 +312,310 @@
     dst += 64;
     src += 4*pitch;
     SwizzleColumn4_sse2_I<aligned, 3>(dst, src, pitch);
+}
 
-    // FIXME normally you must use a sfence but it would impact perf to do here
-    // the function is in a loop and it would have a better place after the loop...
+template<bool FOUR_BIT, bool UPPER>
+__forceinline void SwizzleBlock8H_4H(u8 *dst, u8 *src, int pitch)
+{
+    __m128i zero_128 = _mm_setzero_si128();
+    __m128i src_0;
+    __m128i src_1;
+    __m128i src_2;
+    __m128i src_3;
+    __m128i src_0_init_H;
+    __m128i src_0_init_L;
+    __m128i src_2_init_H;
+    __m128i src_2_init_L;
+    __m128i src_0_init;
+    __m128i src_2_init;
+
+    __m128i upper_mask = _mm_cvtsi32_si128(0xF0F0F0F0);
+    // Build the write_mask (tranform a u32 to a 4 packets u32)
+    __m128i write_mask;
+    if (FOUR_BIT) {
+        if (UPPER) write_mask = _mm_cvtsi32_si128(0xF0000000);
+        else write_mask = _mm_cvtsi32_si128(0x0F000000);
+    } else {
+        write_mask = _mm_cvtsi32_si128(0xFF000000);
+    }
+    write_mask = _mm_shuffle_epi32(write_mask, 0);
+
+    for (int i=3 ; i >= 0 ; --i) {
+        if (FOUR_BIT) {
+            src_0_init = _mm_cvtsi32_si128(*(u32*)src);
+            src_2_init = _mm_cvtsi32_si128(*(u32*)(src + pitch));
+        } else {
+            src_0_init = _mm_loadl_epi64((__m128i*)src);
+            src_2_init = _mm_loadl_epi64((__m128i*)(src + pitch));
+        }
+
+        // Convert to 8 bits
+        if (FOUR_BIT) {
+            src_0_init_H = _mm_and_si128(upper_mask, src_0_init);
+            src_0_init_L = _mm_andnot_si128(upper_mask, src_0_init);
+            src_2_init_H = _mm_and_si128(upper_mask, src_2_init);
+            src_2_init_L = _mm_andnot_si128(upper_mask, src_2_init);
+
+            if (UPPER) {
+                src_0_init_L = _mm_slli_epi32(src_0_init_L, 4);
+                src_2_init_L = _mm_slli_epi32(src_2_init_L, 4);
+            } else {
+                src_0_init_H = _mm_srli_epi32(src_0_init_H, 4);
+                src_2_init_H = _mm_srli_epi32(src_2_init_H, 4);
+            }
+
+            // Repack the src to keep HByte order
+            src_0_init = _mm_unpacklo_epi8(src_0_init_L, src_0_init_H);
+            src_2_init = _mm_unpacklo_epi8(src_2_init_L, src_2_init_H);
+        }
+
+        // transform to 16 bits (add 0 in low bits)
+        src_0_init = _mm_unpacklo_epi8(zero_128, src_0_init);
+        src_2_init = _mm_unpacklo_epi8(zero_128, src_2_init);
+
+        // transform to 32 bits (add 0 in low bits)
+        src_0 = _mm_unpacklo_epi16(zero_128, src_0_init);
+        src_1 = _mm_unpackhi_epi16(zero_128, src_0_init);
+        src_2 = _mm_unpacklo_epi16(zero_128, src_2_init);
+        src_3 = _mm_unpackhi_epi16(zero_128, src_2_init);
+
+        // Reorder the data (same as 32 bits format)
+        __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
+        __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
+        __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
+        __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
+
+        // Load previous value and apply the ~write_mask
+        __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
+        dst_0 = _mm_or_si128(dst_0, old_dst_0);
+
+        __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
+        dst_1 = _mm_or_si128(dst_1, old_dst_1);
+
+        __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
+        dst_2 = _mm_or_si128(dst_2, old_dst_2);
+
+        __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
+        dst_3 = _mm_or_si128(dst_3, old_dst_3);
+
+        // store
+        _mm_stream_si128((__m128i*)dst, dst_0);
+        _mm_stream_si128(((__m128i*)dst)+1, dst_1);
+        _mm_stream_si128(((__m128i*)dst)+2, dst_2);
+        _mm_stream_si128(((__m128i*)dst)+3, dst_3);
+
+        // update the pointer
+        dst += 64;
+        src += 2*pitch;
+    }
 }
-#endif
 
 // special swizzle macros - which I converted to functions.
-#ifdef ZEROGS_SSE2
 
-__forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_32
-    SwizzleBlock32_sse2_I<true>(dst, src, pitch, WriteMask);
-#else
-	SwizzleBlock32_sse2(dst, src, pitch, WriteMask);
-#endif
+    SwizzleBlock32_sse2_I<true>(dst, src, pitch);
+}
+
+__forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch)
+{
+    __m128i mask_H = _mm_load_si128((__m128i*)mask_24b_H);
+    __m128i mask_L = _mm_load_si128((__m128i*)mask_24b_L);
+    // Build the write_mask (tranform a u32 to a 4 packets u32)
+    __m128i write_mask = _mm_cvtsi32_si128(0x00FFFFFF);
+    write_mask = _mm_shuffle_epi32(write_mask, 0);
+
+    for (int i=3 ; i >= 0 ; --i) {
+        //  Note src can be out of bound of GS memory (but there is some spare allocation
+        //  to avoid a tricky corner case)
+        __m128i src_0 = _mm_loadu_si128((__m128i*)src);
+        __m128i src_1 = _mm_loadu_si128((__m128i*)(src+12));
+        __m128i src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
+        __m128i src_3 = _mm_loadu_si128((__m128i*)(src+pitch+12));
+
+        // transform 24 bits value to 32 bits one
+        // 1/ Align a little the data
+        src_0 = _mm_slli_si128(src_0, 2);
+        src_0 = _mm_shufflelo_epi16(src_0, 0x39);
+
+        src_1 = _mm_slli_si128(src_1, 2);
+        src_1 = _mm_shufflelo_epi16(src_1, 0x39);
+
+        src_2 = _mm_slli_si128(src_2, 2);
+        src_2 = _mm_shufflelo_epi16(src_2, 0x39);
+
+        src_3 = _mm_slli_si128(src_3, 2);
+        src_3 = _mm_shufflelo_epi16(src_3, 0x39);
+
+        // 2/ Filter the 24 bits pixels & do the conversion
+        __m128i src_0_H = _mm_and_si128(src_0, mask_H);
+        __m128i src_0_L = _mm_and_si128(src_0, mask_L);
+        src_0_H = _mm_slli_si128(src_0_H, 1);
+        src_0 = _mm_or_si128(src_0_H, src_0_L);
+
+        __m128i src_1_H = _mm_and_si128(src_1, mask_H);
+        __m128i src_1_L = _mm_and_si128(src_1, mask_L);
+        src_1_H = _mm_slli_si128(src_1_H, 1);
+        src_1 = _mm_or_si128(src_1_H, src_1_L);
+
+        __m128i src_2_H = _mm_and_si128(src_2, mask_H);
+        __m128i src_2_L = _mm_and_si128(src_2, mask_L);
+        src_2_H = _mm_slli_si128(src_2_H, 1);
+        src_2 = _mm_or_si128(src_2_H, src_2_L);
+
+        __m128i src_3_H = _mm_and_si128(src_3, mask_H);
+        __m128i src_3_L = _mm_and_si128(src_3, mask_L);
+        src_3_H = _mm_slli_si128(src_3_H, 1);
+        src_3 = _mm_or_si128(src_3_H, src_3_L);
+
+        // Reorder the data (same as 32 bits format)
+        __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
+        __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
+        __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
+        __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
+
+        // Load previous value and apply the ~write_mask
+        __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
+        dst_0 = _mm_or_si128(dst_0, old_dst_0);
+
+        __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
+        dst_1 = _mm_or_si128(dst_1, old_dst_1);
+
+        __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
+        dst_2 = _mm_or_si128(dst_2, old_dst_2);
+
+        __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
+        dst_3 = _mm_or_si128(dst_3, old_dst_3);
+
+        // store
+        _mm_stream_si128((__m128i*)dst, dst_0);
+        _mm_stream_si128(((__m128i*)dst)+1, dst_1);
+        _mm_stream_si128(((__m128i*)dst)+2, dst_2);
+        _mm_stream_si128(((__m128i*)dst)+3, dst_3);
+
+        // update the pointer
+        dst += 64;
+        src += 2*pitch;
+    }
 }
 
-__forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch)
 {
 #ifdef INTRINSIC_PORT_16
-	SwizzleBlock16_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock16_sse2_I<true>(dst, src, pitch);
 #else
-	SwizzleBlock16_sse2(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock16_sse2(dst, src, pitch);
 #endif
 }
 
-__forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch)
 {
 #ifdef INTRINSIC_PORT_8
-	SwizzleBlock8_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock8_sse2_I<true>(dst, src, pitch);
 #else
-	SwizzleBlock8_sse2(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock8_sse2(dst, src, pitch);
 #endif
 }
 
-__forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch)
 {
 #ifdef INTRINSIC_PORT_4
-	SwizzleBlock4_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock4_sse2_I<true>(dst, src, pitch);
 #else
-	SwizzleBlock4_sse2(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock4_sse2(dst, src, pitch);
 #endif
 }
 
-__forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)
 {
-#ifdef INTRINSIC_PORT_32
-    SwizzleBlock32_sse2_I<false>(dst, src, pitch, WriteMask);
-#else
-	SwizzleBlock32u_sse2(dst, src, pitch, WriteMask);
-#endif
+    SwizzleBlock32_sse2_I<false>(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch)
 {
 #ifdef INTRINSIC_PORT_16
-	SwizzleBlock16_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock16_sse2_I<false>(dst, src, pitch);
 #else
-	SwizzleBlock16u_sse2(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock16u_sse2(dst, src, pitch);
 #endif
 }
 
-__forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch)
 {
 #ifdef INTRINSIC_PORT_8
-	SwizzleBlock8_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock8_sse2_I<false>(dst, src, pitch);
 #else
-	SwizzleBlock8u_sse2(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock8u_sse2(dst, src, pitch);
 #endif
 }
 
-__forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch)
 {
 #ifdef INTRINSIC_PORT_4
-	SwizzleBlock4_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock4_sse2_I<false>(dst, src, pitch);
 #else
-	SwizzleBlock4u_sse2(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock4u_sse2(dst, src, pitch);
 #endif
 }
 
+__forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch)
+{
+    SwizzleBlock8H_4H<false, false>(dst, src, pitch);
+}
+
+__forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch)
+{
+    SwizzleBlock8H_4H<true, true>(dst, src, pitch);
+}
+
+__forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch)
+{
+    SwizzleBlock8H_4H<true, false>(dst, src, pitch);
+}
+
 #else
 
-__forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock32_c(dst, src, pitch, WriteMask);
+	SwizzleBlock32_c(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock16_c(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock16_c(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock8_c(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock8_c(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock4_c(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock4_c(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock32_c(dst, src, pitch, WriteMask);
+	SwizzleBlock32_c(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock16_c(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock16_c(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock8_c(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock8_c(dst, src, pitch);
 }
 
-__forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch)
 {
-	SwizzleBlock4_c(dst, src, pitch/*, WriteMask*/);
+	SwizzleBlock4_c(dst, src, pitch);
 }
 
-__forceinline void __fastcall SwizzleBlock32_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
+__forceinline void __fastcall SwizzleBlock32_mask(u8* dst, u8* src, int srcpitch, u32 WriteMask)
 {
 	u32* d = &g_columnTable32[0][0];
 
@@ -513,26 +633,12 @@
 	}
 }
 
-
-__forceinline void __fastcall SwizzleBlock24_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
+__forceinline void __fastcall SwizzleBlock32_c(u8* dst, u8* src, int srcpitch)
 {
-	u32* d = &g_columnTable32[0][0];
-
-	if (WriteMask == 0x00ffffff)
-	{
-		for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
-			for (int i = 0; i < 8; i++)
-				((u32*)dst)[d[i]] = ((u32*)src)[i];
-	}
-	else
-	{
-		for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
-			for (int i = 0; i < 8; i++)
-				((u32*)dst)[d[i]] = (((u32*)dst)[d[i]] & ~WriteMask) | (((u32*)src)[i] & WriteMask);
-	}
+    SwizzleBlock32_mask(dst, src, srcpitch, 0xffffffff);
 }
 
-__forceinline void __fastcall SwizzleBlock16_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
+__forceinline void __fastcall SwizzleBlock16_c(u8* dst, u8* src, int srcpitch)
 {
 	u32* d = &g_columnTable16[0][0];
 
@@ -541,7 +647,7 @@
 			((u16*)dst)[d[i]] = ((u16*)src)[i];
 }
 
-__forceinline void __fastcall SwizzleBlock8_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
+__forceinline void __fastcall SwizzleBlock8_c(u8* dst, u8* src, int srcpitch)
 {
 	u32* d = &g_columnTable8[0][0];
 
@@ -550,7 +656,7 @@
 			dst[d[i]] = src[i];
 }
 
-__forceinline void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch, u32 WriteMask)
+__forceinline void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch)
 {
 	u32* d = &g_columnTable4[0][0];
 
@@ -566,13 +672,14 @@
 	}
 }
 
-#endif
-__forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch)
 {
 	u8* pnewsrc = src;
 	u32* pblock = tempblock;
 
-	for (int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch - 24)
+    //  Note src can be out of bound of GS memory (but there is some spare allocation
+    //  to avoid a tricky corner case)
+	for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch - 24)
 	{
 		for (int bx = 0; bx < 8; ++bx, pnewsrc += 3)
 		{
@@ -580,23 +687,10 @@
 		}
 	}
 
-	for (int bx = 0; bx < 7; ++bx, pnewsrc += 3)
-	{
-		/* might be 1 byte out of bounds of GS memory */
-		pblock[bx] = *(u32*)pnewsrc;
-	}
-
-	/* do 3 bytes for the last copy */
-	*((u8*)pblock + 28) = pnewsrc[0];
-
-	*((u8*)pblock + 29) = pnewsrc[1];
-
-	*((u8*)pblock + 30) = pnewsrc[2];
-
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff);
+	SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0x00ffffff);
 }
 
-__forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch)
 {
 	u8* pnewsrc = src;
 	u32* pblock = tempblock;
@@ -615,10 +709,10 @@
 		pblock[7] = u;
 	}
 
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xff000000);
+	SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0xff000000);
 }
 
-__forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch)
 {
 	u8* pnewsrc = src;
 	u32* pblock = tempblock;
@@ -636,10 +730,10 @@
 		pblock[7] = u;
 	}
 
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xf0000000);
+	SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0xf0000000);
 }
 
-__forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask)
+__forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch)
 {
 	u8* pnewsrc = src;
 	u32* pblock = tempblock;
@@ -657,6 +751,6 @@
 		pblock[7] = u >> 4;
 	}
 
-	SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000);
+	SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0x0f000000);
 }
-
+#endif

 

  ViewVC Help
Powered by ViewVC 1.1.22