/[pcsx2_0.9.7]/trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp
ViewVC logotype

Diff of /trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 62 by william, Tue Sep 7 11:08:22 2010 UTC revision 273 by william, Fri Nov 12 01:10:22 2010 UTC
# Line 20  Line 20 
20  #include "GS.h"  #include "GS.h"
21  #include "Mem.h"  #include "Mem.h"
22  #include "Mem_Swizzle.h"  #include "Mem_Swizzle.h"
23    #ifdef ZEROGS_SSE2
24    #include <emmintrin.h>
25    #endif
26    
27    // Current port of the ASM function to intrinsic
28    #define INTRINSIC_PORT_32
29    #define INTRINSIC_PORT_16
30    #define INTRINSIC_PORT_8
31    #define INTRINSIC_PORT_4
32    #ifdef ZEROGS_SSE2
33    template<bool aligned>
34    __forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch, u32 WriteMask)
35    {
36        __m128i src_0;
37        __m128i src_1;
38        __m128i src_2;
39        __m128i src_3;
40    
41            if (WriteMask == 0xffffffff) {
42            for (int i=3 ; i >= 0 ; --i) {
43                // load
44                if (aligned) {
45                    src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
46                    src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
47                    src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
48                    src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
49                } else {
50                    src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
51                    src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
52                    src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
53                    src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
54                }
55    
56                //  Reorder
57                __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
58                __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
59                __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
60                __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
61    
62                // store
63                _mm_stream_si128((__m128i*)dst, dst_0);
64                _mm_stream_si128(((__m128i*)dst)+1, dst_1);
65                _mm_stream_si128(((__m128i*)dst)+2, dst_2);
66                _mm_stream_si128(((__m128i*)dst)+3, dst_3);
67    
68                // update the pointer
69                dst += 64;
70                src += 2*pitch;
71            }
72            }
73            else
74            {
75            // Build the mask (tranform a u32 to a 4 packets u32)
76            __m128i mask = _mm_cvtsi32_si128(WriteMask);
77            mask = _mm_shuffle_epi32(mask, 0);
78    
79            for (int i=3 ; i >= 0 ; --i) {
80                // load
81                if (aligned) {
82                    src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
83                    src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
84                    src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
85                    src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
86                } else {
87                    src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
88                    src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
89                    src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
90                    src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
91                }
92    
93                // Apply the WriteMask before reordering
94                src_0 = _mm_and_si128(src_0, mask);
95                src_1 = _mm_and_si128(src_1, mask);
96                src_2 = _mm_and_si128(src_2, mask);
97                src_3 = _mm_and_si128(src_3, mask);
98    
99                //  Reorder
100                __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
101                __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
102                __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
103                __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
104    
105                // Load previous value and apply the ~mask
106                __m128i old_dst_0 = _mm_andnot_si128(mask, _mm_load_si128((__m128i*)dst));
107                __m128i old_dst_1 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+1));
108                __m128i old_dst_2 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+2));
109                __m128i old_dst_3 = _mm_andnot_si128(mask, _mm_load_si128(((__m128i*)dst)+3));
110    
111                // Build the final value
112                dst_0 = _mm_or_si128(dst_0, old_dst_0);
113                dst_1 = _mm_or_si128(dst_1, old_dst_1);
114                dst_2 = _mm_or_si128(dst_2, old_dst_2);
115                dst_3 = _mm_or_si128(dst_3, old_dst_3);
116    
117                // store
118                _mm_stream_si128((__m128i*)dst, dst_0);
119                _mm_stream_si128(((__m128i*)dst)+1, dst_1);
120                _mm_stream_si128(((__m128i*)dst)+2, dst_2);
121                _mm_stream_si128(((__m128i*)dst)+3, dst_3);
122    
123                // update the pointer
124                dst += 64;
125                src += 2*pitch;
126            }
127            }
128        // FIXME normally you must use a sfence but it would impact perf to do here
129        // the function is in a loop and it would have a better place after the loop...
130    }
131    
132    template<bool aligned>
133    __forceinline void SwizzleBlock16_sse2_I(u8 *dst, u8 *src, int pitch)
134    {
135        __m128i src_0_L;
136        __m128i src_0_H;
137        __m128i src_2_L;
138        __m128i src_2_H;
139    
140        for (int i=3 ; i >= 0 ; --i) {
141            // load
142            if (aligned) {
143                src_0_L = _mm_load_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
144                src_0_H = _mm_load_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
145                src_2_L = _mm_load_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
146                src_2_H = _mm_load_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
147            } else {
148                src_0_L = _mm_loadu_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
149                src_0_H = _mm_loadu_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
150                src_2_L = _mm_loadu_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
151                src_2_H = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
152            }
153    
154            // Interleave L and H to obtains 32 bits packets
155            __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5H 5L 4H 4L 1H 1L 0H 0L
156            __m128i dst_1_tmp = _mm_unpacklo_epi16(src_2_L, src_2_H); // 7H 7L 6H 6L 3H 3L 2H 2L
157            __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13H 13L 12H 12L 9H 9L 8H 8L
158            __m128i dst_3_tmp = _mm_unpackhi_epi16(src_2_L, src_2_H); // 15H 15L 14H 14L 11H 11L 10H 10L
159    
160            //  Reorder
161            __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
162            __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
163            __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
164            __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
165    
166            // store
167            _mm_stream_si128((__m128i*)dst, dst_0);
168            _mm_stream_si128(((__m128i*)dst)+1, dst_1);
169            _mm_stream_si128(((__m128i*)dst)+2, dst_2);
170            _mm_stream_si128(((__m128i*)dst)+3, dst_3);
171    
172            // update the pointer
173            dst += 64;
174            src += 2*pitch;
175        }
176        // FIXME normally you must use a sfence but it would impact perf to do here
177        // the function is in a loop and it would have a better place after the loop...
178    }
179    
180    // Template the code to improve reuse of code
181    template<bool aligned, u32 INDEX>
182    __forceinline void SwizzleColumn8_sse2_I(u8 *dst, u8 *src, int pitch)
183    {
184        __m128i src_0;
185        __m128i src_1;
186        __m128i src_2;
187        __m128i src_3;
188    
189        // load 4 line of 16*8 bits packets
190        if (aligned) {
191            src_0 = _mm_load_si128((__m128i*)src);
192            src_2 = _mm_load_si128((__m128i*)(src+pitch));
193            src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
194            src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
195        } else {
196            src_0 = _mm_loadu_si128((__m128i*)src);
197            src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
198            src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
199            src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
200        }
201    
202        // shuffle 2 lines to align pixels
203        if (INDEX == 0 || INDEX == 2) {
204            src_1 = _mm_shuffle_epi32(src_1, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 3 & 1)
205            src_3 = _mm_shuffle_epi32(src_3, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 3 & 1)
206        } else if (INDEX == 1 || INDEX == 3) {
207            src_0 = _mm_shuffle_epi32(src_0, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 2 & 0)
208            src_2 = _mm_shuffle_epi32(src_2, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 2 & 0)
209        } else {
210            assert(0);
211        }
212        // src_0 = 13 12 9 8  5 4 1 0 ... (byte 2 & 0)
213        // src_1 = 13 12 9 8  5 4 1 0 ... (byte 3 & 1)
214        // src_2 = 15 14 11 10  7 6 3 2 ... (byte 2 & 0)
215        // src_3 = 15 14 11 10  7 6 3 2 ... (byte 3 & 1)
216    
217        // Interleave byte 1 & 0 to obtain 16 bits packets
218        __m128i src_0_L = _mm_unpacklo_epi8(src_0, src_1); // 13L 12L 9L 8L 5L 4L 1L 0L
219        __m128i src_1_L = _mm_unpacklo_epi8(src_2, src_3); // 15L 14L 11L 10L 7L 6L 3L 2L
220        // Interleave byte 3 & 2 to obtain 16 bits packets
221        __m128i src_0_H = _mm_unpackhi_epi8(src_0, src_1); // 13H 12H 9H 8H 5H 4H 1H 0H
222        __m128i src_1_H = _mm_unpackhi_epi8(src_2, src_3); // 15H 14H 11H 10H 7H 6H 3H 2H
223    
224        // Interleave H and L to obtain 32 bits packets
225        __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5 4 1 0
226        __m128i dst_1_tmp = _mm_unpacklo_epi16(src_1_L, src_1_H); // 7 6 3 2
227        __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13 12 9 8
228        __m128i dst_3_tmp = _mm_unpackhi_epi16(src_1_L, src_1_H); // 15 14 11 10
229    
230        // Reorder the 32 bits packets
231        __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
232        __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
233        __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
234        __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
235    
236        // store
237        _mm_stream_si128((__m128i*)dst, dst_0);
238        _mm_stream_si128(((__m128i*)dst)+1, dst_1);
239        _mm_stream_si128(((__m128i*)dst)+2, dst_2);
240        _mm_stream_si128(((__m128i*)dst)+3, dst_3);
241    }
242    
243    template<bool aligned>
244    __forceinline void SwizzleBlock8_sse2_I(u8 *dst, u8 *src, int pitch)
245    {
246        SwizzleColumn8_sse2_I<aligned, 0>(dst, src, pitch);
247    
248        dst += 64;
249        src += 4*pitch;
250        SwizzleColumn8_sse2_I<aligned, 1>(dst, src, pitch);
251    
252        dst += 64;
253        src += 4*pitch;
254        SwizzleColumn8_sse2_I<aligned, 2>(dst, src, pitch);
255    
256        dst += 64;
257        src += 4*pitch;
258        SwizzleColumn8_sse2_I<aligned, 3>(dst, src, pitch);
259    
260        // FIXME normally you must use a sfence but it would impact perf to do here
261        // the function is in a loop and it would have a better place after the loop...
262    }
263    
264    // Template the code to improve reuse of code
265    template<bool aligned, u32 INDEX>
266    __forceinline void SwizzleColumn4_sse2_I(u8 *dst, u8 *src, int pitch)
267    {
268        __m128i src_0;
269        __m128i src_1;
270        __m128i src_2;
271        __m128i src_3;
272    
273        // Build a mask (tranform a u32 to a 4 packets u32)
274        const u32 mask_template = 0x0f0f0f0f;
275        __m128i mask = _mm_cvtsi32_si128(mask_template);
276        mask = _mm_shuffle_epi32(mask, 0);
277    
278        // load 4 line of 32*4 bits packets
279        if (aligned) {
280            src_0 = _mm_load_si128((__m128i*)src);
281            src_2 = _mm_load_si128((__m128i*)(src+pitch));
282            src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
283            src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
284        } else {
285            src_0 = _mm_loadu_si128((__m128i*)src);
286            src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
287            src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
288            src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
289        }
290    
291        // shuffle 2 lines to align pixels
292        if (INDEX == 0 || INDEX == 2) {
293            src_1 = _mm_shufflelo_epi16(src_1, 0xB1);
294            src_1 = _mm_shufflehi_epi16(src_1, 0xB1); // 13 12 9 8  5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
295            src_3 = _mm_shufflelo_epi16(src_3, 0xB1);
296            src_3 = _mm_shufflehi_epi16(src_3, 0xB1); // 15 14 11 10  7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
297        } else if (INDEX == 1 || INDEX == 3) {
298            src_0 = _mm_shufflelo_epi16(src_0, 0xB1);
299            src_0 = _mm_shufflehi_epi16(src_0, 0xB1); // 13 12 9 8  5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
300            src_2 = _mm_shufflelo_epi16(src_2, 0xB1);
301            src_2 = _mm_shufflehi_epi16(src_2, 0xB1); // 15 14 11 10  7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
302        } else {
303            assert(0);
304        }
305        // src_0 = 13 12 9 8  5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
306        // src_1 = 13 12 9 8  5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
307        // src_2 = 15 14 11 10  7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
308        // src_3 = 15 14 11 10  7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
309    
310        // ** Interleave Half-byte to obtain 8 bits packets
311        // Shift value to ease 4 bits filter.
312        // Note use a packet shift to allow a 4bits shifts
313        __m128i src_0_shift = _mm_srli_epi64(src_0, 4); // ? 13 12 9   8 5 4 1 ... (Half-byte 6 & 4 & 2 & 0)
314        __m128i src_1_shift = _mm_slli_epi64(src_1, 4); // 12 9 8 5    4 1 0 ? ... (Half-byte 7 & 5 & 3 & 1)
315        __m128i src_2_shift = _mm_srli_epi64(src_2, 4); // ? 15 14 11  10 7 6 3 ... (Half-byte 6 & 4 & 2 & 0)
316        __m128i src_3_shift = _mm_slli_epi64(src_3, 4); // 14 11 10 7  6 3 2 ? ... (Half-byte 7 & 5 & 3 & 1)
317    
318        // 12 - 8 - 4 - 0 - (HB odd) || - 12 - 8 - 4 - 0 (HB even) => 12 8 4 0 (byte 3 & 2 & 1 & 0)
319        src_0 = _mm_or_si128(_mm_andnot_si128(mask, src_1_shift), _mm_and_si128(mask, src_0));
320        // - 13 - 9 - 5 - 1 (HB even) || 13 - 9 - 5 - 1 - (HB odd) => 13 9 5 1 (byte 3 & 2 & 1 & 0)
321        src_1 = _mm_or_si128(_mm_and_si128(mask, src_0_shift), _mm_andnot_si128(mask, src_1));
322    
323        // 14 - 10 - 6 - 2 - (HB odd) || - 14 - 10 - 6 - 2 (HB even) => 14 10 6 2 (byte 3 & 2 & 1 & 0)
324        src_2 = _mm_or_si128(_mm_andnot_si128(mask, src_3_shift), _mm_and_si128(mask, src_2));
325        // - 15 - 11 - 7 - 3 (HB even) || 15 - 11 - 7 - 3 - (HB odd) => 15 11 7 3 (byte 3 & 2 & 1 & 0)
326        src_3 = _mm_or_si128(_mm_and_si128(mask, src_2_shift), _mm_andnot_si128(mask, src_3));
327    
328    
329        // reorder the 8 bits packets
330        __m128i src_0_tmp = _mm_unpacklo_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 1 & 0)
331        __m128i src_1_tmp = _mm_unpackhi_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 3 & 2)
332        __m128i src_2_tmp = _mm_unpacklo_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 1 & 0)
333        __m128i src_3_tmp = _mm_unpackhi_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 3 & 2)
334    
335        // interleave byte to obtain 32 bits packets
336        __m128i src_0_L = _mm_unpacklo_epi8(src_0_tmp, src_1_tmp); // 2.13 0.13 2.12 0.12 2.9 0.9 2.8 0.8 2.5 0.5 2.4 0.4 2.1 0.1 2.0 0.0
337        __m128i src_0_H = _mm_unpackhi_epi8(src_0_tmp, src_1_tmp); // 3.13 1.13 3.12 1.12 3.9 1.9 3.8 1.8 3.5 1.5 3.4 1.4 3.1 1.1 3.0 1.0
338        __m128i src_1_L = _mm_unpacklo_epi8(src_2_tmp, src_3_tmp); // 2.15 0.15 2.14 0.14 2.11 0.11 2.10 0.10 2.7 0.7 2.6 0.6 2.3 0.3 2.2 0.2
339        __m128i src_1_H = _mm_unpackhi_epi8(src_2_tmp, src_3_tmp); // 3.15 1.15 3.14 1.14 3.11 1.11 3.10 1.10 3.7 1.7 3.6 1.6 3.3 1.3 3.2 1.2
340    
341        __m128i dst_0_tmp = _mm_unpacklo_epi8(src_0_L, src_0_H); // 5 4 1 0
342        __m128i dst_1_tmp = _mm_unpacklo_epi8(src_1_L, src_1_H); // 7 6 3 2
343        __m128i dst_2_tmp = _mm_unpackhi_epi8(src_0_L, src_0_H); // 13 12 9 8
344        __m128i dst_3_tmp = _mm_unpackhi_epi8(src_1_L, src_1_H); // 15 14 11 10
345    
346        // Reorder the 32 bits packets
347        __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
348        __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
349        __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
350        __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
351    
352        // store
353        _mm_stream_si128((__m128i*)dst, dst_0);
354        _mm_stream_si128(((__m128i*)dst)+1, dst_1);
355        _mm_stream_si128(((__m128i*)dst)+2, dst_2);
356        _mm_stream_si128(((__m128i*)dst)+3, dst_3);
357    }
358    
359    template<bool aligned>
360    __forceinline void SwizzleBlock4_sse2_I(u8 *dst, u8 *src, int pitch)
361    {
362        SwizzleColumn4_sse2_I<aligned, 0>(dst, src, pitch);
363    
364        dst += 64;
365        src += 4*pitch;
366        SwizzleColumn4_sse2_I<aligned, 1>(dst, src, pitch);
367    
368        dst += 64;
369        src += 4*pitch;
370        SwizzleColumn4_sse2_I<aligned, 2>(dst, src, pitch);
371    
372        dst += 64;
373        src += 4*pitch;
374        SwizzleColumn4_sse2_I<aligned, 3>(dst, src, pitch);
375    
376        // FIXME normally you must use a sfence but it would impact perf to do here
377        // the function is in a loop and it would have a better place after the loop...
378    }
379    #endif
380    
381  // special swizzle macros - which I converted to functions.  // special swizzle macros - which I converted to functions.
382  #ifdef ZEROGS_SSE2  #ifdef ZEROGS_SSE2
383    
384  __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
385  {  {
386    #ifdef INTRINSIC_PORT_32
387        SwizzleBlock32_sse2_I<true>(dst, src, pitch, WriteMask);
388    #else
389          SwizzleBlock32_sse2(dst, src, pitch, WriteMask);          SwizzleBlock32_sse2(dst, src, pitch, WriteMask);
390    #endif
391  }  }
392    
393  __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask)
394  {  {
395    #ifdef INTRINSIC_PORT_16
396            SwizzleBlock16_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
397    #else
398          SwizzleBlock16_sse2(dst, src, pitch/*, WriteMask*/);          SwizzleBlock16_sse2(dst, src, pitch/*, WriteMask*/);
399    #endif
400  }  }
401    
402  __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch, u32 WriteMask)
403  {  {
404    #ifdef INTRINSIC_PORT_8
405            SwizzleBlock8_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
406    #else
407          SwizzleBlock8_sse2(dst, src, pitch/*, WriteMask*/);          SwizzleBlock8_sse2(dst, src, pitch/*, WriteMask*/);
408    #endif
409  }  }
410    
411  __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch, u32 WriteMask)
412  {  {
413    #ifdef INTRINSIC_PORT_4
414            SwizzleBlock4_sse2_I<true>(dst, src, pitch/*, WriteMask*/);
415    #else
416          SwizzleBlock4_sse2(dst, src, pitch/*, WriteMask*/);          SwizzleBlock4_sse2(dst, src, pitch/*, WriteMask*/);
417    #endif
418  }  }
419    
420  __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
421  {  {
422    #ifdef INTRINSIC_PORT_32
423        SwizzleBlock32_sse2_I<false>(dst, src, pitch, WriteMask);
424    #else
425          SwizzleBlock32u_sse2(dst, src, pitch, WriteMask);          SwizzleBlock32u_sse2(dst, src, pitch, WriteMask);
426    #endif
427  }  }
428    
429  __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
430  {  {
431    #ifdef INTRINSIC_PORT_16
432            SwizzleBlock16_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
433    #else
434          SwizzleBlock16u_sse2(dst, src, pitch/*, WriteMask*/);          SwizzleBlock16u_sse2(dst, src, pitch/*, WriteMask*/);
435    #endif
436  }  }
437    
438  __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
439  {  {
440    #ifdef INTRINSIC_PORT_8
441            SwizzleBlock8_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
442    #else
443          SwizzleBlock8u_sse2(dst, src, pitch/*, WriteMask*/);          SwizzleBlock8u_sse2(dst, src, pitch/*, WriteMask*/);
444    #endif
445  }  }
446    
447  __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask)  __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
448  {  {
449    #ifdef INTRINSIC_PORT_4
450            SwizzleBlock4_sse2_I<false>(dst, src, pitch/*, WriteMask*/);
451    #else
452          SwizzleBlock4u_sse2(dst, src, pitch/*, WriteMask*/);          SwizzleBlock4u_sse2(dst, src, pitch/*, WriteMask*/);
453    #endif
454  }  }
455    
456  #else  #else
# Line 270  __forceinline void SwizzleBlock4HL(u8 *d Line 659  __forceinline void SwizzleBlock4HL(u8 *d
659    
660          SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000);          SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000);
661  }  }
662    

Legend:
Removed from v.62  
changed lines
  Added in v.273

  ViewVC Help
Powered by ViewVC 1.1.22