/[pcsx2_0.9.7]/trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp
ViewVC logotype

Annotation of /trunk/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 280 - (hide annotations) (download)
Thu Dec 23 12:02:12 2010 UTC (9 years, 6 months ago) by william
File size: 26653 byte(s)
re-commit (had local access denied errors when committing)
1 william 62 /* ZZ Open GL graphics plugin
2     * Copyright (c)2009-2010 zeydlitz@gmail.com, arcum42@gmail.com
3     * Based on Zerofrog's ZeroGS KOSMOS (c)2005-2008
4 william 31 *
5     * This program is free software; you can redistribute it and/or modify
6     * it under the terms of the GNU General Public License as published by
7     * the Free Software Foundation; either version 2 of the License, or
8     * (at your option) any later version.
9     *
10     * This program is distributed in the hope that it will be useful,
11     * but WITHOUT ANY WARRANTY; without even the implied warranty of
12     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13     * GNU General Public License for more details.
14     *
15     * You should have received a copy of the GNU General Public License
16     * along with this program; if not, write to the Free Software
17 william 62 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18 william 31 */
19    
20     #include "GS.h"
21     #include "Mem.h"
22     #include "Mem_Swizzle.h"
23 william 273 #ifdef ZEROGS_SSE2
24     #include <emmintrin.h>
25     #endif
26 william 31
27 william 280 // WARNING a sfence instruction must be call after SwizzleBlock sse2 function
28    
29 william 273 // Current port of the ASM function to intrinsic
30     #define INTRINSIC_PORT_16
31     #define INTRINSIC_PORT_8
32     #define INTRINSIC_PORT_4
33     #ifdef ZEROGS_SSE2
34 william 280 static const __aligned16 u32 mask_24b_H[4] = {0xFF000000, 0x0000FFFF, 0xFF000000, 0x0000FFFF};
35     static const __aligned16 u32 mask_24b_L[4] = {0x00FFFFFF, 0x00000000, 0x00FFFFFF, 0x00000000};
36    
37 william 273 template<bool aligned>
38 william 280 __forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch)
39 william 273 {
40     __m128i src_0;
41     __m128i src_1;
42     __m128i src_2;
43     __m128i src_3;
44    
45 william 280 for (int i=3 ; i >= 0 ; --i) {
46     // load
47     if (aligned) {
48     src_0 = _mm_load_si128((__m128i*)src); // 5 4 1 0
49     src_1 = _mm_load_si128((__m128i*)(src+16)); // 13 12 9 8
50     src_2 = _mm_load_si128((__m128i*)(src+pitch)); // 7 6 3 2
51     src_3 = _mm_load_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
52     } else {
53     src_0 = _mm_loadu_si128((__m128i*)src); // 5 4 1 0
54     src_1 = _mm_loadu_si128((__m128i*)(src+16)); // 13 12 9 8
55     src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); // 7 6 3 2
56     src_3 = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15 14 11 10
57 william 273 }
58    
59 william 280 // Reorder
60     __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); // 3 2 1 0
61     __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); // 7 6 5 4
62     __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); // 11 10 9 8
63     __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); // 15 14 13 12
64 william 273
65 william 280 // store
66     _mm_stream_si128((__m128i*)dst, dst_0);
67     _mm_stream_si128(((__m128i*)dst)+1, dst_1);
68     _mm_stream_si128(((__m128i*)dst)+2, dst_2);
69     _mm_stream_si128(((__m128i*)dst)+3, dst_3);
70 william 273
71 william 280 // update the pointer
72     dst += 64;
73     src += 2*pitch;
74     }
75 william 273 }
76    
77     template<bool aligned>
78     __forceinline void SwizzleBlock16_sse2_I(u8 *dst, u8 *src, int pitch)
79     {
80     __m128i src_0_L;
81     __m128i src_0_H;
82     __m128i src_2_L;
83     __m128i src_2_H;
84    
85     for (int i=3 ; i >= 0 ; --i) {
86     // load
87     if (aligned) {
88     src_0_L = _mm_load_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
89     src_0_H = _mm_load_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
90     src_2_L = _mm_load_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
91     src_2_H = _mm_load_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
92     } else {
93     src_0_L = _mm_loadu_si128((__m128i*)src); // 13L 12L 9L 8L 5L 4L 1L 0L
94     src_0_H = _mm_loadu_si128((__m128i*)(src+16)); // 13H 12H 9H 8H 5H 4H 1H 0H
95     src_2_L = _mm_loadu_si128((__m128i*)(src+pitch)); // 15L 14L 11L 10L 7L 6L 3L 2L
96     src_2_H = _mm_loadu_si128((__m128i*)(src+16+pitch)); // 15H 14H 11H 10H 7H 6H 3H 2H
97     }
98    
99     // Interleave L and H to obtains 32 bits packets
100     __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5H 5L 4H 4L 1H 1L 0H 0L
101     __m128i dst_1_tmp = _mm_unpacklo_epi16(src_2_L, src_2_H); // 7H 7L 6H 6L 3H 3L 2H 2L
102     __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13H 13L 12H 12L 9H 9L 8H 8L
103     __m128i dst_3_tmp = _mm_unpackhi_epi16(src_2_L, src_2_H); // 15H 15L 14H 14L 11H 11L 10H 10L
104    
105     // Reorder
106     __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
107     __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
108     __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
109     __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
110    
111     // store
112     _mm_stream_si128((__m128i*)dst, dst_0);
113     _mm_stream_si128(((__m128i*)dst)+1, dst_1);
114     _mm_stream_si128(((__m128i*)dst)+2, dst_2);
115     _mm_stream_si128(((__m128i*)dst)+3, dst_3);
116    
117     // update the pointer
118     dst += 64;
119     src += 2*pitch;
120     }
121     }
122    
123     // Template the code to improve reuse of code
124     template<bool aligned, u32 INDEX>
125     __forceinline void SwizzleColumn8_sse2_I(u8 *dst, u8 *src, int pitch)
126     {
127     __m128i src_0;
128     __m128i src_1;
129     __m128i src_2;
130     __m128i src_3;
131    
132     // load 4 line of 16*8 bits packets
133     if (aligned) {
134     src_0 = _mm_load_si128((__m128i*)src);
135     src_2 = _mm_load_si128((__m128i*)(src+pitch));
136     src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
137     src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
138     } else {
139     src_0 = _mm_loadu_si128((__m128i*)src);
140     src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
141     src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
142     src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
143     }
144    
145     // shuffle 2 lines to align pixels
146     if (INDEX == 0 || INDEX == 2) {
147     src_1 = _mm_shuffle_epi32(src_1, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 3 & 1)
148     src_3 = _mm_shuffle_epi32(src_3, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 3 & 1)
149     } else if (INDEX == 1 || INDEX == 3) {
150     src_0 = _mm_shuffle_epi32(src_0, 0xB1); // 13 12 9 8 5 4 1 0 ... (byte 2 & 0)
151     src_2 = _mm_shuffle_epi32(src_2, 0xB1); // 15 14 11 10 7 6 3 2 ... (byte 2 & 0)
152     } else {
153     assert(0);
154     }
155     // src_0 = 13 12 9 8 5 4 1 0 ... (byte 2 & 0)
156     // src_1 = 13 12 9 8 5 4 1 0 ... (byte 3 & 1)
157     // src_2 = 15 14 11 10 7 6 3 2 ... (byte 2 & 0)
158     // src_3 = 15 14 11 10 7 6 3 2 ... (byte 3 & 1)
159    
160     // Interleave byte 1 & 0 to obtain 16 bits packets
161     __m128i src_0_L = _mm_unpacklo_epi8(src_0, src_1); // 13L 12L 9L 8L 5L 4L 1L 0L
162     __m128i src_1_L = _mm_unpacklo_epi8(src_2, src_3); // 15L 14L 11L 10L 7L 6L 3L 2L
163     // Interleave byte 3 & 2 to obtain 16 bits packets
164     __m128i src_0_H = _mm_unpackhi_epi8(src_0, src_1); // 13H 12H 9H 8H 5H 4H 1H 0H
165     __m128i src_1_H = _mm_unpackhi_epi8(src_2, src_3); // 15H 14H 11H 10H 7H 6H 3H 2H
166    
167     // Interleave H and L to obtain 32 bits packets
168     __m128i dst_0_tmp = _mm_unpacklo_epi16(src_0_L, src_0_H); // 5 4 1 0
169     __m128i dst_1_tmp = _mm_unpacklo_epi16(src_1_L, src_1_H); // 7 6 3 2
170     __m128i dst_2_tmp = _mm_unpackhi_epi16(src_0_L, src_0_H); // 13 12 9 8
171     __m128i dst_3_tmp = _mm_unpackhi_epi16(src_1_L, src_1_H); // 15 14 11 10
172    
173     // Reorder the 32 bits packets
174     __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
175     __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
176     __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
177     __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
178    
179     // store
180     _mm_stream_si128((__m128i*)dst, dst_0);
181     _mm_stream_si128(((__m128i*)dst)+1, dst_1);
182     _mm_stream_si128(((__m128i*)dst)+2, dst_2);
183     _mm_stream_si128(((__m128i*)dst)+3, dst_3);
184     }
185    
186     template<bool aligned>
187     __forceinline void SwizzleBlock8_sse2_I(u8 *dst, u8 *src, int pitch)
188     {
189     SwizzleColumn8_sse2_I<aligned, 0>(dst, src, pitch);
190    
191     dst += 64;
192     src += 4*pitch;
193     SwizzleColumn8_sse2_I<aligned, 1>(dst, src, pitch);
194    
195     dst += 64;
196     src += 4*pitch;
197     SwizzleColumn8_sse2_I<aligned, 2>(dst, src, pitch);
198    
199     dst += 64;
200     src += 4*pitch;
201     SwizzleColumn8_sse2_I<aligned, 3>(dst, src, pitch);
202     }
203    
204     // Template the code to improve reuse of code
205     template<bool aligned, u32 INDEX>
206     __forceinline void SwizzleColumn4_sse2_I(u8 *dst, u8 *src, int pitch)
207     {
208     __m128i src_0;
209     __m128i src_1;
210     __m128i src_2;
211     __m128i src_3;
212    
213     // Build a mask (tranform a u32 to a 4 packets u32)
214     const u32 mask_template = 0x0f0f0f0f;
215     __m128i mask = _mm_cvtsi32_si128(mask_template);
216     mask = _mm_shuffle_epi32(mask, 0);
217    
218     // load 4 line of 32*4 bits packets
219     if (aligned) {
220     src_0 = _mm_load_si128((__m128i*)src);
221     src_2 = _mm_load_si128((__m128i*)(src+pitch));
222     src_1 = _mm_load_si128((__m128i*)(src+2*pitch));
223     src_3 = _mm_load_si128((__m128i*)(src+3*pitch));
224     } else {
225     src_0 = _mm_loadu_si128((__m128i*)src);
226     src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
227     src_1 = _mm_loadu_si128((__m128i*)(src+2*pitch));
228     src_3 = _mm_loadu_si128((__m128i*)(src+3*pitch));
229     }
230    
231     // shuffle 2 lines to align pixels
232     if (INDEX == 0 || INDEX == 2) {
233     src_1 = _mm_shufflelo_epi16(src_1, 0xB1);
234     src_1 = _mm_shufflehi_epi16(src_1, 0xB1); // 13 12 9 8 5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
235     src_3 = _mm_shufflelo_epi16(src_3, 0xB1);
236     src_3 = _mm_shufflehi_epi16(src_3, 0xB1); // 15 14 11 10 7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
237     } else if (INDEX == 1 || INDEX == 3) {
238     src_0 = _mm_shufflelo_epi16(src_0, 0xB1);
239     src_0 = _mm_shufflehi_epi16(src_0, 0xB1); // 13 12 9 8 5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
240     src_2 = _mm_shufflelo_epi16(src_2, 0xB1);
241     src_2 = _mm_shufflehi_epi16(src_2, 0xB1); // 15 14 11 10 7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
242     } else {
243     assert(0);
244     }
245     // src_0 = 13 12 9 8 5 4 1 0 ... (Half-byte 6 & 4 & 2 & 0)
246     // src_1 = 13 12 9 8 5 4 1 0 ... (Half-byte 7 & 5 & 3 & 1)
247     // src_2 = 15 14 11 10 7 6 3 2 ... (Half-byte 6 & 4 & 2 & 0)
248     // src_3 = 15 14 11 10 7 6 3 2 ... (Half-byte 7 & 5 & 3 & 1)
249    
250     // ** Interleave Half-byte to obtain 8 bits packets
251     // Shift value to ease 4 bits filter.
252     // Note use a packet shift to allow a 4bits shifts
253     __m128i src_0_shift = _mm_srli_epi64(src_0, 4); // ? 13 12 9 8 5 4 1 ... (Half-byte 6 & 4 & 2 & 0)
254     __m128i src_1_shift = _mm_slli_epi64(src_1, 4); // 12 9 8 5 4 1 0 ? ... (Half-byte 7 & 5 & 3 & 1)
255     __m128i src_2_shift = _mm_srli_epi64(src_2, 4); // ? 15 14 11 10 7 6 3 ... (Half-byte 6 & 4 & 2 & 0)
256     __m128i src_3_shift = _mm_slli_epi64(src_3, 4); // 14 11 10 7 6 3 2 ? ... (Half-byte 7 & 5 & 3 & 1)
257    
258     // 12 - 8 - 4 - 0 - (HB odd) || - 12 - 8 - 4 - 0 (HB even) => 12 8 4 0 (byte 3 & 2 & 1 & 0)
259     src_0 = _mm_or_si128(_mm_andnot_si128(mask, src_1_shift), _mm_and_si128(mask, src_0));
260     // - 13 - 9 - 5 - 1 (HB even) || 13 - 9 - 5 - 1 - (HB odd) => 13 9 5 1 (byte 3 & 2 & 1 & 0)
261     src_1 = _mm_or_si128(_mm_and_si128(mask, src_0_shift), _mm_andnot_si128(mask, src_1));
262    
263     // 14 - 10 - 6 - 2 - (HB odd) || - 14 - 10 - 6 - 2 (HB even) => 14 10 6 2 (byte 3 & 2 & 1 & 0)
264     src_2 = _mm_or_si128(_mm_andnot_si128(mask, src_3_shift), _mm_and_si128(mask, src_2));
265     // - 15 - 11 - 7 - 3 (HB even) || 15 - 11 - 7 - 3 - (HB odd) => 15 11 7 3 (byte 3 & 2 & 1 & 0)
266     src_3 = _mm_or_si128(_mm_and_si128(mask, src_2_shift), _mm_andnot_si128(mask, src_3));
267    
268    
269     // reorder the 8 bits packets
270     __m128i src_0_tmp = _mm_unpacklo_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 1 & 0)
271     __m128i src_1_tmp = _mm_unpackhi_epi8(src_0, src_1); // 13 12 9 8 5 4 1 0 (byte 3 & 2)
272     __m128i src_2_tmp = _mm_unpacklo_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 1 & 0)
273     __m128i src_3_tmp = _mm_unpackhi_epi8(src_2, src_3); // 15 14 11 10 7 6 3 2 (byte 3 & 2)
274    
275     // interleave byte to obtain 32 bits packets
276     __m128i src_0_L = _mm_unpacklo_epi8(src_0_tmp, src_1_tmp); // 2.13 0.13 2.12 0.12 2.9 0.9 2.8 0.8 2.5 0.5 2.4 0.4 2.1 0.1 2.0 0.0
277     __m128i src_0_H = _mm_unpackhi_epi8(src_0_tmp, src_1_tmp); // 3.13 1.13 3.12 1.12 3.9 1.9 3.8 1.8 3.5 1.5 3.4 1.4 3.1 1.1 3.0 1.0
278     __m128i src_1_L = _mm_unpacklo_epi8(src_2_tmp, src_3_tmp); // 2.15 0.15 2.14 0.14 2.11 0.11 2.10 0.10 2.7 0.7 2.6 0.6 2.3 0.3 2.2 0.2
279     __m128i src_1_H = _mm_unpackhi_epi8(src_2_tmp, src_3_tmp); // 3.15 1.15 3.14 1.14 3.11 1.11 3.10 1.10 3.7 1.7 3.6 1.6 3.3 1.3 3.2 1.2
280    
281     __m128i dst_0_tmp = _mm_unpacklo_epi8(src_0_L, src_0_H); // 5 4 1 0
282     __m128i dst_1_tmp = _mm_unpacklo_epi8(src_1_L, src_1_H); // 7 6 3 2
283     __m128i dst_2_tmp = _mm_unpackhi_epi8(src_0_L, src_0_H); // 13 12 9 8
284     __m128i dst_3_tmp = _mm_unpackhi_epi8(src_1_L, src_1_H); // 15 14 11 10
285    
286     // Reorder the 32 bits packets
287     __m128i dst_0 = _mm_unpacklo_epi64(dst_0_tmp, dst_1_tmp); // 3 2 1 0
288     __m128i dst_1 = _mm_unpackhi_epi64(dst_0_tmp, dst_1_tmp); // 7 6 5 4
289     __m128i dst_2 = _mm_unpacklo_epi64(dst_2_tmp, dst_3_tmp); // 11 10 9 8
290     __m128i dst_3 = _mm_unpackhi_epi64(dst_2_tmp, dst_3_tmp); // 15 14 13 12
291    
292     // store
293     _mm_stream_si128((__m128i*)dst, dst_0);
294     _mm_stream_si128(((__m128i*)dst)+1, dst_1);
295     _mm_stream_si128(((__m128i*)dst)+2, dst_2);
296     _mm_stream_si128(((__m128i*)dst)+3, dst_3);
297     }
298    
299     template<bool aligned>
300     __forceinline void SwizzleBlock4_sse2_I(u8 *dst, u8 *src, int pitch)
301     {
302     SwizzleColumn4_sse2_I<aligned, 0>(dst, src, pitch);
303    
304     dst += 64;
305     src += 4*pitch;
306     SwizzleColumn4_sse2_I<aligned, 1>(dst, src, pitch);
307    
308     dst += 64;
309     src += 4*pitch;
310     SwizzleColumn4_sse2_I<aligned, 2>(dst, src, pitch);
311    
312     dst += 64;
313     src += 4*pitch;
314     SwizzleColumn4_sse2_I<aligned, 3>(dst, src, pitch);
315 william 280 }
316 william 273
317 william 280 template<bool FOUR_BIT, bool UPPER>
318     __forceinline void SwizzleBlock8H_4H(u8 *dst, u8 *src, int pitch)
319     {
320     __m128i zero_128 = _mm_setzero_si128();
321     __m128i src_0;
322     __m128i src_1;
323     __m128i src_2;
324     __m128i src_3;
325     __m128i src_0_init_H;
326     __m128i src_0_init_L;
327     __m128i src_2_init_H;
328     __m128i src_2_init_L;
329     __m128i src_0_init;
330     __m128i src_2_init;
331    
332     __m128i upper_mask = _mm_cvtsi32_si128(0xF0F0F0F0);
333     // Build the write_mask (tranform a u32 to a 4 packets u32)
334     __m128i write_mask;
335     if (FOUR_BIT) {
336     if (UPPER) write_mask = _mm_cvtsi32_si128(0xF0000000);
337     else write_mask = _mm_cvtsi32_si128(0x0F000000);
338     } else {
339     write_mask = _mm_cvtsi32_si128(0xFF000000);
340     }
341     write_mask = _mm_shuffle_epi32(write_mask, 0);
342    
343     for (int i=3 ; i >= 0 ; --i) {
344     if (FOUR_BIT) {
345     src_0_init = _mm_cvtsi32_si128(*(u32*)src);
346     src_2_init = _mm_cvtsi32_si128(*(u32*)(src + pitch));
347     } else {
348     src_0_init = _mm_loadl_epi64((__m128i*)src);
349     src_2_init = _mm_loadl_epi64((__m128i*)(src + pitch));
350     }
351    
352     // Convert to 8 bits
353     if (FOUR_BIT) {
354     src_0_init_H = _mm_and_si128(upper_mask, src_0_init);
355     src_0_init_L = _mm_andnot_si128(upper_mask, src_0_init);
356     src_2_init_H = _mm_and_si128(upper_mask, src_2_init);
357     src_2_init_L = _mm_andnot_si128(upper_mask, src_2_init);
358    
359     if (UPPER) {
360     src_0_init_L = _mm_slli_epi32(src_0_init_L, 4);
361     src_2_init_L = _mm_slli_epi32(src_2_init_L, 4);
362     } else {
363     src_0_init_H = _mm_srli_epi32(src_0_init_H, 4);
364     src_2_init_H = _mm_srli_epi32(src_2_init_H, 4);
365     }
366    
367     // Repack the src to keep HByte order
368     src_0_init = _mm_unpacklo_epi8(src_0_init_L, src_0_init_H);
369     src_2_init = _mm_unpacklo_epi8(src_2_init_L, src_2_init_H);
370     }
371    
372     // transform to 16 bits (add 0 in low bits)
373     src_0_init = _mm_unpacklo_epi8(zero_128, src_0_init);
374     src_2_init = _mm_unpacklo_epi8(zero_128, src_2_init);
375    
376     // transform to 32 bits (add 0 in low bits)
377     src_0 = _mm_unpacklo_epi16(zero_128, src_0_init);
378     src_1 = _mm_unpackhi_epi16(zero_128, src_0_init);
379     src_2 = _mm_unpacklo_epi16(zero_128, src_2_init);
380     src_3 = _mm_unpackhi_epi16(zero_128, src_2_init);
381    
382     // Reorder the data (same as 32 bits format)
383     __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
384     __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
385     __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
386     __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
387    
388     // Load previous value and apply the ~write_mask
389     __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
390     dst_0 = _mm_or_si128(dst_0, old_dst_0);
391    
392     __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
393     dst_1 = _mm_or_si128(dst_1, old_dst_1);
394    
395     __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
396     dst_2 = _mm_or_si128(dst_2, old_dst_2);
397    
398     __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
399     dst_3 = _mm_or_si128(dst_3, old_dst_3);
400    
401     // store
402     _mm_stream_si128((__m128i*)dst, dst_0);
403     _mm_stream_si128(((__m128i*)dst)+1, dst_1);
404     _mm_stream_si128(((__m128i*)dst)+2, dst_2);
405     _mm_stream_si128(((__m128i*)dst)+3, dst_3);
406    
407     // update the pointer
408     dst += 64;
409     src += 2*pitch;
410     }
411 william 273 }
412    
413 william 31 // special swizzle macros - which I converted to functions.
414    
415 william 280 __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch)
416 william 31 {
417 william 280 SwizzleBlock32_sse2_I<true>(dst, src, pitch);
418 william 31 }
419    
420 william 280 __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch)
421 william 31 {
422 william 280 __m128i mask_H = _mm_load_si128((__m128i*)mask_24b_H);
423     __m128i mask_L = _mm_load_si128((__m128i*)mask_24b_L);
424     // Build the write_mask (tranform a u32 to a 4 packets u32)
425     __m128i write_mask = _mm_cvtsi32_si128(0x00FFFFFF);
426     write_mask = _mm_shuffle_epi32(write_mask, 0);
427    
428     for (int i=3 ; i >= 0 ; --i) {
429     // Note src can be out of bound of GS memory (but there is some spare allocation
430     // to avoid a tricky corner case)
431     __m128i src_0 = _mm_loadu_si128((__m128i*)src);
432     __m128i src_1 = _mm_loadu_si128((__m128i*)(src+12));
433     __m128i src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
434     __m128i src_3 = _mm_loadu_si128((__m128i*)(src+pitch+12));
435    
436     // transform 24 bits value to 32 bits one
437     // 1/ Align a little the data
438     src_0 = _mm_slli_si128(src_0, 2);
439     src_0 = _mm_shufflelo_epi16(src_0, 0x39);
440    
441     src_1 = _mm_slli_si128(src_1, 2);
442     src_1 = _mm_shufflelo_epi16(src_1, 0x39);
443    
444     src_2 = _mm_slli_si128(src_2, 2);
445     src_2 = _mm_shufflelo_epi16(src_2, 0x39);
446    
447     src_3 = _mm_slli_si128(src_3, 2);
448     src_3 = _mm_shufflelo_epi16(src_3, 0x39);
449    
450     // 2/ Filter the 24 bits pixels & do the conversion
451     __m128i src_0_H = _mm_and_si128(src_0, mask_H);
452     __m128i src_0_L = _mm_and_si128(src_0, mask_L);
453     src_0_H = _mm_slli_si128(src_0_H, 1);
454     src_0 = _mm_or_si128(src_0_H, src_0_L);
455    
456     __m128i src_1_H = _mm_and_si128(src_1, mask_H);
457     __m128i src_1_L = _mm_and_si128(src_1, mask_L);
458     src_1_H = _mm_slli_si128(src_1_H, 1);
459     src_1 = _mm_or_si128(src_1_H, src_1_L);
460    
461     __m128i src_2_H = _mm_and_si128(src_2, mask_H);
462     __m128i src_2_L = _mm_and_si128(src_2, mask_L);
463     src_2_H = _mm_slli_si128(src_2_H, 1);
464     src_2 = _mm_or_si128(src_2_H, src_2_L);
465    
466     __m128i src_3_H = _mm_and_si128(src_3, mask_H);
467     __m128i src_3_L = _mm_and_si128(src_3, mask_L);
468     src_3_H = _mm_slli_si128(src_3_H, 1);
469     src_3 = _mm_or_si128(src_3_H, src_3_L);
470    
471     // Reorder the data (same as 32 bits format)
472     __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
473     __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
474     __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
475     __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
476    
477     // Load previous value and apply the ~write_mask
478     __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
479     dst_0 = _mm_or_si128(dst_0, old_dst_0);
480    
481     __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
482     dst_1 = _mm_or_si128(dst_1, old_dst_1);
483    
484     __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
485     dst_2 = _mm_or_si128(dst_2, old_dst_2);
486    
487     __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
488     dst_3 = _mm_or_si128(dst_3, old_dst_3);
489    
490     // store
491     _mm_stream_si128((__m128i*)dst, dst_0);
492     _mm_stream_si128(((__m128i*)dst)+1, dst_1);
493     _mm_stream_si128(((__m128i*)dst)+2, dst_2);
494     _mm_stream_si128(((__m128i*)dst)+3, dst_3);
495    
496     // update the pointer
497     dst += 64;
498     src += 2*pitch;
499     }
500     }
501    
502     __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch)
503     {
504 william 273 #ifdef INTRINSIC_PORT_16
505 william 280 SwizzleBlock16_sse2_I<true>(dst, src, pitch);
506 william 273 #else
507 william 280 SwizzleBlock16_sse2(dst, src, pitch);
508 william 273 #endif
509 william 31 }
510    
511 william 280 __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch)
512 william 31 {
513 william 273 #ifdef INTRINSIC_PORT_8
514 william 280 SwizzleBlock8_sse2_I<true>(dst, src, pitch);
515 william 273 #else
516 william 280 SwizzleBlock8_sse2(dst, src, pitch);
517 william 273 #endif
518 william 31 }
519    
520 william 280 __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch)
521 william 31 {
522 william 273 #ifdef INTRINSIC_PORT_4
523 william 280 SwizzleBlock4_sse2_I<true>(dst, src, pitch);
524 william 273 #else
525 william 280 SwizzleBlock4_sse2(dst, src, pitch);
526 william 273 #endif
527 william 31 }
528    
529 william 280 __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)
530 william 31 {
531 william 280 SwizzleBlock32_sse2_I<false>(dst, src, pitch);
532 william 31 }
533    
534 william 280 __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch)
535 william 31 {
536 william 273 #ifdef INTRINSIC_PORT_16
537 william 280 SwizzleBlock16_sse2_I<false>(dst, src, pitch);
538 william 273 #else
539 william 280 SwizzleBlock16u_sse2(dst, src, pitch);
540 william 273 #endif
541 william 31 }
542    
543 william 280 __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch)
544 william 31 {
545 william 273 #ifdef INTRINSIC_PORT_8
546 william 280 SwizzleBlock8_sse2_I<false>(dst, src, pitch);
547 william 273 #else
548 william 280 SwizzleBlock8u_sse2(dst, src, pitch);
549 william 273 #endif
550 william 31 }
551    
552 william 280 __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch)
553 william 31 {
554 william 273 #ifdef INTRINSIC_PORT_4
555 william 280 SwizzleBlock4_sse2_I<false>(dst, src, pitch);
556 william 273 #else
557 william 280 SwizzleBlock4u_sse2(dst, src, pitch);
558 william 273 #endif
559 william 31 }
560    
561 william 280 __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch)
562     {
563     SwizzleBlock8H_4H<false, false>(dst, src, pitch);
564     }
565    
566     __forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch)
567     {
568     SwizzleBlock8H_4H<true, true>(dst, src, pitch);
569     }
570    
571     __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch)
572     {
573     SwizzleBlock8H_4H<true, false>(dst, src, pitch);
574     }
575    
576 william 31 #else
577    
578 william 280 __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch)
579 william 31 {
580 william 280 SwizzleBlock32_c(dst, src, pitch);
581 william 31 }
582    
583 william 280 __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch)
584 william 31 {
585 william 280 SwizzleBlock16_c(dst, src, pitch);
586 william 31 }
587    
588 william 280 __forceinline void SwizzleBlock8(u8 *dst, u8 *src, int pitch)
589 william 31 {
590 william 280 SwizzleBlock8_c(dst, src, pitch);
591 william 31 }
592    
593 william 280 __forceinline void SwizzleBlock4(u8 *dst, u8 *src, int pitch)
594 william 31 {
595 william 280 SwizzleBlock4_c(dst, src, pitch);
596 william 31 }
597    
598 william 280 __forceinline void SwizzleBlock32u(u8 *dst, u8 *src, int pitch)
599 william 31 {
600 william 280 SwizzleBlock32_c(dst, src, pitch);
601 william 31 }
602    
603 william 280 __forceinline void SwizzleBlock16u(u8 *dst, u8 *src, int pitch)
604 william 31 {
605 william 280 SwizzleBlock16_c(dst, src, pitch);
606 william 31 }
607    
608 william 280 __forceinline void SwizzleBlock8u(u8 *dst, u8 *src, int pitch)
609 william 31 {
610 william 280 SwizzleBlock8_c(dst, src, pitch);
611 william 31 }
612    
613 william 280 __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch)
614 william 31 {
615 william 280 SwizzleBlock4_c(dst, src, pitch);
616 william 31 }
617    
618 william 280 __forceinline void __fastcall SwizzleBlock32_mask(u8* dst, u8* src, int srcpitch, u32 WriteMask)
619 william 31 {
620     u32* d = &g_columnTable32[0][0];
621    
622     if (WriteMask == 0xffffffff)
623     {
624     for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
625     for (int i = 0; i < 8; i++)
626     ((u32*)dst)[d[i]] = ((u32*)src)[i];
627     }
628     else
629     {
630     for (int j = 0; j < 8; j++, d += 8, src += srcpitch)
631     for (int i = 0; i < 8; i++)
632     ((u32*)dst)[d[i]] = (((u32*)dst)[d[i]] & ~WriteMask) | (((u32*)src)[i] & WriteMask);
633     }
634     }
635    
636 william 280 __forceinline void __fastcall SwizzleBlock32_c(u8* dst, u8* src, int srcpitch)
637 william 31 {
638 william 280 SwizzleBlock32_mask(dst, src, srcpitch, 0xffffffff);
639 william 31 }
640    
641 william 280 __forceinline void __fastcall SwizzleBlock16_c(u8* dst, u8* src, int srcpitch)
642 william 31 {
643     u32* d = &g_columnTable16[0][0];
644    
645     for (int j = 0; j < 8; j++, d += 16, src += srcpitch)
646     for (int i = 0; i < 16; i++)
647     ((u16*)dst)[d[i]] = ((u16*)src)[i];
648     }
649    
650 william 280 __forceinline void __fastcall SwizzleBlock8_c(u8* dst, u8* src, int srcpitch)
651 william 31 {
652     u32* d = &g_columnTable8[0][0];
653    
654     for (int j = 0; j < 16; j++, d += 16, src += srcpitch)
655     for (int i = 0; i < 16; i++)
656     dst[d[i]] = src[i];
657     }
658    
659 william 280 __forceinline void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch)
660 william 31 {
661     u32* d = &g_columnTable4[0][0];
662    
663     for (int j = 0; j < 16; j++, d += 32, src += srcpitch)
664     {
665     for (int i = 0; i < 32; i++)
666     {
667     u32 addr = d[i];
668     u8 c = (src[i>>1] >> ((i & 1) << 2)) & 0x0f;
669     u32 shift = (addr & 1) << 2;
670     dst[addr >> 1] = (dst[addr >> 1] & (0xf0 >> shift)) | (c << shift);
671     }
672     }
673     }
674    
675 william 280 __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch)
676 william 31 {
677     u8* pnewsrc = src;
678     u32* pblock = tempblock;
679    
680 william 280 // Note src can be out of bound of GS memory (but there is some spare allocation
681     // to avoid a tricky corner case)
682     for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch - 24)
683 william 31 {
684     for (int bx = 0; bx < 8; ++bx, pnewsrc += 3)
685     {
686     pblock[bx] = *(u32*)pnewsrc;
687     }
688     }
689    
690 william 280 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0x00ffffff);
691 william 31 }
692    
693 william 280 __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch)
694 william 31 {
695     u8* pnewsrc = src;
696     u32* pblock = tempblock;
697    
698     for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
699     {
700     u32 u = *(u32*)pnewsrc;
701     pblock[0] = u << 24;
702     pblock[1] = u << 16;
703     pblock[2] = u << 8;
704     pblock[3] = u;
705     u = *(u32*)(pnewsrc + 4);
706     pblock[4] = u << 24;
707     pblock[5] = u << 16;
708     pblock[6] = u << 8;
709     pblock[7] = u;
710     }
711    
712 william 280 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0xff000000);
713 william 31 }
714    
715 william 280 __forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch)
716 william 31 {
717     u8* pnewsrc = src;
718     u32* pblock = tempblock;
719    
720     for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
721     {
722     u32 u = *(u32*)pnewsrc;
723     pblock[0] = u << 28;
724     pblock[1] = u << 24;
725     pblock[2] = u << 20;
726     pblock[3] = u << 16;
727     pblock[4] = u << 12;
728     pblock[5] = u << 8;
729     pblock[6] = u << 4;
730     pblock[7] = u;
731     }
732    
733 william 280 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0xf0000000);
734 william 31 }
735    
736 william 280 __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch)
737 william 31 {
738     u8* pnewsrc = src;
739     u32* pblock = tempblock;
740    
741     for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch)
742     {
743     u32 u = *(u32*)pnewsrc;
744     pblock[0] = u << 24;
745     pblock[1] = u << 20;
746     pblock[2] = u << 16;
747     pblock[3] = u << 12;
748     pblock[4] = u << 8;
749     pblock[5] = u << 4;
750     pblock[6] = u;
751     pblock[7] = u >> 4;
752     }
753    
754 william 280 SwizzleBlock32_mask((u8*)dst, (u8*)tempblock, 32, 0x0f000000);
755 william 31 }
756 william 280 #endif

  ViewVC Help
Powered by ViewVC 1.1.22