/[pcsx2_0.9.7]/trunk/pcsx2/IPU/yuv2rgb.cpp
ViewVC logotype

Contents of /trunk/pcsx2/IPU/yuv2rgb.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 62 - (show annotations) (download)
Tue Sep 7 11:08:22 2010 UTC (9 years, 10 months ago) by william
File size: 12306 byte(s)
Auto Commited Import of: pcsx2-0.9.7-r3738-debug in ./trunk
1 /* PCSX2 - PS2 Emulator for PCs
2 * Copyright (C) 2002-2010 PCSX2 Dev Team
3 *
4 * PCSX2 is free software: you can redistribute it and/or modify it under the terms
5 * of the GNU Lesser General Public License as published by the Free Software Found-
6 * ation, either version 3 of the License, or (at your option) any later version.
7 *
8 * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
10 * PURPOSE. See the GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License along with PCSX2.
13 * If not, see <http://www.gnu.org/licenses/>.
14 */
15
16
17 // IPU-correct yuv conversions by Pseudonym
18 // SSE2 Implementation by Pseudonym
19
20 #include "PrecompiledHeader.h"
21
22 #include "Common.h"
23 #include "IPU.h"
24 #include "yuv2rgb.h"
25 #include "mpeg2lib/Mpeg.h"
26
27 // The IPU's colour space conversion conforms to ITU-R Recommendation BT.601 if anyone wants to make a
28 // faster or "more accurate" implementation, but this is the precise documented integer method used by
29 // the hardware and is fast enough with SSE2.
30
31 #define IPU_Y_BIAS 16
32 #define IPU_C_BIAS 128
33 #define IPU_Y_COEFF 0x95 // 1.1640625
34 #define IPU_GCR_COEFF -0x68 // -0.8125
35 #define IPU_GCB_COEFF -0x32 // -0.390625
36 #define IPU_RCR_COEFF 0xcc // 1.59375
37 #define IPU_BCB_COEFF 0x102 // 2.015625
38
39 // conforming implementation for reference, do not optimise
40 void yuv2rgb_reference(void)
41 {
42 const macroblock_8& mb8 = decoder.mb8;
43 macroblock_rgb32& rgb32 = decoder.rgb32;
44
45 for (int y = 0; y < 16; y++)
46 for (int x = 0; x < 16; x++)
47 {
48 s32 lum = (IPU_Y_COEFF * (max(0, (s32)mb8.Y[y][x] - IPU_Y_BIAS))) >> 6;
49 s32 rcr = (IPU_RCR_COEFF * ((s32)mb8.Cr[y>>1][x>>1] - 128)) >> 6;
50 s32 gcr = (IPU_GCR_COEFF * ((s32)mb8.Cr[y>>1][x>>1] - 128)) >> 6;
51 s32 gcb = (IPU_GCB_COEFF * ((s32)mb8.Cb[y>>1][x>>1] - 128)) >> 6;
52 s32 bcb = (IPU_BCB_COEFF * ((s32)mb8.Cb[y>>1][x>>1] - 128)) >> 6;
53
54 rgb32.c[y][x].r = max(0, min(255, (lum + rcr + 1) >> 1));
55 rgb32.c[y][x].g = max(0, min(255, (lum + gcr + gcb + 1) >> 1));
56 rgb32.c[y][x].b = max(0, min(255, (lum + bcb + 1) >> 1));
57 rgb32.c[y][x].a = 0x80; // the norm to save doing this on the alpha pass
58 }
59 }
60
61 // Everything below is bit accurate to the IPU specification (except maybe rounding).
62 // Know the specification before you touch it.
63 #define SSE_BYTES(x) {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}
64 #define SSE_WORDS(x) {x, x, x, x, x, x, x, x}
65 #define SSE_COEFFICIENTS(x) SSE_WORDS((x)<<2)
66
67 struct SSE2_Tables
68 {
69 u16 C_bias[8]; // offset -64
70 u8 Y_bias[16]; // offset -48
71 u16 Y_mask[8]; // offset -32
72 u16 round_1bit[8]; // offset -16
73
74 s16 Y_coefficients[8]; // offset 0
75 s16 GCr_coefficients[8];// offset 16
76 s16 GCb_coefficients[8];// offset 32
77 s16 RCr_coefficients[8];// offset 48
78 s16 BCb_coefficients[8];// offset 64
79 };
80
81 enum
82 {
83 C_BIAS = -0x40,
84 Y_BIAS = -0x30,
85 Y_MASK = -0x20,
86 ROUND_1BIT = -0x10,
87
88 Y_COEFF = 0x00,
89 GCr_COEFF = 0x10,
90 GCb_COEFF = 0x20,
91 RCr_COEFF = 0x30,
92 BCb_COEFF = 0x40
93 };
94
95 static const __aligned16 SSE2_Tables sse2_tables =
96 {
97 SSE_WORDS(0x8000), // c_bias
98 SSE_BYTES(IPU_Y_BIAS), // y_bias
99 SSE_WORDS(0xff00), // y_mask
100
101 // Specifying round off instead of round down as everywhere else
102 // implies that this is right
103 SSE_WORDS(1), // round_1bit
104
105 SSE_COEFFICIENTS(IPU_Y_COEFF),
106 SSE_COEFFICIENTS(IPU_GCR_COEFF),
107 SSE_COEFFICIENTS(IPU_GCB_COEFF),
108 SSE_COEFFICIENTS(IPU_RCR_COEFF),
109 SSE_COEFFICIENTS(IPU_BCB_COEFF),
110 };
111
112 static __aligned16 u16 yuv2rgb_temp[3][8];
113
114 // This could potentially be improved for SSE4
115 __ri void yuv2rgb_sse2(void)
116 {
117 #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
118 __asm {
119 mov eax, 1
120 xor esi, esi
121 xor edi, edi
122
123 // Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
124 // This saves 2-3 bytes per instruction where these are used. :)
125 mov ecx, offset yuv2rgb_temp
126 mov edx, offset sse2_tables+64;
127
128 align 16
129 tworows:
130 movq xmm3, qword ptr [decoder.mb8+256+esi]
131 movq xmm1, qword ptr [decoder.mb8+320+esi]
132 pxor xmm2, xmm2
133 pxor xmm0, xmm0
134 // could skip the movq but punpck requires 128-bit alignment
135 // for some reason, so two versions would be needed,
136 // bloating the function (further)
137 punpcklbw xmm2, xmm3
138 punpcklbw xmm0, xmm1
139 // unfortunately I don't think this will matter despite being
140 // technically potentially a little faster, but this is
141 // equivalent to an add or sub
142 pxor xmm2, xmmword ptr [edx+C_BIAS] // xmm2 <-- 8 x (Cb - 128) << 8
143 pxor xmm0, xmmword ptr [edx+C_BIAS] // xmm0 <-- 8 x (Cr - 128) << 8
144
145 movaps xmm1, xmm0
146 movaps xmm3, xmm2
147 pmulhw xmm1, xmmword ptr [edx+GCr_COEFF]
148 pmulhw xmm3, xmmword ptr [edx+GCb_COEFF]
149 pmulhw xmm0, xmmword ptr [edx+RCr_COEFF]
150 pmulhw xmm2, xmmword ptr [edx+BCb_COEFF]
151 paddsw xmm1, xmm3
152 // store for the next line; looking at the code above
153 // compared to the code below, I have to wonder whether
154 // this was worth the hassle
155 movaps xmmword ptr [ecx], xmm0
156 movaps xmmword ptr [ecx+16], xmm1
157 movaps xmmword ptr [ecx+32], xmm2
158 jmp ihatemsvc
159
160 align 16
161 onerow:
162 movaps xmm0, xmmword ptr [ecx]
163 movaps xmm1, xmmword ptr [ecx+16]
164 movaps xmm2, xmmword ptr [ecx+32]
165
166 // If masm directives worked properly in inline asm, I'd be using them,
167 // but I'm not inclined to write ~70 line #defines to simulate them.
168 // Maybe the function's faster like this anyway because it's smaller?
169 // I'd have to write a 70 line #define to benchmark it.
170
171 ihatemsvc:
172 movaps xmm3, xmm0
173 movaps xmm4, xmm1
174 movaps xmm5, xmm2
175
176 movaps xmm6, xmmword ptr [decoder.mb8+edi]
177 psubusb xmm6, xmmword ptr [edx+Y_BIAS]
178 movaps xmm7, xmm6
179 psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
180 pand xmm7, xmmword ptr [edx+Y_MASK] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
181
182 pmulhuw xmm6, xmmword ptr [edx+Y_COEFF]
183 pmulhuw xmm7, xmmword ptr [edx+Y_COEFF]
184
185 paddsw xmm0, xmm6
186 paddsw xmm3, xmm7
187 paddsw xmm1, xmm6
188 paddsw xmm4, xmm7
189 paddsw xmm2, xmm6
190 paddsw xmm5, xmm7
191
192 // 0x80; a constant is probably so much better
193 pcmpeqb xmm7, xmm7
194 psllw xmm7, 15
195 psrlw xmm7, 8
196 packuswb xmm7, xmm7
197
198 // round
199 movaps xmm6, xmmword ptr [edx+ROUND_1BIT]
200 paddw xmm0, xmm6
201 paddw xmm1, xmm6
202 paddw xmm2, xmm6
203 paddw xmm3, xmm6
204 paddw xmm4, xmm6
205 paddw xmm5, xmm6
206 psraw xmm0, 1
207 psraw xmm1, 1
208 psraw xmm2, 1
209 psraw xmm3, 1
210 psraw xmm4, 1
211 psraw xmm5, 1
212
213 // combine even and odd bytes
214 packuswb xmm0, xmm3
215 packuswb xmm1, xmm4
216 packuswb xmm2, xmm5
217 movhlps xmm3, xmm0
218 movhlps xmm4, xmm1
219 movhlps xmm5, xmm2
220 punpcklbw xmm0, xmm3 // Red bytes, back in order
221 punpcklbw xmm1, xmm4 // Green ""
222 punpcklbw xmm2, xmm5 // Blue ""
223 movaps xmm3, xmm0
224 movaps xmm4, xmm1
225 movaps xmm5, xmm2
226
227 // Create RGBA (we could generate A here, but we don't) quads
228 punpcklbw xmm0, xmm1
229 punpcklbw xmm2, xmm7
230 movaps xmm1, xmm0
231 punpcklwd xmm0, xmm2
232 punpckhwd xmm1, xmm2
233
234 punpckhbw xmm3, xmm4
235 punpckhbw xmm5, xmm7
236 movaps xmm4, xmm3
237 punpcklwd xmm3, xmm5
238 punpckhwd xmm4, xmm5
239
240 // at last
241 movaps xmmword ptr [decoder.rgb32+edi*4+0], xmm0
242 movaps xmmword ptr [decoder.rgb32+edi*4+16], xmm1
243 movaps xmmword ptr [decoder.rgb32+edi*4+32], xmm3
244 movaps xmmword ptr [decoder.rgb32+edi*4+48], xmm4
245
246 add edi, 16
247
248 neg eax
249 jl onerow // run twice
250
251 add esi, 8
252 cmp esi, 64
253 jne tworows
254 }
255
256 #elif defined(__GNUC__)
257
258 // offset to the middle of the sse2 table, so that we can use 1-byte address displacement
259 // to access all fields:
260 static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64;
261 static const u8* mb8 = (u8*)&decoder.mb8;
262 static u8* rgb32 = (u8*)&decoder.rgb32;
263
264 __asm__ __volatile__ (
265 ".intel_syntax noprefix\n"
266 "xor esi, esi\n"
267 "xor edi, edi\n"
268
269 ".align 16\n"
270 "tworows_%=:\n"
271 "movq xmm3, qword ptr [%[mb8]+256+esi]\n"
272 "movq xmm1, qword ptr [%[mb8]+320+esi]\n"
273 "pxor xmm2, xmm2\n"
274 "pxor xmm0, xmm0\n"
275 // could skip the movq but punpck requires 128-bit alignment
276 // for some reason, so two versions would be needed,
277 // bloating the function (further)
278 "punpcklbw xmm2, xmm3\n"
279 "punpcklbw xmm0, xmm1\n"
280 // unfortunately I don't think this will matter despite being
281 // technically potentially a little faster, but this is
282 // equivalent to an add or sub
283 "pxor xmm2, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm2 <-- 8 x (Cb - 128) << 8
284 "pxor xmm0, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm0 <-- 8 x (Cr - 128) << 8
285
286 "movaps xmm1, xmm0\n"
287 "movaps xmm3, xmm2\n"
288 "pmulhw xmm1, xmmword ptr [%[sse2_tables]+%c[GCr_COEFF]]\n"
289 "pmulhw xmm3, xmmword ptr [%[sse2_tables]+%c[GCb_COEFF]]\n"
290 "pmulhw xmm0, xmmword ptr [%[sse2_tables]+%c[RCr_COEFF]]\n"
291 "pmulhw xmm2, xmmword ptr [%[sse2_tables]+%c[BCb_COEFF]]\n"
292 "paddsw xmm1, xmm3\n"
293 // store for the next line; looking at the code above
294 // compared to the code below, I have to wonder whether
295 // this was worth the hassle
296 "movaps xmmword ptr [%[yuv2rgb_temp]], xmm0\n"
297 "movaps xmmword ptr [%[yuv2rgb_temp]+16], xmm1\n"
298 "movaps xmmword ptr [%[yuv2rgb_temp]+32], xmm2\n"
299 "jmp ihategcctoo_%=\n"
300
301 ".align 16\n"
302 "onerow_%=:\n"
303 "movaps xmm0, xmmword ptr [%[yuv2rgb_temp]]\n"
304 "movaps xmm1, xmmword ptr [%[yuv2rgb_temp]+16]\n"
305 "movaps xmm2, xmmword ptr [%[yuv2rgb_temp]+32]\n"
306
307 "ihategcctoo_%=:\n"
308 "movaps xmm3, xmm0\n"
309 "movaps xmm4, xmm1\n"
310 "movaps xmm5, xmm2\n"
311
312 "movaps xmm6, xmmword ptr [%[mb8]+edi]\n"
313 "psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
314 "movaps xmm7, xmm6\n"
315 "psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
316 "pand xmm7, xmmword ptr [%[sse2_tables]+%c[Y_MASK]]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
317
318 "pmulhuw xmm6, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
319 "pmulhuw xmm7, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
320
321 "paddsw xmm0, xmm6\n"
322 "paddsw xmm3, xmm7\n"
323 "paddsw xmm1, xmm6\n"
324 "paddsw xmm4, xmm7\n"
325 "paddsw xmm2, xmm6\n"
326 "paddsw xmm5, xmm7\n"
327
328 // 0x80; a constant is probably so much better
329 "pcmpeqb xmm7, xmm7\n"
330 "psllw xmm7, 15\n"
331 "psrlw xmm7, 8\n"
332 "packuswb xmm7, xmm7\n"
333
334 // round
335 "movaps xmm6, xmmword ptr [%[sse2_tables]+%c[ROUND_1BIT]]\n"
336 "paddw xmm0, xmm6\n"
337 "paddw xmm1, xmm6\n"
338 "paddw xmm2, xmm6\n"
339 "paddw xmm3, xmm6\n"
340 "paddw xmm4, xmm6\n"
341 "paddw xmm5, xmm6\n"
342 "psraw xmm0, 1\n"
343 "psraw xmm1, 1\n"
344 "psraw xmm2, 1\n"
345 "psraw xmm3, 1\n"
346 "psraw xmm4, 1\n"
347 "psraw xmm5, 1\n"
348
349 // combine even and odd bytes
350 "packuswb xmm0, xmm3\n"
351 "packuswb xmm1, xmm4\n"
352 "packuswb xmm2, xmm5\n"
353 "movhlps xmm3, xmm0\n"
354 "movhlps xmm4, xmm1\n"
355 "movhlps xmm5, xmm2\n"
356 "punpcklbw xmm0, xmm3\n" // Red bytes, back in order
357 "punpcklbw xmm1, xmm4\n" // Green ""
358 "punpcklbw xmm2, xmm5\n" // Blue ""
359 "movaps xmm3, xmm0\n"
360 "movaps xmm4, xmm1\n"
361 "movaps xmm5, xmm2\n"
362
363 // Create RGBA (we could generate A here, but we don't) quads
364 "punpcklbw xmm0, xmm1\n"
365 "punpcklbw xmm2, xmm7\n"
366 "movaps xmm1, xmm0\n"
367 "punpcklwd xmm0, xmm2\n"
368 "punpckhwd xmm1, xmm2\n"
369
370 "punpckhbw xmm3, xmm4\n"
371 "punpckhbw xmm5, xmm7\n"
372 "movaps xmm4, xmm3\n"
373 "punpcklwd xmm3, xmm5\n"
374 "punpckhwd xmm4, xmm5\n"
375
376 // at last
377 "movaps xmmword ptr [%[rgb32]+edi*4+0], xmm0\n"
378 "movaps xmmword ptr [%[rgb32]+edi*4+16], xmm1\n"
379 "movaps xmmword ptr [%[rgb32]+edi*4+32], xmm3\n"
380 "movaps xmmword ptr [%[rgb32]+edi*4+48], xmm4\n"
381
382 "add edi, 16\n"
383
384 // run twice the onerow <=> edi = 16 or 48 or 80 etc... <=> check bit 5
385 "test edi, 16\n"
386 "jnz onerow_%=\n"
387
388 "add esi, 8\n"
389 "cmp esi, 64\n"
390 "jne tworows_%=\n"
391 ".att_syntax\n"
392 :
393 :[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
394 [ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
395 [GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
396 // Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
397 // This saves 2-3 bytes per instruction where these are used. :)
398 [yuv2rgb_temp]"c"(yuv2rgb_temp), [sse2_tables]"d"(sse2_tableoffset),
399 [mb8]"r"(mb8), [rgb32]"r"(rgb32)
400 : "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
401 );
402 #else
403 # error Unsupported compiler
404 #endif
405 }

  ViewVC Help
Powered by ViewVC 1.1.22