/[pcsx2_0.9.7]/trunk/pcsx2/IPU/mpeg2lib/Idct.cpp
ViewVC logotype

Contents of /trunk/pcsx2/IPU/mpeg2lib/Idct.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 62 - (show annotations) (download)
Tue Sep 7 11:08:22 2010 UTC (9 years, 10 months ago) by william
File size: 7052 byte(s)
Auto Commited Import of: pcsx2-0.9.7-r3738-debug in ./trunk
1 /*
2 * idct.c
3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
5 * Modified by Florin for PCSX2 emu
6 *
7 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
8 * See http://libmpeg2.sourceforge.net/ for updates.
9 *
10 * mpeg2dec is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * mpeg2dec is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 */
24
25 // [TODO] : There are modern SSE versions of idct (idct_mmx.c) in the mpeg2 libs that we
26 // should probably upgrade to. They use their own raw-style intrinsics and not the intel
27 // compiler-integrated ones.
28
29 #include "PrecompiledHeader.h"
30
31 #include "Common.h"
32 #include "IPU/IPU.h"
33 #include "Mpeg.h"
34
35 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
36 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
37 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
38 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
39 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
40 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
41 #define clp(val,res) res = (val < 0) ? 0 : ((val > 255) ? 255 : val);
42 #define clp2(val,res) res = (val < -255) ? -255 : ((val > 255) ? 255 : val);
43
44 /*
45 * In legal streams, the IDCT output should be between -384 and +384.
46 * In corrupted streams, it is possible to force the IDCT output to go
47 * to +-3826 - this is the worst case for a column IDCT where the
48 * column inputs are 16-bit values.
49 */
50 static __aligned16 u8 clip_lut[1024];
51
52 #define CLIP(i) ((clip_lut+384)[(i)])
53
54 #if 0
55 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
56 do { \
57 t0 = W0*d0 + W1*d1; \
58 t1 = W0*d1 - W1*d0; \
59 } while (0)
60 #else
61 #define BUTTERFLY(t0,t1,W0,W1,d0,d1) \
62 do { \
63 int tmp = W0 * (d0 + d1); \
64 t0 = tmp + (W1 - W0) * d1; \
65 t1 = tmp - (W1 + W0) * d0; \
66 } while (0)
67 #endif
68
69 static __fi void idct_row (s16 * const block)
70 {
71 int d0, d1, d2, d3;
72 int a0, a1, a2, a3, b0, b1, b2, b3;
73 int t0, t1, t2, t3;
74
75 /* shortcut */
76 if (!(block[1] | ((s32 *)block)[1] | ((s32 *)block)[2] |
77 ((s32 *)block)[3])) {
78 u32 tmp = (u16) (block[0] << 3);
79 tmp |= tmp << 16;
80 ((s32 *)block)[0] = tmp;
81 ((s32 *)block)[1] = tmp;
82 ((s32 *)block)[2] = tmp;
83 ((s32 *)block)[3] = tmp;
84 return;
85 }
86
87 d0 = (block[0] << 11) + 128;
88 d1 = block[1];
89 d2 = block[2] << 11;
90 d3 = block[3];
91 t0 = d0 + d2;
92 t1 = d0 - d2;
93 BUTTERFLY (t2, t3, W6, W2, d3, d1);
94 a0 = t0 + t2;
95 a1 = t1 + t3;
96 a2 = t1 - t3;
97 a3 = t0 - t2;
98
99 d0 = block[4];
100 d1 = block[5];
101 d2 = block[6];
102 d3 = block[7];
103 BUTTERFLY (t0, t1, W7, W1, d3, d0);
104 BUTTERFLY (t2, t3, W3, W5, d1, d2);
105 b0 = t0 + t2;
106 b3 = t1 + t3;
107 t0 -= t2;
108 t1 -= t3;
109 b1 = ((t0 + t1) * 181) >> 8;
110 b2 = ((t0 - t1) * 181) >> 8;
111
112 block[0] = (a0 + b0) >> 8;
113 block[1] = (a1 + b1) >> 8;
114 block[2] = (a2 + b2) >> 8;
115 block[3] = (a3 + b3) >> 8;
116 block[4] = (a3 - b3) >> 8;
117 block[5] = (a2 - b2) >> 8;
118 block[6] = (a1 - b1) >> 8;
119 block[7] = (a0 - b0) >> 8;
120 }
121
122 static __fi void idct_col (s16 * const block)
123 {
124 int d0, d1, d2, d3;
125 int a0, a1, a2, a3, b0, b1, b2, b3;
126 int t0, t1, t2, t3;
127
128 d0 = (block[8*0] << 11) + 65536;
129 d1 = block[8*1];
130 d2 = block[8*2] << 11;
131 d3 = block[8*3];
132 t0 = d0 + d2;
133 t1 = d0 - d2;
134 BUTTERFLY (t2, t3, W6, W2, d3, d1);
135 a0 = t0 + t2;
136 a1 = t1 + t3;
137 a2 = t1 - t3;
138 a3 = t0 - t2;
139
140 d0 = block[8*4];
141 d1 = block[8*5];
142 d2 = block[8*6];
143 d3 = block[8*7];
144 BUTTERFLY (t0, t1, W7, W1, d3, d0);
145 BUTTERFLY (t2, t3, W3, W5, d1, d2);
146 b0 = t0 + t2;
147 b3 = t1 + t3;
148 t0 = (t0 - t2) >> 8;
149 t1 = (t1 - t3) >> 8;
150 b1 = (t0 + t1) * 181;
151 b2 = (t0 - t1) * 181;
152
153 block[8*0] = (a0 + b0) >> 17;
154 block[8*1] = (a1 + b1) >> 17;
155 block[8*2] = (a2 + b2) >> 17;
156 block[8*3] = (a3 + b3) >> 17;
157 block[8*4] = (a3 - b3) >> 17;
158 block[8*5] = (a2 - b2) >> 17;
159 block[8*6] = (a1 - b1) >> 17;
160 block[8*7] = (a0 - b0) >> 17;
161 }
162
163 __ri void mpeg2_idct_copy(s16 * block, u8 * dest, const int stride)
164 {
165 int i;
166
167 for (i = 0; i < 8; i++)
168 idct_row (block + 8 * i);
169 for (i = 0; i < 8; i++)
170 idct_col (block + i);
171
172 __m128 zero = _mm_setzero_ps();
173 do {
174 dest[0] = CLIP (block[0]);
175 dest[1] = CLIP (block[1]);
176 dest[2] = CLIP (block[2]);
177 dest[3] = CLIP (block[3]);
178 dest[4] = CLIP (block[4]);
179 dest[5] = CLIP (block[5]);
180 dest[6] = CLIP (block[6]);
181 dest[7] = CLIP (block[7]);
182
183 _mm_store_ps((float*)block, zero);
184
185 dest += stride;
186 block += 8;
187 } while (--i);
188 }
189
190
191 // stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]).
192 __ri void mpeg2_idct_add (const int last, s16 * block, s16 * dest, const int stride)
193 {
194 // on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0).
195
196 if (last != 129 || (block[0] & 7) == 4)
197 {
198 int i;
199 for (i = 0; i < 8; i++)
200 idct_row (block + 8 * i);
201 for (i = 0; i < 8; i++)
202 idct_col (block + i);
203
204 __m128 zero = _mm_setzero_ps();
205 do {
206 _mm_store_ps((float*)dest, _mm_load_ps((float*)block));
207 _mm_store_ps((float*)block, zero);
208
209 dest += stride;
210 block += 8;
211 } while (--i);
212
213 }
214 else
215 {
216 int DC = (block[0] + 4) >> 3;
217 s16 dcf[2] = { DC, DC };
218 block[0] = block[63] = 0;
219
220 __m128 dc128 = _mm_set_ps1(*(float*)dcf);
221
222 for(int i=0; i<8; ++i)
223 _mm_store_ps((float*)(dest+(stride*i)), dc128);
224 }
225 }
226
227 mpeg2_scan_pack::mpeg2_scan_pack()
228 {
229 static const u8 mpeg2_scan_norm[64] = {
230 /* Zig-Zag scan pattern */
231 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
232 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
233 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
234 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
235 };
236
237 static const u8 mpeg2_scan_alt[64] = {
238 /* Alternate scan pattern */
239 0, 8, 16, 24, 1, 9, 2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
240 41, 33, 26, 18, 3, 11, 4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
241 51, 59, 20, 28, 5, 13, 6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
242 53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
243 };
244
245 for (int i = -384; i < 640; i++)
246 clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
247
248 for (int i = 0; i < 64; i++) {
249 int j = mpeg2_scan_norm[i];
250 norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
251 j = mpeg2_scan_alt[i];
252 alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
253 }
254 }
255
256 const __aligned16 mpeg2_scan_pack mpeg2_scan;

  ViewVC Help
Powered by ViewVC 1.1.22