/[pcsx2_0.9.7]/trunk/3rdparty/SoundTouch/3dnow_win.cpp
ViewVC logotype

Contents of /trunk/3rdparty/SoundTouch/3dnow_win.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (show annotations) (download)
Mon Sep 6 11:19:43 2010 UTC (9 years, 4 months ago) by william
File size: 11302 byte(s)
Exported ./upsream/trunk @r3730 from http://pcsx2.googlecode.com/svn/trunk/
1 ////////////////////////////////////////////////////////////////////////////////
2 ///
3 /// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon
4 /// processors. All 3DNow! optimized functions have been gathered into this
5 /// single source code file, regardless to their class or original source code
6 /// file, in order to ease porting the library to other compiler and processor
7 /// platforms.
8 ///
9 /// By the way; the performance gain depends heavily on the CPU generation: On
10 /// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the
11 /// difference to the original routines stayed at unremarkable 8%! Such a small
12 /// improvement on Athlon is due to 3DNow can perform only two operations in
13 /// parallel, and obviously also the Athlon FPU is doing a very good job with
14 /// the standard C floating point routines! Here these routines are anyway,
15 /// although it might not be worth the effort to convert these to GCC platform,
16 /// for Athlon CPU at least. The situation is different regarding the SSE
17 /// optimizations though, thanks to the four parallel operations of SSE that
18 /// already make a difference.
19 ///
20 /// This file is to be compiled in Windows platform with Microsoft Visual C++
21 /// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all
22 /// GNU platforms (if file supplied).
23 ///
24 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
25 /// 6.0 processor pack" update to support 3DNow! instruction set. The update is
26 /// available for download at Microsoft Developers Network, see here:
27 /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
28 ///
29 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
30 /// perform a search with keywords "processor pack".
31 ///
32 /// Author : Copyright (c) Olli Parviainen
33 /// Author e-mail : oparviai 'at' iki.fi
34 /// SoundTouch WWW: http://www.surina.net/soundtouch
35 ///
36 ////////////////////////////////////////////////////////////////////////////////
37 //
38 // Last changed : $Date: 2009-02-21 18:00:14 +0200 (Sat, 21 Feb 2009) $
39 // File revision : $Revision: 4 $
40 //
41 // $Id: 3dnow_win.cpp 63 2009-02-21 16:00:14Z oparviai $
42 //
43 ////////////////////////////////////////////////////////////////////////////////
44 //
45 // License :
46 //
47 // SoundTouch audio processing library
48 // Copyright (c) Olli Parviainen
49 //
50 // This library is free software; you can redistribute it and/or
51 // modify it under the terms of the GNU Lesser General Public
52 // License as published by the Free Software Foundation; either
53 // version 2.1 of the License, or (at your option) any later version.
54 //
55 // This library is distributed in the hope that it will be useful,
56 // but WITHOUT ANY WARRANTY; without even the implied warranty of
57 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
58 // Lesser General Public License for more details.
59 //
60 // You should have received a copy of the GNU Lesser General Public
61 // License along with this library; if not, write to the Free Software
62 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
63 //
64 ////////////////////////////////////////////////////////////////////////////////
65
66 #include "cpu_detect.h"
67 #include "STTypes.h"
68
69 #ifndef WIN32
70 #error "wrong platform - this source code file is exclusively for Win32 platform"
71 #endif
72
73 using namespace soundtouch;
74
75 #ifdef ALLOW_3DNOW
76 // 3DNow! routines available only with float sample type
77
78 //////////////////////////////////////////////////////////////////////////////
79 //
80 // implementation of 3DNow! optimized functions of class 'TDStretch3DNow'
81 //
82 //////////////////////////////////////////////////////////////////////////////
83
84 #include "TDStretch.h"
85
86
87 // Calculates cross correlation of two buffers
88 double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const
89 {
90 int overlapLengthLocal = overlapLength;
91 float corr = 0;
92
93 // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
94 /*
95 c-pseudocode:
96
97 corr = 0;
98 for (i = 0; i < overlapLength / 4; i ++)
99 {
100 corr += pV1[0] * pV2[0];
101 pV1[1] * pV2[1];
102 pV1[2] * pV2[2];
103 pV1[3] * pV2[3];
104 pV1[4] * pV2[4];
105 pV1[5] * pV2[5];
106 pV1[6] * pV2[6];
107 pV1[7] * pV2[7];
108
109 pV1 += 8;
110 pV2 += 8;
111 }
112 */
113
114 _asm
115 {
116 // give prefetch hints to CPU of what data are to be needed soonish.
117 // give more aggressive hints on pV1 as that changes more between different calls
118 // while pV2 stays the same.
119 prefetch [pV1]
120 prefetch [pV2]
121 prefetch [pV1 + 32]
122
123 mov eax, dword ptr pV2
124 mov ebx, dword ptr pV1
125
126 pxor mm0, mm0
127
128 mov ecx, overlapLengthLocal
129 shr ecx, 2 // div by four
130
131 loop1:
132 movq mm1, [eax]
133 prefetch [eax + 32] // give a prefetch hint to CPU what data are to be needed soonish
134 pfmul mm1, [ebx]
135 prefetch [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish
136
137 movq mm2, [eax + 8]
138 pfadd mm0, mm1
139 pfmul mm2, [ebx + 8]
140
141 movq mm3, [eax + 16]
142 pfadd mm0, mm2
143 pfmul mm3, [ebx + 16]
144
145 movq mm4, [eax + 24]
146 pfadd mm0, mm3
147 pfmul mm4, [ebx + 24]
148
149 add eax, 32
150 pfadd mm0, mm4
151 add ebx, 32
152
153 dec ecx
154 jnz loop1
155
156 // add halfs of mm0 together and return the result.
157 // note: mm1 is used as a dummy parameter only, we actually don't care about it's value
158 pfacc mm0, mm1
159 movd corr, mm0
160 femms
161 }
162
163 return corr;
164 }
165
166
167
168
169 //////////////////////////////////////////////////////////////////////////////
170 //
171 // implementation of 3DNow! optimized functions of class 'FIRFilter'
172 //
173 //////////////////////////////////////////////////////////////////////////////
174
175 #include "FIRFilter.h"
176
177 FIRFilter3DNow::FIRFilter3DNow() : FIRFilter()
178 {
179 filterCoeffsUnalign = NULL;
180 filterCoeffsAlign = NULL;
181 }
182
183
184 FIRFilter3DNow::~FIRFilter3DNow()
185 {
186 delete[] filterCoeffsUnalign;
187 filterCoeffsUnalign = NULL;
188 filterCoeffsAlign = NULL;
189 }
190
191
192 // (overloaded) Calculates filter coefficients for 3DNow! routine
193 void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
194 {
195 uint i;
196 float fDivider;
197
198 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
199
200 // Scale the filter coefficients so that it won't be necessary to scale the filtering result
201 // also rearrange coefficients suitably for 3DNow!
202 // Ensure that filter coeffs array is aligned to 16-byte boundary
203 delete[] filterCoeffsUnalign;
204 filterCoeffsUnalign = new float[2 * newLength + 4];
205 filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & (uint)-16);
206
207 fDivider = (float)resultDivider;
208
209 // rearrange the filter coefficients for mmx routines
210 for (i = 0; i < newLength; i ++)
211 {
212 filterCoeffsAlign[2 * i + 0] =
213 filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
214 }
215 }
216
217
218 // 3DNow!-optimized version of the filter routine for stereo sound
219 uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, uint numSamples) const
220 {
221 float *filterCoeffsLocal = filterCoeffsAlign;
222 uint count = (numSamples - length) & (uint)-2;
223 uint lengthLocal = length / 4;
224
225 assert(length != 0);
226 assert(count % 2 == 0);
227
228 /* original code:
229
230 double suml1, suml2;
231 double sumr1, sumr2;
232 uint i, j;
233
234 for (j = 0; j < count; j += 2)
235 {
236 const float *ptr;
237
238 suml1 = sumr1 = 0.0;
239 suml2 = sumr2 = 0.0;
240 ptr = src;
241 filterCoeffsLocal = filterCoeffs;
242 for (i = 0; i < lengthLocal; i ++)
243 {
244 // unroll loop for efficiency.
245
246 suml1 += ptr[0] * filterCoeffsLocal[0] +
247 ptr[2] * filterCoeffsLocal[2] +
248 ptr[4] * filterCoeffsLocal[4] +
249 ptr[6] * filterCoeffsLocal[6];
250
251 sumr1 += ptr[1] * filterCoeffsLocal[1] +
252 ptr[3] * filterCoeffsLocal[3] +
253 ptr[5] * filterCoeffsLocal[5] +
254 ptr[7] * filterCoeffsLocal[7];
255
256 suml2 += ptr[8] * filterCoeffsLocal[0] +
257 ptr[10] * filterCoeffsLocal[2] +
258 ptr[12] * filterCoeffsLocal[4] +
259 ptr[14] * filterCoeffsLocal[6];
260
261 sumr2 += ptr[9] * filterCoeffsLocal[1] +
262 ptr[11] * filterCoeffsLocal[3] +
263 ptr[13] * filterCoeffsLocal[5] +
264 ptr[15] * filterCoeffsLocal[7];
265
266 ptr += 16;
267 filterCoeffsLocal += 8;
268 }
269 dest[0] = (float)suml1;
270 dest[1] = (float)sumr1;
271 dest[2] = (float)suml2;
272 dest[3] = (float)sumr2;
273
274 src += 4;
275 dest += 4;
276 }
277
278 */
279 _asm
280 {
281 mov eax, dword ptr dest
282 mov ebx, dword ptr src
283 mov edx, count
284 shr edx, 1
285
286 loop1:
287 // "outer loop" : during each round 2*2 output samples are calculated
288 prefetch [ebx] // give a prefetch hint to CPU what data are to be needed soonish
289 prefetch [filterCoeffsLocal] // give a prefetch hint to CPU what data are to be needed soonish
290
291 mov esi, ebx
292 mov edi, filterCoeffsLocal
293 pxor mm0, mm0
294 pxor mm1, mm1
295 mov ecx, lengthLocal
296
297 loop2:
298 // "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples
299 movq mm2, [edi]
300 movq mm3, mm2
301 prefetch [edi + 32] // give a prefetch hint to CPU what data are to be needed soonish
302 pfmul mm2, [esi]
303 prefetch [esi + 32] // give a prefetch hint to CPU what data are to be needed soonish
304 pfmul mm3, [esi + 8]
305
306 movq mm4, [edi + 8]
307 movq mm5, mm4
308 pfadd mm0, mm2
309 pfmul mm4, [esi + 8]
310 pfadd mm1, mm3
311 pfmul mm5, [esi + 16]
312
313 movq mm2, [edi + 16]
314 movq mm6, mm2
315 pfadd mm0, mm4
316 pfmul mm2, [esi + 16]
317 pfadd mm1, mm5
318 pfmul mm6, [esi + 24]
319
320 movq mm3, [edi + 24]
321 movq mm7, mm3
322 pfadd mm0, mm2
323 pfmul mm3, [esi + 24]
324 pfadd mm1, mm6
325 pfmul mm7, [esi + 32]
326 add esi, 32
327 pfadd mm0, mm3
328 add edi, 32
329 pfadd mm1, mm7
330
331 dec ecx
332 jnz loop2
333
334 movq [eax], mm0
335 add ebx, 16
336 movq [eax + 8], mm1
337 add eax, 16
338
339 dec edx
340 jnz loop1
341
342 femms
343 }
344
345 return count;
346 }
347
348
349 #endif // ALLOW_3DNOW

  ViewVC Help
Powered by ViewVC 1.1.22