/[pcsx2_0.9.7]/trunk/3rdparty/SoundTouch/3dnow_win.cpp
ViewVC logotype

Annotation of /trunk/3rdparty/SoundTouch/3dnow_win.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (hide annotations) (download)
Mon Sep 6 11:19:43 2010 UTC (9 years, 5 months ago) by william
File size: 11302 byte(s)
Exported ./upsream/trunk @r3730 from http://pcsx2.googlecode.com/svn/trunk/
1 william 8 ////////////////////////////////////////////////////////////////////////////////
2     ///
3     /// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon
4     /// processors. All 3DNow! optimized functions have been gathered into this
5     /// single source code file, regardless to their class or original source code
6     /// file, in order to ease porting the library to other compiler and processor
7     /// platforms.
8     ///
9     /// By the way; the performance gain depends heavily on the CPU generation: On
10     /// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the
11     /// difference to the original routines stayed at unremarkable 8%! Such a small
12     /// improvement on Athlon is due to 3DNow can perform only two operations in
13     /// parallel, and obviously also the Athlon FPU is doing a very good job with
14     /// the standard C floating point routines! Here these routines are anyway,
15     /// although it might not be worth the effort to convert these to GCC platform,
16     /// for Athlon CPU at least. The situation is different regarding the SSE
17     /// optimizations though, thanks to the four parallel operations of SSE that
18     /// already make a difference.
19     ///
20     /// This file is to be compiled in Windows platform with Microsoft Visual C++
21     /// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all
22     /// GNU platforms (if file supplied).
23     ///
24     /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
25     /// 6.0 processor pack" update to support 3DNow! instruction set. The update is
26     /// available for download at Microsoft Developers Network, see here:
27     /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
28     ///
29     /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
30     /// perform a search with keywords "processor pack".
31     ///
32     /// Author : Copyright (c) Olli Parviainen
33     /// Author e-mail : oparviai 'at' iki.fi
34     /// SoundTouch WWW: http://www.surina.net/soundtouch
35     ///
36     ////////////////////////////////////////////////////////////////////////////////
37     //
38     // Last changed : $Date: 2009-02-21 18:00:14 +0200 (Sat, 21 Feb 2009) $
39     // File revision : $Revision: 4 $
40     //
41     // $Id: 3dnow_win.cpp 63 2009-02-21 16:00:14Z oparviai $
42     //
43     ////////////////////////////////////////////////////////////////////////////////
44     //
45     // License :
46     //
47     // SoundTouch audio processing library
48     // Copyright (c) Olli Parviainen
49     //
50     // This library is free software; you can redistribute it and/or
51     // modify it under the terms of the GNU Lesser General Public
52     // License as published by the Free Software Foundation; either
53     // version 2.1 of the License, or (at your option) any later version.
54     //
55     // This library is distributed in the hope that it will be useful,
56     // but WITHOUT ANY WARRANTY; without even the implied warranty of
57     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
58     // Lesser General Public License for more details.
59     //
60     // You should have received a copy of the GNU Lesser General Public
61     // License along with this library; if not, write to the Free Software
62     // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
63     //
64     ////////////////////////////////////////////////////////////////////////////////
65    
66     #include "cpu_detect.h"
67     #include "STTypes.h"
68    
69     #ifndef WIN32
70     #error "wrong platform - this source code file is exclusively for Win32 platform"
71     #endif
72    
73     using namespace soundtouch;
74    
75     #ifdef ALLOW_3DNOW
76     // 3DNow! routines available only with float sample type
77    
78     //////////////////////////////////////////////////////////////////////////////
79     //
80     // implementation of 3DNow! optimized functions of class 'TDStretch3DNow'
81     //
82     //////////////////////////////////////////////////////////////////////////////
83    
84     #include "TDStretch.h"
85    
86    
87     // Calculates cross correlation of two buffers
88     double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const
89     {
90     int overlapLengthLocal = overlapLength;
91     float corr = 0;
92    
93     // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
94     /*
95     c-pseudocode:
96    
97     corr = 0;
98     for (i = 0; i < overlapLength / 4; i ++)
99     {
100     corr += pV1[0] * pV2[0];
101     pV1[1] * pV2[1];
102     pV1[2] * pV2[2];
103     pV1[3] * pV2[3];
104     pV1[4] * pV2[4];
105     pV1[5] * pV2[5];
106     pV1[6] * pV2[6];
107     pV1[7] * pV2[7];
108    
109     pV1 += 8;
110     pV2 += 8;
111     }
112     */
113    
114     _asm
115     {
116     // give prefetch hints to CPU of what data are to be needed soonish.
117     // give more aggressive hints on pV1 as that changes more between different calls
118     // while pV2 stays the same.
119     prefetch [pV1]
120     prefetch [pV2]
121     prefetch [pV1 + 32]
122    
123     mov eax, dword ptr pV2
124     mov ebx, dword ptr pV1
125    
126     pxor mm0, mm0
127    
128     mov ecx, overlapLengthLocal
129     shr ecx, 2 // div by four
130    
131     loop1:
132     movq mm1, [eax]
133     prefetch [eax + 32] // give a prefetch hint to CPU what data are to be needed soonish
134     pfmul mm1, [ebx]
135     prefetch [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish
136    
137     movq mm2, [eax + 8]
138     pfadd mm0, mm1
139     pfmul mm2, [ebx + 8]
140    
141     movq mm3, [eax + 16]
142     pfadd mm0, mm2
143     pfmul mm3, [ebx + 16]
144    
145     movq mm4, [eax + 24]
146     pfadd mm0, mm3
147     pfmul mm4, [ebx + 24]
148    
149     add eax, 32
150     pfadd mm0, mm4
151     add ebx, 32
152    
153     dec ecx
154     jnz loop1
155    
156     // add halfs of mm0 together and return the result.
157     // note: mm1 is used as a dummy parameter only, we actually don't care about it's value
158     pfacc mm0, mm1
159     movd corr, mm0
160     femms
161     }
162    
163     return corr;
164     }
165    
166    
167    
168    
169     //////////////////////////////////////////////////////////////////////////////
170     //
171     // implementation of 3DNow! optimized functions of class 'FIRFilter'
172     //
173     //////////////////////////////////////////////////////////////////////////////
174    
175     #include "FIRFilter.h"
176    
177     FIRFilter3DNow::FIRFilter3DNow() : FIRFilter()
178     {
179     filterCoeffsUnalign = NULL;
180     filterCoeffsAlign = NULL;
181     }
182    
183    
184     FIRFilter3DNow::~FIRFilter3DNow()
185     {
186     delete[] filterCoeffsUnalign;
187     filterCoeffsUnalign = NULL;
188     filterCoeffsAlign = NULL;
189     }
190    
191    
192     // (overloaded) Calculates filter coefficients for 3DNow! routine
193     void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
194     {
195     uint i;
196     float fDivider;
197    
198     FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
199    
200     // Scale the filter coefficients so that it won't be necessary to scale the filtering result
201     // also rearrange coefficients suitably for 3DNow!
202     // Ensure that filter coeffs array is aligned to 16-byte boundary
203     delete[] filterCoeffsUnalign;
204     filterCoeffsUnalign = new float[2 * newLength + 4];
205     filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & (uint)-16);
206    
207     fDivider = (float)resultDivider;
208    
209     // rearrange the filter coefficients for mmx routines
210     for (i = 0; i < newLength; i ++)
211     {
212     filterCoeffsAlign[2 * i + 0] =
213     filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
214     }
215     }
216    
217    
218     // 3DNow!-optimized version of the filter routine for stereo sound
219     uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, uint numSamples) const
220     {
221     float *filterCoeffsLocal = filterCoeffsAlign;
222     uint count = (numSamples - length) & (uint)-2;
223     uint lengthLocal = length / 4;
224    
225     assert(length != 0);
226     assert(count % 2 == 0);
227    
228     /* original code:
229    
230     double suml1, suml2;
231     double sumr1, sumr2;
232     uint i, j;
233    
234     for (j = 0; j < count; j += 2)
235     {
236     const float *ptr;
237    
238     suml1 = sumr1 = 0.0;
239     suml2 = sumr2 = 0.0;
240     ptr = src;
241     filterCoeffsLocal = filterCoeffs;
242     for (i = 0; i < lengthLocal; i ++)
243     {
244     // unroll loop for efficiency.
245    
246     suml1 += ptr[0] * filterCoeffsLocal[0] +
247     ptr[2] * filterCoeffsLocal[2] +
248     ptr[4] * filterCoeffsLocal[4] +
249     ptr[6] * filterCoeffsLocal[6];
250    
251     sumr1 += ptr[1] * filterCoeffsLocal[1] +
252     ptr[3] * filterCoeffsLocal[3] +
253     ptr[5] * filterCoeffsLocal[5] +
254     ptr[7] * filterCoeffsLocal[7];
255    
256     suml2 += ptr[8] * filterCoeffsLocal[0] +
257     ptr[10] * filterCoeffsLocal[2] +
258     ptr[12] * filterCoeffsLocal[4] +
259     ptr[14] * filterCoeffsLocal[6];
260    
261     sumr2 += ptr[9] * filterCoeffsLocal[1] +
262     ptr[11] * filterCoeffsLocal[3] +
263     ptr[13] * filterCoeffsLocal[5] +
264     ptr[15] * filterCoeffsLocal[7];
265    
266     ptr += 16;
267     filterCoeffsLocal += 8;
268     }
269     dest[0] = (float)suml1;
270     dest[1] = (float)sumr1;
271     dest[2] = (float)suml2;
272     dest[3] = (float)sumr2;
273    
274     src += 4;
275     dest += 4;
276     }
277    
278     */
279     _asm
280     {
281     mov eax, dword ptr dest
282     mov ebx, dword ptr src
283     mov edx, count
284     shr edx, 1
285    
286     loop1:
287     // "outer loop" : during each round 2*2 output samples are calculated
288     prefetch [ebx] // give a prefetch hint to CPU what data are to be needed soonish
289     prefetch [filterCoeffsLocal] // give a prefetch hint to CPU what data are to be needed soonish
290    
291     mov esi, ebx
292     mov edi, filterCoeffsLocal
293     pxor mm0, mm0
294     pxor mm1, mm1
295     mov ecx, lengthLocal
296    
297     loop2:
298     // "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples
299     movq mm2, [edi]
300     movq mm3, mm2
301     prefetch [edi + 32] // give a prefetch hint to CPU what data are to be needed soonish
302     pfmul mm2, [esi]
303     prefetch [esi + 32] // give a prefetch hint to CPU what data are to be needed soonish
304     pfmul mm3, [esi + 8]
305    
306     movq mm4, [edi + 8]
307     movq mm5, mm4
308     pfadd mm0, mm2
309     pfmul mm4, [esi + 8]
310     pfadd mm1, mm3
311     pfmul mm5, [esi + 16]
312    
313     movq mm2, [edi + 16]
314     movq mm6, mm2
315     pfadd mm0, mm4
316     pfmul mm2, [esi + 16]
317     pfadd mm1, mm5
318     pfmul mm6, [esi + 24]
319    
320     movq mm3, [edi + 24]
321     movq mm7, mm3
322     pfadd mm0, mm2
323     pfmul mm3, [esi + 24]
324     pfadd mm1, mm6
325     pfmul mm7, [esi + 32]
326     add esi, 32
327     pfadd mm0, mm3
328     add edi, 32
329     pfadd mm1, mm7
330    
331     dec ecx
332     jnz loop2
333    
334     movq [eax], mm0
335     add ebx, 16
336     movq [eax + 8], mm1
337     add eax, 16
338    
339     dec edx
340     jnz loop1
341    
342     femms
343     }
344    
345     return count;
346     }
347    
348    
349     #endif // ALLOW_3DNOW

  ViewVC Help
Powered by ViewVC 1.1.22