/[pcsx2_0.9.7]/trunk/3rdparty/SoundTouch/sse_optimized.cpp
ViewVC logotype

Annotation of /trunk/3rdparty/SoundTouch/sse_optimized.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (hide annotations) (download)
Mon Sep 6 11:19:43 2010 UTC (9 years, 5 months ago) by william
File size: 17838 byte(s)
Exported ./upsream/trunk @r3730 from http://pcsx2.googlecode.com/svn/trunk/
1 william 8 ////////////////////////////////////////////////////////////////////////////////
2     ///
3     /// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE
4     /// optimized functions have been gathered into this single source
5     /// code file, regardless to their class or original source code file, in order
6     /// to ease porting the library to other compiler and processor platforms.
7     ///
8     /// The SSE-optimizations are programmed using SSE compiler intrinsics that
9     /// are supported both by Microsoft Visual C++ and GCC compilers, so this file
10     /// should compile with both toolsets.
11     ///
12     /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
13     /// 6.0 processor pack" update to support SSE instruction set. The update is
14     /// available for download at Microsoft Developers Network, see here:
15     /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
16     ///
17     /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
18     /// perform a search with keywords "processor pack".
19     ///
20     /// Author : Copyright (c) Olli Parviainen
21     /// Author e-mail : oparviai 'at' iki.fi
22     /// SoundTouch WWW: http://www.surina.net/soundtouch
23     ///
24     ////////////////////////////////////////////////////////////////////////////////
25     //
26     // Last changed : $Date: 2009-12-28 22:32:57 +0200 (Mon, 28 Dec 2009) $
27     // File revision : $Revision: 4 $
28     //
29     // $Id: sse_optimized.cpp 80 2009-12-28 20:32:57Z oparviai $
30     //
31     ////////////////////////////////////////////////////////////////////////////////
32     //
33     // License :
34     //
35     // SoundTouch audio processing library
36     // Copyright (c) Olli Parviainen
37     //
38     // This library is free software; you can redistribute it and/or
39     // modify it under the terms of the GNU Lesser General Public
40     // License as published by the Free Software Foundation; either
41     // version 2.1 of the License, or (at your option) any later version.
42     //
43     // This library is distributed in the hope that it will be useful,
44     // but WITHOUT ANY WARRANTY; without even the implied warranty of
45     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
46     // Lesser General Public License for more details.
47     //
48     // You should have received a copy of the GNU Lesser General Public
49     // License along with this library; if not, write to the Free Software
50     // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
51     //
52     ////////////////////////////////////////////////////////////////////////////////
53    
54     #include "cpu_detect.h"
55     #include "STTypes.h"
56    
57     using namespace soundtouch;
58    
59     #ifdef ALLOW_SSE
60    
61     // SSE routines available only with float sample type
62    
63     //////////////////////////////////////////////////////////////////////////////
64     //
65     // implementation of SSE optimized functions of class 'TDStretchSSE'
66     //
67     //////////////////////////////////////////////////////////////////////////////
68    
69     #include "TDStretch.h"
70     #include <xmmintrin.h>
71     #include <math.h>
72    
73     // Calculates cross correlation of two buffers
74     double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
75     {
76     int i;
77     const float *pVec1;
78     const __m128 *pVec2;
79     __m128 vSum, vNorm;
80    
81     // Note. It means a major slow-down if the routine needs to tolerate
82     // unaligned __m128 memory accesses. It's way faster if we can skip
83     // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
84     // This can mean up to ~ 10-fold difference (incl. part of which is
85     // due to skipping every second round for stereo sound though).
86     //
87     // Compile-time define ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
88     // for choosing if this little cheating is allowed.
89    
90     #ifdef ALLOW_NONEXACT_SIMD_OPTIMIZATION
91     // Little cheating allowed, return valid correlation only for
92     // aligned locations, meaning every second round for stereo sound.
93    
94     #define _MM_LOAD _mm_load_ps
95    
96     if (((ulong)pV1) & 15) return -1e50; // skip unaligned locations
97    
98     #else
99     // No cheating allowed, use unaligned load & take the resulting
100     // performance hit.
101     #define _MM_LOAD _mm_loadu_ps
102     #endif
103    
104     // ensure overlapLength is divisible by 8
105     assert((overlapLength % 8) == 0);
106    
107     // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
108     // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
109     pVec1 = (const float*)pV1;
110     pVec2 = (const __m128*)pV2;
111     vSum = vNorm = _mm_setzero_ps();
112    
113     // Unroll the loop by factor of 4 * 4 operations
114     for (i = 0; i < overlapLength / 8; i ++)
115     {
116     __m128 vTemp;
117     // vSum += pV1[0..3] * pV2[0..3]
118     vTemp = _MM_LOAD(pVec1);
119     vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0]));
120     vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
121    
122     // vSum += pV1[4..7] * pV2[4..7]
123     vTemp = _MM_LOAD(pVec1 + 4);
124     vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
125     vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
126    
127     // vSum += pV1[8..11] * pV2[8..11]
128     vTemp = _MM_LOAD(pVec1 + 8);
129     vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
130     vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
131    
132     // vSum += pV1[12..15] * pV2[12..15]
133     vTemp = _MM_LOAD(pVec1 + 12);
134     vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
135     vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
136    
137     pVec1 += 16;
138     pVec2 += 4;
139     }
140    
141     // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
142     float *pvNorm = (float*)&vNorm;
143     double norm = sqrt(pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
144     if (norm < 1e-9) norm = 1.0; // to avoid div by zero
145    
146     float *pvSum = (float*)&vSum;
147     return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / norm;
148    
149     /* This is approximately corresponding routine in C-language yet without normalization:
150     double corr, norm;
151     uint i;
152    
153     // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
154     corr = norm = 0.0;
155     for (i = 0; i < overlapLength / 8; i ++)
156     {
157     corr += pV1[0] * pV2[0] +
158     pV1[1] * pV2[1] +
159     pV1[2] * pV2[2] +
160     pV1[3] * pV2[3] +
161     pV1[4] * pV2[4] +
162     pV1[5] * pV2[5] +
163     pV1[6] * pV2[6] +
164     pV1[7] * pV2[7] +
165     pV1[8] * pV2[8] +
166     pV1[9] * pV2[9] +
167     pV1[10] * pV2[10] +
168     pV1[11] * pV2[11] +
169     pV1[12] * pV2[12] +
170     pV1[13] * pV2[13] +
171     pV1[14] * pV2[14] +
172     pV1[15] * pV2[15];
173    
174     for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j];
175    
176     pV1 += 16;
177     pV2 += 16;
178     }
179     return corr / sqrt(norm);
180     */
181    
182     /* This is a bit outdated, corresponding routine in assembler. This may be teeny-weeny bit
183     faster than intrinsic version, but more difficult to maintain & get compiled on multiple
184     platforms.
185    
186     uint overlapLengthLocal = overlapLength;
187     float corr;
188    
189     _asm
190     {
191     // Very important note: data in 'pV2' _must_ be aligned to
192     // 16-byte boundary!
193    
194     // give prefetch hints to CPU of what data are to be needed soonish
195     // give more aggressive hints on pV1 as that changes while pV2 stays
196     // same between runs
197     prefetcht0 [pV1]
198     prefetcht0 [pV2]
199     prefetcht0 [pV1 + 32]
200    
201     mov eax, dword ptr pV1
202     mov ebx, dword ptr pV2
203    
204     xorps xmm0, xmm0
205    
206     mov ecx, overlapLengthLocal
207     shr ecx, 3 // div by eight
208    
209     loop1:
210     prefetcht0 [eax + 64] // give a prefetch hint to CPU what data are to be needed soonish
211     prefetcht0 [ebx + 32] // give a prefetch hint to CPU what data are to be needed soonish
212     movups xmm1, [eax]
213     mulps xmm1, [ebx]
214     addps xmm0, xmm1
215    
216     movups xmm2, [eax + 16]
217     mulps xmm2, [ebx + 16]
218     addps xmm0, xmm2
219    
220     prefetcht0 [eax + 96] // give a prefetch hint to CPU what data are to be needed soonish
221     prefetcht0 [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish
222    
223     movups xmm3, [eax + 32]
224     mulps xmm3, [ebx + 32]
225     addps xmm0, xmm3
226    
227     movups xmm4, [eax + 48]
228     mulps xmm4, [ebx + 48]
229     addps xmm0, xmm4
230    
231     add eax, 64
232     add ebx, 64
233    
234     dec ecx
235     jnz loop1
236    
237     // add the four floats of xmm0 together and return the result.
238    
239     movhlps xmm1, xmm0 // move 3 & 4 of xmm0 to 1 & 2 of xmm1
240     addps xmm1, xmm0
241     movaps xmm2, xmm1
242     shufps xmm2, xmm2, 0x01 // move 2 of xmm2 as 1 of xmm2
243     addss xmm2, xmm1
244     movss corr, xmm2
245     }
246    
247     return (double)corr;
248     */
249     }
250    
251    
252     //////////////////////////////////////////////////////////////////////////////
253     //
254     // implementation of SSE optimized functions of class 'FIRFilter'
255     //
256     //////////////////////////////////////////////////////////////////////////////
257    
258     #include "FIRFilter.h"
259    
260     FIRFilterSSE::FIRFilterSSE() : FIRFilter()
261     {
262     filterCoeffsAlign = NULL;
263     filterCoeffsUnalign = NULL;
264     }
265    
266    
267     FIRFilterSSE::~FIRFilterSSE()
268     {
269     delete[] filterCoeffsUnalign;
270     filterCoeffsAlign = NULL;
271     filterCoeffsUnalign = NULL;
272     }
273    
274    
275     // (overloaded) Calculates filter coefficients for SSE routine
276     void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
277     {
278     uint i;
279     float fDivider;
280    
281     FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
282    
283     // Scale the filter coefficients so that it won't be necessary to scale the filtering result
284     // also rearrange coefficients suitably for 3DNow!
285     // Ensure that filter coeffs array is aligned to 16-byte boundary
286     delete[] filterCoeffsUnalign;
287     filterCoeffsUnalign = new float[2 * newLength + 4];
288     filterCoeffsAlign = (float *)(((unsigned long)filterCoeffsUnalign + 15) & (ulong)-16);
289    
290     fDivider = (float)resultDivider;
291    
292     // rearrange the filter coefficients for mmx routines
293     for (i = 0; i < newLength; i ++)
294     {
295     filterCoeffsAlign[2 * i + 0] =
296     filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
297     }
298     }
299    
300    
301    
302     // SSE-optimized version of the filter routine for stereo sound
303     uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
304     {
305     int count = (int)((numSamples - length) & (uint)-2);
306     int j;
307    
308     assert(count % 2 == 0);
309    
310     if (count < 2) return 0;
311    
312     assert(source != NULL);
313     assert(dest != NULL);
314     assert((length % 8) == 0);
315     assert(filterCoeffsAlign != NULL);
316     assert(((ulong)filterCoeffsAlign) % 16 == 0);
317    
318     // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
319     for (j = 0; j < count; j += 2)
320     {
321     const float *pSrc;
322     const __m128 *pFil;
323     __m128 sum1, sum2;
324     uint i;
325    
326     pSrc = (const float*)source; // source audio data
327     pFil = (const __m128*)filterCoeffsAlign; // filter coefficients. NOTE: Assumes coefficients
328     // are aligned to 16-byte boundary
329     sum1 = sum2 = _mm_setzero_ps();
330    
331     for (i = 0; i < length / 8; i ++)
332     {
333     // Unroll loop for efficiency & calculate filter for 2*2 stereo samples
334     // at each pass
335    
336     // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
337     // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
338    
339     sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc) , pFil[0]));
340     sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
341    
342     sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
343     sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
344    
345     sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) , pFil[2]));
346     sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
347    
348     sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
349     sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
350    
351     pSrc += 16;
352     pFil += 4;
353     }
354    
355     // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
356     // to sum the two hi- and lo-floats of these registers together.
357    
358     // post-shuffle & add the filtered values and store to dest.
359     _mm_storeu_ps(dest, _mm_add_ps(
360     _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)), // s2_1 s2_0 s1_3 s1_2
361     _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0)) // s2_3 s2_2 s1_1 s1_0
362     ));
363     source += 4;
364     dest += 4;
365     }
366    
367     // Ideas for further improvement:
368     // 1. If it could be guaranteed that 'source' were always aligned to 16-byte
369     // boundary, a faster aligned '_mm_load_ps' instruction could be used.
370     // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte
371     // boundary, a faster '_mm_store_ps' instruction could be used.
372    
373     return (uint)count;
374    
375     /* original routine in C-language. please notice the C-version has differently
376     organized coefficients though.
377     double suml1, suml2;
378     double sumr1, sumr2;
379     uint i, j;
380    
381     for (j = 0; j < count; j += 2)
382     {
383     const float *ptr;
384     const float *pFil;
385    
386     suml1 = sumr1 = 0.0;
387     suml2 = sumr2 = 0.0;
388     ptr = src;
389     pFil = filterCoeffs;
390     for (i = 0; i < lengthLocal; i ++)
391     {
392     // unroll loop for efficiency.
393    
394     suml1 += ptr[0] * pFil[0] +
395     ptr[2] * pFil[2] +
396     ptr[4] * pFil[4] +
397     ptr[6] * pFil[6];
398    
399     sumr1 += ptr[1] * pFil[1] +
400     ptr[3] * pFil[3] +
401     ptr[5] * pFil[5] +
402     ptr[7] * pFil[7];
403    
404     suml2 += ptr[8] * pFil[0] +
405     ptr[10] * pFil[2] +
406     ptr[12] * pFil[4] +
407     ptr[14] * pFil[6];
408    
409     sumr2 += ptr[9] * pFil[1] +
410     ptr[11] * pFil[3] +
411     ptr[13] * pFil[5] +
412     ptr[15] * pFil[7];
413    
414     ptr += 16;
415     pFil += 8;
416     }
417     dest[0] = (float)suml1;
418     dest[1] = (float)sumr1;
419     dest[2] = (float)suml2;
420     dest[3] = (float)sumr2;
421    
422     src += 4;
423     dest += 4;
424     }
425     */
426    
427    
428     /* Similar routine in assembly, again obsoleted due to maintainability
429     _asm
430     {
431     // Very important note: data in 'src' _must_ be aligned to
432     // 16-byte boundary!
433     mov edx, count
434     mov ebx, dword ptr src
435     mov eax, dword ptr dest
436     shr edx, 1
437    
438     loop1:
439     // "outer loop" : during each round 2*2 output samples are calculated
440    
441     // give prefetch hints to CPU of what data are to be needed soonish
442     prefetcht0 [ebx]
443     prefetcht0 [filterCoeffsLocal]
444    
445     mov esi, ebx
446     mov edi, filterCoeffsLocal
447     xorps xmm0, xmm0
448     xorps xmm1, xmm1
449     mov ecx, lengthLocal
450    
451     loop2:
452     // "inner loop" : during each round eight FIR filter taps are evaluated for 2*2 samples
453     prefetcht0 [esi + 32] // give a prefetch hint to CPU what data are to be needed soonish
454     prefetcht0 [edi + 32] // give a prefetch hint to CPU what data are to be needed soonish
455    
456     movups xmm2, [esi] // possibly unaligned load
457     movups xmm3, [esi + 8] // possibly unaligned load
458     mulps xmm2, [edi]
459     mulps xmm3, [edi]
460     addps xmm0, xmm2
461     addps xmm1, xmm3
462    
463     movups xmm4, [esi + 16] // possibly unaligned load
464     movups xmm5, [esi + 24] // possibly unaligned load
465     mulps xmm4, [edi + 16]
466     mulps xmm5, [edi + 16]
467     addps xmm0, xmm4
468     addps xmm1, xmm5
469    
470     prefetcht0 [esi + 64] // give a prefetch hint to CPU what data are to be needed soonish
471     prefetcht0 [edi + 64] // give a prefetch hint to CPU what data are to be needed soonish
472    
473     movups xmm6, [esi + 32] // possibly unaligned load
474     movups xmm7, [esi + 40] // possibly unaligned load
475     mulps xmm6, [edi + 32]
476     mulps xmm7, [edi + 32]
477     addps xmm0, xmm6
478     addps xmm1, xmm7
479    
480     movups xmm4, [esi + 48] // possibly unaligned load
481     movups xmm5, [esi + 56] // possibly unaligned load
482     mulps xmm4, [edi + 48]
483     mulps xmm5, [edi + 48]
484     addps xmm0, xmm4
485     addps xmm1, xmm5
486    
487     add esi, 64
488     add edi, 64
489     dec ecx
490     jnz loop2
491    
492     // Now xmm0 and xmm1 both have a filtered 2-channel sample each, but we still need
493     // to sum the two hi- and lo-floats of these registers together.
494    
495     movhlps xmm2, xmm0 // xmm2 = xmm2_3 xmm2_2 xmm0_3 xmm0_2
496     movlhps xmm2, xmm1 // xmm2 = xmm1_1 xmm1_0 xmm0_3 xmm0_2
497     shufps xmm0, xmm1, 0xe4 // xmm0 = xmm1_3 xmm1_2 xmm0_1 xmm0_0
498     addps xmm0, xmm2
499    
500     movaps [eax], xmm0
501     add ebx, 16
502     add eax, 16
503    
504     dec edx
505     jnz loop1
506     }
507     */
508     }
509    
510     #endif // ALLOW_SSE

  ViewVC Help
Powered by ViewVC 1.1.22