1 |
//////////////////////////////////////////////////////////////////////////////// |
2 |
/// |
3 |
/// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon |
4 |
/// processors. All 3DNow! optimized functions have been gathered into this |
5 |
/// single source code file, regardless to their class or original source code |
6 |
/// file, in order to ease porting the library to other compiler and processor |
7 |
/// platforms. |
8 |
/// |
9 |
/// By the way; the performance gain depends heavily on the CPU generation: On |
10 |
/// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the |
11 |
/// difference to the original routines stayed at unremarkable 8%! Such a small |
12 |
/// improvement on Athlon is due to 3DNow can perform only two operations in |
13 |
/// parallel, and obviously also the Athlon FPU is doing a very good job with |
14 |
/// the standard C floating point routines! Here these routines are anyway, |
15 |
/// although it might not be worth the effort to convert these to GCC platform, |
16 |
/// for Athlon CPU at least. The situation is different regarding the SSE |
17 |
/// optimizations though, thanks to the four parallel operations of SSE that |
18 |
/// already make a difference. |
19 |
/// |
20 |
/// This file is to be compiled in Windows platform with Microsoft Visual C++ |
21 |
/// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all |
22 |
/// GNU platforms (if file supplied). |
23 |
/// |
24 |
/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ |
25 |
/// 6.0 processor pack" update to support 3DNow! instruction set. The update is |
26 |
/// available for download at Microsoft Developers Network, see here: |
27 |
/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx |
28 |
/// |
29 |
/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and |
30 |
/// perform a search with keywords "processor pack". |
31 |
/// |
32 |
/// Author : Copyright (c) Olli Parviainen |
33 |
/// Author e-mail : oparviai 'at' iki.fi |
34 |
/// SoundTouch WWW: http://www.surina.net/soundtouch |
35 |
/// |
36 |
//////////////////////////////////////////////////////////////////////////////// |
37 |
// |
38 |
// Last changed : $Date: 2009-02-21 18:00:14 +0200 (Sat, 21 Feb 2009) $ |
39 |
// File revision : $Revision: 4 $ |
40 |
// |
41 |
// $Id: 3dnow_win.cpp 63 2009-02-21 16:00:14Z oparviai $ |
42 |
// |
43 |
//////////////////////////////////////////////////////////////////////////////// |
44 |
// |
45 |
// License : |
46 |
// |
47 |
// SoundTouch audio processing library |
48 |
// Copyright (c) Olli Parviainen |
49 |
// |
50 |
// This library is free software; you can redistribute it and/or |
51 |
// modify it under the terms of the GNU Lesser General Public |
52 |
// License as published by the Free Software Foundation; either |
53 |
// version 2.1 of the License, or (at your option) any later version. |
54 |
// |
55 |
// This library is distributed in the hope that it will be useful, |
56 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
57 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
58 |
// Lesser General Public License for more details. |
59 |
// |
60 |
// You should have received a copy of the GNU Lesser General Public |
61 |
// License along with this library; if not, write to the Free Software |
62 |
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
63 |
// |
64 |
//////////////////////////////////////////////////////////////////////////////// |
65 |
|
66 |
#include "cpu_detect.h" |
67 |
#include "STTypes.h" |
68 |
|
69 |
#ifndef WIN32 |
70 |
#error "wrong platform - this source code file is exclusively for Win32 platform" |
71 |
#endif |
72 |
|
73 |
using namespace soundtouch; |
74 |
|
75 |
#ifdef ALLOW_3DNOW |
76 |
// 3DNow! routines available only with float sample type |
77 |
|
78 |
////////////////////////////////////////////////////////////////////////////// |
79 |
// |
80 |
// implementation of 3DNow! optimized functions of class 'TDStretch3DNow' |
81 |
// |
82 |
////////////////////////////////////////////////////////////////////////////// |
83 |
|
84 |
#include "TDStretch.h" |
85 |
|
86 |
|
87 |
// Calculates cross correlation of two buffers |
88 |
double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const |
89 |
{ |
90 |
int overlapLengthLocal = overlapLength; |
91 |
float corr = 0; |
92 |
|
93 |
// Calculates the cross-correlation value between 'pV1' and 'pV2' vectors |
94 |
/* |
95 |
c-pseudocode: |
96 |
|
97 |
corr = 0; |
98 |
for (i = 0; i < overlapLength / 4; i ++) |
99 |
{ |
100 |
corr += pV1[0] * pV2[0]; |
101 |
pV1[1] * pV2[1]; |
102 |
pV1[2] * pV2[2]; |
103 |
pV1[3] * pV2[3]; |
104 |
pV1[4] * pV2[4]; |
105 |
pV1[5] * pV2[5]; |
106 |
pV1[6] * pV2[6]; |
107 |
pV1[7] * pV2[7]; |
108 |
|
109 |
pV1 += 8; |
110 |
pV2 += 8; |
111 |
} |
112 |
*/ |
113 |
|
114 |
_asm |
115 |
{ |
116 |
// give prefetch hints to CPU of what data are to be needed soonish. |
117 |
// give more aggressive hints on pV1 as that changes more between different calls |
118 |
// while pV2 stays the same. |
119 |
prefetch [pV1] |
120 |
prefetch [pV2] |
121 |
prefetch [pV1 + 32] |
122 |
|
123 |
mov eax, dword ptr pV2 |
124 |
mov ebx, dword ptr pV1 |
125 |
|
126 |
pxor mm0, mm0 |
127 |
|
128 |
mov ecx, overlapLengthLocal |
129 |
shr ecx, 2 // div by four |
130 |
|
131 |
loop1: |
132 |
movq mm1, [eax] |
133 |
prefetch [eax + 32] // give a prefetch hint to CPU what data are to be needed soonish |
134 |
pfmul mm1, [ebx] |
135 |
prefetch [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish |
136 |
|
137 |
movq mm2, [eax + 8] |
138 |
pfadd mm0, mm1 |
139 |
pfmul mm2, [ebx + 8] |
140 |
|
141 |
movq mm3, [eax + 16] |
142 |
pfadd mm0, mm2 |
143 |
pfmul mm3, [ebx + 16] |
144 |
|
145 |
movq mm4, [eax + 24] |
146 |
pfadd mm0, mm3 |
147 |
pfmul mm4, [ebx + 24] |
148 |
|
149 |
add eax, 32 |
150 |
pfadd mm0, mm4 |
151 |
add ebx, 32 |
152 |
|
153 |
dec ecx |
154 |
jnz loop1 |
155 |
|
156 |
// add halfs of mm0 together and return the result. |
157 |
// note: mm1 is used as a dummy parameter only, we actually don't care about it's value |
158 |
pfacc mm0, mm1 |
159 |
movd corr, mm0 |
160 |
femms |
161 |
} |
162 |
|
163 |
return corr; |
164 |
} |
165 |
|
166 |
|
167 |
|
168 |
|
169 |
////////////////////////////////////////////////////////////////////////////// |
170 |
// |
171 |
// implementation of 3DNow! optimized functions of class 'FIRFilter' |
172 |
// |
173 |
////////////////////////////////////////////////////////////////////////////// |
174 |
|
175 |
#include "FIRFilter.h" |
176 |
|
177 |
FIRFilter3DNow::FIRFilter3DNow() : FIRFilter() |
178 |
{ |
179 |
filterCoeffsUnalign = NULL; |
180 |
filterCoeffsAlign = NULL; |
181 |
} |
182 |
|
183 |
|
184 |
FIRFilter3DNow::~FIRFilter3DNow() |
185 |
{ |
186 |
delete[] filterCoeffsUnalign; |
187 |
filterCoeffsUnalign = NULL; |
188 |
filterCoeffsAlign = NULL; |
189 |
} |
190 |
|
191 |
|
192 |
// (overloaded) Calculates filter coefficients for 3DNow! routine |
193 |
void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor) |
194 |
{ |
195 |
uint i; |
196 |
float fDivider; |
197 |
|
198 |
FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor); |
199 |
|
200 |
// Scale the filter coefficients so that it won't be necessary to scale the filtering result |
201 |
// also rearrange coefficients suitably for 3DNow! |
202 |
// Ensure that filter coeffs array is aligned to 16-byte boundary |
203 |
delete[] filterCoeffsUnalign; |
204 |
filterCoeffsUnalign = new float[2 * newLength + 4]; |
205 |
filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & (uint)-16); |
206 |
|
207 |
fDivider = (float)resultDivider; |
208 |
|
209 |
// rearrange the filter coefficients for mmx routines |
210 |
for (i = 0; i < newLength; i ++) |
211 |
{ |
212 |
filterCoeffsAlign[2 * i + 0] = |
213 |
filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider; |
214 |
} |
215 |
} |
216 |
|
217 |
|
218 |
// 3DNow!-optimized version of the filter routine for stereo sound |
219 |
uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, uint numSamples) const |
220 |
{ |
221 |
float *filterCoeffsLocal = filterCoeffsAlign; |
222 |
uint count = (numSamples - length) & (uint)-2; |
223 |
uint lengthLocal = length / 4; |
224 |
|
225 |
assert(length != 0); |
226 |
assert(count % 2 == 0); |
227 |
|
228 |
/* original code: |
229 |
|
230 |
double suml1, suml2; |
231 |
double sumr1, sumr2; |
232 |
uint i, j; |
233 |
|
234 |
for (j = 0; j < count; j += 2) |
235 |
{ |
236 |
const float *ptr; |
237 |
|
238 |
suml1 = sumr1 = 0.0; |
239 |
suml2 = sumr2 = 0.0; |
240 |
ptr = src; |
241 |
filterCoeffsLocal = filterCoeffs; |
242 |
for (i = 0; i < lengthLocal; i ++) |
243 |
{ |
244 |
// unroll loop for efficiency. |
245 |
|
246 |
suml1 += ptr[0] * filterCoeffsLocal[0] + |
247 |
ptr[2] * filterCoeffsLocal[2] + |
248 |
ptr[4] * filterCoeffsLocal[4] + |
249 |
ptr[6] * filterCoeffsLocal[6]; |
250 |
|
251 |
sumr1 += ptr[1] * filterCoeffsLocal[1] + |
252 |
ptr[3] * filterCoeffsLocal[3] + |
253 |
ptr[5] * filterCoeffsLocal[5] + |
254 |
ptr[7] * filterCoeffsLocal[7]; |
255 |
|
256 |
suml2 += ptr[8] * filterCoeffsLocal[0] + |
257 |
ptr[10] * filterCoeffsLocal[2] + |
258 |
ptr[12] * filterCoeffsLocal[4] + |
259 |
ptr[14] * filterCoeffsLocal[6]; |
260 |
|
261 |
sumr2 += ptr[9] * filterCoeffsLocal[1] + |
262 |
ptr[11] * filterCoeffsLocal[3] + |
263 |
ptr[13] * filterCoeffsLocal[5] + |
264 |
ptr[15] * filterCoeffsLocal[7]; |
265 |
|
266 |
ptr += 16; |
267 |
filterCoeffsLocal += 8; |
268 |
} |
269 |
dest[0] = (float)suml1; |
270 |
dest[1] = (float)sumr1; |
271 |
dest[2] = (float)suml2; |
272 |
dest[3] = (float)sumr2; |
273 |
|
274 |
src += 4; |
275 |
dest += 4; |
276 |
} |
277 |
|
278 |
*/ |
279 |
_asm |
280 |
{ |
281 |
mov eax, dword ptr dest |
282 |
mov ebx, dword ptr src |
283 |
mov edx, count |
284 |
shr edx, 1 |
285 |
|
286 |
loop1: |
287 |
// "outer loop" : during each round 2*2 output samples are calculated |
288 |
prefetch [ebx] // give a prefetch hint to CPU what data are to be needed soonish |
289 |
prefetch [filterCoeffsLocal] // give a prefetch hint to CPU what data are to be needed soonish |
290 |
|
291 |
mov esi, ebx |
292 |
mov edi, filterCoeffsLocal |
293 |
pxor mm0, mm0 |
294 |
pxor mm1, mm1 |
295 |
mov ecx, lengthLocal |
296 |
|
297 |
loop2: |
298 |
// "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples |
299 |
movq mm2, [edi] |
300 |
movq mm3, mm2 |
301 |
prefetch [edi + 32] // give a prefetch hint to CPU what data are to be needed soonish |
302 |
pfmul mm2, [esi] |
303 |
prefetch [esi + 32] // give a prefetch hint to CPU what data are to be needed soonish |
304 |
pfmul mm3, [esi + 8] |
305 |
|
306 |
movq mm4, [edi + 8] |
307 |
movq mm5, mm4 |
308 |
pfadd mm0, mm2 |
309 |
pfmul mm4, [esi + 8] |
310 |
pfadd mm1, mm3 |
311 |
pfmul mm5, [esi + 16] |
312 |
|
313 |
movq mm2, [edi + 16] |
314 |
movq mm6, mm2 |
315 |
pfadd mm0, mm4 |
316 |
pfmul mm2, [esi + 16] |
317 |
pfadd mm1, mm5 |
318 |
pfmul mm6, [esi + 24] |
319 |
|
320 |
movq mm3, [edi + 24] |
321 |
movq mm7, mm3 |
322 |
pfadd mm0, mm2 |
323 |
pfmul mm3, [esi + 24] |
324 |
pfadd mm1, mm6 |
325 |
pfmul mm7, [esi + 32] |
326 |
add esi, 32 |
327 |
pfadd mm0, mm3 |
328 |
add edi, 32 |
329 |
pfadd mm1, mm7 |
330 |
|
331 |
dec ecx |
332 |
jnz loop2 |
333 |
|
334 |
movq [eax], mm0 |
335 |
add ebx, 16 |
336 |
movq [eax + 8], mm1 |
337 |
add eax, 16 |
338 |
|
339 |
dec edx |
340 |
jnz loop1 |
341 |
|
342 |
femms |
343 |
} |
344 |
|
345 |
return count; |
346 |
} |
347 |
|
348 |
|
349 |
#endif // ALLOW_3DNOW |