OgreSIMDHelper.h
Go to the documentation of this file.
00001 /*
00002 -----------------------------------------------------------------------------
00003 This source file is part of OGRE
00004     (Object-oriented Graphics Rendering Engine)
00005 For the latest info, see http://www.ogre3d.org/
00006 
00007 Copyright (c) 2000-2012 Torus Knot Software Ltd
00008 
00009 Permission is hereby granted, free of charge, to any person obtaining a copy
00010 of this software and associated documentation files (the "Software"), to deal
00011 in the Software without restriction, including without limitation the rights
00012 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00013 copies of the Software, and to permit persons to whom the Software is
00014 furnished to do so, subject to the following conditions:
00015 
00016 The above copyright notice and this permission notice shall be included in
00017 all copies or substantial portions of the Software.
00018 
00019 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00020 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00021 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00022 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00023 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00024 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00025 THE SOFTWARE.
00026 -----------------------------------------------------------------------------
00027 */
00028 #ifndef __SIMDHelper_H__
00029 #define __SIMDHelper_H__
00030 
00031 #include "OgrePrerequisites.h"
00032 #include "OgrePlatformInformation.h"
00033 
00034 // Stack-alignment hackery.
00035 //
00036 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
00037 // special code to ensure stack align to a 16-bytes boundary.
00038 //
00039 // Note:
00040 //   This macro can only guarantee callee stack pointer (esp) align
00041 // to a 16-bytes boundary, but not that for frame pointer (ebp).
00042 // Because most compiler might use frame pointer to access to stack
00043 // variables, so you need to wrap those alignment required functions
00044 // with extra function call.
00045 //
00046 #if defined(__INTEL_COMPILER)
00047 // For intel's compiler, simply calling alloca seems to do the right
00048 // thing. The size of the allocated block seems to be irrelevant.
00049 #define __OGRE_SIMD_ALIGN_STACK()   _alloca(16)
00050 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
00051 
00052 #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64)
00053 // mark functions with GCC attribute to force stack alignment to 16 bytes
00054 #define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer))
00055 
00056 #elif defined(_MSC_VER)
00057 // Fortunately, MSVC will align the stack automatically
00058 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
00059 
00060 #else
00061 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
00062 
00063 #endif
00064 
00065 
00066 // Additional platform-dependent header files and declares.
00067 //
00068 // NOTE: Should be sync with __OGRE_HAVE_SSE macro.
00069 //
00070 
00071 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00072 
00073 // GCC version 4.0 upwards should be reliable for official SSE now,
00074 // so no longer define SSE macros ourselves
00075 // We don't support gcc 3.x anymore anyway, although that had SSE it was a bit flaky?
00076 #include <xmmintrin.h>
00077 
00078 
00079 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00080 
00081 
00082 
00083 //---------------------------------------------------------------------
00084 // SIMD macros and helpers
00085 //---------------------------------------------------------------------
00086 
00087 
00088 namespace Ogre {
00096 #if __OGRE_HAVE_SSE
00097 
00108 #if 1
00109 #define __MM_RSQRT_PS(x)    _mm_rsqrt_ps(x)
00110 #else
00111 #define __MM_RSQRT_PS(x)    __mm_rsqrt_nr_ps(x) // Implemented below
00112 #endif
00113 
00122 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                            \
00123     {                                                                                   \
00124         __m128 tmp3, tmp2, tmp1, tmp0;                                                  \
00125                                                                                         \
00126                                                             /* r00 r01 r02 r03 */       \
00127                                                             /* r10 r11 r12 r13 */       \
00128                                                             /* r20 r21 r22 r23 */       \
00129                                                             /* r30 r31 r32 r33 */       \
00130                                                                                         \
00131         tmp0 = _mm_unpacklo_ps(r0, r1);                       /* r00 r10 r01 r11 */     \
00132         tmp2 = _mm_unpackhi_ps(r0, r1);                       /* r02 r12 r03 r13 */     \
00133         tmp1 = _mm_unpacklo_ps(r2, r3);                       /* r20 r30 r21 r31 */     \
00134         tmp3 = _mm_unpackhi_ps(r2, r3);                       /* r22 r32 r23 r33 */     \
00135                                                                                         \
00136         r0 = _mm_movelh_ps(tmp0, tmp1);                         /* r00 r10 r20 r30 */   \
00137         r1 = _mm_movehl_ps(tmp1, tmp0);                         /* r01 r11 r21 r31 */   \
00138         r2 = _mm_movelh_ps(tmp2, tmp3);                         /* r02 r12 r22 r32 */   \
00139         r3 = _mm_movehl_ps(tmp3, tmp2);                         /* r03 r13 r23 r33 */   \
00140     }
00141 
00150 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2)                                                \
00151     {                                                                                   \
00152         __m128 tmp0, tmp1, tmp2;                                                        \
00153                                                                                         \
00154                                                             /* r00 r01 r02 r10 */       \
00155                                                             /* r11 r12 r20 r21 */       \
00156                                                             /* r22 r30 r31 r32 */       \
00157                                                                                         \
00158         tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0));  /* r00 r10 r22 r32 */     \
00159         tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1));  /* r01 r02 r11 r12 */     \
00160         tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2));  /* r20 r21 r30 r31 */     \
00161                                                                                         \
00162         v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0));  /* r00 r10 r20 r30 */   \
00163         v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r01 r11 r21 r31 */   \
00164         v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1));  /* r02 r12 r22 r32 */   \
00165     }
00166 
00174 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
00175     {                                                                               \
00176         __m128 tmp0, tmp1, tmp2;                                                    \
00177                                                                                     \
00178                                                             /* r00 r10 r20 r30 */   \
00179                                                             /* r01 r11 r21 r31 */   \
00180                                                             /* r02 r12 r22 r32 */   \
00181                                                                                     \
00182         tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1));  /* r10 r30 r02 r22 */   \
00183         tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1));  /* r11 r31 r12 r32 */   \
00184         tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0));  /* r00 r20 r01 r21 */   \
00185                                                                                     \
00186         v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0));  /* r00 r01 r02 r10 */   \
00187         v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r11 r12 r20 r21 */   \
00188         v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3));  /* r22 r30 r31 r32 */   \
00189     }
00190 
00194 #define __MM_SELECT(v, fp)                                                          \
00195     _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
00196 
00198 #define __MM_ACCUM4_PS(a, b, c, d)                                                  \
00199     _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
00200 
00204 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3)                              \
00205     __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
00206 
00210 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
00211     __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
00212 
00214 #define __MM_ACCUM3_PS(a, b, c)                                                     \
00215     _mm_add_ps(_mm_add_ps(a, b), c)
00216 
00220 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
00221     __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
00222 
00224 #define __MM_MADD_PS(a, b, c)                                                       \
00225     _mm_add_ps(_mm_mul_ps(a, b), c)
00226 
00228 #define __MM_LERP_PS(t, a, b)                                                       \
00229     __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
00230 
00232 #define __MM_MADD_SS(a, b, c)                                                       \
00233     _mm_add_ss(_mm_mul_ss(a, b), c)
00234 
00236 #define __MM_LERP_SS(t, a, b)                                                       \
00237     __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
00238 
00240 #define __MM_LOAD_PS(p)                                                             \
00241     (*(__m128*)(p))
00242 
00244 #define __MM_STORE_PS(p, v)                                                         \
00245     (*(__m128*)(p) = (v))
00246 
00247 
00250     template <bool aligned = false>
00251     struct SSEMemoryAccessor
00252     {
00253         static FORCEINLINE __m128 load(const float *p)
00254         {
00255             return _mm_loadu_ps(p);
00256         }
00257         static FORCEINLINE void store(float *p, const __m128& v)
00258         {
00259             _mm_storeu_ps(p, v);
00260         }
00261     };
00262     // Special aligned accessor
00263     template <>
00264     struct SSEMemoryAccessor<true>
00265     {
00266         static FORCEINLINE const __m128& load(const float *p)
00267         {
00268             return __MM_LOAD_PS(p);
00269         }
00270         static FORCEINLINE void store(float *p, const __m128& v)
00271         {
00272             __MM_STORE_PS(p, v);
00273         }
00274     };
00275 
00278     static FORCEINLINE bool _isAlignedForSSE(const void *p)
00279     {
00280         return (((size_t)p) & 15) == 0;
00281     }
00282 
00286     static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
00287     {
00288         static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
00289         static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
00290         __m128 t = _mm_rsqrt_ps(x);
00291         return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
00292             _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
00293     }
00294 
00295 // Macro to check the stack aligned for SSE
00296 #if OGRE_DEBUG_MODE
00297 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()        \
00298     {                                               \
00299         __m128 test;                                \
00300         assert(_isAlignedForSSE(&test));            \
00301     }
00302 
00303 #else   // !OGRE_DEBUG_MODE
00304 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
00305 
00306 #endif  // OGRE_DEBUG_MODE
00307 
00308 
00309 #endif  // __OGRE_HAVE_SSE
00310 
00313 }
00314 
00315 #endif // __SIMDHelper_H__

Copyright © 2012 Torus Knot Software Ltd
Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
Last modified Sun Sep 2 2012 07:27:24