| /* |
| A C-program for MT19937, with initialization improved 2002/1/26. |
| Coded by Takuji Nishimura and Makoto Matsumoto. |
| |
| Before using, initialize the state by using init_genrand(seed) |
| or init_by_array(init_key, key_length). |
| |
| Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions |
| are met: |
| |
| 1. Redistributions of source code must retain the above copyright |
| notice, this list of conditions and the following disclaimer. |
| |
| 2. Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| 3. The names of its contributors may not be used to endorse or promote |
| products derived from this software without specific prior written |
| permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
| OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| |
| Any feedback is very welcome. |
| http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html |
| email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) |
| |
| Modifications for use in OpenCL by Ian Ollmann, Apple Inc. |
| |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include "mt19937.h" |
| #include "mingw_compat.h" |
| #include "harness/alloc.h" |
| |
| #ifdef __SSE2__ |
| #include <mutex> |
| #include <emmintrin.h> |
| #endif |
| |
| /* Period parameters */ |
| #define N 624 /* vector code requires multiple of 4 here */ |
| #define M 397 |
| #define MATRIX_A (cl_uint)0x9908b0dfUL /* constant vector a */ |
| #define UPPER_MASK (cl_uint)0x80000000UL /* most significant w-r bits */ |
| #define LOWER_MASK (cl_uint)0x7fffffffUL /* least significant r bits */ |
| |
| typedef struct _MTdata |
| { |
| cl_uint mt[N]; |
| #ifdef __SSE2__ |
| cl_uint cache[N]; |
| #endif |
| cl_int mti; |
| } _MTdata; |
| |
| /* initializes mt[N] with a seed */ |
| MTdata init_genrand(cl_uint s) |
| { |
| MTdata r = (MTdata)align_malloc(sizeof(_MTdata), 16); |
| if (NULL != r) |
| { |
| cl_uint *mt = r->mt; |
| int mti = 0; |
| mt[0] = s; // & 0xffffffffUL; |
| for (mti = 1; mti < N; mti++) |
| { |
| mt[mti] = (cl_uint)( |
| 1812433253UL * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti); |
| /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ |
| /* In the previous versions, MSBs of the seed affect */ |
| /* only MSBs of the array mt[]. */ |
| /* 2002/01/09 modified by Makoto Matsumoto */ |
| // mt[mti] &= 0xffffffffUL; |
| /* for >32 bit machines */ |
| } |
| r->mti = mti; |
| } |
| |
| return r; |
| } |
| |
| void free_mtdata(MTdata d) |
| { |
| if (d) align_free(d); |
| } |
| |
| /* generates a random number on [0,0xffffffff]-interval */ |
| cl_uint genrand_int32(MTdata d) |
| { |
| /* mag01[x] = x * MATRIX_A for x=0,1 */ |
| static const cl_uint mag01[2] = { 0x0UL, MATRIX_A }; |
| #ifdef __SSE2__ |
| static std::once_flag init_flag; |
| static union { |
| __m128i v; |
| cl_uint s[4]; |
| } upper_mask, lower_mask, one, matrix_a, c0, c1; |
| #endif |
| |
| |
| cl_uint *mt = d->mt; |
| cl_uint y; |
| |
| if (d->mti == N) |
| { /* generate N words at one time */ |
| int kk; |
| |
| #ifdef __SSE2__ |
| auto init_fn = []() { |
| upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] = |
| upper_mask.s[3] = UPPER_MASK; |
| lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] = |
| lower_mask.s[3] = LOWER_MASK; |
| one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1; |
| matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] = |
| MATRIX_A; |
| c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL; |
| c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL; |
| }; |
| std::call_once(init_flag, init_fn); |
| #endif |
| |
| kk = 0; |
| #ifdef __SSE2__ |
| // vector loop |
| for (; kk + 4 <= N - M; kk += 4) |
| { |
| // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK)) |
| __m128i vy = _mm_or_si128( |
| _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)), |
| upper_mask.v), |
| _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)), |
| lower_mask.v)); |
| |
| // y & 1 ? -1 : 0 |
| __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v); |
| // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL] |
| __m128i vmag01 = _mm_and_si128(mask, matrix_a.v); |
| // mt[kk+M] ^ (y >> 1) |
| __m128i vr = |
| _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M)), |
| (__m128i)_mm_srli_epi32(vy, 1)); |
| // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL] |
| vr = _mm_xor_si128(vr, vmag01); |
| _mm_store_si128((__m128i *)(mt + kk), vr); |
| } |
| #endif |
| for (; kk < N - M; kk++) |
| { |
| y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK)); |
| mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL]; |
| } |
| |
| #ifdef __SSE2__ |
| // advance to next aligned location |
| for (; kk < N - 1 && (kk & 3); kk++) |
| { |
| y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK)); |
| mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL]; |
| } |
| |
| // vector loop |
| for (; kk + 4 <= N - 1; kk += 4) |
| { |
| __m128i vy = _mm_or_si128( |
| _mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)), |
| upper_mask.v), |
| // ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK)) |
| _mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)), |
| lower_mask.v)); |
| |
| // y & 1 ? -1 : 0 |
| __m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v); |
| // y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL] |
| __m128i vmag01 = _mm_and_si128(mask, matrix_a.v); |
| // mt[kk+M-N] ^ (y >> 1) |
| __m128i vr = |
| _mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M - N)), |
| _mm_srli_epi32(vy, 1)); |
| // mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL] |
| vr = _mm_xor_si128(vr, vmag01); |
| _mm_store_si128((__m128i *)(mt + kk), vr); |
| } |
| #endif |
| |
| for (; kk < N - 1; kk++) |
| { |
| y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK)); |
| mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL]; |
| } |
| y = (cl_uint)((mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK)); |
| mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL]; |
| |
| #ifdef __SSE2__ |
| // Do the tempering ahead of time in vector code |
| for (kk = 0; kk + 4 <= N; kk += 4) |
| { |
| // y = mt[k]; |
| __m128i vy = _mm_load_si128((__m128i *)(mt + kk)); |
| // y ^= (y >> 11); |
| vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 11)); |
| // y ^= (y << 7) & (cl_uint) 0x9d2c5680UL; |
| vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 7), c0.v)); |
| // y ^= (y << 15) & (cl_uint) 0xefc60000UL; |
| vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 15), c1.v)); |
| // y ^= (y >> 18); |
| vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 18)); |
| _mm_store_si128((__m128i *)(d->cache + kk), vy); |
| } |
| #endif |
| |
| d->mti = 0; |
| } |
| #ifdef __SSE2__ |
| y = d->cache[d->mti++]; |
| #else |
| y = mt[d->mti++]; |
| |
| /* Tempering */ |
| y ^= (y >> 11); |
| y ^= (y << 7) & (cl_uint)0x9d2c5680UL; |
| y ^= (y << 15) & (cl_uint)0xefc60000UL; |
| y ^= (y >> 18); |
| #endif |
| |
| |
| return y; |
| } |
| |
| cl_ulong genrand_int64(MTdata d) |
| { |
| return ((cl_ulong)genrand_int32(d) << 32) | (cl_uint)genrand_int32(d); |
| } |
| |
| /* generates a random number on [0,1]-real-interval */ |
| double genrand_real1(MTdata d) |
| { |
| return genrand_int32(d) * (1.0 / 4294967295.0); |
| /* divided by 2^32-1 */ |
| } |
| |
| /* generates a random number on [0,1)-real-interval */ |
| double genrand_real2(MTdata d) |
| { |
| return genrand_int32(d) * (1.0 / 4294967296.0); |
| /* divided by 2^32 */ |
| } |
| |
| /* generates a random number on (0,1)-real-interval */ |
| double genrand_real3(MTdata d) |
| { |
| return (((double)genrand_int32(d)) + 0.5) * (1.0 / 4294967296.0); |
| /* divided by 2^32 */ |
| } |
| |
| /* generates a random number on [0,1) with 53-bit resolution*/ |
| double genrand_res53(MTdata d) |
| { |
| unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6; |
| return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0); |
| } |
| |
| bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); } |