patx/mrhttp-asgi

Rand gbench

Commit ca88a0e · Mark Reed · 2024-04-13T15:05:55-07:00
Changeset: ca88a0ea28b236af3e4e4672986d3929e2183bc4
Parents: 6826f12c65b10e79739fa860d7e4bc8e589acf7a
Comments

No comments yet.
Diff

diff --git a/gbench/bld b/gbench/bld
index 72ba4a5..d0e360a 100755
--- a/gbench/bld
+++ b/gbench/bld
@@ -1,7 +1,8 @@
 #
 #g++ t.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o t
 #g++ tst.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o tst
-g++ parse.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o parse
+#g++ parse.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o parse
 #g++ string.cpp -O0 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o string
 #g++ query.cpp -O0 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o query
+g++ rand.cpp -O3 -msse4.2 -mavx2 -std=c++11 -lbenchmark -lpthread -o orand
 
diff --git a/gbench/fastPRNG.h b/gbench/fastPRNG.h
new file mode 100755
index 0000000..bb09e53
--- /dev/null
+++ b/gbench/fastPRNG.h
@@ -0,0 +1,690 @@
+//------------------------------------------------------------------------------
+//  Copyright (c) 2018-2020 Michele Morrone
+//  All rights reserved.
+//
+//  https://michelemorrone.eu - https://BrutPitt.com
+//
+//  twitter: https://twitter.com/BrutPitt - github: https://github.com/BrutPitt
+//
+//  mailto:[email protected] - mailto:[email protected]
+//  
+//  This software is distributed under the terms of the BSD 2-Clause license
+//------------------------------------------------------------------------------
+#pragma once
+
+#include <stdint.h>
+#include <chrono>
+#include <type_traits>
+#include <cfloat>
+
+namespace fastPRNG {
+#define UNI_32BIT_INV 2.3283064365386962890625e-10
+#define VNI_32BIT_INV 4.6566128730773925781250e-10   // UNI_32BIT_INV * 2
+
+#define UNI_64BIT_INV 5.42101086242752217003726400434970e-20
+#define VNI_64BIT_INV 1.08420217248550443400745280086994e-19 // UNI_64BIT_INV * 2
+
+#define FPRNG_SEED_INIT64 std::chrono::system_clock::now().time_since_epoch().count()
+#define FPRNG_SEED_INIT32 FPRNG_SEED_INIT64
+
+inline static uint32_t splitMix32(const uint32_t val) {
+    uint32_t z = val + 0x9e3779b9;
+    z ^= z >> 15; // 16 for murmur3
+    z *= 0x85ebca6b;
+    z ^= z >> 13;
+    z *= 0xc2b2ae35;
+    return z ^ (z >> 16);
+}
+
+inline static uint64_t splitMix64(const uint64_t val) {
+    uint64_t z = val    + 0x9e3779b97f4a7c15;
+    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+    z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+    return z ^ (z >> 31);
+}
+
+// 32/64 bit rotation func
+template <typename T> inline static T rotl(const T x, const int k) { return (x << k) | (x >> (sizeof(T)*8 - k)); } // sizeof*8 is resolved to compile-time
+
+/*--------------------------------------------------------------------------
+ 32bit PRNG Algorithms: xoshiro / xoroshiro
+
+ xoshiro256+ / xoshiro256++ / xoshiro256**
+ xoroshiro128+ / xoroshiro128++ / xoroshiro128**
+
+ Algorithms by David Blackman and Sebastiano Vigna
+ http://prng.di.unimi.it/
+
+ To the extent possible under law, the author has dedicated all copyright
+ and related and neighboring rights to this software to the public domain
+ worldwide. This software is distributed without any warranty.
+
+ See <http://creativecommons.org/publicdomain/zero/1.0/>.
+-------------------------------------------------------------------------- */
+#define XOSHIRO128\
+    const uint32_t t = s1 << 9;\
+    s2 ^= s0;\
+    s3 ^= s1;\
+    s1 ^= s2;\
+    s0 ^= s3;\
+    s2 ^= t;\
+    s3 = rotl<uint32_t>(s3, 11);\
+    return result;
+
+#define XOROSHIRO64\
+    s1 ^= s0;\
+    s0 = rotl<uint32_t>(s0, 26) ^ s1 ^ (s1 << 9);\
+    s1 = rotl<uint32_t>(s1, 13);\
+    return result;
+
+#define XORSHIFT32\
+    s0 ^= s0 << 13;\
+    s0 ^= s0 >> 17;\
+    s0 ^= s0 << 5;\
+    return s0;
+
+#define XOSHIRO128_STATIC(FUNC)\
+    static const uint32_t seed = uint32_t(FPRNG_SEED_INIT32);\
+    static uint32_t s0 = splitMix32(seed), s1 = splitMix32(s0), s2 = splitMix32(s1), s3 = splitMix32(s2);\
+    FUNC; XOSHIRO128
+
+#define XOROSHIRO64_STATIC(FUNC)\
+    static const uint32_t seed = uint32_t(FPRNG_SEED_INIT32);\
+    static uint32_t s0 = splitMix32(seed), s1 = splitMix32(s0);\
+    FUNC; XOROSHIRO64
+
+#define XORSHIFT32_STATIC\
+    static uint32_t s0 = uint32_t(FPRNG_SEED_INIT32);\
+    XORSHIFT32
+
+// fastXS32
+//
+// 32bit pseudo-random generator
+// All integer values are returned in interval [0, UINT32_MAX]
+// to get values between [INT32_MIN, INT32_MAX] just cast result to int32_t
+///////////////////////////////////////////////////////////////////////////////
+class fastXS32
+{
+public:
+    fastXS32(const uint32_t seedVal = uint32_t(FPRNG_SEED_INIT32)) { seed(seedVal); }
+
+    inline uint32_t xoshiro128p()  { return xoshiro128(s0 + s3); }
+    inline uint32_t xoshiro128pp() { return xoshiro128(rotl<uint32_t>(s0 + s3, 7) + s0); }
+    inline uint32_t xoshiro128xx() { return xoshiro128(rotl<uint32_t>(s1 * 5, 7) * 9); }
+
+    template <typename T> inline T xoshiro128p_UNI() { return T(        xoshiro128p() ) * UNI_32BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T xoshiro128p_VNI() { return T(int32_t(xoshiro128p())) * VNI_32BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T xoshiro128p_Range(T min, T max)                                         // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoshiro128p_UNI<T>(); }
+
+    inline uint32_t xoroshiro64x()  { return xoroshiro64(               s0 * 0x9E3779BB); }
+    inline uint32_t xoroshiro64xx() { return xoroshiro64(rotl<uint32_t>(s0 * 0x9E3779BB, 5) * 5); }
+
+    template <typename T> inline T xoroshiro64x_UNI() { return T(         xoroshiro64x() ) * UNI_32BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T xoroshiro64x_VNI() { return T(int32_t(xoroshiro64x()))  * VNI_32BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T xoroshiro64x_Range(T min, T max)                                           // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoroshiro64x_UNI<T>(); }
+
+    inline uint32_t xorShift() { XORSHIFT32 } // Marsaglia xorShift: period 2^32-1
+
+    template <typename T> inline T xorShift_UNI() { return         xorShift()  * UNI_32BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T xorShift_VNI() { return int32_t(xorShift()) * VNI_32BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T xorShift_Range(T min, T max)                                   // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xorShift_UNI<T>(); }
+
+    void seed(const uint32_t seedVal = uint32_t(FPRNG_SEED_INIT32)) {
+        s0 = splitMix32(seedVal);
+        s1 = splitMix32(s0);
+        s2 = splitMix32(s1);
+        s3 = splitMix32(s2);
+    }
+    
+private:
+    inline uint32_t xoroshiro64(const uint32_t result)  { XOROSHIRO64 }
+    inline uint32_t xoshiro128(const uint32_t result)   { XOSHIRO128  }
+
+    uint32_t s0, s1, s2, s3;
+};
+
+// fastXS32s - static members
+//      you can call directly w/o declaration, but..
+//      N.B. all members/functions share same seed, and subsequents xor & shift
+//           operations on it, if you need different seeds declare more
+//           fastXS32 (non static) objects
+//
+// 32bit pseudo-random generator
+// All integer values are returned in interval [0, UINT32_MAX]
+// to get values between [INT32_MIN, INT32_MAX] just cast result to int32_t
+///////////////////////////////////////////////////////////////////////////////
+class fastXS32s
+{
+public:
+    fastXS32s() = default;
+
+    inline static uint32_t xoshiro128p()  { XOSHIRO128_STATIC(const uint32_t result = s0 + s3)               }
+    inline static uint32_t xoshiro128pp() { XOSHIRO128_STATIC(const uint32_t result = rotl<uint32_t>(s0 + s3, 7) + s0) }
+    inline static uint32_t xoshiro128xx() { XOSHIRO128_STATIC(const uint32_t result = rotl<uint32_t>(s1 * 5, 7) * 9)   }
+
+    template <typename T> inline static T xoshiro128p_UNI() { return T(        xoshiro128p() ) * UNI_32BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline static T xoshiro128p_VNI() { return T(int32_t(xoshiro128p())) * VNI_32BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline static T xoshiro128p_Range(T min, T max)                                         // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoshiro128p_UNI<T>(); }
+
+    inline static uint32_t xoroshiro64x()  { XOROSHIRO64_STATIC(const uint32_t result =                s0 * 0x9E3779BB)         }
+    inline static uint32_t xoroshiro64xx() { XOROSHIRO64_STATIC(const uint32_t result = rotl<uint32_t>(s0 * 0x9E3779BB, 5) * 5) }
+
+    template <typename T> inline static T xoroshiro64x_UNI() { return T(         xoroshiro64x() ) * UNI_32BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline static T xoroshiro64x_VNI() { return T(int32_t(xoroshiro64x()))  * VNI_32BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline static T xoroshiro64x_Range(T min, T max)                                           // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoroshiro64x_UNI<T>(); }
+
+    inline static uint32_t xorShift() { XORSHIFT32_STATIC } //Marsaglia xorShift: period 2^32-1
+
+    template <typename T> inline static T xorShift_UNI() { return         xorShift()  * UNI_32BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline static T xorShift_VNI() { return int32_t(xorShift()) * VNI_32BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline static T xorShift_Range(T min, T max)                                   // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xorShift_UNI<T>(); }
+};
+
+#undef XOSHIRO128
+#undef XOROSHIRO64
+#undef XORSHIFT32
+#undef XOSHIRO128_STATIC
+#undef XOROSHIRO64_STATIC
+#undef XORSHIFT32_STATIC
+
+/*--------------------------------------------------------------------------
+ 64bit PRNG Algorithms: xoshiro / xoroshiro
+
+ xoshiro256+ / xoshiro256++ / xoshiro256**
+ xoroshiro128+ / xoroshiro128++ / xoroshiro128**
+
+ Algorithms by David Blackman and Sebastiano Vigna
+ http://prng.di.unimi.it/
+
+ To the extent possible under law, the author has dedicated all copyright
+ and related and neighboring rights to this software to the public domain
+ worldwide. This software is distributed without any warranty.
+
+ See <http://creativecommons.org/publicdomain/zero/1.0/>.
+-------------------------------------------------------------------------- */
+#define XOSHIRO256\
+    const uint64_t t = s1 << 17;\
+    s2 ^= s0;\
+    s3 ^= s1;\
+    s1 ^= s2;\
+    s0 ^= s3;\
+    s2 ^= t;\
+    s3 = rotl<uint64_t>(s3, 45);\
+    return result;
+
+#define XOROSHIRO128(A,B,C)\
+    s1 ^= s0;\
+    s0 = rotl<uint64_t>(s0, A) ^ s1 ^ (s1 << B);\
+    s1 = rotl<uint64_t>(s1, C);\
+    return result;
+
+#define XORSHIFT64\
+    s0 ^= s0 << 13;\
+    s0 ^= s0 >> 7;\
+    s0 ^= s0 << 17;\
+    return s0;
+
+#define XOSHIRO256_STATIC(FUNC)\
+    static const uint64_t seed = uint64_t(FPRNG_SEED_INIT64);\
+    static uint64_t s0 = splitMix64(seed), s1 = splitMix64(s0), s2 = splitMix64(s1), s3 = splitMix64(s2);\
+    FUNC; XOSHIRO256
+
+#define XOROSHIRO128_STATIC(FUNC, A, B, C)\
+    static const uint64_t seed = uint64_t(FPRNG_SEED_INIT64);\
+    static uint64_t s0 = splitMix64(seed), s1 = splitMix64(s0);\
+    FUNC; XOROSHIRO128(A,B,C)
+
+#define XORSHIFT64_STATIC\
+    static uint64_t s0 = uint64_t(FPRNG_SEED_INIT64);\
+    XORSHIFT64
+
+// fastXS32
+//
+// 64bit pseudo-random generator
+// All integer values are returned in interval [0, UINT64_MAX]
+// to get values between [INT64_MIN, INT64_MAX] just cast result to int64_t
+///////////////////////////////////////////////////////////////////////////////
+class fastXS64
+{
+public:
+    fastXS64(const uint64_t seedVal = uint64_t(FPRNG_SEED_INIT64)) { seed(seedVal); }
+
+    inline uint64_t xoshiro256p()  { return xoshiro256(s0 + s3); }
+    inline uint64_t xoshiro256pp() { return xoshiro256(rotl<uint64_t>(s0 + s3, 23) + s0); }
+    inline uint64_t xoshiro256xx() { return xoshiro256(rotl<uint64_t>(s1 * 5, 7) * 9); }
+
+    template <typename T> inline T xoshiro256p_UNI() { return T(        xoshiro256p())  * UNI_64BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T xoshiro256p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T xoshiro256p_Range(T min, T max)                                         // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoshiro256p_UNI<T>(); }
+
+    inline uint64_t xoroshiro128p()  { return xoroshiro128(     s0 + s1); }
+    inline uint64_t xoroshiro128pp() { return xoroshiro128(rotl<uint64_t>(s0 + s1, 17) + s0, 49, 21, 28); }
+    inline uint64_t xoroshiro128xx() { return xoroshiro128(rotl<uint64_t>(s0 * 5, 7) * 9); }
+
+    template <typename T> inline T xoroshiro128p_UNI() { return T(        xoshiro256p())  * UNI_64BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T xoroshiro128p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T xoroshiro128p_Range(T min, T max)                                         // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoroshiro128p_UNI<T>(); }
+
+    inline uint64_t xorShift() { XORSHIFT64 } // Marsaglia xorShift: period 2^64-1
+
+    template <typename T> inline T xorShift_UNI() { return         xorShift()  * UNI_64BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T xorShift_VNI() { return int64_t(xorShift()) * VNI_64BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T xorShift_Range(T min, T max)                                   // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xorShift_UNI<T>(); }
+
+    void seed(const uint64_t seedVal = uint64_t(FPRNG_SEED_INIT64)) {
+        s0 = splitMix64(seedVal);
+        s1 = splitMix64(s0);
+        s2 = splitMix64(s1);
+        s3 = splitMix64(s2);
+    }
+private:
+    inline uint64_t xoshiro256(const uint64_t result)   { XOSHIRO256 }
+    inline uint64_t xoroshiro128(const uint64_t result, const int A = 24, const int B = 16, const int C = 37) { XOROSHIRO128(A,B,C) }
+
+    uint64_t s0, s1, s2, s3;
+};
+
+// fastXS64s - static members
+//      you can call directly w/o declaration, but..
+//      N.B. all members/functions share same seed, and subsequents xor & shift
+//           operations on it, if you need different seeds declare more
+//           fastXS32 (non static) objects
+//
+// 64bit pseudo-random generator
+// All integer values are returned in interval [0, UINT64_MAX]
+// to get values between [INT64_MIN, INT64_MAX] just cast result to int64_t
+///////////////////////////////////////////////////////////////////////////////
+class fastXS64s
+{
+public:
+    fastXS64s() = default;
+
+    inline static uint64_t xoshiro256p()  { XOSHIRO256_STATIC(const uint64_t result = s0 + s3) }
+    inline static uint64_t xoshiro256pp() { XOSHIRO256_STATIC(const uint64_t result = rotl<uint64_t>(s0 + s3, 23) + s0) }
+    inline static uint64_t xoshiro256xx() { XOSHIRO256_STATIC(const uint64_t result = rotl<uint64_t>(s1 * 5, 7) * 9) }
+
+    template <typename T> inline static T xoshiro256p_UNI() { return T(        xoshiro256p())  * UNI_64BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline static T xoshiro256p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline static T xoshiro256p_Range(T min, T max)                                         // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoshiro256p_UNI<T>(); }
+
+    inline static uint64_t xoroshiro128p()  { XOROSHIRO128_STATIC(const uint64_t result =      s0 + s1,           24, 13, 27) }
+    inline static uint64_t xoroshiro128pp() { XOROSHIRO128_STATIC(const uint64_t result = rotl<uint64_t>(s0 + s1, 17) + s0, 49, 21, 28) }
+    inline static uint64_t xoroshiro128xx() { XOROSHIRO128_STATIC(const uint64_t result = rotl<uint64_t>(s0 * 5, 7) * 9,    24, 13, 27) }
+
+    template <typename T> inline static T xoroshiro128p_UNI() { return T(        xoshiro256p())  * UNI_64BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline static T xoroshiro128p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline static T xoroshiro128p_Range(T min, T max)                                         // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xoroshiro128p_UNI<T>(); }
+
+    inline static uint64_t xorShift() { XORSHIFT64_STATIC } // Marsaglia xorShift: period 2^64-1
+
+    template <typename T> inline static T xorShift_UNI() { return         xorShift()  * UNI_64BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline static T xorShift_VNI() { return int64_t(xorShift()) * VNI_64BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline static T xorShift_Range(T min, T max)                                   // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * xorShift_UNI<T>(); }
+};
+
+#undef XOSHIRO256
+#undef XOROSHIRO128
+#undef XORSHIFT64
+#undef XOSHIRO256_STATIC
+#undef XOROSHIRO128_STATIC
+#undef XORSHIFT64_STATIC
+
+/*--------------------------------------------------------------------------
+ 32bit PRNG Algorithms:
+
+    znew / wnew / MWC / CNG / FIB / XSH / KISS
+
+    LFIB4 / SWB (uncomment: #define FSTRND_USES_BUILT_TABLE below)
+
+ Originally written from George Marsaglia
+-------------------------------------------------------------------------- */
+//#define FSTRND_USES_BUILT_TABLE   // uncomment to use 32Bit algorithms: LFI84 & SWB
+
+// fastRandom32Class
+//
+// 32bit pseudo-random generator
+// All integer values are returned in interval [0, UINT32_MAX]
+// to get values between [INT32_MIN, INT32_MAX] just cast result to int32_t
+///////////////////////////////////////////////////////////////////////////////
+class fastRandom32Class
+{
+public:
+
+    // no vaule, seed from system clock, or same seed for same sequence of numbers
+    fastRandom32Class(const uint32_t seedVal = uint32_t(FPRNG_SEED_INIT32))
+        { reset(); seed(seedVal); }
+
+    // re-seed the current state/values with a new random values
+    void seed(const uint32_t seed = uint32_t(FPRNG_SEED_INIT32)) {
+        uint32_t s[6];
+        s[0] = splitMix32(seed);
+        for(int i=1; i<6; i++) s[i] = splitMix32(s[i-1]);
+        initialize(s);
+    }
+
+    // reset to initial state
+    void reset() {
+        z   = 362436069; w     = 521288629;
+        jsr = 123456789; jcong = 380116160;
+        a   = 224466889; b     = 7584631;
+    }
+
+    inline uint32_t znew() { return z=36969*(z&65535)+(z>>16); }
+    inline uint32_t wnew() { return w=18000*(w&65535)+(w>>16); }
+    inline uint32_t MWC()  { return (znew()<<16)+wnew()      ; }
+    inline uint32_t CNG()  { return jcong=69069*jcong+1234567; }
+    inline uint32_t FIB()  { return (b=a+b),(a=b-a)          ; }
+    inline uint32_t XSH()  { return jsr^=(jsr<<17), jsr^=(jsr>>13), jsr^=(jsr<<5); }
+
+    inline uint32_t KISS() { return (MWC()^CNG())+XSH(); } // period 2^123
+
+    template <typename T> inline T KISS_UNI() { return         KISS()  * UNI_32BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T KISS_VNI() { return int32_t(KISS()) * VNI_32BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T KISS_Range(T min, T max)                               // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * KISS_UNI<T>(); }
+
+#ifdef FSTRND_USES_BUILT_TABLE
+    inline uint32_t LFIB4() { return c++,t[c]=t[c]+t[uint8_t(c+58)]+t[uint8_t(c+119)]+t[uint8_t(c+178)];}
+    inline uint32_t SWB()   { uint32_t bro; return c++,bro=(x<y),t[c]=(x=t[uint8_t(c+34)])-(y=t[uint8_t(c+19)]+bro); }
+#endif
+private:
+    void initialize(const uint32_t *i) { z+=i[0]; w+=i[1]; jsr+=i[2]; jcong+=i[3]; a=+i[4]; b=+i[5]; }
+
+    uint32_t z, w, jsr, jcong;
+    uint32_t a, b;
+
+#ifdef FSTRND_USES_BUILT_TABLE
+    uint32_t t[256];
+    unsigned char c=0;
+#endif
+};
+
+#undef FSTRND_USES_BUILT_TABLE
+
+/*--------------------------------------------------------------------------
+ 64bit PRNG Algorithms:
+
+    znew / wnew / MWC / CNG / FIB / XSH / KISS
+
+ Originally written from George Marsaglia
+-------------------------------------------------------------------------- */
+
+
+// fastRandom64Class
+//
+// 64bit pseudo-random generator
+// All values are returned in interval [0, UINT64_MAX]
+// to get values between [INT64_MIN, INT64_MAX] just cast result to int64_t
+///////////////////////////////////////////////////////////////////////////////
+class fastRandom64Class
+{
+public:
+    // no vaule, seed from system clock, or same seed for same sequence of numbers
+    fastRandom64Class(const uint64_t seedVal = uint64_t(FPRNG_SEED_INIT64)) { reset(); seed(seedVal);  }
+
+    // re-seed the current state/values with a new random values
+    void seed(const uint64_t seed = uint64_t(FPRNG_SEED_INIT64)) {
+        uint64_t s[6];
+        s[0] = splitMix64(seed);
+        for(int i=1; i<6; i++) s[i] = splitMix64(s[i-1]);
+        initialize(s);
+    }
+    // reset to initial state
+    void reset() {
+        x=uint64_t(1234567890987654321ULL); c=uint64_t(123456123456123456ULL);
+        y=uint64_t(362436362436362436ULL ); z=uint64_t(1066149217761810ULL  );
+        a=uint64_t(224466889);              b=uint64_t(7584631);
+    }
+
+    inline uint64_t MWC() { uint64_t t; return t=(x<<58)+c, c=(x>>6), x+=t, c+=(x<t), x; }
+    inline uint64_t CNG() { return z=6906969069LL*z+1234567;            }
+    inline uint64_t XSH() { return y^=(y<<13), y^=(y>>17), y^=(y<<43);  }
+    inline uint64_t FIB() { return (b=a+b),(a=b-a);                     }
+
+    inline uint64_t KISS () { return MWC()+XSH()+CNG(); } //period 2^250
+
+    template <typename T> inline T KISS_UNI() { return         KISS()  * UNI_64BIT_INV; } // _UNI<T>   returns value in [ 0, 1] with T ==> float/double
+    template <typename T> inline T KISS_VNI() { return int64_t(KISS()) * VNI_64BIT_INV; } // _VNI<T>   returns value in [-1, 1] with T ==> float/double
+    template <typename T> inline T KISS_Range(T min, T max)                               // _Range<T> returns value in [min, max] with T ==> float/double
+            { return min + (max-min) * KISS_UNI<T>(); }
+
+private:
+    void initialize(const uint64_t *i){ x+=i[0]; y+=i[1]; z+=i[2]; c+=i[3]; a=+i[4]; b=+i[5]; }
+
+    uint64_t x, c, y, z;
+    uint64_t a, b;
+};
+
+using fastRand32 = fastRandom32Class;
+using fastRand64 = fastRandom64Class;
+
+} // end of namespace FstRnd
+
+#undef UNI_32BIT_INV
+#undef VNI_32BIT_INV
+#undef UNI_64BIT_INV
+#undef VNI_64BIT_INV
+#undef FPRNG_SEED_INIT32
+#undef FPRNG_SEED_INIT64
+
+/*-----------------------------------------------------
+    32bit Marsaglia algorithms description
+-------------------------------------------------------
+Write your own calling program and try one or more of
+the above, singly or in combination, when you run a
+simulation. You may want to change the simple 1-letter
+names, to avoid conflict with your own choices. */
+
+/* All that follows is comment, mostly from the initial
+   post. You may want to remove it */
+
+
+/* Any one of KISS, MWC, FIB, LFIB4, SWB, SHR3, or CONG
+   can be used in an expression to provide a random 32-bit
+   integer.
+
+
+   The KISS generator, (Keep It Simple Stupid), is
+   designed to combine the two multiply-with-carry
+   generators in MWC with the 3-shift register SHR3 and
+   the congruential generator CONG, using addition and
+   exclusive-or. Period about 2^123.
+   It is one of my favorite generators.
+
+
+   The MWC generator concatenates two 16-bit multiply-
+   with-carry generators, x(n)=36969x(n-1)+carry,
+   y(n)=18000y(n-1)+carry mod 2^16, has period about
+   2^60 and seems to pass all tests of randomness. A
+   favorite stand-alone generator---faster than KISS,
+   which contains it.
+
+
+   FIB is the classical Fibonacci sequence
+   x(n)=x(n-1)+x(n-2),but taken modulo 2^32.
+   Its period is 3*2^31 if one of its two seeds is odd
+   and not 1 mod 8. It has little worth as a RNG by
+   itself, but provides a simple and fast component for
+   use in combination generators.
+
+
+   SHR3 is a 3-shift-register generator with period
+   2^32-1. It uses y(n)=y(n-1)(I+L^17)(I+R^13)(I+L^5),
+   with the y's viewed as binary vectors, L the 32x32
+   binary matrix that shifts a vector left 1, and R its
+   transpose. SHR3 seems to pass all except those
+   related to the binary rank test, since 32 successive
+   values, as binary vectors, must be linearly
+   independent, while 32 successive truly random 32-bit
+   integers, viewed as binary vectors, will be linearly
+   independent only about 29% of the time.
+
+
+   CONG is a congruential generator with the widely used 69069
+   multiplier: x(n)=69069x(n-1)+1234567. It has period
+   2^32. The leading half of its 32 bits seem to pass
+   tests, but bits in the last half are too regular.
+
+
+   LFIB4 is an extension of what I have previously
+   defined as a lagged Fibonacci generator:
+   x(n)=x(n-r) op x(n-s), with the x's in a finite
+   set over which there is a binary operation op, such
+   as +,- on integers mod 2^32, * on odd such integers,
+   exclusive-or(xor) on binary vectors. Except for
+   those using multiplication, lagged Fibonacci
+   generators fail various tests of randomness, unless
+   the lags are very long. (See SWB below).
+   To see if more than two lags would serve to overcome
+   the problems of 2-lag generators using +,- or xor, I
+   have developed the 4-lag generator LFIB4 using
+   addition: x(n)=x(n-256)+x(n-179)+x(n-119)+x(n-55)
+   mod 2^32. Its period is 2^31*(2^256-1), about 2^287,
+   and it seems to pass all tests---in particular,
+   those of the kind for which 2-lag generators using
+   +,-,xor seem to fail. For even more confidence in
+   its suitability, LFIB4 can be combined with KISS,
+   with a resulting period of about 2^410: just use
+   (KISS+LFIB4) in any C expression.
+
+
+   SWB is a subtract-with-borrow generator that I
+   developed to give a simple method for producing
+   extremely long periods:
+   x(n)=x(n-222)-x(n-237)- borrow mod 2^32.
+   The 'borrow' is 0, or set to 1 if computing x(n-1)
+   caused overflow in 32-bit integer arithmetic. This
+   generator has a very long period, 2^7098(2^480-1),
+   about 2^7578. It seems to pass all tests of
+   randomness, except for the Birthday Spacings test,
+   which it fails badly, as do all lagged Fibonacci
+   generators using +,- or xor. I would suggest
+   combining SWB with KISS, MWC, SHR3, or CONG.
+   KISS+SWB has period >2^7700 and is highly
+   recommended.
+   Subtract-with-borrow has the same local behaviour
+   as lagged Fibonacci using +,-,xor---the borrow
+   merely provides a much longer period.
+   SWB fails the birthday spacings test, as do all
+   lagged Fibonacci and other generators that merely
+   combine two previous values by means of =,- or xor.
+   Those failures are for a particular case: m=512
+   birthdays in a year of n=2^24 days. There are
+   choices of m and n for which lags >1000 will also
+   fail the test. A reasonable precaution is to always
+   combine a 2-lag Fibonacci or SWB generator with
+   another kind of generator, unless the generator uses
+   *, for which a very satisfactory sequence of odd
+   32-bit integers results.
+
+
+   The classical Fibonacci sequence mod 2^32 from FIB
+   fails several tests. It is not suitable for use by
+   itself, but is quite suitable for combining with
+   other generators.
+
+
+   The last half of the bits of CONG are too regular,
+   and it fails tests for which those bits play a
+   significant role. CONG+FIB will also have too much
+   regularity in trailing bits, as each does. But keep
+   in mind that it is a rare application for which
+   the trailing bits play a significant role. CONG
+   is one of the most widely used generators of the
+   last 30 years, as it was the system generator for
+   VAX and was incorporated in several popular
+   software packages, all seemingly without complaint.
+
+
+   Finally, because many simulations call for uniform
+   random variables in 0<x<1 or -1<x<1, I use #define
+   statements that permit inclusion of such variates
+   directly in expressions: using UNI will provide a
+   uniform random real (float) in (0,1), while VNI will
+   provide one in (-1,1).
+
+
+   All of these: MWC, SHR3, CONG, KISS, LFIB4, SWB, FIB
+   UNI and VNI, permit direct insertion of the desired
+   random quantity into an expression, avoiding the
+   time and space costs of a function call. I call
+   these in-line-define functions. To use them, static
+   variables z,w,jsr,jcong,a and b should be assigned
+   seed values other than their initial values. If
+   LFIB4 or SWB are used, the static table t[256] must
+   be initialized.
+
+
+   A note on timing: It is difficult to provide exact
+   time costs for inclusion of one of these in-line-
+   define functions in an expression. Times may differ
+   widely for different compilers, as the C operations
+   may be deeply nested and tricky. I suggest these
+   rough comparisons, based on averaging ten runs of a
+   routine that is essentially a long loop:
+   for(i=1;i<10000000;i++) L=KISS; then with KISS
+   replaced with SHR3, CONG,... or KISS+SWB, etc. The
+   times on my home PC, a Pentium 300MHz, in nanoseconds:
+   FIB 49;LFIB4 77;SWB 80;CONG 80;SHR3 84;MWC 93;KISS 157;
+   VNI 417;UNI 450;
+*/
+
+
+/*-----------------------------------------------------
+    64bit Marsaglia algorithms description
+-------------------------------------------------------
+
+64-bit KISS RNGs
+https://www.thecodingforums.com/threads/64-bit-kiss-rngs.673657/
+
+Consistent with the Keep It Simple Stupid (KISS) principle,
+I have previously suggested 32-bit KISS Random Number
+Generators (RNGs) that seem to have been frequently adopted.
+
+Having had requests for 64-bit KISSes, and now that
+64-bit integers are becoming more available, I will
+describe here a 64-bit KISS RNG, with comments on
+implementation for various languages, speed, periods
+and performance after extensive tests of randomness.
+
+This 64-bit KISS RNG has three components, each nearly
+good enough to serve alone. The components are:
+Multiply-With-Carry (MWC), period (2^121+2^63-1)
+Xorshift (XSH), period 2^64-1
+Congruential (CNG), period 2^64
+
+Compact C and Fortran listings are given below. They
+can be cut, pasted, compiled and run to see if, after
+100 million calls, results agree with that provided
+by theory, assuming the default seeds.
+
+Users may want to put the content in other forms, and,
+for general use, provide means to set the 250 seed bits
+required in the variables x,y,z (64 bits) and c (58 bits)
+that have been given default values in the test versions.
+
+The C version uses #define macros to enumerate the few
+instructions that MWC, XSH and CNG require. The KISS
+macro adds MWC+XSH+CNG mod 2^64, so that KISS can be
+inserted at any place in a C program where a random 64-bit
+integer is required.
+Fortran's requirement that integers be signed makes the
+necessary code more complicated, hence a function KISS().
+
+
+64-bit xorShift
+https://en.wikipedia.org/wiki/Xorshift
+
+*/
\ No newline at end of file
diff --git a/gbench/rand.cpp b/gbench/rand.cpp
new file mode 100644
index 0000000..4d5954c
--- /dev/null
+++ b/gbench/rand.cpp
@@ -0,0 +1,65 @@
+
+#include <time.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string>
+#include <string.h>
+#include <x86intrin.h>
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+#include "fastPRNG.h"
+
+#include <benchmark/benchmark.h>
+
+static char buf[8196];
+
+void rand_bytes (int num_bytes)
+{
+  for (int i = 0; i < num_bytes; i++) { buf[i] = rand (); }
+}
+void rand_bytes2 (int num_bytes)
+{
+  uint64_t *p = (uint64_t*)buf;
+  for (int i = 0; i < num_bytes; p++, i+=8 ) { 
+    *p = fastPRNG::fastXS64s::xoshiro256p(); 
+  } 
+}
+
+
+static void BM_rand1(benchmark::State& state) {
+  for (auto _ : state) { 
+    long x;
+    benchmark::DoNotOptimize(x);
+    x = fastPRNG::fastXS64s::xoshiro256p();
+  }
+}
+static void BM_rand_bytes(benchmark::State& state) {
+  srand ((unsigned int) time (NULL));
+  for (auto _ : state) { 
+    rand_bytes(1024);
+  }
+}
+static void BM_rand_bytes2(benchmark::State& state) {
+  srand ((unsigned int) time (NULL));
+  for (auto _ : state) { 
+    rand_bytes2(1024);
+  }
+}
+
+
+
+BENCHMARK(BM_rand1);
+BENCHMARK(BM_rand_bytes);
+BENCHMARK(BM_rand_bytes2);
+BENCHMARK_MAIN();
+
+/*
+int main() {
+  char buf[8096] = "123z4 ";
+  //strcpy(buf,"942312");
+
+  printf(" my strtol %d\n", my_strtol(buf, 4));
+
+}
+*/