diff options
Diffstat (limited to 'external/meow_hash/meow_hash_x64_aesni.h')
| -rw-r--r-- | external/meow_hash/meow_hash_x64_aesni.h | 744 | 
1 files changed, 744 insertions, 0 deletions
diff --git a/external/meow_hash/meow_hash_x64_aesni.h b/external/meow_hash/meow_hash_x64_aesni.h new file mode 100644 index 0000000..966557d --- /dev/null +++ b/external/meow_hash/meow_hash_x64_aesni.h @@ -0,0 +1,744 @@ +/* ======================================================================== + +   Meow - A Fast Non-cryptographic Hash +   (C) Copyright 2018-2019 by Molly Rocket, Inc. (https://mollyrocket.com) +    +   See https://mollyrocket.com/meowhash for details. +    +   ======================================================================== +    +   zlib License +    +   (C) Copyright 2018-2019 Molly Rocket, Inc. +    +   This software is provided 'as-is', without any express or implied +   warranty.  In no event will the authors be held liable for any damages +   arising from the use of this software. +    +   Permission is granted to anyone to use this software for any purpose, +   including commercial applications, and to alter it and redistribute it +   freely, subject to the following restrictions: +    +   1. The origin of this software must not be misrepresented; you must not +      claim that you wrote the original software. If you use this software +      in a product, an acknowledgment in the product documentation would be +      appreciated but is not required. +   2. Altered source versions must be plainly marked as such, and must not be +      misrepresented as being the original software. +   3. This notice may not be removed or altered from any source distribution. +    +   ======================================================================== +    +   FAQ +    +   Q: What is it? +    +   A: Meow is a 128-bit Level 3 hash taking 128 bytes of seed.  It operates +      at very high speeds on x64 processors, and potentially other processors +      that provide accelerated AES instructions. +       +   Q: What is it GOOD for? +    +   A: Quickly hashing any amount of data for comparison purposes such as +      block deduplication or change detection.  It is fast on all buffer +      sizes, and can generally be used anywhere you need fast Level 3 +      hashing without worrying about how big or small the inputs tend to be. +       +      However, substantial speed improvements could be made over Meow +      if you either a) know you are always hashing an exact, small number of bytes, +      or b) can always supply a small number of bytes in a buffer padded to some +      fixed multiple of 16. +       +   Q: What is it BAD for? +    +   A: Anything requiring Level 4 or Level 5 security guarantees (see +      http://nohatcoder.dk/2019-05-19-1.html#level3).  Also, note that +      Meow is a new hash and has not had the extensive community +      cryptanalysis necessary to ensure that it is not breakable down to +      a lower level of hash, so you must do your due diligence in +      deciding when and where to use Meow instead of a slower but +      more extensively studied existing hash.  We have tried to design +      it to provide Level 3 security, but the possibility of the hash +      being broken in the future always exists. +       +   Q: Why is it called the "Meow hash"? +    +   A: It is named after a character in Meow the Infinite +      (https://meowtheinfinite.com) +       +   Q: Who wrote it? +    +   A: The final Meow Hash was created as a collaboration between +      JACOB CHRISTIAN MUNCH-ANDERSEN (https://twitter.com/nohatcoder) and +      CASEY MURATORI (https://caseymuratori.com).  Casey wrote the original +      implementation for use in processing large-footprint assets for the +      game 1935 (https://molly1935.com).  Jacob was the first to analyze +      that implementation and determine the adversarial bit strength, which +      was weaker than they would have liked. +       +      Following that, the two collaborated to figure out how the hash +      could be strengthened without reducing Meow's 16 bytes/cycle +      maximum theoretical throughput.  Jacob created the hash candidates +      and Casey did the performance validation.  After a long and +      exhaustive effort, Jacob found the unaligned aes/add/xor formulation +      that forms the current Meow hash core. +       +      A number of valuable additions to Meow Hash were also contributed +      by other great folks along the way: +       +      JEFF ROBERTS (https://radgametools.com) provided a super slick +      way to handle the residual end-of-buffer bytes that dramatically +      improved Meow's small hash performance. +       +      MARTINS MOZEIKO (https://matrins.ninja) ported Meow to ARM and +      ANSI-C, and added the proper preprocessor dressing for clean +      compilation on a variety of compiler configurations. +       +      FABIAN GIESEN (https://fgiesen.wordpress.com) analyzed many +      performance oddities that came up during development, and +      helped get the benchmarking working properly across a number +      of platforms. +       +      ARAS PRANCKEVICIUS (https://aras-p.info) provided the allocation +      shim for compilation on Mac OS X. +       +   ======================================================================== */ + +// +// IMPORTANT(casey): We are currently evaluating this hash construction as +// the final one for Meow Hash.  If you find a way to produce collisions +// that should not be possible with a Level 3 hash, find significant performance +// problems, or see any bugs in this version, please be sure to report them +// to the Meow Hash GitHub as soon as possible.  We would like to know as +// much as we can about the robustness and performance before committing to +// it as the final construction. +// + +#if !defined(MEOW_HASH_X64_AESNI_H) + +#define MEOW_HASH_VERSION 5 +#define MEOW_HASH_VERSION_NAME "0.5/calico" + +#if !defined(meow_u8) + +#if _MSC_VER +#if !defined(__clang__) +#define INSTRUCTION_REORDER_BARRIER _ReadWriteBarrier() +#else +#endif +#include <intrin.h> +#else +#include <x86intrin.h> +#endif + +#define meow_u8 char unsigned +#define meow_u64 long long unsigned +#define meow_u128 __m128i + +#if __x86_64__ || _M_AMD64 +#define meow_umm long long unsigned +#define MeowU64From(A, I) (_mm_extract_epi64((A), (I))) +#elif __i386__  || _M_IX86 +#define meow_umm int unsigned +#define MeowU64From(A, I) (*(meow_u64 *)&(A)) +#else +#error Cannot determine architecture to use! +#endif + +#define MeowU32From(A, I) (_mm_extract_epi32((A), (I))) +#define MeowHashesAreEqual(A, B) (_mm_movemask_epi8(_mm_cmpeq_epi8((A), (B))) == 0xFFFF) + +#if !defined INSTRUCTION_REORDER_BARRIER +#define INSTRUCTION_REORDER_BARRIER +#endif + +#if !defined MEOW_PAGESIZE +#define MEOW_PAGESIZE 4096 +#endif + +#if !defined MEOW_PREFETCH +#define MEOW_PREFETCH 4096 +#endif + +#if !defined MEOW_PREFETCH_LIMIT +#define MEOW_PREFETCH_LIMIT 0x3ff +#endif + +#endif + +#define prefetcht0(A) _mm_prefetch((char *)(A), _MM_HINT_T0) +#define movdqu(A, B)  A = _mm_loadu_si128((__m128i *)(B)) +#define movdqu_mem(A, B)  _mm_storeu_si128((__m128i *)(A), B) +#define movq(A, B) A = _mm_set_epi64x(0, B); +#define aesdec(A, B)  A = _mm_aesdec_si128(A, B) +#define pshufb(A, B)  A = _mm_shuffle_epi8(A, B) +#define pxor(A, B)    A = _mm_xor_si128(A, B) +#define paddq(A, B) A = _mm_add_epi64(A, B) +#define pand(A, B)    A = _mm_and_si128(A, B) +#define palignr(A, B, i) A = _mm_alignr_epi8(A, B, i) +#define pxor_clear(A, B)    A = _mm_setzero_si128(); // NOTE(casey): pxor_clear is a nonsense thing that is only here because compilers don't detect xor(a, a) is clearing a :( + +#define MEOW_MIX_REG(r1, r2, r3, r4, r5,  i1, i2, i3, i4) \ +aesdec(r1, r2); \ +INSTRUCTION_REORDER_BARRIER; \ +paddq(r3, i1); \ +pxor(r2, i2); \ +aesdec(r2, r4); \ +INSTRUCTION_REORDER_BARRIER; \ +paddq(r5, i3); \ +pxor(r4, i4); + +#define MEOW_MIX(r1, r2, r3, r4, r5,  ptr) \ +MEOW_MIX_REG(r1, r2, r3, r4, r5, _mm_loadu_si128( (__m128i *) ((ptr) + 15) ), _mm_loadu_si128( (__m128i *) ((ptr) + 0)  ), _mm_loadu_si128( (__m128i *) ((ptr) + 1)  ), _mm_loadu_si128( (__m128i *) ((ptr) + 16) )) + +#define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) \ +aesdec(r1, r4); \ +paddq(r2, r5); \ +pxor(r4, r6); \ +aesdec(r4, r2); \ +paddq(r5, r6); \ +pxor(r2, r3) + +#if MEOW_DUMP +struct meow_dump +{ +    meow_u128 xmm[8]; +    void *Ptr; +    char const *Title; +}; +extern "C" meow_dump *MeowDumpTo; +meow_dump *MeowDumpTo; +#define MEOW_DUMP_STATE(T, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ptr) \ +if(MeowDumpTo) \ +{ \ +    MeowDumpTo->xmm[0] = xmm0; \ +    MeowDumpTo->xmm[1] = xmm1; \ +    MeowDumpTo->xmm[2] = xmm2; \ +    MeowDumpTo->xmm[3] = xmm3; \ +    MeowDumpTo->xmm[4] = xmm4; \ +    MeowDumpTo->xmm[5] = xmm5; \ +    MeowDumpTo->xmm[6] = xmm6; \ +    MeowDumpTo->xmm[7] = xmm7; \ +    MeowDumpTo->Ptr = ptr; \ +    MeowDumpTo->Title = T; \ +    ++MeowDumpTo; \ +} +#else +#define MEOW_DUMP_STATE(...) +#endif + +static meow_u8 MeowShiftAdjust[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; +static meow_u8 MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; + +// NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure.  You may verify that it is just an encoding of Pi. +static meow_u8 MeowDefaultSeed[128] = +{ +    0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D, +	0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34, +	0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D, +	0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8, +	0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37, +	0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6, +	0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D, +	0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91, +	0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1, +	0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A, +	0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB, +	0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9, +	0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9, +	0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF, +	0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1, +	0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6 +}; + +// +// NOTE(casey): Single block version +// + +static meow_u128 +MeowHash(void *Seed128Init, meow_umm Len, void *SourceInit) +{ +    meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes +    meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) +     +    meow_u8 *rax = (meow_u8 *)SourceInit; +    meow_u8 *rcx = (meow_u8 *)Seed128Init; +     +    // +	// NOTE(casey): Seed the eight hash registers +    // +     +    movdqu(xmm0, rcx + 0x00); +    movdqu(xmm1, rcx + 0x10); +    movdqu(xmm2, rcx + 0x20); +    movdqu(xmm3, rcx + 0x30); +     +    movdqu(xmm4, rcx + 0x40); +    movdqu(xmm5, rcx + 0x50); +    movdqu(xmm6, rcx + 0x60); +    movdqu(xmm7, rcx + 0x70); +     +    MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    // +    // NOTE(casey): Hash all full 256-byte blocks +    // +     +    meow_umm BlockCount = (Len >> 8); +    if(BlockCount > MEOW_PREFETCH_LIMIT) +    { +        // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop +        while(BlockCount--) +        { +            prefetcht0(rax + MEOW_PREFETCH + 0x00); +            prefetcht0(rax + MEOW_PREFETCH + 0x40); +            prefetcht0(rax + MEOW_PREFETCH + 0x80); +            prefetcht0(rax + MEOW_PREFETCH + 0xc0); +             +            MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); +            MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); +            MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); +            MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); +            MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); +            MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); +            MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); +            MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); +             +            rax += 0x100; +        } +    } +    else +    { +        // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop. +        while(BlockCount--) +        { +            MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); +            MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); +            MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); +            MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); +            MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); +            MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); +            MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); +            MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); +             +            rax += 0x100; +        } +    } +     +    MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    // +    // NOTE(casey): Load any less-than-32-byte residual +    // +     +    pxor_clear(xmm9, xmm9); +    pxor_clear(xmm11, xmm11); +     +    // +    // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here, +    // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the +    // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due +    // to the & 0xf on the align computation. +    // +     +    // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned +    meow_u8 *Last = (meow_u8 *)SourceInit + (Len & ~0xf); +    int unsigned Len8 = (Len & 0xf); +    if(Len8) +    { +        // NOTE(casey): Load the mask early +        movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); +         +        meow_u8 *LastOk = (meow_u8*)((((meow_umm)(((meow_u8 *)SourceInit)+Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); +        int Align = (Last > LastOk) ? ((int)(meow_umm)Last) & 0xf : 0; +        movdqu(xmm10, &MeowShiftAdjust[Align]); +        movdqu(xmm9, Last - Align); +        pshufb(xmm9, xmm10); +         +        // NOTE(jeffr): and off the extra bytes +        pand(xmm9, xmm8); +    } +     +    // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned +    if(Len & 0x10) +    { +        xmm11 = xmm9; +        movdqu(xmm9, Last - 0x10); +    } +     +    // +    // NOTE(casey): Construct the residual and length injests +    // +     +    xmm8 = xmm9; +    xmm10 = xmm9; +    palignr(xmm8, xmm11, 15); +    palignr(xmm10, xmm11, 1); +     +    // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but +    // the decision was made to leave them zero'd so as not to confuse people +    // about hwo to use them or what security implications they had. +    pxor_clear(xmm12, xmm12); +    pxor_clear(xmm13, xmm13); +    pxor_clear(xmm14, xmm14); +    movq(xmm15, Len); +    palignr(xmm12, xmm15, 15); +    palignr(xmm14, xmm15, 1); +     +    MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0); +     +    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty +    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2,  xmm8, xmm9, xmm10, xmm11); +     +    // NOTE(casey): Append the length, to avoid problems with our 32-byte padding +    MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3,  xmm12, xmm13, xmm14, xmm15); +     +    MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    // +    // NOTE(casey): Hash all full 32-byte blocks +    // +    int unsigned LaneCount = (Len >> 5) & 0x7; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; +     +    // +    // NOTE(casey): Mix the eight lanes down to one 128-bit hash +    // +     +    MixDown: +     +    MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); +    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); +    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); +    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); +    MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); +    MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); +    MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); +    MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); +    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); +    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); +    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); +    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); +     +    MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    paddq(xmm0, xmm2); +    paddq(xmm1, xmm3); +    paddq(xmm4, xmm6); +    paddq(xmm5, xmm7); +    pxor(xmm0, xmm1); +    pxor(xmm4, xmm5); +    paddq(xmm0, xmm4); +     +    MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    return(xmm0); +} + +// +// NOTE(casey): Streaming construction +// + +typedef struct meow_state +{ +    meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; +    meow_u64 TotalLengthInBytes; +     +    int unsigned BufferLen; +     +    meow_u8 Buffer[256]; +    meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary +} meow_state; + +static void +MeowBegin(meow_state *State, void *Seed128) +{ +    meow_u8 *rcx = (meow_u8 *)Seed128; +     +    movdqu(State->xmm0, rcx + 0x00); +    movdqu(State->xmm1, rcx + 0x10); +    movdqu(State->xmm2, rcx + 0x20); +    movdqu(State->xmm3, rcx + 0x30); +    movdqu(State->xmm4, rcx + 0x40); +    movdqu(State->xmm5, rcx + 0x50); +    movdqu(State->xmm6, rcx + 0x60); +    movdqu(State->xmm7, rcx + 0x70); +     +    MEOW_DUMP_STATE("Seed", State->xmm0, State->xmm1, State->xmm2, State->xmm3, State->xmm4, State->xmm5, State->xmm6, State->xmm7, 0); +     +    State->BufferLen = 0; +    State->TotalLengthInBytes = 0; +} + +static void +MeowAbsorbBlocks(meow_state *State, meow_umm BlockCount, meow_u8 *rax) +{ +    meow_u128 xmm0 = State->xmm0; +    meow_u128 xmm1 = State->xmm1; +    meow_u128 xmm2 = State->xmm2; +    meow_u128 xmm3 = State->xmm3; +    meow_u128 xmm4 = State->xmm4; +    meow_u128 xmm5 = State->xmm5; +    meow_u128 xmm6 = State->xmm6; +    meow_u128 xmm7 = State->xmm7; +     +    if(BlockCount > MEOW_PREFETCH_LIMIT) +    { +        while(BlockCount--) +        { +            prefetcht0(rax + MEOW_PREFETCH + 0x00); +            prefetcht0(rax + MEOW_PREFETCH + 0x40); +            prefetcht0(rax + MEOW_PREFETCH + 0x80); +            prefetcht0(rax + MEOW_PREFETCH + 0xc0); +             +            MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); +            MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); +            MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); +            MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); +            MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); +            MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); +            MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); +            MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); +             +            rax += 0x100; +        } +    } +    else +    { +        while(BlockCount--) +        { +            MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00); +            MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20); +            MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40); +            MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60); +            MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80); +            MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0); +            MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0); +            MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0); +             +            rax += 0x100; +        } +    } +     +    State->xmm0 = xmm0; +    State->xmm1 = xmm1; +    State->xmm2 = xmm2; +    State->xmm3 = xmm3; +    State->xmm4 = xmm4; +    State->xmm5 = xmm5; +    State->xmm6 = xmm6; +    State->xmm7 = xmm7; +} + +static void +MeowAbsorb(meow_state *State, meow_umm Len, void *SourceInit) +{ +    State->TotalLengthInBytes += Len; +    meow_u8 *Source = (meow_u8 *)SourceInit; +     +    // NOTE(casey): Handle any buffered residual +    if(State->BufferLen) +    { +        int unsigned Fill = (sizeof(State->Buffer) - State->BufferLen); +        if(Fill > Len) +        { +            Fill = (int unsigned)Len; +        } +         +        Len -= Fill; +        while(Fill--) +        { +            State->Buffer[State->BufferLen++] = *Source++; +        } +         +        if(State->BufferLen == sizeof(State->Buffer)) +        { +            MeowAbsorbBlocks(State, 1, State->Buffer); +            State->BufferLen = 0; +        } +    } +     +    // NOTE(casey): Handle any full blocks +    meow_u64 BlockCount = (Len >> 8); +    meow_u64 Advance = (BlockCount << 8); +    MeowAbsorbBlocks(State, BlockCount, Source); +     +    Len -= Advance; +    Source += Advance; +     +    // NOTE(casey): Store residual +    while(Len--) +    { +        State->Buffer[State->BufferLen++] = *Source++; +    } +} + +static meow_u128 +MeowEnd(meow_state *State, meow_u8 *Store128) +{ +    meow_umm Len = State->TotalLengthInBytes; +     +    meow_u128 xmm0 = State->xmm0; +    meow_u128 xmm1 = State->xmm1; +    meow_u128 xmm2 = State->xmm2; +    meow_u128 xmm3 = State->xmm3; +    meow_u128 xmm4 = State->xmm4; +    meow_u128 xmm5 = State->xmm5; +    meow_u128 xmm6 = State->xmm6; +    meow_u128 xmm7 = State->xmm7; +     +    meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; +     +    meow_u8 *rax = State->Buffer; +     +    pxor_clear(xmm9, xmm9); +    pxor_clear(xmm11, xmm11); +     +    meow_u8 *Last = (meow_u8 *)rax + (Len & 0xf0); +    int unsigned Len8 = (Len & 0xf); +    if(Len8) +    { +        movdqu(xmm8, &MeowMaskLen[0x10 - Len8]); +        movdqu(xmm9, Last); +        pand(xmm9, xmm8); +    } +     +    if(Len & 0x10) +    { +        xmm11 = xmm9; +        movdqu(xmm9, Last - 0x10); +    } +     +    xmm8 = xmm9; +    xmm10 = xmm9; +    palignr(xmm8, xmm11, 15); +    palignr(xmm10, xmm11, 1); +     +    pxor_clear(xmm12, xmm12); +    pxor_clear(xmm13, xmm13); +    pxor_clear(xmm14, xmm14); +    movq(xmm15, Len); +    palignr(xmm12, xmm15, 15); +    palignr(xmm14, xmm15, 1); +     +    MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +    MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0); +     +    // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty +    MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2,  xmm8, xmm9, xmm10, xmm11); +     +    // NOTE(casey): Append the length, to avoid problems with our 32-byte padding +    MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3,  xmm12, xmm13, xmm14, xmm15); +     +    MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    // +    // NOTE(casey): Hash all full 32-byte blocks +    // +    int unsigned LaneCount = (Len >> 5) & 0x7; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount; +    if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount; +     +    // +    // NOTE(casey): Mix the eight lanes down to one 128-bit hash +    // +     +    MixDown: +     +    MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); +    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); +    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); +    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); +    MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2); +    MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3); +    MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4); +    MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5); +    MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6); +    MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7); +    MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0); +    MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1); +     +    MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    if(Store128) +    { +        movdqu_mem(Store128 + 0x00, xmm0); +        movdqu_mem(Store128 + 0x10, xmm1); +        movdqu_mem(Store128 + 0x20, xmm2); +        movdqu_mem(Store128 + 0x30, xmm3); +        movdqu_mem(Store128 + 0x40, xmm4); +        movdqu_mem(Store128 + 0x50, xmm5); +        movdqu_mem(Store128 + 0x60, xmm6); +        movdqu_mem(Store128 + 0x70, xmm7); +    } +     +    paddq(xmm0, xmm2); +    paddq(xmm1, xmm3); +    paddq(xmm4, xmm6); +    paddq(xmm5, xmm7); +    pxor(xmm0, xmm1); +    pxor(xmm4, xmm5); +    paddq(xmm0, xmm4); +     +    MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); +     +    return(xmm0); +} + +#undef INSTRUCTION_REORDER_BARRIER +#undef prefetcht0 +#undef movdqu +#undef movdqu_mem +#undef movq +#undef aesdec +#undef pshufb +#undef pxor +#undef paddq +#undef pand +#undef palignr +#undef pxor_clear +#undef MEOW_MIX +#undef MEOW_MIX_REG +#undef MEOW_SHUFFLE +#undef MEOW_DUMP_STATE + +// +// NOTE(casey): If you need to create your own seed from non-random data, you can use MeowExpandSeed +// to create a seed which you then store for repeated use.  It is _expensive_ to generate the seed, +// so you do not want to do this every time you hash.  You _only_ want to do it when you actually +// need to create a new seed. +// + +static void +MeowExpandSeed(meow_umm InputLen, void *Input, meow_u8 *SeedResult) +{ +    meow_state State; +    meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results +    meow_umm InjestCount = (256 / InputLen) + 2; +     +    MeowBegin(&State, MeowDefaultSeed); +    MeowAbsorb(&State, sizeof(LengthTab), &LengthTab); +    while(InjestCount--) +    { +        MeowAbsorb(&State, InputLen, Input); +    } +    MeowEnd(&State, SeedResult); +} + +#define MEOW_HASH_X64_AESNI_H +#endif  | 
