/**************************************************************************/ /* */ /* OCaml */ /* */ /* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ /* */ /* Copyright 1996 Institut National de Recherche en Informatique et */ /* en Automatique. */ /* */ /* All rights reserved. This file is distributed under the terms of */ /* the GNU Lesser General Public License version 2.1, with the */ /* special exception on linking described in the file LICENSE. */ /* */ /**************************************************************************/ #define CAML_INTERNALS /* The generic hashing primitive */ /* The interface of this file is in "mlvalues.h" (for [caml_hash_variant]) and in "hash.h" (for the other exported functions). */ #include "caml/mlvalues.h" #include "caml/custom.h" #include "caml/memory.h" #include "caml/hash.h" /* The new implementation, based on MurmurHash 3, http://code.google.com/p/smhasher/ */ #define ROTL32(x,n) ((x) << n | (x) >> (32-n)) #define MIX(h,d) \ d *= 0xcc9e2d51; \ d = ROTL32(d, 15); \ d *= 0x1b873593; \ h ^= d; \ h = ROTL32(h, 13); \ h = h * 5 + 0xe6546b64; #define FINAL_MIX(h) \ h ^= h >> 16; \ h *= 0x85ebca6b; \ h ^= h >> 13; \ h *= 0xc2b2ae35; \ h ^= h >> 16; CAMLexport uint32_t caml_hash_mix_uint32(uint32_t h, uint32_t d) { MIX(h, d); return h; } /* Mix a platform-native integer. */ CAMLexport uint32_t caml_hash_mix_intnat(uint32_t h, intnat d) { uint32_t n; #ifdef ARCH_SIXTYFOUR /* Mix the low 32 bits and the high 32 bits, in a way that preserves 32/64 compatibility: we want n = (uint32_t) d if d is in the range [-2^31, 2^31-1]. */ n = (d >> 32) ^ (d >> 63) ^ d; /* If 0 <= d < 2^31: d >> 32 = 0 d >> 63 = 0 If -2^31 <= d < 0: d >> 32 = -1 d >> 63 = -1 In both cases, n = (uint32_t) d. */ #else n = d; #endif MIX(h, n); return h; } /* Mix a 64-bit integer. */ CAMLexport uint32_t caml_hash_mix_int64(uint32_t h, int64_t d) { uint32_t hi = (uint32_t) (d >> 32), lo = (uint32_t) d; MIX(h, lo); MIX(h, hi); return h; } /* Mix a double-precision float. Treats +0.0 and -0.0 identically. Treats all NaNs identically. */ CAMLexport uint32_t caml_hash_mix_double(uint32_t hash, double d) { union { double d; #if defined(ARCH_BIG_ENDIAN) || (defined(__arm__) && !defined(__ARM_EABI__)) struct { uint32_t h; uint32_t l; } i; #else struct { uint32_t l; uint32_t h; } i; #endif } u; uint32_t h, l; /* Convert to two 32-bit halves */ u.d = d; h = u.i.h; l = u.i.l; /* Normalize NaNs */ if ((h & 0x7FF00000) == 0x7FF00000 && (l | (h & 0xFFFFF)) != 0) { h = 0x7FF00000; l = 0x00000001; } /* Normalize -0 into +0 */ else if (h == 0x80000000 && l == 0) { h = 0; } MIX(hash, l); MIX(hash, h); return hash; } /* Mix a single-precision float. Treats +0.0 and -0.0 identically. Treats all NaNs identically. */ CAMLexport uint32_t caml_hash_mix_float(uint32_t hash, float d) { union { float f; uint32_t i; } u; uint32_t n; /* Convert to int32_t */ u.f = d; n = u.i; /* Normalize NaNs */ if ((n & 0x7F800000) == 0x7F800000 && (n & 0x007FFFFF) != 0) { n = 0x7F800001; } /* Normalize -0 into +0 */ else if (n == 0x80000000) { n = 0; } MIX(hash, n); return hash; } /* Mix an OCaml string */ CAMLexport uint32_t caml_hash_mix_string(uint32_t h, value s) { mlsize_t len = caml_string_length(s); mlsize_t i; uint32_t w; /* Mix by 32-bit blocks (little-endian) */ for (i = 0; i + 4 <= len; i += 4) { #ifdef ARCH_BIG_ENDIAN w = Byte_u(s, i) | (Byte_u(s, i+1) << 8) | (Byte_u(s, i+2) << 16) | (Byte_u(s, i+3) << 24); #else w = *((uint32_t *) &Byte_u(s, i)); #endif MIX(h, w); } /* Finish with up to 3 bytes */ w = 0; switch (len & 3) { case 3: w = Byte_u(s, i+2) << 16; /* fallthrough */ case 2: w |= Byte_u(s, i+1) << 8; /* fallthrough */ case 1: w |= Byte_u(s, i); MIX(h, w); default: /*skip*/; /* len & 3 == 0, no extra bytes, do nothing */ } /* Finally, mix in the length. Ignore the upper 32 bits, generally 0. */ h ^= (uint32_t) len; return h; } /* Maximal size of the queue used for breadth-first traversal. */ #define HASH_QUEUE_SIZE 256 /* Maximal number of Forward_tag links followed in one step */ #define MAX_FORWARD_DEREFERENCE 1000 /* The generic hash function */ CAMLprim value caml_hash(value count, value limit, value seed, value obj) { value queue[HASH_QUEUE_SIZE]; /* Queue of values to examine */ intnat rd; /* Position of first value in queue */ intnat wr; /* One past position of last value in queue */ intnat sz; /* Max number of values to put in queue */ intnat num; /* Max number of meaningful values to see */ uint32_t h; /* Rolling hash */ value v; mlsize_t i, len; sz = Long_val(limit); if (sz < 0 || sz > HASH_QUEUE_SIZE) sz = HASH_QUEUE_SIZE; num = Long_val(count); h = Int_val(seed); queue[0] = obj; rd = 0; wr = 1; while (rd < wr && num > 0) { v = queue[rd++]; again: if (Is_long(v)) { h = caml_hash_mix_intnat(h, v); num--; } else if (Is_in_value_area(v)) { switch (Tag_val(v)) { case String_tag: h = caml_hash_mix_string(h, v); num--; break; case Double_tag: h = caml_hash_mix_double(h, Double_val(v)); num--; break; case Double_array_tag: for (i = 0, len = Wosize_val(v) / Double_wosize; i < len; i++) { h = caml_hash_mix_double(h, Double_flat_field(v, i)); num--; if (num <= 0) break; } break; case Abstract_tag: /* Block contents unknown. Do nothing. */ break; case Infix_tag: /* Mix in the offset to distinguish different functions from the same mutually-recursive definition */ h = caml_hash_mix_uint32(h, Infix_offset_val(v)); v = v - Infix_offset_val(v); goto again; case Forward_tag: /* PR#6361: we can have a loop here, so limit the number of Forward_tag links being followed */ for (i = MAX_FORWARD_DEREFERENCE; i > 0; i--) { v = Forward_val(v); if (Is_long(v) || !Is_in_value_area(v) || Tag_val(v) != Forward_tag) goto again; } /* Give up on this object and move to the next */ break; case Object_tag: h = caml_hash_mix_intnat(h, Oid_val(v)); num--; break; case Custom_tag: /* If no hashing function provided, do nothing. */ /* Only use low 32 bits of custom hash, for 32/64 compatibility */ if (Custom_ops_val(v)->hash != NULL) { uint32_t n = (uint32_t) Custom_ops_val(v)->hash(v); h = caml_hash_mix_uint32(h, n); num--; } break; default: /* Mix in the tag and size, but do not count this towards [num] */ h = caml_hash_mix_uint32(h, Whitehd_hd(Hd_val(v))); /* Copy fields into queue, not exceeding the total size [sz] */ for (i = 0, len = Wosize_val(v); i < len; i++) { if (wr >= sz) break; queue[wr++] = Field(v, i); } break; } } else { /* v is a pointer outside the heap, probably a code pointer. Shall we count it? Let's say yes by compatibility with old code. */ h = caml_hash_mix_intnat(h, v); num--; } } /* Final mixing of bits */ FINAL_MIX(h); /* Fold result to the range [0, 2^30-1] so that it is a nonnegative OCaml integer both on 32 and 64-bit platforms. */ return Val_int(h & 0x3FFFFFFFU); } /* The old implementation */ struct hash_state { uintnat accu; intnat univ_limit, univ_count; }; static void hash_aux(struct hash_state*, value obj); CAMLprim value caml_hash_univ_param(value count, value limit, value obj) { struct hash_state h; h.univ_limit = Long_val(limit); h.univ_count = Long_val(count); h.accu = 0; hash_aux(&h, obj); return Val_long(h.accu & 0x3FFFFFFF); /* The & has two purposes: ensure that the return value is positive and give the same result on 32 bit and 64 bit architectures. */ } #define Alpha 65599 #define Beta 19 #define Combine(new) (h->accu = h->accu * Alpha + (new)) #define Combine_small(new) (h->accu = h->accu * Beta + (new)) static void hash_aux(struct hash_state* h, value obj) { unsigned char * p; mlsize_t i, j; tag_t tag; h->univ_limit--; if (h->univ_count < 0 || h->univ_limit < 0) return; again: if (Is_long(obj)) { h->univ_count--; Combine(Long_val(obj)); return; } /* Pointers into the heap are well-structured blocks. So are atoms. We can inspect the block contents. */ CAMLassert (Is_block (obj)); if (Is_in_value_area(obj)) { tag = Tag_val(obj); switch (tag) { case String_tag: h->univ_count--; i = caml_string_length(obj); for (p = &Byte_u(obj, 0); i > 0; i--, p++) Combine_small(*p); break; case Double_tag: /* For doubles, we inspect their binary representation, LSB first. The results are consistent among all platforms with IEEE floats. */ h->univ_count--; #ifdef ARCH_BIG_ENDIAN for (p = &Byte_u(obj, sizeof(double) - 1), i = sizeof(double); i > 0; p--, i--) #else for (p = &Byte_u(obj, 0), i = sizeof(double); i > 0; p++, i--) #endif Combine_small(*p); break; case Double_array_tag: h->univ_count--; for (j = 0; j < Bosize_val(obj); j += sizeof(double)) { #ifdef ARCH_BIG_ENDIAN for (p = &Byte_u(obj, j + sizeof(double) - 1), i = sizeof(double); i > 0; p--, i--) #else for (p = &Byte_u(obj, j), i = sizeof(double); i > 0; p++, i--) #endif Combine_small(*p); } break; case Abstract_tag: /* We don't know anything about the contents of the block. Better do nothing. */ break; case Infix_tag: hash_aux(h, obj - Infix_offset_val(obj)); break; case Forward_tag: obj = Forward_val (obj); goto again; case Object_tag: h->univ_count--; Combine(Oid_val(obj)); break; case Custom_tag: /* If no hashing function provided, do nothing */ if (Custom_ops_val(obj)->hash != NULL) { h->univ_count--; Combine(Custom_ops_val(obj)->hash(obj)); } break; default: h->univ_count--; Combine_small(tag); i = Wosize_val(obj); while (i != 0) { i--; hash_aux(h, Field(obj, i)); } break; } return; } /* Otherwise, obj is a pointer outside the heap, to an object with a priori unknown structure. Use its physical address as hash key. */ Combine((intnat) obj); } /* Hashing variant tags */ CAMLexport value caml_hash_variant(char const * tag) { value accu; /* Same hashing algorithm as in ../typing/btype.ml, function hash_variant */ for (accu = Val_int(0); *tag != 0; tag++) accu = Val_int(223 * Int_val(accu) + *((unsigned char *) tag)); #ifdef ARCH_SIXTYFOUR accu = accu & Val_long(0x7FFFFFFFL); #endif /* Force sign extension of bit 31 for compatibility between 32 and 64-bit platforms */ return (int32_t) accu; }