javascript algorithm counting loglog hyperloglog

javascript - Algoritmos LogLog y HyperLogLog para contar grandes cardinalidades



algorithm counting (6)

Aquí hay una versión ligeramente modificada que agrega la operación de fusión.

Merge le permite tomar los contadores de varias instancias de HyperLogLog y determinar los contadores únicos en general.

Por ejemplo, si tiene visitantes únicos que se recogen los lunes, martes y miércoles, puede combinar los segmentos y contar el número de visitantes únicos en el lapso de tres días:

var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function merge(other) { for (var i = 0; i < m; i++) M[i] = Math.max(M[i], other.buckets[i]); } function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count, merge: merge, buckets: M}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; }

Entonces puedes hacer algo como esto:

// initialize one counter per day var ll_monday = HyperLogLog(0.01); var ll_tuesday = HyperLogLog(0.01); var ll_wednesday = HyperLogLog(0.01); // add 5000 unique values in each day for(var i=0; i<5000; i++) ll_monday.count(fnv1a('''' + Math.random())); for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('''' + Math.random())); for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('''' + Math.random())); // add 5000 values which appear every day for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''''+i)); ll_tuesday.count(fnv1a('''' + i)); ll_wednesday.count(fnv1a('''' + i));} // merge three days together together = HyperLogLog(0.01); together.merge(ll_monday); together.merge(ll_tuesday); together.merge(ll_wednesday); // report console.log(''unique per day: '' + Math.round(ll_monday.count()) + '' '' + Math.round(ll_tuesday.count()) + '' '' + Math.round(ll_wednesday.count())); console.log(''unique numbers overall: '' + Math.round(together.count()));

¿Dónde puedo encontrar una implementación válida del algoritmo LogLog ? He tratado de implementarlo por mí mismo, pero mi proyecto de implementación arroja resultados extraños.

Here está:

function LogLog(max_error, max_count) { function log2(x) { return Math.log(x) / Math.LN2; } var m = 1.30 / max_error; var k = Math.ceil(log2(m * m)); m = Math.pow(2, k); var k_comp = 32 - k; var l = log2(log2(max_count / m)); if (isNaN(l)) l = 1; else l = Math.ceil(l); var l_mask = ((1 << l) - 1) >>> 0; var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; var rank = 0; for (var i = 0; i < k_comp; ++i) { if ((hash >>> i) & 1) { rank = i + 1; break; } } M[j] = Math.max(M[j], rank & l_mask); } else { var c = 0; for (var i = 0; i < m; ++i) c += M[i]; return 0.79402 * m * Math.pow(2, c / m); } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = [''aardvark'', ''abyssinian'', ... ,''zoology'']; // about 2 300 words var log_log = LogLog(0.01, 100000); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i])); alert(log_log.count());

Por razones desconocidas, la implementación es muy sensible al parámetro max_error , es el factor principal que determina la magnitud del resultado. Estoy seguro, hay un error estúpido :)

ACTUALIZACIÓN: este problema se resuelve en la versión más nueva de algoritmo. Publicaré su implementación más tarde.




Sé que esta es una publicación anterior, pero la implementación de @buryat se ha movido, y en cualquier caso es incompleta, y un poco lenta (lo siento o_o).

Tomé la implementación utilizada por la nueva versión de Redis que se puede encontrar here y se la transfirió a PHP. El informe está aquí https://github.com/joegreen0991/HyperLogLog

<?php class HyperLogLog { private $HLL_P_MASK; private $HLL_REGISTERS; private $ALPHA; private $registers; public function __construct($HLL_P = 14) { $this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */ $this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */ $this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS); $this->registers = new SplFixedArray($this->HLL_REGISTERS); for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $this->registers[$i] = 0; } } public function add($v) { $h = crc32(md5($v)); $h |= 1 << 63; /* Make sure the loop terminates. */ $bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */ $count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */ while(($h & $bit) == 0) { $count++; $bit <<= 1; } /* Update the register if this element produced a longer run of zeroes. */ $index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */ if ($this->registers[$index] < $count) { $this->registers[$index] = $count; } } public function export() { $str = ''''; for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $str .= chr($this->registers[$i]); } return $str; } public function import($str) { for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0; } } public function merge($str) { for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { if(isset($str[$i])) { $ord = ord($str[$i]); if ($this->registers[$i] < $ord) { $this->registers[$i] = $ord; } } } } /** * @static * @param $arr * @return int Number of unique items in $arr */ public function count() { $E = 0; $ez = 0; for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { if ($this->registers[$i] !== 0) { $E += (1.0 / pow(2, $this->registers[$i])); } else { $ez++; $E += 1.0; } } $E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS; /* Use the LINEARCOUNTING algorithm for small cardinalities. * For larger values but up to 72000 HyperLogLog raw approximation is * used since linear counting error starts to increase. However HyperLogLog * shows a strong bias in the range 2.5*16384 - 72000, so we try to * compensate for it. */ if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) { $E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez); } else if ($this->HLL_REGISTERS == 16384 && $E < 72000) { // We did polynomial regression of the bias for this range, this // way we can compute the bias for a given cardinality and correct // according to it. Only apply the correction for P=14 that''s what // we use and the value the correction was verified with. $bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E) -1.4253 * 1.0e-12 * ($E*$E*$E)+ 1.2940 * 1.0e-7 * ($E*$E) -5.2921 * 1.0e-3 * $E+ 83.3216; $E -= $E * ($bias/100); } return floor($E); } }


Utilizando la versión js @actual proporcionada, traté de implementar lo mismo en C #, que parece lo suficientemente cerca. Acabo de cambiar la función fnv1a un poco y le cambié el nombre a getHashCode. (El crédito va a la función hash de Jenkins, http://en.wikipedia.org/wiki/Jenkins_hash_function )

public class HyperLogLog { private double mapSize, alpha_m, k; private int kComplement; private Dictionary<int, int> Lookup = new Dictionary<int, int>(); private const double pow_2_32 = 4294967297; public HyperLogLog(double stdError) { mapSize = (double)1.04 / stdError; k = (long)Math.Ceiling(log2(mapSize * mapSize)); kComplement = 32 - (int)k; mapSize = (long)Math.Pow(2, k); alpha_m = mapSize == 16 ? (double)0.673 : mapSize == 32 ? (double)0.697 : mapSize == 64 ? (double)0.709 : (double)0.7213 / (double)(1 + 1.079 / mapSize); for (int i = 0; i < mapSize; i++) Lookup[i] = 0; } private static double log2(double x) { return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2 } private static int getRank(uint hash, int max) { int r = 1; uint one = 1; while ((hash & one) == 0 && r <= max) { ++r; hash >>= 1; } return r; } public static uint getHashCode(string text) { uint hash = 0; for (int i = 0, l = text.Length; i < l; i++) { hash += (uint)text[i]; hash += hash << 10; hash ^= hash >> 6; } hash += hash << 3; hash ^= hash >> 6; hash += hash << 16; return hash; } public int Count() { double c = 0, E; for (var i = 0; i < mapSize; i++) c += 1d / Math.Pow(2, (double)Lookup[i]); E = alpha_m * mapSize * mapSize / c; // Make corrections & smoothen things. if (E <= (5 / 2) * mapSize) { double V = 0; for (var i = 0; i < mapSize; i++) if (Lookup[i] == 0) V++; if (V > 0) E = mapSize * Math.Log(mapSize / V); } else if (E > (1 / 30) * pow_2_32) E = -pow_2_32 * Math.Log(1 - E / pow_2_32); // Made corrections & smoothen things, or not. return (int)E; } public void Add(object val) { uint hashCode = getHashCode(val.ToString()); int j = (int)(hashCode >> kComplement); Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement)); } }


Here está la versión actualizada del algoritmo basada en el documento más nuevo :

var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = [''aardvark'', ''abyssinian'', ..., ''zoology'']; // 2336 words var seed = Math.floor(Math.random() * pow_2_32); // make more fun var log_log = HyperLogLog(0.065); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed); var count = log_log.count(); alert(count + '', error '' + (count - words.length) / (words.length / 100.0) + ''%'');