Ejemplo n.º 1
0
        private void CompareHLL(CardinalityEstimator hll1, CardinalityEstimator hll2)
        {
            CardinalityEstimatorState data  = hll1.GetState();
            CardinalityEstimatorState data2 = hll2.GetState();

            Assert.AreEqual(data.BitsPerIndex, data2.BitsPerIndex);
            Assert.AreEqual(data.IsSparse, data2.IsSparse);

            Assert.IsTrue((data.DirectCount != null && data2.DirectCount != null) || (data.DirectCount == null && data2.DirectCount == null));
            Assert.IsTrue((data.LookupSparse != null && data2.LookupSparse != null) ||
                          (data.LookupSparse == null && data2.LookupSparse == null));
            Assert.IsTrue((data.LookupDense != null && data2.LookupDense != null) || (data.LookupDense == null && data2.LookupDense == null));

            if (data.DirectCount != null)
            {
                // DirectCount are subsets of each-other => they are the same set
                Assert.IsTrue(data.DirectCount.IsSubsetOf(data2.DirectCount) && data2.DirectCount.IsSubsetOf(data.DirectCount));
            }
            if (data.LookupSparse != null)
            {
                Assert.IsTrue(data.LookupSparse.DictionaryEqual(data2.LookupSparse));
            }
            if (data.LookupDense != null)
            {
                Assert.IsTrue(data.LookupDense.SequenceEqual(data2.LookupDense));
            }
        }
Ejemplo n.º 2
0
        private void TestDeserializer2(int cardinality)
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(cardinality);
            CardinalityEstimator hll2;

            var serializer = new CardinalityEstimatorSerializer();

            byte[] results;
            using (var memoryStream = new MemoryStream())
            {
                using (var bw = new BinaryWriter(memoryStream))
                {
                    serializer.Write(bw, hll);
                }

                results = memoryStream.ToArray();
            }

            using (var memoryStream = new MemoryStream(results))
            {
                using (var br = new BinaryReader(memoryStream))
                {
                    hll2 = serializer.Read(br);
                }
            }

            CompareHLL(hll, hll2);
        }
Ejemplo n.º 3
0
        public void TestSerializerCardinality1000()
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(1000);

            var serializer = new CardinalityEstimatorSerializer();

            byte[] results;
            using (var memoryStream = new MemoryStream())
            {
                serializer.Serialize(memoryStream, hll, false);

                results = memoryStream.ToArray();
            }

            CardinalityEstimatorState data = hll.GetState();

            // Expected length is:
            // 4 bytes for the major and minor versions
            // 1 byte for the HashFunctionId
            // 4 bytes for the Bits in Index
            // 1 byte for the IsSparse and IsDirectCount flags
            // 4 bytes for the number of elements in lookupSparse
            // 2+1 bytes for each element (ulong) in lookupSparse
            // 8 bytes for CountAdded
            Assert.AreEqual(22 + 3 * data.LookupSparse.Count, results.Length);

            Assert.AreEqual((byte)HashFunctionId.Murmur3, results.Skip(4).Take(1).First());
            Assert.AreEqual(14, BitConverter.ToInt32(results.Skip(5).Take(4).ToArray(), 0));                                     // Bits in Index = 14
            Assert.AreEqual(2, results[9]);                                                                                      // IsSparse = true AND IsDirectCount = false
            Assert.AreEqual(data.LookupSparse.Count, BitConverter.ToInt32(results.Skip(10).Take(4).ToArray(), 0));
            Assert.AreEqual(1000UL, BitConverter.ToUInt64(results.Skip(14 + 3 * data.LookupSparse.Count).Take(8).ToArray(), 0)); // CountAdditions = 1000
        }
        public void ReportAccuracy()
        {
            var    hll         = new CardinalityEstimator();
            double maxError    = 0;
            var    worstMember = 0;
            var    nextMember  = new byte[ElementSizeInBytes];

            for (var i = 0; i < 10000000; i++)
            {
                Rand.NextBytes(nextMember);
                hll.Add(nextMember);

                if (i % 1007 == 0) // just some interval to sample error at, can be any number
                {
                    double error = (hll.Count() - (double)(i + 1)) / ((double)i + 1);
                    if (error > maxError)
                    {
                        maxError    = error;
                        worstMember = i + 1;
                    }
                }
            }

            Console.WriteLine("Worst: {0}", worstMember);
            Console.WriteLine("Max error: {0}", maxError);

            Assert.True(true);
        }
Ejemplo n.º 5
0
        public void TestSerializerCardinality1000()
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(1000);

            var serializer = new CardinalityEstimatorSerializer();

            byte[] results;
            using (var memoryStream = new MemoryStream())
            {
                serializer.Serialize(memoryStream, hll);

                results = memoryStream.ToArray();
            }

            CardinalityEstimatorState data = hll.GetState();

            // Expected length is 2908:
            // 4 bytes for the major and minor versions
            // 4 bytes for the Bits in Index
            // 1 byte for the IsSparse and IsDirectCount flags
            // 4 bytes for the number of elements in lookupSparse
            // 2+1 bytes for each element (ulong) in lookupSparse
            Assert.AreEqual(13 + 3 * data.LookupSparse.Count, results.Length);


            Assert.AreEqual(14, BitConverter.ToInt32(results.Skip(4).Take(4).ToArray(), 0)); // Bits in Index = 14
            Assert.AreEqual(2, results[8]);                                                  // IsSparse = true AND IsDirectCount = false
            Assert.AreEqual(data.LookupSparse.Count, BitConverter.ToInt32(results.Skip(9).Take(4).ToArray(), 0));
        }
        public void DirectCountingIsResetWhenMergingAlmostFullEstimators()
        {
            var addedEstimator  = new CardinalityEstimator();
            var mergedEstimator = new CardinalityEstimator();

            for (int i = 0; i < 10_000; i++)
            {
                var guid = Guid.NewGuid().ToString();

                addedEstimator.Add(guid);

                // Simulate some intermediate estimators being merged together
                var temporaryEstimator = new CardinalityEstimator();
                temporaryEstimator.Add(guid);
                mergedEstimator.Merge(temporaryEstimator);
            }

            var serializer = new CardinalityEstimatorSerializer();

            var stream1 = new MemoryStream();

            serializer.Serialize(stream1, addedEstimator, true);

            var stream2 = new MemoryStream();

            serializer.Serialize(stream2, mergedEstimator, true);

            Assert.Equal(stream1.Length, stream2.Length);
        }
Ejemplo n.º 7
0
        public void TestSerializerCardinality10()
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(10);

            var serializer = new CardinalityEstimatorSerializer();

            byte[] results;
            using (var memoryStream = new MemoryStream())
            {
                serializer.Serialize(memoryStream, hll);

                results = memoryStream.ToArray();
            }


            // Expected length is 93:
            // 4 bytes for the major and minor versions
            // 4 bytes for the Bits in Index
            // 1 byte for the IsSparse and IsDirectCount flags
            // 4 bytes for the number of elements in DirectCount
            // 8 bytes for each element (ulong) in DirectCount
            Assert.AreEqual(93, results.Length);


            Assert.AreEqual(14, BitConverter.ToInt32(results.Skip(4).Take(4).ToArray(), 0)); // Bits in Index = 14
            Assert.AreEqual(3, results[8]);                                                  // IsSparse = true AND IsDirectCount = true
            Assert.AreEqual(10, BitConverter.ToInt32(results.Skip(9).Take(4).ToArray(), 0)); // Count = 10
        }
        /// <summary>
        ///     Gets the number of indexing bits required to produce a given standard error
        /// </summary>
        /// <param name="stdError">
        ///     Standard error, which determines accuracy and memory consumption. For large cardinalities, the observed error is usually less than
        ///     3 * <paramref name="stdError" />.
        /// </param>
        /// <returns></returns>
        private static int GetAccuracyInBits(double stdError)
        {
            double sqrtm = 1.04 / stdError;
            var    b     = (int)Math.Ceiling(CardinalityEstimator.Log2(sqrtm * sqrtm));

            return(b);
        }
Ejemplo n.º 9
0
        private void TestSerializerCreatesSmallerData(int cardinality, out int customSize, out int defaultSize)
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(cardinality);

            var customSerializer = new CardinalityEstimatorSerializer();

            byte[] customSerializerResults;
            using (var memoryStream = new MemoryStream())
            {
                customSerializer.Serialize(memoryStream, hll, false);
                customSerializerResults = memoryStream.ToArray();
                customSize = customSerializerResults.Length;
            }

            var binaryFormatter = new BinaryFormatter();

            byte[] defaultSerializerResults;
            using (var memoryStream = new MemoryStream())
            {
                binaryFormatter.Serialize(memoryStream, hll);
                defaultSerializerResults = memoryStream.ToArray();
                defaultSize = defaultSerializerResults.Length;
            }

            Assert.IsTrue(customSerializerResults.Length <= defaultSerializerResults.Length);
        }
Ejemplo n.º 10
0
        public static byte[] SerializeCardinality(this CardinalityEstimator estimator, CardinalityEstimatorSerializer serializer)
        {
            using (var stream = new MemoryStream())
            {
                serializer.Serialize(stream, estimator);

                return(stream.ToArray());
            }
        }
Ejemplo n.º 11
0
        public JsonResult Get()
        {
            ICardinalityEstimator <string> estimator = new CardinalityEstimator();

            foreach (Fingerprint finger in _counterRepo.GetFingerprints())
            {
                estimator.Add(finger.Hash);
            }
            return(new JsonResult(new { Clicks = estimator.Count() }));
        }
        public void StaticMergeHandlesNullElements()
        {
            const int expectedBitsPerIndex = 11;
            var       estimators           = new List <CardinalityEstimator> {
                null, new CardinalityEstimator(expectedBitsPerIndex, HashFunctionId.Fnv1A), null
            };
            CardinalityEstimator result = CardinalityEstimator.Merge(estimators);

            Assert.NotNull(result);
            Assert.Equal(expectedBitsPerIndex, result.GetState().BitsPerIndex);
        }
        public void TestGetSigma()
        {
            // simulate a 64 bit hash and 14 bits for indexing
            const int bitsToCount = 64 - 14;

            Assert.Equal(51, CardinalityEstimator.GetSigma(0, bitsToCount));
            Assert.Equal(50, CardinalityEstimator.GetSigma(1, bitsToCount));
            Assert.Equal(47, CardinalityEstimator.GetSigma(8, bitsToCount));
            Assert.Equal(1, CardinalityEstimator.GetSigma((ulong)(Math.Pow(2, bitsToCount) - 1), bitsToCount));
            Assert.Equal(51, CardinalityEstimator.GetSigma((ulong)(Math.Pow(2, bitsToCount + 1)), bitsToCount));
        }
Ejemplo n.º 14
0
        public void SerializerCanDeserializeVersion2Point0()
        {
            var serializer = new CardinalityEstimatorSerializer();

            CardinalityEstimator hllDirect = serializer.Deserialize(new MemoryStream(Resources.serializedDirect_v2_0));
            CardinalityEstimator hllSparse = serializer.Deserialize(new MemoryStream(Resources.serializedSparse_v2_0));
            CardinalityEstimator hllDense  = serializer.Deserialize(new MemoryStream(Resources.serializedDense_v2_0));

            Assert.AreEqual(50UL, hllDirect.Count());
            Assert.AreEqual(151UL, hllSparse.Count());
            Assert.AreEqual(5009UL, hllDense.Count());
        }
Ejemplo n.º 15
0
        private CardinalityEstimator CreateAndFillCardinalityEstimator(int cardinality = 1000000, int bits = 14)
        {
            var hll = new CardinalityEstimator(bits);

            var nextMember = new byte[ElementSizeInBytes];

            for (var i = 0; i < cardinality; i++)
            {
                Rand.NextBytes(nextMember);
                hll.Add(nextMember);
            }

            return(hll);
        }
        /// <summary>
        ///     Generates <paramref name="expectedCount" /> random (or sequential) elements and adds them to CardinalityEstimators, then asserts that
        ///     the observed error rate is no more than <paramref name="maxAcceptedError" />
        /// </summary>
        /// <param name="stdError">Expected standard error of the estimators (upper bound)</param>
        /// <param name="expectedCount">number of elements to generate in total</param>
        /// <param name="maxAcceptedError">Maximum allowed error rate. Default is 4 times <paramref name="stdError" /></param>
        /// <param name="numHllInstances">Number of estimators to create. Generated elements will be assigned to one of the estimators at random</param>
        /// <param name="sequential">When false, elements will be generated at random. When true, elements will be 0,1,2...</param>
        private void RunTest(double stdError, long expectedCount, double?maxAcceptedError = null, int numHllInstances = 1,
                             bool sequential = false)
        {
            maxAcceptedError = maxAcceptedError ?? 4 * stdError; // should fail once in A LOT of runs
            int b = GetAccuracyInBits(stdError);

            var  runStopwatch    = new Stopwatch();
            long gcMemoryAtStart = GetGcMemory();

            // init HLLs
            var hlls = new CardinalityEstimator[numHllInstances];

            for (var i = 0; i < numHllInstances; i++)
            {
                hlls[i] = new CardinalityEstimator(b);
            }

            var nextMember = new byte[ElementSizeInBytes];

            runStopwatch.Start();
            for (long i = 0; i < expectedCount; i++)
            {
                // pick random hll, add member
                int chosenHll = Rand.Next(numHllInstances);
                if (sequential)
                {
                    hlls[chosenHll].Add(i);
                }
                else
                {
                    Rand.NextBytes(nextMember);
                    hlls[chosenHll].Add(nextMember);
                }
            }

            runStopwatch.Stop();
            ReportMemoryCost(gcMemoryAtStart); // done here so references can't be GC'ed yet

            // Merge
            CardinalityEstimator mergedHll = CardinalityEstimator.Merge(hlls);

            Console.WriteLine("Run time: {0}", runStopwatch.Elapsed);
            Console.WriteLine("Expected {0}, got {1}", expectedCount, mergedHll.Count());

            double obsError = Math.Abs(mergedHll.Count() / (double)(expectedCount) - 1.0);

            Console.WriteLine("StdErr: {0}.  Observed error: {1}", stdError, obsError);
            Assert.True(obsError <= maxAcceptedError, string.Format("Observed error was over {0}", maxAcceptedError));
            Console.WriteLine();
        }
        public void StaticMergeTest()
        {
            const int expectedBitsPerIndex = 11;
            var       estimators           = new CardinalityEstimator[10];

            for (var i = 0; i < estimators.Length; i++)
            {
                estimators[i] = new CardinalityEstimator(expectedBitsPerIndex);
                estimators[i].Add(Rand.Next());
            }

            CardinalityEstimator merged = CardinalityEstimator.Merge(estimators);

            Assert.Equal(10UL, merged.Count());
            Assert.Equal(expectedBitsPerIndex, merged.GetState().BitsPerIndex);
        }
        public void TestSize()
        {
            var estimator = new CardinalityEstimator();

            Assert.AreEqual(0UL, estimator.CountElementsAdded);

            estimator.Add(0);
            estimator.Add(0);

            Assert.AreEqual(2UL, estimator.CountElementsAdded);

            var estimator2 = new CardinalityEstimator();
            estimator2.Add(0);
            estimator.Merge(estimator2);

            Assert.AreEqual(3UL, estimator.CountElementsAdded);
        }
Ejemplo n.º 19
0
        public void DeserializedEstimatorUsesSameHashAsOriginal()
        {
            // Prepare some elements
            IList <int> elements = new List <int>();

            for (int i = 0; i < 150; i++)
            {
                elements.Add(Rand.Next());
            }

            foreach (HashFunctionId hashFunctionId in Enum.GetValues(typeof(HashFunctionId)))
            {
                // Add elements to an estimator using the given hashFunctionId
                CardinalityEstimator original = new CardinalityEstimator(hashFunctionId: hashFunctionId);
                foreach (int element in elements)
                {
                    original.Add(element);
                }

                // Serialize
                var    serializer = new CardinalityEstimatorSerializer();
                byte[] results;

                using (var memoryStream = new MemoryStream())
                {
                    serializer.Serialize(memoryStream, original, false);
                    results = memoryStream.ToArray();
                }

                // Deserialize
                CardinalityEstimator deserialized;
                using (var memoryStream = new MemoryStream(results))
                {
                    deserialized = serializer.Deserialize(memoryStream, false);
                }

                // Add the elements again, should have no effect on state
                foreach (int element in elements)
                {
                    deserialized.Add(element);
                }

                Assert.AreEqual(original.Count(), deserialized.Count());
            }
        }
        private void TestDeserializer(int cardinality)
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(cardinality);
            CardinalityEstimator hll2;

            var serializer = new CardinalityEstimatorSerializer();

            byte[] results;
            using (var memoryStream = new MemoryStream())
            {
                serializer.Serialize(memoryStream, hll);

                results = memoryStream.ToArray();
            }

            using (var memoryStream = new MemoryStream(results))
            {
                hll2 = serializer.Deserialize(memoryStream);
            }

            CardinalityEstimatorState data  = hll.GetState();
            CardinalityEstimatorState data2 = hll2.GetState();

            Assert.AreEqual(data.BitsPerIndex, data2.BitsPerIndex);
            Assert.AreEqual(data.IsSparse, data2.IsSparse);

            Assert.IsTrue((data.DirectCount != null && data2.DirectCount != null) || (data.DirectCount == null && data2.DirectCount == null));
            Assert.IsTrue((data.LookupSparse != null && data2.LookupSparse != null) ||
                          (data.LookupSparse == null && data2.LookupSparse == null));
            Assert.IsTrue((data.LookupDense != null && data2.LookupDense != null) || (data.LookupDense == null && data2.LookupDense == null));

            if (data.DirectCount != null)
            {
                // DirectCount are subsets of each-other => they are the same set
                Assert.IsTrue(data.DirectCount.IsSubsetOf(data2.DirectCount) && data2.DirectCount.IsSubsetOf(data.DirectCount));
            }
            if (data.LookupSparse != null)
            {
                Assert.IsTrue(data.LookupSparse.DictionaryEqual(data2.LookupSparse));
            }
            if (data.LookupDense != null)
            {
                Assert.IsTrue(data.LookupDense.SequenceEqual(data2.LookupDense));
            }
        }
        public void TestCountAdditions()
        {
            var estimator = new CardinalityEstimator();

            Assert.Equal(0UL, estimator.CountAdditions);

            estimator.Add(0);
            estimator.Add(0);

            Assert.Equal(2UL, estimator.CountAdditions);

            var estimator2 = new CardinalityEstimator();

            estimator2.Add(0);
            estimator.Merge(estimator2);

            Assert.Equal(3UL, estimator.CountAdditions);
        }
Ejemplo n.º 22
0
        private void TestSerializerCardinality100000Parameterized(bool useBinWriter)
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(100000);

            var serializer = new CardinalityEstimatorSerializer();

            byte[] results;
            using (var memoryStream = new MemoryStream())
            {
                if (useBinWriter)
                {
                    using (var bw = new BinaryWriter(memoryStream))
                    {
                        serializer.Write(bw, hll);
                    }
                }
                else
                {
                    serializer.Serialize(memoryStream, hll, false);
                }

                results = memoryStream.ToArray();
            }

            CardinalityEstimatorState data = hll.GetState();

            // Expected length is:
            // 4 bytes for the major and minor versions
            // 1 byte for the HashFunctionId
            // 4 bytes for the Bits in Index
            // 1 byte for the IsSparse and IsDirectCount flags
            // 4 bytes for the number of elements in lookupDense
            // 1 bytes for each element (ulong) in lookupDense
            // 8 bytes for CountAdded
            Assert.Equal(22 + data.LookupDense.Length, results.Length);

            Assert.Equal((byte)HashFunctionId.Murmur3, results.Skip(4).Take(1).First());
            Assert.Equal(14, BitConverter.ToInt32(results.Skip(5).Take(4).ToArray(), 0));                                   // Bits in Index = 14
            Assert.Equal(0, results[9]);                                                                                    // IsSparse = false AND IsDirectCount = false
            Assert.Equal(data.LookupDense.Length, BitConverter.ToInt32(results.Skip(10).Take(4).ToArray(), 0));
            Assert.Equal(100000UL, BitConverter.ToUInt64(results.Skip(14 + data.LookupDense.Length).Take(8).ToArray(), 0)); // CountAdditions = 100000
        }
        /// <summary>
        ///     Serialize the given <paramref name="cardinalityEstimator" /> to <paramref name="stream" />
        /// </summary>
        public void Serialize(Stream stream, CardinalityEstimator cardinalityEstimator)
        {
            using (var bw = new BinaryWriter(stream))
            {
                bw.Write(DataFormatMajorVersion);
                bw.Write(DataFormatMinorVersion);

                CardinalityEstimatorState data = cardinalityEstimator.GetState();

                bw.Write((byte)data.HashFunctionId);
                bw.Write(data.BitsPerIndex);
                bw.Write((byte)(((data.IsSparse ? 1 : 0) << 1) + (data.DirectCount != null ? 1 : 0)));
                if (data.DirectCount != null)
                {
                    bw.Write(data.DirectCount.Count);
                    foreach (ulong element in data.DirectCount)
                    {
                        bw.Write(element);
                    }
                }
                else if (data.IsSparse)
                {
                    bw.Write(data.LookupSparse.Count);
                    foreach (KeyValuePair<ushort, byte> element in data.LookupSparse)
                    {
                        bw.Write(element.Key);
                        bw.Write(element.Value);
                    }
                }
                else
                {
                    bw.Write(data.LookupDense.Length);
                    foreach (byte element in data.LookupDense)
                    {
                        bw.Write(element);
                    }
                }

                bw.Write(data.CountAdditions);
                bw.Flush();
            }
        }
        private void RunRecreationFromData(int cardinality = 1000000)
        {
            var hll = new CardinalityEstimator();

            var nextMember = new byte[ElementSizeInBytes];

            for (var i = 0; i < cardinality; i++)
            {
                Rand.NextBytes(nextMember);
                hll.Add(nextMember);
            }

            CardinalityEstimatorState data = hll.GetState();

            var hll2 = new CardinalityEstimator(data);
            CardinalityEstimatorState data2 = hll2.GetState();

            Assert.Equal(data.BitsPerIndex, data2.BitsPerIndex);
            Assert.Equal(data.IsSparse, data2.IsSparse);

            Assert.True((data.DirectCount != null && data2.DirectCount != null) || (data.DirectCount == null && data2.DirectCount == null));
            Assert.True((data.LookupSparse != null && data2.LookupSparse != null) ||
                        (data.LookupSparse == null && data2.LookupSparse == null));
            Assert.True((data.LookupDense != null && data2.LookupDense != null) || (data.LookupDense == null && data2.LookupDense == null));

            if (data.DirectCount != null)
            {
                // DirectCount are subsets of each-other => they are the same set
                Assert.True(data.DirectCount.IsSubsetOf(data2.DirectCount) && data2.DirectCount.IsSubsetOf(data.DirectCount));
            }
            if (data.LookupSparse != null)
            {
                Assert.True(data.LookupSparse.DictionaryEqual(data2.LookupSparse));
            }
            if (data.LookupDense != null)
            {
                Assert.True(data.LookupDense.SequenceEqual(data2.LookupDense));
            }
        }
        public void EstimatorWorksAfterDeserialization()
        {
            ICardinalityEstimator <int> original = new CardinalityEstimator();

            original.Add(5);
            original.Add(7);
            Assert.Equal(2UL, original.Count());

            var binaryFormatter = new BinaryFormatter();

            using (var memoryStream = new MemoryStream())
            {
                binaryFormatter.Serialize(memoryStream, original);
                memoryStream.Seek(0, SeekOrigin.Begin);
                CardinalityEstimator copy = (CardinalityEstimator)binaryFormatter.Deserialize(memoryStream);

                Assert.Equal(2UL, copy.Count());
                copy.Add(5);
                copy.Add(7);
                Assert.Equal(2UL, copy.Count());
            }
        }
Ejemplo n.º 26
0
        private void TestDeserializer(int cardinality)
        {
            CardinalityEstimator hll = CreateAndFillCardinalityEstimator(cardinality);
            CardinalityEstimator hll2;

            var serializer = new CardinalityEstimatorSerializer();

            byte[] results;
            using (var memoryStream = new MemoryStream())
            {
                serializer.Serialize(memoryStream, hll, false);

                results = memoryStream.ToArray();
            }

            using (var memoryStream = new MemoryStream(results))
            {
                hll2 = serializer.Deserialize(memoryStream, false);
            }

            CompareHLL(hll, hll2);
        }
        /// <summary>
        ///     Deserialize a <see cref="CardinalityEstimator" /> from the given <paramref name="stream" />
        /// </summary>
        public CardinalityEstimator Deserialize(Stream stream)
        {
            using (var br = new BinaryReader(stream))
            {
                int dataFormatMajorVersion = br.ReadUInt16();
                int dataFormatMinorVersion = br.ReadUInt16();

                AssertDataVersionCanBeRead(dataFormatMajorVersion, dataFormatMinorVersion);

                HashFunctionId hashFunctionId;
                if (dataFormatMajorVersion >= 2)
                {
                    // Starting with version 2.0, the serializer writes the hash function ID
                    hashFunctionId = (HashFunctionId)br.ReadByte();
                }
                else
                {
                    // Versions before 2.0 all used FNV-1a
                    hashFunctionId = HashFunctionId.Fnv1A;
                }

                int bitsPerIndex = br.ReadInt32();
                byte flags = br.ReadByte();
                bool isSparse = ((flags & 2) == 2);
                bool isDirectCount = ((flags & 1) == 1);

                HashSet<ulong> directCount = null;
                IDictionary<ushort, byte> lookupSparse = isSparse ? new Dictionary<ushort, byte>() : null;
                byte[] lookupDense = null;

                if (isDirectCount)
                {
                    int count = br.ReadInt32();
                    directCount = new HashSet<ulong>();

                    for (var i = 0; i < count; i++)
                    {
                        ulong element = br.ReadUInt64();
                        directCount.Add(element);
                    }
                }
                else if (isSparse)
                {
                    int count = br.ReadInt32();

                    for (var i = 0; i < count; i++)
                    {
                        ushort elementKey = br.ReadUInt16();
                        byte elementValue = br.ReadByte();
                        lookupSparse.Add(elementKey, elementValue);
                    }
                }
                else
                {
                    int count = br.ReadInt32();
                    lookupDense = br.ReadBytes(count);
                }

                // Starting with version 2.1, the serializer writes CountAdditions
                ulong countAdditions = 0UL;
                if (dataFormatMajorVersion >= 2 && dataFormatMinorVersion >= 1)
                {
                    countAdditions = br.ReadUInt64();
                }

                var data = new CardinalityEstimatorState
                {
                    HashFunctionId = hashFunctionId,
                    BitsPerIndex = bitsPerIndex,
                    DirectCount = directCount,
                    IsSparse = isSparse,
                    LookupDense = lookupDense,
                    LookupSparse = lookupSparse,
                    CountAdditions = countAdditions,
                };

                var result = new CardinalityEstimator(data);

                return result;
            }
        }
        private CardinalityEstimator CreateAndFillCardinalityEstimator(int cardinality = 1000000)
        {
            var hll = new CardinalityEstimator();

            var nextMember = new byte[ElementSizeInBytes];
            for (var i = 0; i < cardinality; i++)
            {
                Rand.NextBytes(nextMember);
                hll.Add(nextMember);
            }

            return hll;
        }
        private void RunTest(double stdError, long expectedCount, double? maxAcceptedError = null, int numHllInstances = 1)
        {
            maxAcceptedError = maxAcceptedError ?? 5*stdError; // should fail appx once in 1.7 million runs
            int b = GetAccuracyInBits(stdError);

            var runStopwatch = new Stopwatch();
            long gcMemoryAtStart = GetGcMemory();

            // init HLLs
            var hlls = new CardinalityEstimator[numHllInstances];
            for (var i = 0; i < numHllInstances; i++)
            {
                hlls[i] = new CardinalityEstimator(b);
            }

            var nextMember = new byte[ElementSizeInBytes];
            runStopwatch.Start();
            for (long i = 0; i < expectedCount; i++)
            {
                // pick random hll, add member
                int chosenHll = Rand.Next(numHllInstances);
                Rand.NextBytes(nextMember);
                hlls[chosenHll].Add(nextMember);
            }

            runStopwatch.Stop();
            ReportMemoryCost(gcMemoryAtStart); // done here so references can't be GC'ed yet

            // Merge
            CardinalityEstimator mergedHll = CardinalityEstimator.Merge(hlls);
            Console.WriteLine("Run time: {0}", runStopwatch.Elapsed);
            Console.WriteLine("Expected {0}, got {1}", expectedCount, mergedHll.Count());

            double obsError = Math.Abs(mergedHll.Count()/(double) (expectedCount) - 1.0);
            Console.WriteLine("StdErr: {0}.  Observed error: {1}", stdError, obsError);
            Assert.IsTrue(obsError <= maxAcceptedError, string.Format("Observed error was over {0}", maxAcceptedError));
            Console.WriteLine();
        }
        public void StaticMergeHandlesNullParameter()
        {
            CardinalityEstimator result = CardinalityEstimator.Merge(null as IEnumerable <CardinalityEstimator>);

            Assert.Null(result);
        }
Ejemplo n.º 31
0
 public HyperLogLog(Stream stream)
 {
     _hyperLogLog = Serializer.Deserialize(stream);
 }
        private void RunRecreationFromData(int cardinality = 1000000)
        {
            var hll = new CardinalityEstimator();

            var nextMember = new byte[ElementSizeInBytes];
            for (var i = 0; i < cardinality; i++)
            {
                Rand.NextBytes(nextMember);
                hll.Add(nextMember);
            }

            CardinalityEstimatorState data = hll.GetState();

            var hll2 = new CardinalityEstimator(data);
            CardinalityEstimatorState data2 = hll2.GetState();

            Assert.AreEqual(data.BitsPerIndex, data2.BitsPerIndex);
            Assert.AreEqual(data.IsSparse, data2.IsSparse);

            Assert.IsTrue((data.DirectCount != null && data2.DirectCount != null) || (data.DirectCount == null && data2.DirectCount == null));
            Assert.IsTrue((data.LookupSparse != null && data2.LookupSparse != null) ||
                          (data.LookupSparse == null && data2.LookupSparse == null));
            Assert.IsTrue((data.LookupDense != null && data2.LookupDense != null) || (data.LookupDense == null && data2.LookupDense == null));

            if (data.DirectCount != null)
            {
                // DirectCount are subsets of each-other => they are the same set
                Assert.IsTrue(data.DirectCount.IsSubsetOf(data2.DirectCount) && data2.DirectCount.IsSubsetOf(data.DirectCount));
            }
            if (data.LookupSparse != null)
            {
                Assert.IsTrue(data.LookupSparse.DictionaryEqual(data2.LookupSparse));
            }
            if (data.LookupDense != null)
            {
                Assert.IsTrue(data.LookupDense.SequenceEqual(data2.LookupDense));
            }
        }
        [Ignore] // Test runtime is long
        public void ReportAccuracy()
        {
            var hll = new CardinalityEstimator();
            double maxError = 0;
            var worstMember = 0;
            var nextMember = new byte[ElementSizeInBytes];
            for (var i = 0; i < 10000000; i++)
            {
                Rand.NextBytes(nextMember);
                hll.Add(nextMember);

                if (i%1007 == 0) // just some interval to sample error at, can be any number
                {
                    double error = (hll.Count() - (double) (i + 1))/((double) i + 1);
                    if (error > maxError)
                    {
                        maxError = error;
                        worstMember = i + 1;
                    }
                }
            }

            Console.WriteLine("Worst: {0}", worstMember);
            Console.WriteLine("Max error: {0}", maxError);

            Assert.IsTrue(true);
        }
Ejemplo n.º 34
0
 public HyperLogLog()
 {
     _hyperLogLog = new CardinalityEstimator();
 }
        /// <summary>
        ///     Deserialize a <see cref="CardinalityEstimator" /> from the given <paramref name="stream" />
        /// </summary>
        public CardinalityEstimator Deserialize(Stream stream)
        {
            using (var br = new BinaryReader(stream))
            {
                int dataFormatMajorVersion = br.ReadUInt16();
                int dataFormatMinorVersion = br.ReadUInt16();

                AssertDataVersionCanBeRead(dataFormatMajorVersion, dataFormatMinorVersion);

                int bitsPerIndex = br.ReadInt32();
                byte flags = br.ReadByte();
                bool isSparse = ((flags & 2) == 2);
                bool isDirectCount = ((flags & 1) == 1);

                HashSet<ulong> directCount = null;
                IDictionary<ushort, byte> lookupSparse = isSparse ? new Dictionary<ushort, byte>() : null;
                byte[] lookupDense = null;

                if (isDirectCount)
                {
                    int count = br.ReadInt32();
                    directCount = new HashSet<ulong>();

                    for (var i = 0; i < count; i++)
                    {
                        ulong element = br.ReadUInt64();
                        directCount.Add(element);
                    }
                }
                else if (isSparse)
                {
                    int count = br.ReadInt32();

                    for (var i = 0; i < count; i++)
                    {
                        ushort elementKey = br.ReadUInt16();
                        byte elementValue = br.ReadByte();
                        lookupSparse.Add(elementKey, elementValue);
                    }
                }
                else
                {
                    int count = br.ReadInt32();
                    lookupDense = br.ReadBytes(count);
                }

                var data = new CardinalityEstimatorState
                {
                    BitsPerIndex = bitsPerIndex,
                    DirectCount = directCount,
                    IsSparse = isSparse,
                    LookupDense = lookupDense,
                    LookupSparse = lookupSparse
                };

                var result = new CardinalityEstimator(data);

                return result;
            }
        }