public void ShouldNotGenerateAudioCollisions() { var lshAlgorithm = LocalitySensitiveHashingAlgorithm.Instance; var random = new Random(1); int width = 128, height = 32; var hashingConfig = new DefaultHashingConfig(); var tables = Enumerable.Range(0, 25).Select(index => new ConcurrentDictionary <int, int>()).ToArray(); int runs = 10000; for (int i = 0; i < 10000; ++i) { var tinyFingerprint = TestUtilities.GenerateRandomFingerprint(random, 200, width, height); var fingerprint = new Fingerprint(tinyFingerprint, 0, 0, Array.Empty <byte>()); var hashedFingerprint = lshAlgorithm.Hash(fingerprint, hashingConfig); for (int table = 0; table < tables.Length; ++table) { int key = hashedFingerprint.HashBins[table]; tables[table].AddOrUpdate(key, 1, (k, old) => old + 1); } } foreach (var table in tables) { Assert.IsTrue(table.Count > runs * 0.9); } }
public void DistributionOfHashesHasToBeUniform() { var lshAlgorithm = LocalitySensitiveHashingAlgorithm.Instance; var random = new Random(); var storage = new RAMStorage(25); float one = 8192f / 5512; var config = new DefaultHashingConfig { NumberOfLSHTables = 25, NumberOfMinHashesPerTable = 4, HashBuckets = 0 }; var track = new ModelReference <int>(1); int l = 100000; for (int i = 0; i < l; ++i) { var schema = TestUtilities.GenerateRandomFingerprint(random, 200, 128, 32); var hash = lshAlgorithm.Hash(new Fingerprint(schema, i * one, (uint)i, Array.Empty <byte>()), config); var subFingerprint = new SubFingerprintData(hash.HashBins, hash.SequenceNumber, hash.StartsAt, new ModelReference <uint>((uint)i), track); storage.AddSubFingerprint(subFingerprint); } var distribution = storage.HashCountsPerTable; foreach (var hashPerTable in distribution) { double collisions = (double)(l - hashPerTable) / l; Assert.IsTrue(collisions <= 0.01d, $"Less than 1% of collisions across 100K hashes: {collisions}"); } }
public void FingerprintsCantMatchUniformlyAtRandom() { var lshAlgorithm = LocalitySensitiveHashingAlgorithm.Instance; var random = new Random(); var storage = new RAMStorage(25); float one = 8192f / 5512; var config = new DefaultHashingConfig { NumberOfLSHTables = 25, NumberOfMinHashesPerTable = 4, HashBuckets = 0 }; var track = new ModelReference <int>(1); for (int i = 0; i < 100; ++i) { var schema = TestUtilities.GenerateRandomFingerprint(random, 200, 128, 32); var hash = lshAlgorithm.Hash(new Fingerprint(schema, i * one, (uint)i, Array.Empty <byte>()), config); var subFingerprint = new SubFingerprintData(hash.HashBins, hash.SequenceNumber, hash.StartsAt, new ModelReference <uint>((uint)i), track); storage.AddSubFingerprint(subFingerprint); } for (int i = 0; i < 10; ++i) { var schema = TestUtilities.GenerateRandomFingerprint(random, 200, 128, 32); var hash = lshAlgorithm.Hash(new Fingerprint(schema, i * one, (uint)i, Array.Empty <byte>()), config); for (int j = 0; j < 25; ++j) { var ids = storage.GetSubFingerprintsByHashTableAndHash(j, hash.HashBins[j]); Assert.IsFalse(ids.Any()); } } }
public void ShouldMatchAccordingToTheTheory() { var lsh = LocalitySensitiveHashingAlgorithm.Instance; int bands = 25; // segments int rows = 4; int topWavelets = 200; int vectorLength = 128 * 32 * 2; var hashingConfig = new DefaultHashingConfig(); double[] howSimilars = { 0.3, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9 }; double[] avgCandidatesFound = new double[howSimilars.Length]; double[] probabilityOfAMatch = new double[howSimilars.Length]; double[] atLeastOneCandidateFounds = new double[howSimilars.Length]; Parallel.For(0, howSimilars.Length, i => { var random = new Random(i); double howSimilar = howSimilars[i]; double jaccardSimilarity = howSimilar * topWavelets / (2 * topWavelets - howSimilar * topWavelets); probabilityOfAMatch[i] = Math.Round(1 - Math.Pow(1 - Math.Pow(jaccardSimilarity, rows), bands), 4); int simulationRuns = 10000; int agreeOn = 0; int atLeastOneCandidateFound = 0; for (int j = 0; j < simulationRuns; ++j) { var arrays = TestUtilities.GenerateSimilarFingerprints(random, howSimilar, topWavelets, vectorLength); var hashed1 = lsh.Hash(new Fingerprint(arrays.Item1, 0, 0), hashingConfig); var hashed2 = lsh.Hash(new Fingerprint(arrays.Item2, 0, 0), hashingConfig); int agreeCount = AgreeOn(hashed1.HashBins, hashed2.HashBins); if (agreeCount > 0) { atLeastOneCandidateFound++; } agreeOn += agreeCount; } avgCandidatesFound[i] = Math.Round((double)agreeOn / simulationRuns, 4); atLeastOneCandidateFounds[i] = Math.Round((double)atLeastOneCandidateFound / simulationRuns, 4); }); Console.WriteLine("Bands {0}, Rows {1}, Top Wavelets {2}", bands, rows, topWavelets); string header = $"{"Actual Similarity",5}{"Th. At Least One",19}{"Pr. At Least One",18}{"Avg. Candidates Found",25}"; Console.WriteLine(header); for (int i = 0; i < howSimilars.Length; ++i) { Console.WriteLine("{0,5:0.0000}{1,20:0.0000}{2,18:0.0000}{3,20:0.0000}", howSimilars[i], probabilityOfAMatch[i], atLeastOneCandidateFounds[i], avgCandidatesFound[i]); } for (int i = 0; i < howSimilars.Length; ++i) { Assert.AreEqual(probabilityOfAMatch[i], atLeastOneCandidateFounds[i], 0.05); } }
internal SubFingerprintDao(ISolrOperations <SubFingerprintDTO> solr, IDictionaryToHashConverter dictionaryToHashConverter, IHashConverter hashConverter, ISolrQueryBuilder solrQueryBuilder) { this.solr = solr; this.dictionaryToHashConverter = dictionaryToHashConverter; this.hashConverter = hashConverter; this.solrQueryBuilder = solrQueryBuilder; var hashinConfig = new DefaultHashingConfig(); fingerprintLength = hashinConfig.NumberOfLSHTables * hashinConfig.NumberOfMinHashesPerTable; }
public void ShouldBeAbleToControlReturnedCandidatesWithThresholdParameter() { int l = 25, k = 4, width = 128, height = 72; var hashingConfig = new DefaultHashingConfig { Width = width, Height = height, NumberOfLSHTables = l, NumberOfMinHashesPerTable = k }; var lsh = LocalitySensitiveHashingAlgorithm.Instance; double[] howSimilarly = { 0.3, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9 }; int[] expectedThresholds = { 0, 0, 0, 2, 3, 5, 7, 11 }; const int simulations = 10000; Parallel.For(0, howSimilarly.Length, r => { var random = new Random((r + 1) * 100); double howSimilar = howSimilarly[r]; int topWavelets = (int)(0.035 * width * height); var agreeOn = new List <int>(); var hammingDistances = new List <int>(); for (int i = 0; i < simulations; ++i) { var fingerprints = TestUtilities.GenerateSimilarFingerprints(random, howSimilar, topWavelets, width * height * 2); int hammingDistance = similarity.CalculateHammingDistance(fingerprints.Item1.ToBools(), fingerprints.Item2.ToBools()); hammingDistances.Add(hammingDistance); var hashed1 = lsh.HashImage(new Fingerprint(fingerprints.Item1, 0, 0, Array.Empty <byte>()), hashingConfig); var hashed2 = lsh.HashImage(new Fingerprint(fingerprints.Item2, 0, 0, Array.Empty <byte>()), hashingConfig); int agreeCount = AgreeOn(hashed1.HashBins, hashed2.HashBins); agreeOn.Add(agreeCount); } int requested = (int)((1 - howSimilar) * topWavelets * 2); Assert.AreEqual(requested, hammingDistances.Average(), 1); Assert.AreEqual(expectedThresholds[r], Math.Floor(agreeOn.Average())); Console.WriteLine($"Similarity: {howSimilar: 0.00}, Avg. Table Matches {agreeOn.Average(): 0.000}"); }); }
public void ShouldNotGenerateVideoCollisions() { var lshAlgorithm = LocalitySensitiveHashingAlgorithm.Instance; var random = new Random(1); int width = 128, height = 72; double topWaveletsPercentage = 0.04; var hashingConfig = new DefaultHashingConfig() { Width = width, Height = height }; var tables = Enumerable.Range(0, 25).Select(index => new ConcurrentDictionary <int, int>()).ToArray(); int runs = 10000; for (int i = 0; i < 10000; ++i) { var tinyFingerprint = TestUtilities.GenerateRandomFingerprint(random, (int)(width * height * topWaveletsPercentage), width, height); var fingerprint = new Fingerprint(tinyFingerprint, 0, 0, Array.Empty <byte>()); var hashedFingerprint = lshAlgorithm.HashImage(fingerprint, hashingConfig); for (int table = 0; table < tables.Length; ++table) { int key = hashedFingerprint.HashBins[table]; tables[table].AddOrUpdate(key, 1, (k, old) => old + 1); } } Console.WriteLine(string.Join(",", tables.Select(t => t.Count))); foreach (var table in tables) { Assert.IsTrue(table.Count > runs * 0.9); } }