public void TestMinHashSimilarity() { var bag = new List <string> { "bob", "alice", "frank", "tyler", "sara" }; var simRatio = MinHash.Similarity(bag.ToArray(), bag.ToArray()); Assert.AreEqual(1.0, simRatio); var dict = Words.Dictionary(1000); var bag2 = new List <string>(); for (int i = 0; i < 1000; i++) { bag2.Add(i.ToString()); } simRatio = MinHash.Similarity(dict, bag2.ToArray()); Assert.AreEqual(0.0, simRatio); var bag3 = Words.Dictionary(500); simRatio = MinHash.Similarity(dict, bag3); if (simRatio > 0.7 || simRatio < 0.5) { Assert.Fail(string.Format("Expected between 0.5 and 0.7, got {0}", simRatio)); } }
private static int MAX_SIGNATURE_COUNT = 5; // the number of signatures to reduce to public Repository(IPermutations permutations, DatabaseService dbService, FingerprintService fingerprintService) { this.permutations = permutations; this.minHash = new MinHash(this.permutations); this.dbService = dbService; this.fingerprintService = fingerprintService; }
/// <summary> /// Each repository should have storage for permutations and for tracks/fingerprints /// </summary> /// <param name = "storage">Track/Signatures storage</param> /// <param name = "permutations">Permutations storage</param> public Repository(IStorage storage, IPermutations permutations) { _permutations = permutations; _storage = storage; _manager = new FingerprintManager(); _hasher = new MinHash(_permutations); }
public void GetShinglesTest() { MinHash minHash = new MinHash(10); string input = "Hello world I am a string"; List <string> result = minHash.GetShingles(input); List <string> expected = new List <string>() { "hello world i", "world i am", "i am a", "am a string", "a string", "string" }; Assert.AreEqual(string.Join(" ", expected), string.Join(" ", result)); }
private static void BuildTitleMinHashes(List <Listing> listings, string job_id) { var universe = new Dictionary <string, int>(); var wordId = 0; var mh = new MinHash(200000, minHashCount); var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M); // Build ngrams for each listing (if not done already) and save to universe of ngrams foreach (var listing in listings.ToList()) { if (!listing.ngrams_description.Any()) { listing.ngrams_description = mh.GetProfile(listing.description, nGramLength); } foreach (var ngram in listing.ngrams_description.Keys) { if (!universe.ContainsKey(ngram)) { universe[ngram] = wordId++; } } ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress); } singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M); mh = new MinHash(universe.Count, minHashCount); foreach (var listing in listings) { if (listing.minhash_description.Any()) { continue; } // Set word ID in each document foreach (var ngram in listing.ngrams_description.Keys) { listing.word_ids_description.Add(universe[ngram]); } // Calculate min hash for each listing listing.minhash_description = mh.GetMinHash(listing.word_ids_description); ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress); } }
public void GetMinHashTest_ForDifferentInputs() { MinHash minHash = new MinHash(10); List <string> inputOne = new List <string>() { "hello world i", "world i am", "i am a", "am a string", "a string", "string" }; List <string> inputTwo = new List <string>() { "not the same", "the same", "same" }; List <uint> result = minHash.GetMinHash(inputOne); List <uint> expected = minHash.GetMinHash(inputTwo); Assert.AreNotEqual(string.Join(" ", expected), string.Join(" ", result)); }
public Dictionary <int, double> closestSimilarItems(int setIndex, MinHash <T> minHasher) { Dictionary <int, double> closestMap = new Dictionary <int, double>(); HashSet <int> closeMembers = getCloseMembers(setIndex, minHasher); foreach (int intIndex in closeMembers) { if (intIndex != setIndex) { double similarity = minHasher.ComputeSimilarity(minHashValues, setIndex, intIndex); closestMap.Add(intIndex, similarity); } } return(closestMap); }
private static int ComputeHashMain(DatabaseConnection connection) { var persons = connection.ReadPersons(); foreach (var personItems in persons) { var items = personItems.AsEnumerable().Skip(1).Where(i => !(i is System.DBNull)); var stringItems = items.Select(i => i.ToString()); var hashes = stringItems.Select(Farmhash.Sharp.Farmhash.Hash64); var minHash = new MinHash(); minHash.Add(hashes); connection.SetMinHash((long)personItems[0], MinHash.ToByteArray(minHash.MinHashes)); } return(0); }
public void SimilarityTest() { MinHash minHash = new MinHash(10); List <uint> firstInput = new List <uint>() { 1, 2, 3 }; List <uint> secondInput = new List <uint>() { 1, 2, 3 }; double result = minHash.Similarity(firstInput, secondInput); double expected = 1.0; Assert.AreEqual(expected, result); }
public void Test1() { var dataSet1 = new[] { "To", "compute", "the", "resemblance", "and/or", "the", "containment", "of", "two", "documents", "it", "suffices", "to", "keep", "for", "each", "document", "a", "relatively", "small", "sketch.", "The", "sketches", "can", "be", "computed", "fairly", "fast", "(linear", "in", "the", "size", "of", "the", "documents)", "and", "given", "two", "sketches", "the", "resemblance", "or", "the", "containment", "of", "the", "corresponding", "documents", "can", "be", "computed", "in", "linear", "time", "in", "the", "size", "of", "the", "sketches.", "For", "computing", "resemblance,", "it", "suffices", "to", "keep", "a", "fixed", "size", "sketch.", "For", "computing", "containment,", "we", "need", "a", "sketch", "proportional", "to", "the", "size", "of", "the", "underlying", "document;", "however", "as", "it", "will", "be", "explained", "this", "problem", "can", "be", "finessed", "at", "the", "cost", "of", "a", "loss", "of", "precision." }; var dataSet2 = new[] { "In", "order", "to", "compute", "the", "resemblance", "or/and", "the", "containment", "of", "two", "documents", "it", "suffice", "to", "keep", "a", "relatively", "small", "sketch.", "The", "sketches", "can", "be", "computed", "fairly", "quick", "and", "given", "two", "sketches", "the", "resemblance", "or", "the", "containment", "of", "the", "corresponding", "document", "can", "be", "in", "linear", "time", "in", "the", "size", "of", "the", "sketches.", "For", "computing", "resemblance,", "it", "suffices", "to", "keep", "a", "fixed", "size", "sketch.", "For", "computing", "containment,", "we", "need", "a", "sketch", "proportional", "to", "the", "size", "of", "the", "underlying", "document", "however", "as", "it", "will", "be", "explained", "this", "problem", "can", "be", "finessed", "at", "the", "cost", "of", "the", "loss", "precision" }; var minHash1 = new MinHash(); var dataSet1HashSet = dataSet1.Select(Farmhash.Sharp.Farmhash.Hash64); minHash1.Add(dataSet1HashSet); var minHash2 = new MinHash(); var dataSet2HashSet = dataSet2.Select(Farmhash.Sharp.Farmhash.Hash64); minHash2.Add(dataSet2HashSet); var estimate = minHash1.GetJaccardIndex(minHash2); var exact = JaccardIndex.GetJaccardIndex(dataSet1, dataSet2); var distance = Math.Abs(exact - estimate); Assert.True(distance < 0.01); }
private HashSet <int> getCloseMembers(int setIndex, MinHash <T> minHasher) { HashSet <int> closeMembers = new HashSet <int>(); for (int b = 0; b < m_numBands; b++) { int sum = 0; for (int i = 0; i < ROWSINBAND; i++) { sum += minHashValues[setIndex][b * ROWSINBAND + i]; } sum = sum % PROCENT; foreach (int intInx in m_lshBuckets[sum]) { closeMembers.Add(intInx); } } return(closeMembers); }
/// <summary> /// Arrange candidates according to the corresponding calculation between initial signature and actual signature /// </summary> /// <param name="f">Actual signature gathered from the song</param> /// <param name="potentialCandidates">Potential fingerprints returned from the database</param> /// <param name="lHashTables">Number of L Hash tables</param> /// <param name="kKeys">Number of keys per table</param> /// <param name="trackIdQueryStats">Result set</param> /// <returns>Result set</returns> private static Dictionary <Int32, QueryStats> ArrangeCandidatesAccordingToFingerprints(bool[] f, Dictionary <Fingerprint, int> potentialCandidates, int lHashTables, int kKeys, Dictionary <Int32, QueryStats> trackIdQueryStats) { // Most time consuming method while performing the necessary calculation foreach (KeyValuePair <Fingerprint, int> pair in potentialCandidates) { Fingerprint fingerprint = pair.Key; int tableVotes = pair.Value; // Compute Hamming Distance of actual and read signature int hammingDistance = MinHash.CalculateHammingDistance(f, fingerprint.Signature) * tableVotes; double jaqSimilarity = MinHash.CalculateJaqSimilarity(f, fingerprint.Signature); // Add to sample set Int32 trackId = fingerprint.TrackId; if (!trackIdQueryStats.ContainsKey(trackId)) { trackIdQueryStats.Add(trackId, new QueryStats(0, 0, 0, -1, -1, 0, Int32.MinValue, 0, Int32.MaxValue, Int32.MinValue, Int32.MinValue, Double.MaxValue)); } QueryStats stats = trackIdQueryStats[trackId]; stats.HammingDistance += hammingDistance; // Sum hamming distance of each potential candidate stats.NumberOfTrackIdOccurences++; // Increment occurrence count stats.NumberOfTotalTableVotes += tableVotes; // Find total table votes stats.HammingDistanceByTrack += hammingDistance / tableVotes; // Find hamming distance by track id occurrence if (stats.MinHammingDistance > hammingDistance / tableVotes) // Find minimal hamming distance over the entire set { stats.MinHammingDistance = hammingDistance / tableVotes; } if (stats.MaxTableVote < tableVotes) // Find maximal table vote { stats.MaxTableVote = tableVotes; } if (stats.Similarity > jaqSimilarity) { stats.Similarity = jaqSimilarity; } } return(trackIdQueryStats); }
private static int[] Calculeaza_vecini_LSH(int k, int id_user) { // Extragem din DB un Dictionary de toate id utilizator, lista de preparate // comandate. Dictionary<int, List<int>> toatePrep = DatabaseFunctions. preparateComandateDupaUtilizator(); Dictionary<int, HashSet<int>> signatures = new Dictionary<int, HashSet<int>>(); HashSet<int> single_signatures = new HashSet<int>(); foreach (KeyValuePair<int, List<int>> entry in toatePrep) { signatures.Add(entry.Key, entry.Value.ToHashSet<int>()); single_signatures.UnionWith(entry.Value); } int numSets = signatures.Count; int numHashFunctions = single_signatures.Count; MinHash<int> minHash = new MinHash<int>(numHashFunctions); int[][] minHashValues = minHash.initializeHashBuckets(numSets, numHashFunctions); int index = 0, index_cautat = 0; List<HashSet<int>> list_signatures = new List<HashSet<int>>(); foreach(var entry in signatures) { minHash.computeMinHashForSet(entry.Value, index, minHashValues, single_signatures); if(entry.Key == id_user) { index_cautat = index; } index++; list_signatures.Add(entry.Value); } LSH<int> lsh = new LSH<int>(minHashValues, list_signatures); Dictionary<int, double> closeSimilarItems = lsh.closestSimilarItems(index_cautat, minHash); int poz = 0; int[] vecini = new int[k]; Dictionary<int, double> results = new Dictionary<int,double>(); int i = 0; foreach(var closeItem in closeSimilarItems) { poz = closeItem.Key; i = 0; foreach (var entry in signatures) { if (i == poz) { results.Add(entry.Key, closeItem.Value); break; } i++; } } if (results.Count() > 0) { for (int ind = 0; ind < k; ind++) { vecini[ind] = results.MaxBy(x => x.Value).Key; results.Remove(vecini[ind]); if (results.Count == 0) break; } } return vecini; }
public void TestIntMinHash() { MinHash _mh = new MinHash(1000, 100); double[] bloom = new double[10000]; int[] count = new int[10000]; double[] bloom1 = new double[10000]; int[] count1 = new int[10000]; //var biList1 = SchemeProcess.TransformKeywordsToBiGram("cat"); //var index1 = SchemeProcess.GenerateVector(biList1); //var res1 = _mh.getMinHashSignatures("ca1"); int len = 0; List <string> stemmedDoc; var stemSet = SchemeProcess.GetVocabulary("my name is zjw", out stemmedDoc, 0); var stemSet1 = SchemeProcess.GetVocabulary("my name is wrm", out stemmedDoc, 0); foreach (string stem in stemSet) { var biList = MyScheme.TransformKeywordsToBiGram(stem); //var index = SchemeProcess.GenerateVector(biList); foreach (string s in biList) { foreach (int i in _mh.getMinHashSignatures(s)) { if (i >= 10000) { continue; } if (bloom[i] == 0) { bloom[i] = 1; count[i]++; } else { bloom[i] = (bloom[i] * count[i] + 1) / (++count[i]); } } } } foreach (string stem in stemSet1) { var biList = MyScheme.TransformKeywordsToBiGram(stem); //var index = SchemeProcess.GenerateVector(biList); foreach (string s in biList) { foreach (int i in _mh.getMinHashSignatures(s)) { if (i >= 1000) { continue; } if (bloom1[i] == 0) { bloom1[i] = 1; count1[i]++; } else { bloom1[i] = (bloom1[i] * count1[i] + 1) / (++count1[i]); } } } } for (int i = 0; i < bloom.Length; i++) { if (bloom[i] == bloom1[i] && bloom[i].Equals(1)) { len++; } Console.Write($"{bloom[i]} "); Console.WriteLine(bloom1[i]); } Console.WriteLine(len); }
public void GenerateSSGraphForComparativeSum(List <string> docs, List <int> offsets) { /* * same as the first just applied to comparative sum * here we have the set of different documents */ Stopwatch sw = new Stopwatch(); sw.Start(); List <string> docsOrg = new List <string>(docs); for (int i = 0; i < docs.Count; i++) { senteceNames[i] = docs[i]; } int r = docs.Count; int n = 100; int rows = 5; // n / r; int[][] minHashes = new int[r][]; for (int i = 0; i < r; i++) { //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray(); minHashes[i] = GetShingleVec(docs[i]).ToArray(); } MinHash mh = new MinHash(r, n); int[,] minhashes = new int[r, n]; for (int i = 0; i < r; i++) { List <int> doc = minHashes[i].ToList(); List <uint> hvs = mh.GetMinHash(doc).ToList(); for (int j = 0; j < hvs.Count; j++) { minhashes[i, j] = (int)hvs[j]; } } OFF.Add(0); int conCount = 0; LSH lsh = new LSH(minhashes, rows); lsh.Calc(); int idx = 0; for (int k = 0; k < minhashes.GetUpperBound(0); k++) { List <int> nearest = lsh.GetNearest(k); if (!nodes.Contains(k)) { nodes.Add(k); } //Console.Write("\n" + k+" "); foreach (int i in nearest) { //Console.Write(near + ", "); if (!nodes.Contains(i)) { nodes.Add(i); } if (i == idx) { continue; } NB.Add(i); if (Helpers.AreFromSameGroup(k, i, offsets)) { SIGN.Add(1); } else { SIGN.Add(-0.5f); } conCount++; ++idx; } OFF.Add(conCount); } sw.Stop(); Console.WriteLine(sw.ElapsedMilliseconds / (double)1000); }
/// <summary> /// Query one specific song using MinHash algorithm. /// </summary> /// <param name="signatures">Signature signatures from a song</param> /// <param name="dbService">DatabaseService used to query the underlying database</param> /// <param name="lshHashTables">Number of hash tables from the database</param> /// <param name="lshGroupsPerKey">Number of groups per hash table</param> /// <param name="thresholdTables">Minimum number of hash tables that must be found for one signature to be considered a candidate (0 = return all candidates, 2+ = return only exact matches)</param> /// <param name="queryTime">Set by the method, representing the query length</param> /// <param name="doSearchEverything">disregard the local sensitivity hashes and search the whole database</param> /// <param name="splashScreen">The "please wait" splash screen (or null)</param> /// <returns>Dictionary with Tracks ID's and the Query Statistics</returns> public static Dictionary <Int32, QueryStats> QueryOneSongMinHash( IEnumerable <bool[]> signatures, DatabaseService dbService, MinHash minHash, int lshHashTables, int lshGroupsPerKey, int thresholdTables, ref long queryTime, bool doSearchEverything = false, SplashSceenWaitingForm splashScreen = null) { Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); int signatureCounter = 0; int signatureTotalCount = signatures.Count(); Dictionary <int, QueryStats> stats = new Dictionary <int, QueryStats>(); foreach (bool[] signature in signatures) { #region Please Wait Splash Screen Cancel Event // check if the user clicked cancel if (splashScreen.CancellationPending) { break; } #endregion if (signature == null) { continue; } IDictionary <int, IList <HashBinMinHash> > candidates = null; if (doSearchEverything) { candidates = dbService.ReadAllFingerprints(); } else { // Compute Min Hash on randomly selected fingerprint int[] bin = minHash.ComputeMinHashSignature(signature); // Find all hashbuckets to care about Dictionary <int, long> hashes = minHash.GroupMinHashToLSHBuckets(bin, lshHashTables, lshGroupsPerKey); long[] hashbuckets = hashes.Values.ToArray(); // Find all candidates by querying the database for those hashbuckets candidates = dbService.ReadFingerprintsByHashBucketLsh(hashbuckets); } // Reduce the potential candidates list if the number of hash tables found for each signature are less than the threshold Dictionary <int, IList <HashBinMinHash> > potentialCandidates = SelectPotentialMatchesOutOfEntireDataset(candidates, thresholdTables); // get the final candidate list by only using the potential candidate list if (potentialCandidates.Count > 0) { IList <Fingerprint> fingerprints = dbService.ReadFingerprintById(potentialCandidates.Keys); Dictionary <Fingerprint, int> finalCandidates = fingerprints.ToDictionary(finger => finger, finger => potentialCandidates[finger.Id].Count); ArrangeCandidatesAccordingToFingerprints(signature, finalCandidates, lshHashTables, lshGroupsPerKey, stats); } #region Please Wait Splash Screen Update // calculate a percentage between 5 and 90 int percentage = (int)((float)(signatureCounter) / (float)signatureTotalCount * 85) + 5; if (splashScreen != null) { splashScreen.SetProgress(percentage, String.Format("Searching for similar fingerprints.\n(Signature {0} of {1})", signatureCounter + 1, signatureTotalCount)); } signatureCounter++; #endregion Updat } stopWatch.Stop(); queryTime = stopWatch.ElapsedMilliseconds; /*Set the query Time parameter*/ return(stats); }
public void GenerateSSGraph(List <string> docs) { /* 1) decomposite the document represetned by fileName into sentences * 2) generate the sentence similarity graph via minhashing and LSH * 3) describe the graph by neiborhood list NB and offset list OFF */ Stopwatch sw = new Stopwatch(); sw.Start(); List <string> docsOrg = new List <string>(docs); for (int i = 0; i < docs.Count; i++) { senteceNames[i] = docs[i]; } int r = docs.Count; int n = 40; int rows = 2; // b= n / r; int[][] minHashes = new int[r][]; for (int i = 0; i < r; i++) { //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray(); minHashes[i] = GetShingleVec(docs[i]).ToArray(); } MinHash mh = new MinHash(r, n); int[,] minhashes = new int[r, n]; for (int i = 0; i < r; i++) { List <int> doc = minHashes[i].ToList(); List <uint> hvs = mh.GetMinHash(doc).ToList(); for (int j = 0; j < hvs.Count; j++) { minhashes[i, j] = (int)hvs[j]; } } OFF.Add(0); int conCount = 0; LSH lsh = new LSH(minhashes, rows); lsh.Calc(); int idx = 0; for (int k = 0; k < minhashes.GetUpperBound(0); k++) { List <int> nearest = lsh.GetNearest(k); if (!nodes.Contains(k)) { nodes.Add(k); } //Console.Write("\n" + k+" "); foreach (int i in nearest) { //Console.Write(near + ", "); if (!nodes.Contains(i)) { nodes.Add(i); } if (i == idx) { continue; } NB.Add(i); conCount++; ++idx; } OFF.Add(conCount); } sw.Stop(); Console.WriteLine(sw.ElapsedMilliseconds / (double)1000); }
public CompareAudioForm() { // // The InitializeComponent() call is required for Windows Forms designer support. // InitializeComponent(); // // TODO: Add constructor code after the InitializeComponent() call. // // Instansiate Soundfingerprinting Repository FingerprintService fingerprintService = Analyzer.GetSoundfingerprintingService(); this.databaseService = DatabaseService.Instance; IPermutations permutations = new LocalPermutations("Soundfingerprinting\\perms.csv", ","); //IPermutations permutations = new LocalPermutations("Soundfingerprinting\\perms-new.csv", ","); IFingerprintingConfiguration fingerprintingConfigCreation = new FullFrequencyFingerprintingConfiguration(); repository = new Repository(permutations, databaseService, fingerprintService); ImageService imageService = new ImageService(fingerprintService.SpectrumService, fingerprintService.WaveletService); FileInfo filePathAudio1 = new FileInfo(@"C:\Users\perivar.nerseth\Music\Test Samples Database\VDUB1 Snare 004.wav"); FileInfo filePathAudio2 = new FileInfo(@"C:\Users\perivar.nerseth\Music\Test Samples Search\VDUB1 Snare 004 - Start.wav"); int fingerprintsPerRow = 2; double[][] logSpectrogram1 = null; double[][] logSpectrogram2 = null; List <bool[]> fingerprints1 = null; List <bool[]> fingerprints2 = null; WorkUnitParameterObject file1Param = Analyzer.GetWorkUnitParameterObjectFromAudioFile(filePathAudio1); if (file1Param != null) { file1Param.FingerprintingConfiguration = fingerprintingConfigCreation; // Get fingerprints fingerprints1 = fingerprintService.CreateFingerprintsFromAudioSamples(file1Param.AudioSamples, file1Param, out logSpectrogram1); pictureBox1.Image = imageService.GetSpectrogramImage(logSpectrogram1, logSpectrogram1.Length, logSpectrogram1[0].Length); pictureBoxWithInterpolationMode1.Image = imageService.GetImageForFingerprints(fingerprints1, file1Param.FingerprintingConfiguration.FingerprintLength, file1Param.FingerprintingConfiguration.LogBins, fingerprintsPerRow); } WorkUnitParameterObject file2Param = Analyzer.GetWorkUnitParameterObjectFromAudioFile(filePathAudio2); if (file2Param != null) { file2Param.FingerprintingConfiguration = fingerprintingConfigCreation; // Get fingerprints fingerprints2 = fingerprintService.CreateFingerprintsFromAudioSamples(file2Param.AudioSamples, file2Param, out logSpectrogram2); pictureBox2.Image = imageService.GetSpectrogramImage(logSpectrogram2, logSpectrogram2.Length, logSpectrogram2[0].Length); pictureBoxWithInterpolationMode2.Image = imageService.GetImageForFingerprints(fingerprints2, file2Param.FingerprintingConfiguration.FingerprintLength, file2Param.FingerprintingConfiguration.LogBins, fingerprintsPerRow); } MinHash minHash = repository.MinHash; // only use the first signatures bool[] signature1 = fingerprints1[0]; bool[] signature2 = fingerprints2[0]; if (signature1 != null && signature2 != null) { int hammingDistance = MinHash.CalculateHammingDistance(signature1, signature2); double jaqSimilarity = MinHash.CalculateJaqSimilarity(signature1, signature2); lblSimilarity.Text = String.Format("Hamming: {0} JAQ: {1}", hammingDistance, jaqSimilarity); } }