private static void CalculateLshForListingSet(List <Listing> listings, string job_id, Dictionary <long, long> duplicates) { var numSimilarityBuckets = (int)Math.Ceiling(listings.Count / 100M); // First make 2 dimensional array (docs by min-hashes) var matrix = new int[listings.Count, minHashCount]; for (int listing = 0; listing < listings.Count; listing++) { for (int hash = 0; hash < listings[listing].minhash_description.Count; hash++) { matrix[listing, hash] = (int)listings[listing].minhash_description[hash]; } } // Now set LSH var lsh = new LSH(matrix, numSimilarityBuckets); lsh.Calc(); // Set closes duplicate on each listing var duplicatesFound = new Dictionary <long, long>(); var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M); for (int listing = 0; listing < listings.Count; listing++) { ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress); var nearest = lsh.GetNearest(listing); if (!nearest.Any()) { continue; } var thisListing = listings[listing]; var nearestListing = listings[nearest[0]]; var priceRatio = nearestListing.buy_now_price / thisListing.buy_now_price; if (priceRatio < 0.8M || priceRatio > 1.2M) { continue; } if (duplicatesFound.ContainsKey(nearestListing.id)) { continue; } listings[listing].likely_duplicate_id_by_description = nearestListing.id; listings[listing].similarity_description = Jaccard.Calc(ArrayHelpers.GetRow <int>(matrix, listing).ToList(), nearest); duplicates[nearestListing.id] = thisListing.id; duplicates[thisListing.id] = nearestListing.id; } }
public void GenerateSSGraph(List <string> docs) { /* 1) decomposite the document represetned by fileName into sentences * 2) generate the sentence similarity graph via minhashing and LSH * 3) describe the graph by neiborhood list NB and offset list OFF */ Stopwatch sw = new Stopwatch(); sw.Start(); List <string> docsOrg = new List <string>(docs); for (int i = 0; i < docs.Count; i++) { senteceNames[i] = docs[i]; } int r = docs.Count; int n = 40; int rows = 2; // b= n / r; int[][] minHashes = new int[r][]; for (int i = 0; i < r; i++) { //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray(); minHashes[i] = GetShingleVec(docs[i]).ToArray(); } MinHash mh = new MinHash(r, n); int[,] minhashes = new int[r, n]; for (int i = 0; i < r; i++) { List <int> doc = minHashes[i].ToList(); List <uint> hvs = mh.GetMinHash(doc).ToList(); for (int j = 0; j < hvs.Count; j++) { minhashes[i, j] = (int)hvs[j]; } } OFF.Add(0); int conCount = 0; LSH lsh = new LSH(minhashes, rows); lsh.Calc(); int idx = 0; for (int k = 0; k < minhashes.GetUpperBound(0); k++) { List <int> nearest = lsh.GetNearest(k); if (!nodes.Contains(k)) { nodes.Add(k); } //Console.Write("\n" + k+" "); foreach (int i in nearest) { //Console.Write(near + ", "); if (!nodes.Contains(i)) { nodes.Add(i); } if (i == idx) { continue; } NB.Add(i); conCount++; ++idx; } OFF.Add(conCount); } sw.Stop(); Console.WriteLine(sw.ElapsedMilliseconds / (double)1000); }
public void GenerateSSGraphForComparativeSum(List <string> docs, List <int> offsets) { /* * same as the first just applied to comparative sum * here we have the set of different documents */ Stopwatch sw = new Stopwatch(); sw.Start(); List <string> docsOrg = new List <string>(docs); for (int i = 0; i < docs.Count; i++) { senteceNames[i] = docs[i]; } int r = docs.Count; int n = 100; int rows = 5; // n / r; int[][] minHashes = new int[r][]; for (int i = 0; i < r; i++) { //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray(); minHashes[i] = GetShingleVec(docs[i]).ToArray(); } MinHash mh = new MinHash(r, n); int[,] minhashes = new int[r, n]; for (int i = 0; i < r; i++) { List <int> doc = minHashes[i].ToList(); List <uint> hvs = mh.GetMinHash(doc).ToList(); for (int j = 0; j < hvs.Count; j++) { minhashes[i, j] = (int)hvs[j]; } } OFF.Add(0); int conCount = 0; LSH lsh = new LSH(minhashes, rows); lsh.Calc(); int idx = 0; for (int k = 0; k < minhashes.GetUpperBound(0); k++) { List <int> nearest = lsh.GetNearest(k); if (!nodes.Contains(k)) { nodes.Add(k); } //Console.Write("\n" + k+" "); foreach (int i in nearest) { //Console.Write(near + ", "); if (!nodes.Contains(i)) { nodes.Add(i); } if (i == idx) { continue; } NB.Add(i); if (Helpers.AreFromSameGroup(k, i, offsets)) { SIGN.Add(1); } else { SIGN.Add(-0.5f); } conCount++; ++idx; } OFF.Add(conCount); } sw.Stop(); Console.WriteLine(sw.ElapsedMilliseconds / (double)1000); }