Ejemplo n.º 1
0
        private static void CalculateLshForListingSet(List <Listing> listings, string job_id, Dictionary <long, long> duplicates)
        {
            var numSimilarityBuckets = (int)Math.Ceiling(listings.Count / 100M);

            // First make 2 dimensional array (docs by min-hashes)
            var matrix = new int[listings.Count, minHashCount];

            for (int listing = 0; listing < listings.Count; listing++)
            {
                for (int hash = 0; hash < listings[listing].minhash_description.Count; hash++)
                {
                    matrix[listing, hash] = (int)listings[listing].minhash_description[hash];
                }
            }

            // Now set LSH
            var lsh = new LSH(matrix, numSimilarityBuckets);

            lsh.Calc();

            // Set closes duplicate on each listing
            var duplicatesFound    = new Dictionary <long, long>();
            var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M);

            for (int listing = 0; listing < listings.Count; listing++)
            {
                ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress);

                var nearest = lsh.GetNearest(listing);
                if (!nearest.Any())
                {
                    continue;
                }

                var thisListing    = listings[listing];
                var nearestListing = listings[nearest[0]];

                var priceRatio = nearestListing.buy_now_price / thisListing.buy_now_price;
                if (priceRatio < 0.8M || priceRatio > 1.2M)
                {
                    continue;
                }

                if (duplicatesFound.ContainsKey(nearestListing.id))
                {
                    continue;
                }

                listings[listing].likely_duplicate_id_by_description = nearestListing.id;
                listings[listing].similarity_description             = Jaccard.Calc(ArrayHelpers.GetRow <int>(matrix, listing).ToList(), nearest);
                duplicates[nearestListing.id] = thisListing.id;
                duplicates[thisListing.id]    = nearestListing.id;
            }
        }
Ejemplo n.º 2
0
        public void GenerateSSGraph(List <string> docs)
        {
            /* 1) decomposite the document represetned by fileName into sentences
             * 2) generate the sentence similarity graph via minhashing and LSH
             * 3) describe the graph by neiborhood list NB and offset list OFF
             */
            Stopwatch sw = new Stopwatch();

            sw.Start();
            List <string> docsOrg = new List <string>(docs);

            for (int i = 0; i < docs.Count; i++)
            {
                senteceNames[i] = docs[i];
            }

            int r    = docs.Count;
            int n    = 40;
            int rows = 2; // b= n / r;

            int[][] minHashes = new int[r][];
            for (int i = 0; i < r; i++)
            {
                //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray();
                minHashes[i] = GetShingleVec(docs[i]).ToArray();
            }

            MinHash mh = new MinHash(r, n);

            int[,] minhashes = new int[r, n];
            for (int i = 0; i < r; i++)
            {
                List <int>  doc = minHashes[i].ToList();
                List <uint> hvs = mh.GetMinHash(doc).ToList();
                for (int j = 0; j < hvs.Count; j++)
                {
                    minhashes[i, j] = (int)hvs[j];
                }
            }


            OFF.Add(0);
            int conCount = 0;


            LSH lsh = new LSH(minhashes, rows);

            lsh.Calc();
            int idx = 0;

            for (int k = 0; k < minhashes.GetUpperBound(0); k++)
            {
                List <int> nearest = lsh.GetNearest(k);
                if (!nodes.Contains(k))
                {
                    nodes.Add(k);
                }
                //Console.Write("\n" + k+" ");
                foreach (int i in nearest)
                {
                    //Console.Write(near + ", ");
                    if (!nodes.Contains(i))
                    {
                        nodes.Add(i);
                    }
                    if (i == idx)
                    {
                        continue;
                    }
                    NB.Add(i);
                    conCount++;
                    ++idx;
                }
                OFF.Add(conCount);
            }
            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds / (double)1000);
        }
Ejemplo n.º 3
0
        public void GenerateSSGraphForComparativeSum(List <string> docs, List <int> offsets)
        {
            /*
             * same as the first just applied to comparative sum
             * here we have the set of different documents
             */
            Stopwatch sw = new Stopwatch();

            sw.Start();
            List <string> docsOrg = new List <string>(docs);

            for (int i = 0; i < docs.Count; i++)
            {
                senteceNames[i] = docs[i];
            }

            int r    = docs.Count;
            int n    = 100;
            int rows = 5; // n / r;

            int[][] minHashes = new int[r][];
            for (int i = 0; i < r; i++)
            {
                //minHashes[i] = getShingleVec(parseWords(docs[i]).ToList()).ToArray();
                minHashes[i] = GetShingleVec(docs[i]).ToArray();
            }

            MinHash mh = new MinHash(r, n);

            int[,] minhashes = new int[r, n];
            for (int i = 0; i < r; i++)
            {
                List <int>  doc = minHashes[i].ToList();
                List <uint> hvs = mh.GetMinHash(doc).ToList();
                for (int j = 0; j < hvs.Count; j++)
                {
                    minhashes[i, j] = (int)hvs[j];
                }
            }


            OFF.Add(0);
            int conCount = 0;


            LSH lsh = new LSH(minhashes, rows);

            lsh.Calc();
            int idx = 0;

            for (int k = 0; k < minhashes.GetUpperBound(0); k++)
            {
                List <int> nearest = lsh.GetNearest(k);
                if (!nodes.Contains(k))
                {
                    nodes.Add(k);
                }
                //Console.Write("\n" + k+" ");
                foreach (int i in nearest)
                {
                    //Console.Write(near + ", ");
                    if (!nodes.Contains(i))
                    {
                        nodes.Add(i);
                    }
                    if (i == idx)
                    {
                        continue;
                    }
                    NB.Add(i);
                    if (Helpers.AreFromSameGroup(k, i, offsets))
                    {
                        SIGN.Add(1);
                    }
                    else
                    {
                        SIGN.Add(-0.5f);
                    }
                    conCount++;
                    ++idx;
                }
                OFF.Add(conCount);
            }
            sw.Stop();
            Console.WriteLine(sw.ElapsedMilliseconds / (double)1000);
        }