Пример #1
0
        private static void BuildTitleMinHashes(List <Listing> listings, string job_id)
        {
            var universe = new Dictionary <string, int>();
            var wordId   = 0;
            var mh       = new MinHash(200000, minHashCount);

            var singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M);

            // Build ngrams for each listing (if not done already) and save to universe of ngrams
            foreach (var listing in listings.ToList())
            {
                if (!listing.ngrams_description.Any())
                {
                    listing.ngrams_description = mh.GetProfile(listing.description, nGramLength);
                }

                foreach (var ngram in listing.ngrams_description.Keys)
                {
                    if (!universe.ContainsKey(ngram))
                    {
                        universe[ngram] = wordId++;
                    }
                }

                ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress);
            }

            singleItemProgress = ProgressManager.CalculateLoopIncrement(listings.Count(), 0.2M);

            mh = new MinHash(universe.Count, minHashCount);

            foreach (var listing in listings)
            {
                if (listing.minhash_description.Any())
                {
                    continue;
                }

                // Set word ID in each document
                foreach (var ngram in listing.ngrams_description.Keys)
                {
                    listing.word_ids_description.Add(universe[ngram]);
                }

                // Calculate min hash for each listing
                listing.minhash_description = mh.GetMinHash(listing.word_ids_description);

                ProgressManager.IncrementJobPercentBy(job_id, singleItemProgress);
            }
        }