/// <summary>
        /// Calculates Token Frequencies in document and persist it in Cache (for faster access)
        /// This method means that we adding {document} to our similarity model
        /// </summary>
        private async Task SaveTokenFrequenciesAsync(Document document)
        {
            foreach (var token in document.Tokens)
            {
                var key          = _tokenContextCacheKey(token);
                var tokenContext = await _keyValueStorage.GetAsync <TokenContext>(key)
                                   ?? new TokenContext();

                if (!tokenContext.DocumentIds.Contains(document.Id))
                {
                    tokenContext.DocumentIds.Add(document.Id);
                }

                await _keyValueStorage.SetAsync(key, tokenContext);
            }
        }
Beispiel #2
0
        /// <summary>
        /// Finds duplicates of article
        /// </summary>
        /// <param name="article">article for which perform duplicate scanning</param>
        /// <param name="saveInCache">Saving results to redis cache (could be useful to set false, if article was not added to database)</param>
        /// <returns></returns>
        public async Task <List <int> > FindDuplicatesAsync(Article article, bool saveInCache)
        {
            var results = await _cache.GetAsync <List <SimilarityResult> >(_articleSimilarityCacheKey(article));

            if (results == null)
            {
                results = await _similarityScoring.GetSimilarityScoresAsync(article.ToDocument());

                if (saveInCache)
                {
                    // Persist similarity results (before MatchingThreshold check, since it could be changed in future)
                    await _cache.SetAsync(_articleSimilarityCacheKey(article), results);
                }
            }

            var duplicates = results
                             .Where(r => r.Id != article.Id && r.Score >= _articleCheckConfiguration.MatchingThreshold)
                             .ToList();

            return(duplicates
                   .Select(x => x.Id)
                   .ToList());
        }