/// <summary> /// Calculates Token Frequencies in document and persist it in Cache (for faster access) /// This method means that we adding {document} to our similarity model /// </summary> private async Task SaveTokenFrequenciesAsync(Document document) { foreach (var token in document.Tokens) { var key = _tokenContextCacheKey(token); var tokenContext = await _keyValueStorage.GetAsync <TokenContext>(key) ?? new TokenContext(); if (!tokenContext.DocumentIds.Contains(document.Id)) { tokenContext.DocumentIds.Add(document.Id); } await _keyValueStorage.SetAsync(key, tokenContext); } }
/// <summary> /// Finds duplicates of article /// </summary> /// <param name="article">article for which perform duplicate scanning</param> /// <param name="saveInCache">Saving results to redis cache (could be useful to set false, if article was not added to database)</param> /// <returns></returns> public async Task <List <int> > FindDuplicatesAsync(Article article, bool saveInCache) { var results = await _cache.GetAsync <List <SimilarityResult> >(_articleSimilarityCacheKey(article)); if (results == null) { results = await _similarityScoring.GetSimilarityScoresAsync(article.ToDocument()); if (saveInCache) { // Persist similarity results (before MatchingThreshold check, since it could be changed in future) await _cache.SetAsync(_articleSimilarityCacheKey(article), results); } } var duplicates = results .Where(r => r.Id != article.Id && r.Score >= _articleCheckConfiguration.MatchingThreshold) .ToList(); return(duplicates .Select(x => x.Id) .ToList()); }