public void Index(string processId, PrcSettingsElastic prcSettings, CancellationToken token) { var logPrefix = $"Prc Index {processId}"; try { const int parallelMultiplier = 2; var service = serviceQuery.Get(prcSettings.ServiceId); redisHandler.Clean(PrcIndexRedisKey.ServiceDeleteKey(prcSettings.ServiceId)); var wordQuery = queryFactory.GetWordQuery(prcSettings.DataSetName); var lockObject = new object(); var documentQuery = queryFactory.GetDocumentQuery(prcSettings.DataSetName); var tagsWithDocumentCounts = documentQuery.CountForTags(prcSettings.IndexSettings.FilterTagIdList, GlobalStore.DataSets.Get(prcSettings.DataSetName).DataSet.TagField); var allDocumentsCount = tagsWithDocumentCounts.Sum(d => d.Value); logger.LogInformation($"{logPrefix} starts with ParallelLimit: {parallelService.ParallelLimit * parallelMultiplier}, Tags Count: {prcSettings.IndexSettings.FilterTagIdList.Count}, All documents count: {allDocumentsCount}"); var allDocProgress = new Progress(allDocumentsCount); // lekérjük az aktuális tag-hez(és Filter - hez) tartozó doksikat (elastic - ból). foreach (var tagId in prcSettings.IndexSettings.FilterTagIdList) { if (token.IsCancellationRequested) { CleanPrcIndex(prcSettings.ServiceId); processHandler.Cancelled(processId); logger.LogInformation($"{logPrefix} cancelled Tag: `{tagId}`"); return; } logger.LogTrace($"{logPrefix} preparing Tag: `{tagId}`"); var globalSubset = GlobalStore.ActivatedPrcs.Get(prcSettings.ServiceId).PrcSubsets[tagId]; if (globalSubset.WordsWithOccurences == null) { continue; } var documentElasticIds = GetDocumentIds(prcSettings.DataSetName, prcSettings.IndexSettings.FilterQuery, new List <string> { tagId }, null, prcSettings.IndexSettings.IndexDate).OrderBy(o => o).ToList(); if (documentElasticIds.Count == 0) { continue; } var wwoDocuments = wordQuery.GetWordsWithOccurencesByDocuments( documentElasticIds, prcSettings.FieldsForRecommendation.Select(DocumentQuery.MapDocumentObjectName), 1, parallelLimit: parallelService.ParallelLimit); var cleanedTextDocuments = wwoDocuments.ToDictionary(w => w.Key, w => GetCleanedText(w.Value)); var docProgress = new Progress(documentElasticIds.Count); Parallel.ForEach(documentElasticIds, parallelService.ParallelOptions(parallelMultiplier), (documentId, loopState) => { if (token.IsCancellationRequested) { loopState.Stop(); return; } try { logger.LogTrace($"{logPrefix} preparing Document: `{documentId}`/`{tagId}`"); // kiszámoljuk az aktuális doksi base dictionary - jét var scorer = GetScorer(globalSubset, wwoDocuments[documentId], cleanedTextDocuments[documentId], GlobalStore.ActivatedPrcs.Get(prcSettings.ServiceId).PrcScorers[tagId]); if (scorer == null) { return; } var similarDocuments = new List <KeyValuePair <string, double> >(); // documentElastics except document foreach (var siblingDocumentId in documentElasticIds) { if (siblingDocumentId == documentId) { continue; } var wwoSibling = wwoDocuments[siblingDocumentId]; if (wwoSibling.Keys.Intersect(scorer.BaseDic.Keys).Count() == 0) { continue; } var finalScore = GetPrcScore(scorer, cleanedTextDocuments[siblingDocumentId]); if (finalScore > 0) { similarDocuments.Add(new KeyValuePair <string, double>(siblingDocumentId, finalScore)); } } var redisKey = new PrcIndexRedisKey(prcSettings.ServiceId, tagId, documentId); redisHandler.AddDocuments(redisKey, similarDocuments); } finally { logger.LogTrace($"{logPrefix} prepared Document: `{documentId}`/`{tagId}`"); allDocProgress.Step(); var value = docProgress.Step(); if (value % 50 == 0) { lock (lockObject) { processHandler.Changed(processId, allDocProgress.Percent.Round(6)); } logger.LogTrace($"{logPrefix} progress {docProgress} in `{tagId}`"); logger.LogTrace($"{logPrefix} total progress is {allDocProgress}"); } if (value % 1000 == 0) { GC.Collect(); } } }); if (token.IsCancellationRequested) { CleanPrcIndex(prcSettings.ServiceId); processHandler.Cancelled(processId); logger.LogInformation($"{logPrefix} cancelled Tag: `{tagId}`"); return; } logger.LogInformation($"{logPrefix} prepared Tag: `{tagId}`"); logger.LogInformation($"{logPrefix} total progress is {allDocProgress}"); GC.Collect(); } processHandler.Finished(processId, string.Format(ServiceResources.SuccessfullyIndexed_0_Service_1, ServiceTypeEnum.Prc, service.Name)); logger.LogInformation($"{logPrefix} finished"); } catch (Exception ex) { logger.LogError($"{logPrefix} failed. {ex.Message} {ex.StackTrace}"); CleanPrcIndex(prcSettings.ServiceId); processHandler.Interrupted(processId, ex); } finally { GC.Collect(); } }
public void IndexPartial(string processId, PrcSettingsElastic prcSettings, CancellationToken token) { var logPrefix = $"Prc Partial Index {processId}"; try { const int parallelMultiplier = 2; var partialIndexDate = DateTime.UtcNow; var service = serviceQuery.Get(prcSettings.ServiceId); var wordQuery = queryFactory.GetWordQuery(prcSettings.DataSetName); var tagProgress = new Progress(prcSettings.IndexSettings.FilterTagIdList.Count); logger.LogInformation($"{logPrefix} starts with ParallelLimit: {parallelService.ParallelLimit * parallelMultiplier}, Tags Count: {prcSettings.IndexSettings.FilterTagIdList.Count}"); //lekérjük a legutóbbi indexelés óta módosult vagy létrehozott doksikat(Filter - t figyelve) //TODO get IndexFilterTagIds which has changed only foreach (var tagId in prcSettings.IndexSettings.FilterTagIdList) { if (token.IsCancellationRequested) { processHandler.Cancelled(processId); logger.LogInformation($"{logPrefix} cancelled Tag: `{tagId}`"); return; } var changedDocumentElasticIds = GetDocumentIds(prcSettings.DataSetName, prcSettings.IndexSettings.FilterQuery, new List <string> { tagId }, prcSettings.IndexSettings.IndexDate, partialIndexDate).OrderBy(o => o); // no changed document found since last index if (!changedDocumentElasticIds.Any()) { continue; } // kiszámoljuk az aktuális doksi base dictionary - jét var globalSubset = GlobalStore.ActivatedPrcs.Get(prcSettings.ServiceId).PrcSubsets[tagId]; if (globalSubset.WordsWithOccurences == null) { continue; } // Kellenek azok a doksik is az indexeléshez amik nem változtak var documentElasticIds = GetDocumentIds(prcSettings.DataSetName, prcSettings.IndexSettings.FilterQuery, new List <string> { tagId }, dateEnd: partialIndexDate); if (!documentElasticIds.Any()) { continue; } var wwoDocuments = wordQuery.GetWordsWithOccurencesByDocuments( documentElasticIds, prcSettings.FieldsForRecommendation.Select(DocumentQuery.MapDocumentObjectName), 1, parallelLimit: parallelService.ParallelLimit); var cleanedTextDocuments = wwoDocuments.ToDictionary(w => w.Key, w => GetCleanedText(w.Value)); Parallel.ForEach(changedDocumentElasticIds, parallelService.ParallelOptions(parallelMultiplier), (documentId, loopState) => { if (token.IsCancellationRequested) { loopState.Stop(); return; } try { logger.LogTrace($"{logPrefix} preparing Document: `{documentId}`/`{tagId}`"); var scorer = GetScorer(globalSubset, wwoDocuments[documentId], cleanedTextDocuments[documentId], GlobalStore.ActivatedPrcs.Get(prcSettings.ServiceId).PrcScorers[tagId]); if (scorer == null) { return; } var similarDocuments = new List <KeyValuePair <string, double> >(); // calculate documentElastics scores except current document foreach (var siblingDocumentId in documentElasticIds) { if (siblingDocumentId == documentId) { continue; } var wwoSibling = wwoDocuments[siblingDocumentId]; if (!wwoSibling.Any(w => scorer.BaseDic.Keys.Contains(w.Key))) { continue; } var finalScore = GetPrcScore(scorer, cleanedTextDocuments[siblingDocumentId]); if (finalScore > 0) { similarDocuments.Add(new KeyValuePair <string, double>(siblingDocumentId, finalScore)); } } var redisKey = new PrcIndexRedisKey(prcSettings.ServiceId, tagId, documentId); var indexedDocumentIds = redisHandler.GetDocuments(redisKey); var unchangedDocumentIds = indexedDocumentIds .Where(idx => similarDocuments.Any(sim => sim.Key == idx.Element && sim.Value == idx.Score)) .Select(s => s.Element) .ToList(); var documentIdsToAdjust = indexedDocumentIds.Select(s => s.Element.ToString()) .Union(similarDocuments.Select(s => s.Key)) .Where(w => !unchangedDocumentIds.Contains(w)) .Distinct() .ToList(); redisHandler.ReplaceDocuments(redisKey, similarDocuments); //ezekre doksikra (ha már kéznél vannak), visszafelé is kiszámoljuk a prc score-t foreach (var reverseDocumentId in documentIdsToAdjust) { var reverseRedisKey = new PrcIndexRedisKey(prcSettings.ServiceId, tagId, reverseDocumentId); redisHandler.RemoveDocument(reverseRedisKey, documentId); // ha van egyezőség kiszámolni a prcscore-t // és beszúrni a redisbe var reverseScorer = GetScorer(globalSubset, wwoDocuments[reverseDocumentId], cleanedTextDocuments[reverseDocumentId], GlobalStore.ActivatedPrcs.Get(prcSettings.ServiceId).PrcScorers[tagId]); if (reverseScorer == null) { continue; } var wwoReverse = wwoDocuments[documentId]; if (wwoReverse.Keys.Intersect(reverseScorer.BaseDic.Keys).Count() > 0) { var finalScore = GetPrcScore(reverseScorer, cleanedTextDocuments[documentId]); if (finalScore > 0) { redisHandler.AddDocument(reverseRedisKey, documentId, finalScore); } } // levágjuk a max listaelemszám felettieket redisHandler.TrimDocuments(reverseRedisKey); } } finally { logger.LogTrace($"{logPrefix} prepared Document: `{documentId}`/`{tagId}`"); } }); if (token.IsCancellationRequested) { processHandler.Cancelled(processId); logger.LogInformation($"{logPrefix} cancelled Tag: `{tagId}`"); return; } tagProgress.Step(); processHandler.Changed(processId, tagProgress.Percent.Round(6)); GC.Collect(); } prcSettings.IndexSettings.IndexDate = partialIndexDate; serviceQuery.IndexSettings(prcSettings); logger.LogInformation($"{logPrefix} finished"); processHandler.Finished(processId, string.Format(ServiceResources.SuccessfullyPartialIndexed_0_Service_1, ServiceTypeEnum.Prc, service.Name)); } catch (Exception ex) { processHandler.Interrupted(processId, ex); } finally { GC.Collect(); } }
public IEnumerable <PrcRecommendationResult> RecommendById(string id, PrcSettingsElastic prcSettings, PrcRecommendationByIdRequest request) { var result = new List <PrcRecommendationResult>(); var globalStoreDataSet = GlobalStore.DataSets.Get(prcSettings.DataSetName); var dataSet = globalStoreDataSet.DataSet; var documentQuery = queryFactory.GetDocumentQuery(dataSet.Name); var fieldsForRecommendation = prcSettings.FieldsForRecommendation; var filterOrWeight = !string.IsNullOrEmpty(request.Query) || request?.Weights?.Any() == true; var tagId = string.Empty; if (string.IsNullOrEmpty(request.TagId)) { var documentElastic = documentQuery.Get(request.DocumentId); if (documentElastic == null) { return(result); } var tagToken = JTokenHelper.GetToken(documentElastic.DocumentObject).GetPathToken(dataSet.TagField); tagId = JTokenHelper.GetUnderlyingToken(tagToken)?.ToString(); if (tagId == null) { return(result); } } else { tagId = request.TagId; } var similarDocIdsWithScore = redisHandler.GetTopNDocuments(new PrcIndexRedisKey(id, tagId, request.DocumentId), filterOrWeight ? -1 : request.Count - 1); Dictionary <string, double> resultDictionary = similarDocIdsWithScore; var documentElastics = (filterOrWeight || request.NeedDocumentInResult) ? GetDocuments(dataSet.Name, request.Query, null, fieldsForRecommendation, similarDocIdsWithScore.Keys, request.NeedDocumentInResult) : null; // ha a Filter és a Weights is üres, a TOP Count doksi Id - t visszaadjuk score-jaikkal. (ha kell a document is, akkor elastic - tól elkérjük ezeket pluszban) if (filterOrWeight) { // ezekre a doksikra módosítjuk a prc score - t a Weights-el var docIdsWithScore = documentElastics.ToDictionary(k => k.Id, v => similarDocIdsWithScore[v.Id]); //súlyozás if (request?.Weights?.Any() == true) { var weightsDic = request.Weights.ToDictionary(key => Guid.NewGuid().ToString(), value => value); var docIds = docIdsWithScore.Keys.ToList(); var queries = weightsDic.ToDictionary(key => key.Key, value => documentQuery.PrefixQueryFields(value.Value.Query, globalStoreDataSet.DocumentFields)); var ids = documentQuery.GetExistsForQueries(queries, docIds).ToDictionary(k => k.Key, v => v.Value.ToDictionary(ke => ke, va => va)); var allWeightsCount = request.Weights.Count; foreach (var docId in docIds) { var weightsSum = weightsDic.Where(w => ids[w.Key].ContainsKey(docId)).Sum(w => w.Value.Value); var pow = 1 + (weightsSum / allWeightsCount); var score = Math.Pow(docIdsWithScore[docId] + 1, pow) - 1; docIdsWithScore[docId] = score; } } resultDictionary = docIdsWithScore; } var recommendation = resultDictionary .OrderByDescending(o => o.Value) .Take(request.Count) .Select(s => new PrcRecommendationResult() { DocumentId = s.Key, Score = s.Value, Document = request.NeedDocumentInResult ? documentElastics.SingleOrDefault(d => d.Id == s.Key).DocumentObject : null }); return(recommendation); }
public IActionResult Prepare(string id, [FromBody] PrcPrepareSettings prcPrepareSettings) { //SERVICE VALIDATION var service = serviceQuery.Get(id); if (service == null) { return(new HttpStatusCodeWithErrorResult(StatusCodes.Status404NotFound, ServiceResources.InvalidIdNotExistingService)); } if (service.Type != (int)ServiceTypeEnum.Prc) { return(new HttpStatusCodeWithErrorResult(StatusCodes.Status400BadRequest, string.Format(ServiceResources.InvalidServiceTypeOnly_0_ServicesAreValidForThisRequest, "Prc"))); } if (service.Status != (int)ServiceStatusEnum.New) { return(new HttpStatusCodeWithErrorResult(StatusCodes.Status400BadRequest, ServiceResources.InvalidStatusOnlyTheServicesWithNewStatusCanBePrepared)); } //DATASET VALIDATION if (!GlobalStore.DataSets.IsExist(prcPrepareSettings.DataSetName)) { return(new HttpStatusCodeWithErrorResult(StatusCodes.Status400BadRequest, string.Format(ServiceResources.DataSet_0_NotFound, prcPrepareSettings.DataSetName))); } var globalStoreDataSet = GlobalStore.DataSets.Get(prcPrepareSettings.DataSetName); var dataSet = globalStoreDataSet.DataSet; //TAGS VALIDATION var tagQuery = queryFactory.GetTagQuery(dataSet.Name); List <TagElastic> tags; if (prcPrepareSettings?.TagIdList?.Any() == true) { tags = tagQuery.Get(prcPrepareSettings.TagIdList).ToList(); if (tags.Count < prcPrepareSettings.TagIdList.Count) { var missingTagIds = prcPrepareSettings.TagIdList.Except(tags.Select(t => t.Id)).ToList(); return(new HttpStatusCodeWithErrorResult(StatusCodes.Status400BadRequest, string.Format(ServiceResources.TheFollowingTagIdsNotExistInTheDataSet_0, string.Join(", ", missingTagIds)))); } } else { tags = tagQuery.GetAll().Items.Where(i => i.IsLeaf).ToList(); } //SAVE SETTINGS TO ELASTIC var serviceSettings = new PrcSettingsElastic { DataSetName = globalStoreDataSet.IndexName, ServiceId = service.Id, Tags = tags, CompressSettings = CompressHelper.ToCompressSettingsElastic(prcPrepareSettings.CompressSettings, prcPrepareSettings.CompressLevel) }; serviceQuery.IndexSettings(serviceSettings); var process = processHandler.Create( ProcessTypeEnum.PrcPrepare, service.Id, prcPrepareSettings, string.Format(ServiceResources.Preparing_0_Service_1, ServiceTypeEnum.Prc, service.Name)); service.ProcessIdList.Add(process.Id); serviceQuery.Update(service.Id, service); processHandler.Start(process, (tokenSource) => prcHandler.Prepare(process.Id, serviceSettings, tokenSource.Token)); return(new HttpStatusCodeWithObjectResult(StatusCodes.Status202Accepted, process.ToProcessModel())); }