/// <summary> /// if force==true, analysis is run again, even if it's already there. /// if force==false, only relationships that have not been analyzed yet are run again. /// </summary> public void TriggerSimilarityAnalysisForAllDumps(bool force, DateTime timeFrom) { Console.WriteLine($"Triggering similarity analysis for all dumps new thatn {timeFrom}. force={force}"); // start analysis with newest dump // for every dump, only analyze newer ones. // that way, at first, only newset dumps are compared with newest ones. foreach (var dumpInfo in dumpRepo.GetAll().Where(x => x.Created >= timeFrom).OrderByDescending(x => x.Created)) { ScheduleSimilarityAnalysis(dumpInfo, force, dumpInfo.Created); } }
public async Task Populate() { if (!settings.Value.SimilarityDetectionEnabled) { return; } await BlockIfBundleRepoNotReady("RelationshipRepository.Populate"); await semaphoreSlim.WaitAsync().ConfigureAwait(false); var sw = new Stopwatch(); sw.Start(); try { var tasks = dumpRepo.GetAll().Select(dump => Task.Run(async() => { try { relationShips[dump.Id] = await relationshipStorage.ReadRelationships(dump.Id); } catch (FileNotFoundException) { // ignore. } catch (Exception e) { Console.WriteLine($"RelationshipRepository.Populate: Error reading relationship file for dump {dump.Id}: " + e.Message); relationshipStorage.Wipe(dump.Id); } })); await Task.WhenAll(tasks); } finally { IsPopulated = true; semaphoreSlim.Release(); } sw.Stop(); Console.WriteLine($"Finished populating RelationshipRepository in {sw.Elapsed}"); }
public async Task <IOrderedEnumerable <DumpViewModel> > SearchBySimpleFilter(string searchFilter, bool includeSimilarities = true) { var dumps = await Task.WhenAll(dumpRepo.GetAll().Select(x => ToDumpViewModel(x, dumpRepo, bundleRepo, includeSimilarities ? similarityService : null))); var filtered = SimpleFilter(searchFilter, dumps).OrderByDescending(x => x.DumpInfo.Created); return(filtered); }
public async Task Populate() { await semaphoreSlim.WaitAsync().ConfigureAwait(false); try { foreach (var dump in dumpRepo.GetAll()) { try { relationShips[dump.Id] = await relationshipStorage.ReadRelationships(dump.Id); } catch (FileNotFoundException) { // ignore. } catch (Exception e) { Console.WriteLine("error reading relationship file: " + e.ToString()); relationshipStorage.Wipe(dump.Id); } } } finally { semaphoreSlim.Release(); } }
public async Task PushAllResultsAsync(bool clean) { if (elasticClient == null) { throw new InvalidOperationException("ElasticSearch has not been initialized! Please verify that the settings specify a correct elastic search host."); } await BlockIfBundleRepoNotReady("ElasticSearchService.PushAllResultsAsync"); if (clean) { DeleteIndex(); CreateIndex(); // since we are clean, we can do everything in one bulk var dumps = dumpRepo.GetAll().OrderByDescending(x => x.Created); foreach (var dumpsBatch in dumps.Batch(100)) { var tasks = dumpsBatch.Select(x => Task.Run(async() => new { res = await dumpRepo.GetResult(x.Id), bundleInfo = bundleRepo.Get(x.BundleId), dumpInfo = x })); var results = (await Task.WhenAll(tasks)).Where(x => x.res != null); Console.WriteLine($"pushing {results.Count()} results into elasticsearch"); var sdResults = results.Select(x => ElasticSDResult.FromResultOrDefault(x.res, x.bundleInfo, x.dumpInfo, pathHelper)).Where(x => x != null); await PushBulk(sdResults); } return; } IEnumerable <string> documentIds = GetAllDocumentIds(); int nErrorsLogged = 0; var bundles = bundleRepo.GetAll(); if (bundles == null) { throw new InvalidOperationException("Bundle repository must be populated before pushing data into ES."); } // In order to check if a dump has already been added, we go through them all and add one at the time // There is potential to optimize this and still do a bulk add. foreach (BundleMetainfo bundle in bundles) { var dumps = dumpRepo.Get(bundle.BundleId); if (dumps == null) { continue; } foreach (DumpMetainfo dump in dumps) { if (documentIds.Contains(bundle.BundleId + "/" + dump.DumpId)) { continue; } SDResult result = await dumpRepo.GetResult(dump.Id); if (result != null) { bool success = await PushResultAsync(result, bundle, dump); if (!success && nErrorsLogged < 20) { Console.WriteLine($"Failed to create document for {dump.BundleId}/{dump.DumpId}"); nErrorsLogged++; } } } } }