public void Index(string index, AbstractViewGenerator viewGenerator, IndexingBatch batch, WorkContext context, IStorageActionsAccessor actions, DateTime minimumTimestamp) { Index value; if (indexes.TryGetValue(index, out value) == false) { log.Debug("Tried to index on a non existent index {0}, ignoring", index); return; } using (EnsureInvariantCulture()) using (DocumentCacher.SkipSettingDocumentsInDocumentCache()) { value.IndexDocuments(viewGenerator, batch, context, actions, minimumTimestamp); context.RaiseIndexChangeNotification(new IndexChangeNotification { Name = index, Type = IndexChangeTypes.MapCompleted }); } }
private void IndexDocuments(IStorageActionsAccessor actions, string index, IndexingBatch batch) { var viewGenerator = context.IndexDefinitionStorage.GetViewGenerator(index); if (viewGenerator == null) { return; // index was deleted, probably } try { if (log.IsDebugEnabled) { string ids; if (batch.Ids.Count < 256) { ids = string.Join(",", batch.Ids); } else { ids = string.Join(", ", batch.Ids.Take(128)) + " ... " + string.Join(", ", batch.Ids.Skip(batch.Ids.Count - 128)); } log.Debug("Indexing {0} documents for index: {1}. ({2})", batch.Docs.Count, index, ids); } context.CancellationToken.ThrowIfCancellationRequested(); context.IndexStorage.Index(index, viewGenerator, batch.Docs, context, actions, batch.DateTime ?? DateTime.MinValue); } catch (OperationCanceledException) { throw; } catch (Exception e) { if (actions.IsWriteConflict(e)) { return; } log.WarnException( string.Format("Failed to index documents for index: {0}", index), e); } }
public override void IndexDocuments( AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; var changed = new HashSet <ReduceKeyAndBucket>(); var documentsWrapped = batch.Docs.Select(doc => { sourceCount++; var documentId = doc.__document_id; actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, name, changed); return(doc); }) .Where(x => x is FilteredDocument == false); var items = new List <MapResultItem>(); var stats = new IndexingWorkStats(); var allReferencedDocs = new ConcurrentQueue <IDictionary <string, HashSet <string> > >(); using (CurrentIndexingScope.Current = new CurrentIndexingScope(LoadDocument, allReferencedDocs.Enqueue)) { var mapResults = RobustEnumerationIndex( documentsWrapped.GetEnumerator(), viewGenerator.MapDefinitions, actions, stats) .ToList(); actions.MapReduce.UpdateRemovedMapReduceStats(name, changed); foreach (var mappedResultFromDocument in mapResults.GroupBy(GetDocumentId)) { var dynamicResults = mappedResultFromDocument.Select(x => (object)new DynamicJsonObject(RavenJObject.FromObject(x, jsonSerializer))).ToList(); foreach ( var doc in RobustEnumerationReduceDuringMapPhase(dynamicResults.GetEnumerator(), viewGenerator.ReduceDefinition, actions, context)) { count++; var reduceValue = viewGenerator.GroupByExtraction(doc); if (reduceValue == null) { logIndexing.Debug("Field {0} is used as the reduce key and cannot be null, skipping document {1}", viewGenerator.GroupByExtraction, mappedResultFromDocument.Key); continue; } var reduceKey = ReduceKeyToString(reduceValue); var docId = mappedResultFromDocument.Key.ToString(); var data = GetMappedData(doc); items.Add(new MapResultItem { Data = data, DocId = docId, ReduceKey = reduceKey }); changed.Add(new ReduceKeyAndBucket(IndexingUtil.MapBucket(docId), reduceKey)); } } } IDictionary <string, HashSet <string> > result; while (allReferencedDocs.TryDequeue(out result)) { foreach (var referencedDocument in result) { actions.Indexing.UpdateDocumentReferences(name, referencedDocument.Key, referencedDocument.Value); actions.General.MaybePulseTransaction(); } } foreach (var mapResultItem in items) { actions.MapReduce.PutMappedResult(name, mapResultItem.DocId, mapResultItem.ReduceKey, mapResultItem.Data); actions.General.MaybePulseTransaction(); } UpdateIndexingStats(context, stats); actions.MapReduce.ScheduleReductions(name, 0, changed); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, InputCount = sourceCount, Operation = "Map", Duration = sw.Elapsed, Started = start }); logIndexing.Debug("Mapped {0} documents for {1}", count, name); }
private IEnumerable<Tuple<IndexToWorkOn, IndexingBatch>> FilterIndexes(IList<IndexToWorkOn> indexesToWorkOn, JsonDocument[] jsonDocs) { var last = jsonDocs.Last(); Debug.Assert(last.Etag != null); Debug.Assert(last.LastModified != null); var lastEtag = last.Etag.Value; var lastModified = last.LastModified.Value; var lastIndexedEtag = new ComparableByteArray(lastEtag.ToByteArray()); var documentRetriever = new DocumentRetriever(null, context.ReadTriggers); var filteredDocs = BackgroundTaskExecuter.Instance.Apply(jsonDocs, doc => { doc = documentRetriever.ExecuteReadTriggers(doc, null, ReadOperation.Index); return doc == null ? null : new {Doc = doc, Json = JsonToExpando.Convert(doc.ToJson())}; }); log.Debug("After read triggers executed, {0} documents remained", filteredDocs.Count); var results = new Tuple<IndexToWorkOn, IndexingBatch>[indexesToWorkOn.Count]; var actions = new Action<IStorageActionsAccessor>[indexesToWorkOn.Count]; BackgroundTaskExecuter.Instance.ExecuteAll(context.Configuration, scheduler, indexesToWorkOn, (indexToWorkOn, i) => { var indexLastInedexEtag = new ComparableByteArray(indexToWorkOn.LastIndexedEtag.ToByteArray()); if (indexLastInedexEtag.CompareTo(lastIndexedEtag) >= 0) return; var indexName = indexToWorkOn.IndexName; var viewGenerator = context.IndexDefinitionStorage.GetViewGenerator(indexName); if (viewGenerator == null) return; // probably deleted var batch = new IndexingBatch(); foreach (var item in filteredDocs) { // did we already indexed this document in this index? if (indexLastInedexEtag.CompareTo(new ComparableByteArray(item.Doc.Etag.Value.ToByteArray())) >= 0) continue; // is the Raven-Entity-Name a match for the things the index executes on? if (viewGenerator.ForEntityNames.Count != 0 && viewGenerator.ForEntityNames.Contains(item.Doc.Metadata.Value<string>(Constants.RavenEntityName)) == false) { continue; } batch.Add(item.Doc, item.Json); if (batch.DateTime == null) batch.DateTime = item.Doc.LastModified; else batch.DateTime = batch.DateTime > item.Doc.LastModified ? item.Doc.LastModified : batch.DateTime; } if (batch.Docs.Count == 0) { log.Debug("All documents have been filtered for {0}, no indexing will be performed, updating to {1}, {2}", indexName, lastEtag, lastModified); // we use it this way to batch all the updates together actions[i] = accessor => accessor.Indexing.UpdateLastIndexed(indexName, lastEtag, lastModified); return; } log.Debug("Going to index {0} documents in {1}", batch.Ids.Count, indexToWorkOn); results[i] = Tuple.Create(indexToWorkOn, batch); }); transactionalStorage.Batch(actionsAccessor => { foreach (var action in actions) { if (action != null) action(actionsAccessor); } }); return results.Where(x => x != null); }
public override void IndexDocuments( AbstractViewGenerator viewGenerator, IndexingBatch batch, WorkContext context, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; var changed = new HashSet <ReduceKeyAndBucket>(); var documentsWrapped = batch.Docs.Select(doc => { sourceCount++; var documentId = doc.__document_id; actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, name, changed); return(doc); }) .Where(x => x is FilteredDocument == false); var items = new List <MapResultItem>(); var stats = new IndexingWorkStats(); foreach ( var mappedResultFromDocument in GroupByDocumentId(context, RobustEnumerationIndex(documentsWrapped.GetEnumerator(), viewGenerator.MapDefinitions, actions, stats))) { var dynamicResults = mappedResultFromDocument.Select(x => (object)new DynamicJsonObject(RavenJObject.FromObject(x, jsonSerializer))).ToList(); foreach ( var doc in RobustEnumerationReduceDuringMapPhase(dynamicResults.GetEnumerator(), viewGenerator.ReduceDefinition, actions, context)) { count++; var reduceValue = viewGenerator.GroupByExtraction(doc); if (reduceValue == null) { logIndexing.Debug("Field {0} is used as the reduce key and cannot be null, skipping document {1}", viewGenerator.GroupByExtraction, mappedResultFromDocument.Key); continue; } var reduceKey = ReduceKeyToString(reduceValue); var docId = mappedResultFromDocument.Key.ToString(); var data = GetMappedData(doc); logIndexing.Debug("Mapped result for index '{0}' doc '{1}': '{2}'", name, docId, data); items.Add(new MapResultItem { Data = data, DocId = docId, ReduceKey = reduceKey }); changed.Add(new ReduceKeyAndBucket(IndexingUtil.MapBucket(docId), reduceKey)); } } int mapCount = 0; foreach (var mapResultItem in items) { actions.MapReduce.PutMappedResult(name, mapResultItem.DocId, mapResultItem.ReduceKey, mapResultItem.Data); if (mapCount++ % 50000 == 0) { // The reason this is here is to protect us from Version Store Out Of Memory error during indexing // this can happen if we have indexes that output a VERY large number of items per doc. actions.General.PulseTransaction(); } } UpdateIndexingStats(context, stats); actions.MapReduce.ScheduleReductions(name, 0, changed); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, InputCount = sourceCount, Operation = "Map", Duration = sw.Elapsed, Started = start }); logIndexing.Debug("Mapped {0} documents for {1}", count, name); }
public IndexingPerformanceStats Index(int index, AbstractViewGenerator viewGenerator, IndexingBatch batch, WorkContext context, IStorageActionsAccessor actions, DateTime minimumTimestamp, CancellationToken token) { Index value; if (indexes.TryGetValue(index, out value) == false) { log.Debug("Tried to index on a non existent index {0}, ignoring", index); return null; } using (EnsureInvariantCulture()) using (DocumentCacher.SkipSettingDocumentsInDocumentCache()) { var performance = value.IndexDocuments(viewGenerator, batch, actions, minimumTimestamp, token); context.RaiseIndexChangeNotification(new IndexChangeNotification { Name = value.PublicName, Type = IndexChangeTypes.MapCompleted }); return performance; } }
private IEnumerable<IndexingBatchForIndex> FilterIndexes(IList<IndexToWorkOn> indexesToWorkOn, List<JsonDocument> jsonDocs, Etag highestETagInBatch) { var last = jsonDocs.Last(); Debug.Assert(last.Etag != null); Debug.Assert(last.LastModified != null); var lastEtag = last.Etag; var lastModified = last.LastModified.Value; var documentRetriever = new DocumentRetriever(null, context.ReadTriggers, context.Database.InFlightTransactionalState); var filteredDocs = BackgroundTaskExecuter.Instance.Apply(context, jsonDocs, doc => { var filteredDoc = documentRetriever.ExecuteReadTriggers(doc, null, ReadOperation.Index); return filteredDoc == null ? new { Doc = doc, Json = (object)new FilteredDocument(doc) } : new { Doc = filteredDoc, Json = JsonToExpando.Convert(doc.ToJson()) }; }); Log.Debug("After read triggers executed, {0} documents remained", filteredDocs.Count); var results = new IndexingBatchForIndex[indexesToWorkOn.Count]; var actions = new Action<IStorageActionsAccessor>[indexesToWorkOn.Count]; BackgroundTaskExecuter.Instance.ExecuteAll(context, indexesToWorkOn, (indexToWorkOn, i) => { var indexName = indexToWorkOn.IndexName; var viewGenerator = context.IndexDefinitionStorage.GetViewGenerator(indexName); if (viewGenerator == null) return; // probably deleted var batch = new IndexingBatch(highestETagInBatch); foreach (var item in filteredDocs) { if (prefetchingBehavior.FilterDocuments(item.Doc) == false) continue; // did we already indexed this document in this index? var etag = item.Doc.Etag; if (etag == null) continue; // is the Raven-Entity-Name a match for the things the index executes on? if (viewGenerator.ForEntityNames.Count != 0 && viewGenerator.ForEntityNames.Contains(item.Doc.Metadata.Value<string>(Constants.RavenEntityName)) == false) { continue; } batch.Add(item.Doc, item.Json, prefetchingBehavior.ShouldSkipDeleteFromIndex(item.Doc)); if (batch.DateTime == null) batch.DateTime = item.Doc.LastModified; else batch.DateTime = batch.DateTime > item.Doc.LastModified ? item.Doc.LastModified : batch.DateTime; } if (batch.Docs.Count == 0) { Log.Debug("All documents have been filtered for {0}, no indexing will be performed, updating to {1}, {2}", indexName, lastEtag, lastModified); // we use it this way to batch all the updates together actions[i] = accessor => accessor.Indexing.UpdateLastIndexed(indexName, lastEtag, lastModified); return; } if (Log.IsDebugEnabled) { Log.Debug("Going to index {0} documents in {1}: ({2})", batch.Ids.Count, indexToWorkOn, string.Join(", ", batch.Ids)); } results[i] = new IndexingBatchForIndex { Batch = batch, IndexName = indexToWorkOn.IndexName, Index = indexToWorkOn.Index, LastIndexedEtag = indexToWorkOn.LastIndexedEtag }; }); transactionalStorage.Batch(actionsAccessor => { foreach (var action in actions) { if (action != null) action(actionsAccessor); } }); return results.Where(x => x != null); }
public override void IndexDocuments( AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; var deleted = new Dictionary<ReduceKeyAndBucket, int>(); var indexPerfStats = RecordCurrentBatch("Current Map", batch.Docs.Count); batch.SetIndexingPerformance(indexPerfStats); var documentsWrapped = batch.Docs.Select(doc => { sourceCount++; var documentId = doc.__document_id; actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, indexId, deleted); return doc; }) .Where(x => x is FilteredDocument == false) .ToList(); var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>(); var allReferenceEtags = new ConcurrentQueue<IDictionary<string, Etag>>(); var allState = new ConcurrentQueue<Tuple<HashSet<ReduceKeyAndBucket>, IndexingWorkStats, Dictionary<string, int>>>(); int loadDocumentCount = 0; long loadDocumentDuration = 0; BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, partition => { var localStats = new IndexingWorkStats(); var localChanges = new HashSet<ReduceKeyAndBucket>(); var statsPerKey = new Dictionary<string, int>(); allState.Enqueue(Tuple.Create(localChanges, localStats, statsPerKey)); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { // we are writing to the transactional store from multiple threads here, and in a streaming fashion // should result in less memory and better perf context.TransactionalStorage.Batch(accessor => { var mapResults = RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, localStats); var currentDocumentResults = new List<object>(); string currentKey = null; bool skipDocument = false; foreach (var currentDoc in mapResults) { var documentId = GetDocumentId(currentDoc); if (documentId != currentKey) { count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey); currentDocumentResults.Clear(); currentKey = documentId; } else if (skipDocument) { continue; } currentDocumentResults.Add(new DynamicJsonObject(RavenJObject.FromObject(currentDoc, jsonSerializer))); if (EnsureValidNumberOfOutputsForDocument(documentId, currentDocumentResults.Count) == false) { skipDocument = true; currentDocumentResults.Clear(); continue; } Interlocked.Increment(ref localStats.IndexingSuccesses); } count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey); }); allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); Interlocked.Add(ref loadDocumentCount, CurrentIndexingScope.Current.LoadDocumentCount); Interlocked.Add(ref loadDocumentDuration, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds); } }); UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); var changed = allState.SelectMany(x => x.Item1).Concat(deleted.Keys) .Distinct() .ToList(); var stats = new IndexingWorkStats(allState.Select(x => x.Item2)); var reduceKeyStats = allState.SelectMany(x => x.Item3) .GroupBy(x => x.Key) .Select(g => new { g.Key, Count = g.Sum(x => x.Value) }) .ToList(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, reduceKeyStats, enumerator => context.TransactionalStorage.Batch(accessor => { while (enumerator.MoveNext()) { var reduceKeyStat = enumerator.Current; accessor.MapReduce.IncrementReduceKeyCounter(indexId, reduceKeyStat.Key, reduceKeyStat.Count); } })); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, changed, enumerator => context.TransactionalStorage.Batch(accessor => { while (enumerator.MoveNext()) { accessor.MapReduce.ScheduleReductions(indexId, 0, enumerator.Current); } })); UpdateIndexingStats(context, stats); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, ItemsCount = sourceCount, InputCount = documentsWrapped.Count, Operation = "Map", Duration = sw.Elapsed, Started = start, LoadDocumentCount = loadDocumentCount, LoadDocumentDurationMs = loadDocumentDuration }); BatchCompleted("Current Map"); logIndexing.Debug("Mapped {0} documents for {1}", count, indexId); }
private IEnumerable<IndexingBatchForIndex> FilterIndexes(IList<IndexToWorkOn> indexesToWorkOn, List<JsonDocument> jsonDocs, Etag highestETagInBatch, out List<IndexToWorkOn> filteredOutIndexes) { var innerFilteredOutIndexes = new ConcurrentStack<IndexToWorkOn>(); var last = jsonDocs.Last(); Debug.Assert(last.Etag != null); Debug.Assert(last.LastModified != null); var lastEtag = last.Etag; var lastModified = last.LastModified.Value; var documentRetriever = new DocumentRetriever(null, null, context.ReadTriggers); var filteredDocs = BackgroundTaskExecuter.Instance.Apply(context, jsonDocs, doc => { var filteredDoc = documentRetriever.ExecuteReadTriggers(doc, ReadOperation.Index); return filteredDoc == null ? new { Doc = doc, Json = (object)new FilteredDocument(doc) } : new { Doc = filteredDoc, Json = JsonToExpando.Convert(doc.ToJson()) }; }); if ( Log.IsDebugEnabled ) Log.Debug("After read triggers executed, {0} documents remained", filteredDocs.Count); var results = new ConcurrentQueue<IndexingBatchForIndex>(); var actions = new ConcurrentQueue<Action<IStorageActionsAccessor>>(); context.Database.MappingThreadPool.ExecuteBatch(indexesToWorkOn, indexToWorkOn => { var indexName = indexToWorkOn.Index.PublicName; var viewGenerator = context.IndexDefinitionStorage.GetViewGenerator(indexName); if (viewGenerator == null) return; // probably deleted var batch = new IndexingBatch(highestETagInBatch); foreach (var filteredDoc in filteredDocs) { var doc = filteredDoc.Doc; var json = filteredDoc.Json; if (defaultPrefetchingBehavior.FilterDocuments(doc) == false || doc.Etag.CompareTo(indexToWorkOn.LastIndexedEtag) <= 0) continue; // did we already indexed this document in this index? var etag = doc.Etag; if (etag == null) continue; // is the Raven-Entity-Name a match for the things the index executes on? if (viewGenerator.ForEntityNames.Count != 0 && viewGenerator.ForEntityNames.Contains(doc.Metadata.Value<string>(Constants.RavenEntityName)) == false) { continue; } batch.Add(doc, json, defaultPrefetchingBehavior.ShouldSkipDeleteFromIndex(doc)); if (batch.DateTime == null) batch.DateTime = doc.LastModified; else batch.DateTime = batch.DateTime > doc.LastModified ? doc.LastModified : batch.DateTime; } if (batch.Docs.Count == 0) { if ( Log.IsDebugEnabled ) Log.Debug("All documents have been filtered for {0}, no indexing will be performed, updating to {1}, {2}", indexName, lastEtag, lastModified); // we use it this way to batch all the updates together if (indexToWorkOn.LastIndexedEtag.CompareTo(lastEtag) < 0) actions.Enqueue(accessor => { accessor.Indexing.UpdateLastIndexed(indexToWorkOn.Index.indexId, lastEtag, lastModified); accessor.AfterStorageCommit += () => { indexToWorkOn.Index.EnsureIndexWriter(); indexToWorkOn.Index.Flush(lastEtag); }; }); innerFilteredOutIndexes.Push(indexToWorkOn); context.MarkIndexFilteredOut(indexName); return; } if (Log.IsDebugEnabled) Log.Debug("Going to index {0} documents in {1}: ({2})", batch.Ids.Count, indexToWorkOn, string.Join(", ", batch.Ids)); results.Enqueue(new IndexingBatchForIndex { Batch = batch, IndexId = indexToWorkOn.IndexId, Index = indexToWorkOn.Index, LastIndexedEtag = indexToWorkOn.LastIndexedEtag }); }, description: string.Format("Filtering documents for {0} indexes", indexesToWorkOn.Count)); filteredOutIndexes = innerFilteredOutIndexes.ToList(); foreach (var action in actions) { bool keepTrying = true; for (int i = 0; i < 10 && keepTrying; i++) { keepTrying = false; transactionalStorage.Batch(actionsAccessor => { if (action != null) { try { action(actionsAccessor); } catch (Exception e) { if (actionsAccessor.IsWriteConflict(e)) { keepTrying = true; return; } throw; } } }); if (keepTrying) Thread.Sleep(11); } } return results.Where(x => x != null); }
public override IndexingPerformanceStats IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp, CancellationToken token) { token.ThrowIfCancellationRequested(); var count = 0; var sourceCount = 0; var writeToIndexStats = new List<PerformanceStats>(); IndexingPerformanceStats performance = null; var performanceStats = new List<BasePerformanceStats>(); var storageCommitDuration = new Stopwatch(); actions.BeforeStorageCommit += storageCommitDuration.Start; actions.AfterStorageCommit += () => { storageCommitDuration.Stop(); performanceStats.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds)); }; Write((indexWriter, analyzer, stats) => { var processedKeys = new HashSet<string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId)) .Where(x => x != null) .ToList(); try { performance = RecordCurrentBatch("Current", "Index", batch.Docs.Count); var deleteExistingDocumentsDuration = new Stopwatch(); var docIdTerm = new Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Docs.Select((doc, i) => { token.ThrowIfCancellationRequested(); Interlocked.Increment(ref sourceCount); if (doc.__document_id == null) throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) return doc; InvokeOnIndexEntryDeletedOnAllBatchers(batchers, docIdTerm.CreateTerm(documentId.ToLowerInvariant())); if (batch.SkipDeleteFromIndex[i] == false || context.ShouldRemoveFromIndex(documentId)) // maybe it is recently deleted? { using (StopwatchScope.For(deleteExistingDocumentsDuration)) { indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); } } return doc; }) .Where(x => x is FilteredDocument == false) .ToList(); performanceStats.Add(new PerformanceStats { Name = IndexingOperation.Lucene_DeleteExistingDocument, DurationMs = deleteExistingDocumentsDuration.ElapsedMilliseconds }); var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>(); var allReferenceEtags = new ConcurrentQueue<IDictionary<string, Etag>>(); var parallelOperations = new ConcurrentQueue<ParallelBatchStats>(); var parallelProcessingStart = SystemTime.UtcNow; BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, (partition) => { token.ThrowIfCancellationRequested(); var parallelStats = new ParallelBatchStats { StartDelay = (long) (SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds }; var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(context.Database, indexDefinition, viewGenerator, logIndexing); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { string currentDocId = null; int outputPerDocId = 0; Action<Exception, object> onErrorFunc; bool skipDocument = false; var linqExecutionDuration = new Stopwatch(); var addDocumentDutation = new Stopwatch(); var convertToLuceneDocumentDuration = new Stopwatch(); foreach (var doc in RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, stats, out onErrorFunc, linqExecutionDuration)) { token.ThrowIfCancellationRequested(); float boost; IndexingResult indexingResult; using (StopwatchScope.For(convertToLuceneDocumentDuration)) { try { indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); } catch (Exception e) { onErrorFunc(e, doc); continue; } } // ReSharper disable once RedundantBoolCompare --> code clarity if (indexingResult.NewDocId == null || indexingResult.ShouldSkip != false) { continue; } if (currentDocId != indexingResult.NewDocId) { currentDocId = indexingResult.NewDocId; outputPerDocId = 0; skipDocument = false; } if (skipDocument) continue; outputPerDocId++; if (EnsureValidNumberOfOutputsForDocument(currentDocId, outputPerDocId) == false) { skipDocument = true; continue; } Interlocked.Increment(ref count); using (StopwatchScope.For(convertToLuceneDocumentDuration)) { luceneDoc.GetFields().Clear(); luceneDoc.Boost = boost; documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format( "Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", PublicName, indexingResult.NewDocId), exception); context.AddError( indexId, PublicName, indexingResult.NewDocId, exception, "OnIndexEntryCreated Trigger"); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); using (StopwatchScope.For(addDocumentDutation)) { AddDocumentToIndex(indexWriter, luceneDoc, analyzer); } Interlocked.Increment(ref stats.IndexingSuccesses); } allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.LoadDocument, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_MapExecution, linqExecutionDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Lucene_ConvertToLuceneDocument, convertToLuceneDocumentDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Lucene_AddDocument, addDocumentDutation.ElapsedMilliseconds)); parallelOperations.Enqueue(parallelStats); parallelOperations.Enqueue(parallelStats); } }); performanceStats.Add(new ParallelPerformanceStats { NumberOfThreads = parallelOperations.Count, DurationMs = (long) (SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds, BatchedOperations = parallelOperations.ToList() }); var updateDocumentReferencesDuration = new Stopwatch(); using (StopwatchScope.For(updateDocumentReferencesDuration)) { UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); } performanceStats.Add(PerformanceStats.From(IndexingOperation.UpdateDocumentReferences, updateDocumentReferencesDuration.ElapsedMilliseconds)); } catch (Exception e) { batchers.ApplyAndIgnoreAllErrors( ex => { logIndexing.WarnException("Failed to notify index update trigger batcher about an error in " + PublicName, ex); context.AddError(indexId, PublicName, null, ex, "AnErrorOccured Trigger"); }, x => x.AnErrorOccured(e)); throw; } finally { batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger in " + PublicName, e); context.AddError(indexId, PublicName, null, e, "Dispose Trigger"); }, x => x.Dispose()); } return new IndexedItemsInfo(batch.HighestEtagBeforeFiltering) { ChangedDocs = sourceCount }; }, writeToIndexStats); performanceStats.AddRange(writeToIndexStats); performance.OnCompleted = () => BatchCompleted("Current", "Index", sourceCount, count, performanceStats); logIndexing.Debug("Indexed {0} documents for {1}", count, PublicName); return performance; }
public override void IndexDocuments( AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; var deleted = new Dictionary <ReduceKeyAndBucket, int>(); RecordCurrentBatch("Current Map", batch.Docs.Count); var documentsWrapped = batch.Docs.Select(doc => { sourceCount++; var documentId = doc.__document_id; actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, indexId, deleted); return(doc); }) .Where(x => x is FilteredDocument == false) .ToList(); var allReferencedDocs = new ConcurrentQueue <IDictionary <string, HashSet <string> > >(); var allReferenceEtags = new ConcurrentQueue <IDictionary <string, Etag> >(); var allState = new ConcurrentQueue <Tuple <HashSet <ReduceKeyAndBucket>, IndexingWorkStats, Dictionary <string, int> > >(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, partition => { var localStats = new IndexingWorkStats(); var localChanges = new HashSet <ReduceKeyAndBucket>(); var statsPerKey = new Dictionary <string, int>(); allState.Enqueue(Tuple.Create(localChanges, localStats, statsPerKey)); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { // we are writing to the transactional store from multiple threads here, and in a streaming fashion // should result in less memory and better perf context.TransactionalStorage.Batch(accessor => { var mapResults = RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, localStats); var currentDocumentResults = new List <object>(); string currentKey = null; foreach (var currentDoc in mapResults) { var documentId = GetDocumentId(currentDoc); if (documentId != currentKey) { count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey); currentDocumentResults.Clear(); currentKey = documentId; } currentDocumentResults.Add(new DynamicJsonObject(RavenJObject.FromObject(currentDoc, jsonSerializer))); EnsureValidNumberOfOutputsForDocument(documentId, currentDocumentResults.Count); Interlocked.Increment(ref localStats.IndexingSuccesses); } count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey); }); allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); } }); UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); var changed = allState.SelectMany(x => x.Item1).Concat(deleted.Keys) .Distinct() .ToList(); var stats = new IndexingWorkStats(allState.Select(x => x.Item2)); var reduceKeyStats = allState.SelectMany(x => x.Item3) .GroupBy(x => x.Key) .Select(g => new { g.Key, Count = g.Sum(x => x.Value) }) .ToList(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, reduceKeyStats, enumerator => context.TransactionalStorage.Batch(accessor => { while (enumerator.MoveNext()) { var reduceKeyStat = enumerator.Current; accessor.MapReduce.IncrementReduceKeyCounter(indexId, reduceKeyStat.Key, reduceKeyStat.Count); } })); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, changed, enumerator => context.TransactionalStorage.Batch(accessor => { while (enumerator.MoveNext()) { accessor.MapReduce.ScheduleReductions(indexId, 0, enumerator.Current); } })); UpdateIndexingStats(context, stats); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, ItemsCount = sourceCount, InputCount = documentsWrapped.Count, Operation = "Map", Duration = sw.Elapsed, Started = start }); BatchCompleted("Current Map"); logIndexing.Debug("Mapped {0} documents for {1}", count, indexId); }
public override void IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, WorkContext context, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; Write(context, (indexWriter, analyzer, stats) => { var processedKeys = new HashSet<string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(name)) .Where(x => x != null) .ToList(); try { var docIdTerm = new Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Docs.Select((doc,i) => { Interlocked.Increment(ref sourceCount); if (doc.__document_id == null) throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) return doc; batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format("Error when executed OnIndexEntryDeleted trigger for index '{0}', key: '{1}'", name, documentId), exception); context.AddError(name, documentId, exception.Message ); }, trigger => trigger.OnIndexEntryDeleted(documentId)); if(batch.SkipDeleteFromIndex[i] == false) indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); return doc; }) .Where(x => x is FilteredDocument == false) .ToList(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, (partition) => { var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(indexDefinition); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); foreach (var doc in RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, actions, stats)) { float boost; var indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); if (indexingResult.NewDocId != null && indexingResult.ShouldSkip == false) { Interlocked.Increment(ref count); luceneDoc.GetFields().Clear(); luceneDoc.Boost = boost; documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format("Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", name, indexingResult.NewDocId), exception); context.AddError(name, indexingResult.NewDocId, exception.Message ); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); AddDocumentToIndex(indexWriter, luceneDoc, analyzer); } Interlocked.Increment(ref stats.IndexingSuccesses); } }); } catch(Exception e) { batchers.ApplyAndIgnoreAllErrors( ex => { logIndexing.WarnException("Failed to notify index update trigger batcher about an error", ex); context.AddError(name, null, ex.Message); }, x => x.AnErrorOccured(e)); throw; } finally { batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger", e); context.AddError(name, null, e.Message); }, x => x.Dispose()); } return sourceCount; }); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, InputCount = sourceCount, Duration = sw.Elapsed, Operation = "Index", Started = start }); logIndexing.Debug("Indexed {0} documents for {1}", count, name); }
private IEnumerable <Tuple <IndexToWorkOn, IndexingBatch> > FilterIndexes(IList <IndexToWorkOn> indexesToWorkOn, JsonDocument[] jsonDocs) { var last = jsonDocs.Last(); Debug.Assert(last.Etag != null); Debug.Assert(last.LastModified != null); var lastEtag = last.Etag.Value; var lastModified = last.LastModified.Value; var lastIndexedEtag = new ComparableByteArray(lastEtag.ToByteArray()); var documentRetriever = new DocumentRetriever(null, context.ReadTriggers); var filteredDocs = BackgroundTaskExecuter.Instance.Apply(jsonDocs, doc => { doc = documentRetriever.ExecuteReadTriggers(doc, null, ReadOperation.Index); return(doc == null ? null : new { Doc = doc, Json = JsonToExpando.Convert(doc.ToJson()) }); }); log.Debug("After read triggers executed, {0} documents remained", filteredDocs.Count); var results = new Tuple <IndexToWorkOn, IndexingBatch> [indexesToWorkOn.Count]; var actions = new Action <IStorageActionsAccessor> [indexesToWorkOn.Count]; BackgroundTaskExecuter.Instance.ExecuteAll(context.Configuration, scheduler, indexesToWorkOn, (indexToWorkOn, i) => { var indexLastInedexEtag = new ComparableByteArray(indexToWorkOn.LastIndexedEtag.ToByteArray()); if (indexLastInedexEtag.CompareTo(lastIndexedEtag) >= 0) { return; } var indexName = indexToWorkOn.IndexName; var viewGenerator = context.IndexDefinitionStorage.GetViewGenerator(indexName); if (viewGenerator == null) { return; // probably deleted } var batch = new IndexingBatch(); foreach (var item in filteredDocs) { // did we already indexed this document in this index? if (indexLastInedexEtag.CompareTo(new ComparableByteArray(item.Doc.Etag.Value.ToByteArray())) >= 0) { continue; } // is the Raven-Entity-Name a match for the things the index executes on? if (viewGenerator.ForEntityNames.Count != 0 && viewGenerator.ForEntityNames.Contains(item.Doc.Metadata.Value <string>(Constants.RavenEntityName)) == false) { continue; } batch.Add(item.Doc, item.Json); if (batch.DateTime == null) { batch.DateTime = item.Doc.LastModified; } else { batch.DateTime = batch.DateTime > item.Doc.LastModified ? item.Doc.LastModified : batch.DateTime; } } if (batch.Docs.Count == 0) { log.Debug("All documents have been filtered for {0}, no indexing will be performed, updating to {1}, {2}", indexName, lastEtag, lastModified); // we use it this way to batch all the updates together actions[i] = accessor => accessor.Indexing.UpdateLastIndexed(indexName, lastEtag, lastModified); return; } log.Debug("Going to index {0} documents in {1}", batch.Ids.Count, indexToWorkOn); results[i] = Tuple.Create(indexToWorkOn, batch); }); transactionalStorage.Batch(actionsAccessor => { foreach (var action in actions) { if (action != null) { action(actionsAccessor); } } }); return(results.Where(x => x != null)); }
public override void IndexDocuments( AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; var changed = new HashSet<ReduceKeyAndBucket>(); var documentsWrapped = batch.Docs.Select(doc => { sourceCount++; var documentId = doc.__document_id; actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, name, changed); return doc; }) .Where(x => x is FilteredDocument == false) .ToList(); var items = new ConcurrentQueue<MapResultItem>(); var stats = new IndexingWorkStats(); var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>(); if (documentsWrapped.Count > 0) actions.MapReduce.UpdateRemovedMapReduceStats(name, changed); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, partition => { using (CurrentIndexingScope.Current = new CurrentIndexingScope(LoadDocument, allReferencedDocs.Enqueue)) { var mapResults = RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, stats); var currentDocumentResults = new List<object>(); string currentKey = null; foreach (var currentDoc in mapResults) { var documentId = GetDocumentId(currentDoc); if (documentId != currentKey) { count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, items); currentDocumentResults.Clear(); currentKey = documentId; } currentDocumentResults.Add(new DynamicJsonObject(RavenJObject.FromObject(currentDoc, jsonSerializer))); } count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, items); } }); IDictionary<string, HashSet<string>> result; while (allReferencedDocs.TryDequeue(out result)) { foreach (var referencedDocument in result) { actions.Indexing.UpdateDocumentReferences(name, referencedDocument.Key, referencedDocument.Value); actions.General.MaybePulseTransaction(); } } foreach (var mapResultItem in items) { changed.Add(new ReduceKeyAndBucket(mapResultItem.Bucket, mapResultItem.ReduceKey)); actions.MapReduce.PutMappedResult(name, mapResultItem.DocId, mapResultItem.ReduceKey, mapResultItem.Data); actions.General.MaybePulseTransaction(); } UpdateIndexingStats(context, stats); actions.MapReduce.ScheduleReductions(name, 0, changed); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, InputCount = sourceCount, Operation = "Map", Duration = sw.Elapsed, Started = start }); logIndexing.Debug("Mapped {0} documents for {1}", count, name); }
public override void IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; Write((indexWriter, analyzer, stats) => { var processedKeys = new HashSet<string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(name)) .Where(x => x != null) .ToList(); try { RecordCurrentBatch("Current", batch.Docs.Count); var docIdTerm = new Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Docs.Select((doc, i) => { Interlocked.Increment(ref sourceCount); if (doc.__document_id == null) throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) return doc; batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format("Error when executed OnIndexEntryDeleted trigger for index '{0}', key: '{1}'", name, documentId), exception); context.AddError(name, documentId, exception.Message, "OnIndexEntryDeleted Trigger" ); }, trigger => trigger.OnIndexEntryDeleted(documentId)); if (batch.SkipDeleteFromIndex[i] == false || context.ShouldRemoveFromIndex(documentId)) // maybe it is recently deleted? indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); return doc; }) .Where(x => x is FilteredDocument == false) .ToList(); var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, (partition) => { var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(context.Database, indexDefinition, viewGenerator); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); using (CurrentIndexingScope.Current = new CurrentIndexingScope(LoadDocument, allReferencedDocs.Enqueue)) { foreach (var doc in RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, stats)) { float boost; var indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); if (indexingResult.NewDocId != null && indexingResult.ShouldSkip == false) { Interlocked.Increment(ref count); luceneDoc.GetFields().Clear(); luceneDoc.Boost = boost; documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format("Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", name, indexingResult.NewDocId), exception); context.AddError(name, indexingResult.NewDocId, exception.Message, "OnIndexEntryCreated Trigger" ); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); AddDocumentToIndex(indexWriter, luceneDoc, analyzer); } Interlocked.Increment(ref stats.IndexingSuccesses); } } }); var dic = context.ReferencingDocumentsByChildKeysWhichMightNeedReindexing_SimpleIndex; IDictionary<string, HashSet<string>> result; while (allReferencedDocs.TryDequeue(out result)) { foreach (var referencedDocument in result) { actions.Indexing.UpdateDocumentReferences(name, referencedDocument.Key, referencedDocument.Value); foreach (var childDocumentKey in referencedDocument.Value) { dic.GetOrAdd(childDocumentKey, k => new ConcurrentBag<string>()).Add(referencedDocument.Key); } } } } catch (Exception e) { batchers.ApplyAndIgnoreAllErrors( ex => { logIndexing.WarnException("Failed to notify index update trigger batcher about an error", ex); context.AddError(name, null, ex.Message, "AnErrorOccured Trigger"); }, x => x.AnErrorOccured(e)); throw; } finally { batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger", e); context.AddError(name, null, e.Message, "Dispose Trigger"); }, x => x.Dispose()); BatchCompleted("Current"); } return new IndexedItemsInfo { ChangedDocs = sourceCount, HighestETag = batch.HighestEtagInBatch }; }); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, ItemsCount = sourceCount, InputCount = batch.Docs.Count, Duration = sw.Elapsed, Operation = "Index", Started = start }); logIndexing.Debug("Indexed {0} documents for {1}", count, name); }
public override void IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; Write((indexWriter, analyzer, stats) => { var processedKeys = new HashSet <string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId)) .Where(x => x != null) .ToList(); try { RecordCurrentBatch("Current", batch.Docs.Count); var docIdTerm = new Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Docs.Select((doc, i) => { Interlocked.Increment(ref sourceCount); if (doc.__document_id == null) { throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); } string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) { return(doc); } InvokeOnIndexEntryDeletedOnAllBatchers(batchers, docIdTerm.CreateTerm(documentId.ToLowerInvariant())); if (batch.SkipDeleteFromIndex[i] == false || context.ShouldRemoveFromIndex(documentId)) // maybe it is recently deleted? { indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); } return(doc); }) .Where(x => x is FilteredDocument == false) .ToList(); var allReferencedDocs = new ConcurrentQueue <IDictionary <string, HashSet <string> > >(); var allReferenceEtags = new ConcurrentQueue <IDictionary <string, Etag> >(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, (partition) => { var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(context.Database, indexDefinition, viewGenerator, logIndexing); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { string currentDocId = null; int outputPerDocId = 0; Action <Exception, object> onErrorFunc; foreach (var doc in RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, stats, out onErrorFunc)) { float boost; IndexingResult indexingResult; try { indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); } catch (InvalidSpatialShapeException e) { onErrorFunc(e, doc); continue; } try { indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); } catch (Exception e) { onErrorFunc(e, doc); continue; } // ReSharper disable once RedundantBoolCompare --> code clarity if (indexingResult.NewDocId == null || indexingResult.ShouldSkip != false) { continue; } if (currentDocId != indexingResult.NewDocId) { currentDocId = indexingResult.NewDocId; outputPerDocId = 0; } outputPerDocId++; EnsureValidNumberOfOutputsForDocument(currentDocId, outputPerDocId); Interlocked.Increment(ref count); luceneDoc.GetFields().Clear(); luceneDoc.Boost = boost; documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format( "Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", indexId, indexingResult.NewDocId), exception); context.AddError(indexId, indexingResult.NewDocId, exception.Message, "OnIndexEntryCreated Trigger" ); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); AddDocumentToIndex(indexWriter, luceneDoc, analyzer); Interlocked.Increment(ref stats.IndexingSuccesses); } allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); } }); UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); } catch (Exception e) { batchers.ApplyAndIgnoreAllErrors( ex => { logIndexing.WarnException("Failed to notify index update trigger batcher about an error", ex); context.AddError(indexId, null, ex.Message, "AnErrorOccured Trigger"); }, x => x.AnErrorOccured(e)); throw; } finally { batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger", e); context.AddError(indexId, null, e.Message, "Dispose Trigger"); }, x => x.Dispose()); BatchCompleted("Current"); } return(new IndexedItemsInfo(batch.HighestEtagBeforeFiltering) { ChangedDocs = sourceCount }); }); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, ItemsCount = sourceCount, InputCount = batch.Docs.Count, Duration = sw.Elapsed, Operation = "Index", Started = start }); logIndexing.Debug("Indexed {0} documents for {1}", count, indexId); }
public override IndexingPerformanceStats IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp, CancellationToken token) { token.ThrowIfCancellationRequested(); var count = 0; var sourceCount = 0; var deleted = new Dictionary<ReduceKeyAndBucket, int>(); var performance = RecordCurrentBatch("Current Map", "Map", batch.Docs.Count); var performanceStats = new List<BasePerformanceStats>(); var usedStorageAccessors = new ConcurrentSet<IStorageActionsAccessor>(); if (usedStorageAccessors.TryAdd(actions)) { var storageCommitDuration = new Stopwatch(); actions.BeforeStorageCommit += storageCommitDuration.Start; actions.AfterStorageCommit += () => { storageCommitDuration.Stop(); performanceStats.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds)); }; } var deleteMappedResultsDuration = new Stopwatch(); var documentsWrapped = batch.Docs.Select(doc => { token.ThrowIfCancellationRequested(); sourceCount++; var documentId = doc.__document_id; using (StopwatchScope.For(deleteMappedResultsDuration)) { actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, indexId, deleted); } return doc; }) .Where(x => x is FilteredDocument == false) .ToList(); performanceStats.Add(new PerformanceStats { Name = IndexingOperation.Map_DeleteMappedResults, DurationMs = deleteMappedResultsDuration.ElapsedMilliseconds, }); var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>(); var allReferenceEtags = new ConcurrentQueue<IDictionary<string, Etag>>(); var allState = new ConcurrentQueue<Tuple<HashSet<ReduceKeyAndBucket>, IndexingWorkStats, Dictionary<string, int>>>(); var parallelOperations = new ConcurrentQueue<ParallelBatchStats>(); var parallelProcessingStart = SystemTime.UtcNow; BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, partition => { token.ThrowIfCancellationRequested(); var parallelStats = new ParallelBatchStats { StartDelay = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds }; var localStats = new IndexingWorkStats(); var localChanges = new HashSet<ReduceKeyAndBucket>(); var statsPerKey = new Dictionary<string, int>(); var linqExecutionDuration = new Stopwatch(); var reduceInMapLinqExecutionDuration = new Stopwatch(); var putMappedResultsDuration = new Stopwatch(); var convertToRavenJObjectDuration = new Stopwatch(); allState.Enqueue(Tuple.Create(localChanges, localStats, statsPerKey)); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { // we are writing to the transactional store from multiple threads here, and in a streaming fashion // should result in less memory and better perf context.TransactionalStorage.Batch(accessor => { if (usedStorageAccessors.TryAdd(accessor)) { var storageCommitDuration = new Stopwatch(); accessor.BeforeStorageCommit += storageCommitDuration.Start; accessor.AfterStorageCommit += () => { storageCommitDuration.Stop(); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds)); }; } var mapResults = RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, localStats, linqExecutionDuration); var currentDocumentResults = new List<object>(); string currentKey = null; bool skipDocument = false; foreach (var currentDoc in mapResults) { token.ThrowIfCancellationRequested(); var documentId = GetDocumentId(currentDoc); if (documentId != currentKey) { count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration); currentDocumentResults.Clear(); currentKey = documentId; } else if (skipDocument) { continue; } RavenJObject currentDocJObject; using (StopwatchScope.For(convertToRavenJObjectDuration)) { currentDocJObject = RavenJObject.FromObject(currentDoc, jsonSerializer); } currentDocumentResults.Add(new DynamicJsonObject(currentDocJObject)); if (EnsureValidNumberOfOutputsForDocument(documentId, currentDocumentResults.Count) == false) { skipDocument = true; currentDocumentResults.Clear(); continue; } Interlocked.Increment(ref localStats.IndexingSuccesses); } count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.LoadDocument, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_MapExecution, linqExecutionDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_ReduceLinqExecution, reduceInMapLinqExecutionDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_PutMappedResults, putMappedResultsDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ConvertToRavenJObject, convertToRavenJObjectDuration.ElapsedMilliseconds)); parallelOperations.Enqueue(parallelStats); }); allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); } }); performanceStats.Add(new ParallelPerformanceStats { NumberOfThreads = parallelOperations.Count, DurationMs = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds, BatchedOperations = parallelOperations.ToList() }); var updateDocumentReferencesDuration = new Stopwatch(); using (StopwatchScope.For(updateDocumentReferencesDuration)) { UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); } performanceStats.Add(PerformanceStats.From(IndexingOperation.UpdateDocumentReferences, updateDocumentReferencesDuration.ElapsedMilliseconds)); var changed = allState.SelectMany(x => x.Item1).Concat(deleted.Keys) .Distinct() .ToList(); var stats = new IndexingWorkStats(allState.Select(x => x.Item2)); var reduceKeyStats = allState.SelectMany(x => x.Item3) .GroupBy(x => x.Key) .Select(g => new { g.Key, Count = g.Sum(x => x.Value) }) .ToList(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, reduceKeyStats, enumerator => context.TransactionalStorage.Batch(accessor => { while (enumerator.MoveNext()) { var reduceKeyStat = enumerator.Current; accessor.MapReduce.IncrementReduceKeyCounter(indexId, reduceKeyStat.Key, reduceKeyStat.Count); } })); var parallelReductionOperations = new ConcurrentQueue<ParallelBatchStats>(); var parallelReductionStart = SystemTime.UtcNow; BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, changed, enumerator => context.TransactionalStorage.Batch(accessor => { var parallelStats = new ParallelBatchStats { StartDelay = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds }; var scheduleReductionsDuration = new Stopwatch(); using (StopwatchScope.For(scheduleReductionsDuration)) { while (enumerator.MoveNext()) { accessor.MapReduce.ScheduleReductions(indexId, 0, enumerator.Current); } } parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ScheduleReductions, scheduleReductionsDuration.ElapsedMilliseconds)); parallelReductionOperations.Enqueue(parallelStats); })); performanceStats.Add(new ParallelPerformanceStats { NumberOfThreads = parallelReductionOperations.Count, DurationMs = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds, BatchedOperations = parallelReductionOperations.ToList() }); UpdateIndexingStats(context, stats); performance.OnCompleted = () => BatchCompleted("Current Map", "Map", sourceCount, count, performanceStats); logIndexing.Debug("Mapped {0} documents for {1}", count, indexId); return performance; }
public override IndexingPerformanceStats IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp, CancellationToken token) { token.ThrowIfCancellationRequested(); var count = 0; var sourceCount = 0; var writeToIndexStats = new List <PerformanceStats>(); IndexingPerformanceStats performance = null; var performanceStats = new List <BasePerformanceStats>(); var storageCommitDuration = new Stopwatch(); actions.BeforeStorageCommit += storageCommitDuration.Start; actions.AfterStorageCommit += () => { storageCommitDuration.Stop(); performanceStats.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds)); }; Write((indexWriter, analyzer, stats) => { var processedKeys = new HashSet <string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId)) .Where(x => x != null) .ToList(); try { performance = RecordCurrentBatch("Current", "Index", batch.Docs.Count); var deleteExistingDocumentsDuration = new Stopwatch(); Interlocked.Increment(ref sourceCount); var docIdTerm = new Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Docs.Select((doc, i) => { token.ThrowIfCancellationRequested(); if (doc.__document_id == null) { throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); } string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) { return(doc); } InvokeOnIndexEntryDeletedOnAllBatchers(batchers, docIdTerm.CreateTerm(documentId.ToLowerInvariant())); if (batch.SkipDeleteFromIndex[i] == false || context.ShouldRemoveFromIndex(documentId)) // maybe it is recently deleted? { using (StopwatchScope.For(deleteExistingDocumentsDuration)) { indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); } } return(doc); }) .Where(x => x is FilteredDocument == false) .ToList(); performanceStats.Add(new PerformanceStats { Name = IndexingOperation.Lucene_DeleteExistingDocument, DurationMs = deleteExistingDocumentsDuration.ElapsedMilliseconds }); var allReferencedDocs = new ConcurrentQueue <IDictionary <string, HashSet <string> > >(); var allReferenceEtags = new ConcurrentQueue <IDictionary <string, Etag> >(); var parallelOperations = new ConcurrentQueue <ParallelBatchStats>(); var parallelProcessingStart = SystemTime.UtcNow; context.Database.MappingThreadPool.ExecuteBatch(documentsWrapped, (IEnumerator <dynamic> partition) => { token.ThrowIfCancellationRequested(); var parallelStats = new ParallelBatchStats { StartDelay = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds }; var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(context.Database, indexDefinition, viewGenerator, logIndexing); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { string currentDocId = null; int outputPerDocId = 0; Action <Exception, object> onErrorFunc; bool skipDocument = false; var linqExecutionDuration = new Stopwatch(); var addDocumentDutation = new Stopwatch(); var convertToLuceneDocumentDuration = new Stopwatch(); foreach (var doc in RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, stats, out onErrorFunc, linqExecutionDuration)) { token.ThrowIfCancellationRequested(); float boost; IndexingResult indexingResult; using (StopwatchScope.For(convertToLuceneDocumentDuration)) { try { indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); } catch (Exception e) { onErrorFunc(e, doc); continue; } } // ReSharper disable once RedundantBoolCompare --> code clarity if (indexingResult.NewDocId == null || indexingResult.ShouldSkip != false) { continue; } if (currentDocId != indexingResult.NewDocId) { currentDocId = indexingResult.NewDocId; outputPerDocId = 0; skipDocument = false; } if (skipDocument) { continue; } outputPerDocId++; if (EnsureValidNumberOfOutputsForDocument(currentDocId, outputPerDocId) == false) { skipDocument = true; continue; } Interlocked.Increment(ref count); using (StopwatchScope.For(convertToLuceneDocumentDuration)) { luceneDoc.GetFields().Clear(); luceneDoc.Boost = boost; documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format( "Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", PublicName, indexingResult.NewDocId), exception); context.AddError( indexId, PublicName, indexingResult.NewDocId, exception, "OnIndexEntryCreated Trigger"); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); using (StopwatchScope.For(addDocumentDutation)) { AddDocumentToIndex(indexWriter, luceneDoc, analyzer); } Interlocked.Increment(ref stats.IndexingSuccesses); } allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.LoadDocument, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_MapExecution, linqExecutionDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Lucene_ConvertToLuceneDocument, convertToLuceneDocumentDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Lucene_AddDocument, addDocumentDutation.ElapsedMilliseconds)); parallelOperations.Enqueue(parallelStats); } }, description: string.Format("Mapping index {0} from Etag {1} to Etag {2}", this.PublicName, this.GetLastEtagFromStats(), batch.HighestEtagBeforeFiltering)); performanceStats.Add(new ParallelPerformanceStats { NumberOfThreads = parallelOperations.Count, DurationMs = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds, BatchedOperations = parallelOperations.ToList() }); var updateDocumentReferencesDuration = new Stopwatch(); using (StopwatchScope.For(updateDocumentReferencesDuration)) { UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); } performanceStats.Add(PerformanceStats.From(IndexingOperation.UpdateDocumentReferences, updateDocumentReferencesDuration.ElapsedMilliseconds)); } catch (Exception e) { batchers.ApplyAndIgnoreAllErrors( ex => { logIndexing.WarnException("Failed to notify index update trigger batcher about an error in " + PublicName, ex); context.AddError(indexId, PublicName, null, ex, "AnErrorOccured Trigger"); }, x => x.AnErrorOccured(e)); throw; } finally { batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger in " + PublicName, e); context.AddError(indexId, PublicName, null, e, "Dispose Trigger"); }, x => x.Dispose()); } return(new IndexedItemsInfo(batch.HighestEtagBeforeFiltering) { ChangedDocs = sourceCount }); }, writeToIndexStats); performanceStats.AddRange(writeToIndexStats); InitializeIndexingPerformanceCompleteDelegate(performance, sourceCount, count, performanceStats); if (logIndexing.IsDebugEnabled) { logIndexing.Debug("Indexed {0} documents for {1}", count, PublicName); } return(performance); }
public abstract void IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp);
public override IndexingPerformanceStats IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp, CancellationToken token) { token.ThrowIfCancellationRequested(); var count = 0; var sourceCount = 0; var deleted = new Dictionary <ReduceKeyAndBucket, int>(); var performance = RecordCurrentBatch("Current Map", "Map", batch.Docs.Count); var performanceStats = new List <BasePerformanceStats>(); var usedStorageAccessors = new ConcurrentSet <IStorageActionsAccessor>(); if (usedStorageAccessors.TryAdd(actions)) { var storageCommitDuration = new Stopwatch(); actions.BeforeStorageCommit += storageCommitDuration.Start; actions.AfterStorageCommit += () => { storageCommitDuration.Stop(); performanceStats.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds)); }; } var deleteMappedResultsDuration = new Stopwatch(); var documentsWrapped = batch.Docs.Select(doc => { token.ThrowIfCancellationRequested(); sourceCount++; var documentId = doc.__document_id; using (StopwatchScope.For(deleteMappedResultsDuration)) { actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, indexId, deleted); } return(doc); }) .Where(x => x is FilteredDocument == false) .ToList(); performanceStats.Add(new PerformanceStats { Name = IndexingOperation.Map_DeleteMappedResults, DurationMs = deleteMappedResultsDuration.ElapsedMilliseconds, }); var allReferencedDocs = new ConcurrentQueue <IDictionary <string, HashSet <string> > >(); var allReferenceEtags = new ConcurrentQueue <IDictionary <string, Etag> >(); var allState = new ConcurrentQueue <Tuple <HashSet <ReduceKeyAndBucket>, IndexingWorkStats, Dictionary <string, int> > >(); var parallelOperations = new ConcurrentQueue <ParallelBatchStats>(); var parallelProcessingStart = SystemTime.UtcNow; BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, partition => { token.ThrowIfCancellationRequested(); var parallelStats = new ParallelBatchStats { StartDelay = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds }; var localStats = new IndexingWorkStats(); var localChanges = new HashSet <ReduceKeyAndBucket>(); var statsPerKey = new Dictionary <string, int>(); var linqExecutionDuration = new Stopwatch(); var reduceInMapLinqExecutionDuration = new Stopwatch(); var putMappedResultsDuration = new Stopwatch(); var convertToRavenJObjectDuration = new Stopwatch(); allState.Enqueue(Tuple.Create(localChanges, localStats, statsPerKey)); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { // we are writing to the transactional store from multiple threads here, and in a streaming fashion // should result in less memory and better perf context.TransactionalStorage.Batch(accessor => { if (usedStorageAccessors.TryAdd(accessor)) { var storageCommitDuration = new Stopwatch(); accessor.BeforeStorageCommit += storageCommitDuration.Start; accessor.AfterStorageCommit += () => { storageCommitDuration.Stop(); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.StorageCommit, storageCommitDuration.ElapsedMilliseconds)); }; } var mapResults = RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, localStats, linqExecutionDuration); var currentDocumentResults = new List <object>(); string currentKey = null; bool skipDocument = false; foreach (var currentDoc in mapResults) { token.ThrowIfCancellationRequested(); var documentId = GetDocumentId(currentDoc); if (documentId != currentKey) { count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration); currentDocumentResults.Clear(); currentKey = documentId; } else if (skipDocument) { continue; } RavenJObject currentDocJObject; using (StopwatchScope.For(convertToRavenJObjectDuration)) { currentDocJObject = RavenJObject.FromObject(currentDoc, jsonSerializer); } currentDocumentResults.Add(new DynamicJsonObject(currentDocJObject)); if (EnsureValidNumberOfOutputsForDocument(documentId, currentDocumentResults.Count) == false) { skipDocument = true; currentDocumentResults.Clear(); continue; } Interlocked.Increment(ref localStats.IndexingSuccesses); } count += ProcessBatch(viewGenerator, currentDocumentResults, currentKey, localChanges, accessor, statsPerKey, reduceInMapLinqExecutionDuration, putMappedResultsDuration, convertToRavenJObjectDuration); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.LoadDocument, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_MapExecution, linqExecutionDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Linq_ReduceLinqExecution, reduceInMapLinqExecutionDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_PutMappedResults, putMappedResultsDuration.ElapsedMilliseconds)); parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ConvertToRavenJObject, convertToRavenJObjectDuration.ElapsedMilliseconds)); parallelOperations.Enqueue(parallelStats); }); allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); } }); performanceStats.Add(new ParallelPerformanceStats { NumberOfThreads = parallelOperations.Count, DurationMs = (long)(SystemTime.UtcNow - parallelProcessingStart).TotalMilliseconds, BatchedOperations = parallelOperations.ToList() }); var updateDocumentReferencesDuration = new Stopwatch(); using (StopwatchScope.For(updateDocumentReferencesDuration)) { UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); } performanceStats.Add(PerformanceStats.From(IndexingOperation.UpdateDocumentReferences, updateDocumentReferencesDuration.ElapsedMilliseconds)); var changed = allState.SelectMany(x => x.Item1).Concat(deleted.Keys) .Distinct() .ToList(); var stats = new IndexingWorkStats(allState.Select(x => x.Item2)); var reduceKeyStats = allState.SelectMany(x => x.Item3) .GroupBy(x => x.Key) .Select(g => new { g.Key, Count = g.Sum(x => x.Value) }) .ToList(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, reduceKeyStats, enumerator => context.TransactionalStorage.Batch(accessor => { while (enumerator.MoveNext()) { var reduceKeyStat = enumerator.Current; accessor.MapReduce.IncrementReduceKeyCounter(indexId, reduceKeyStat.Key, reduceKeyStat.Count); } })); actions.General.MaybePulseTransaction(); var parallelReductionOperations = new ConcurrentQueue <ParallelBatchStats>(); var parallelReductionStart = SystemTime.UtcNow; BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, changed, enumerator => context.TransactionalStorage.Batch(accessor => { var parallelStats = new ParallelBatchStats { StartDelay = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds }; var scheduleReductionsDuration = new Stopwatch(); using (StopwatchScope.For(scheduleReductionsDuration)) { while (enumerator.MoveNext()) { accessor.MapReduce.ScheduleReductions(indexId, 0, enumerator.Current); accessor.General.MaybePulseTransaction(); } } parallelStats.Operations.Add(PerformanceStats.From(IndexingOperation.Map_ScheduleReductions, scheduleReductionsDuration.ElapsedMilliseconds)); parallelReductionOperations.Enqueue(parallelStats); })); performanceStats.Add(new ParallelPerformanceStats { NumberOfThreads = parallelReductionOperations.Count, DurationMs = (long)(SystemTime.UtcNow - parallelReductionStart).TotalMilliseconds, BatchedOperations = parallelReductionOperations.ToList() }); UpdateIndexingStats(context, stats); performance.OnCompleted = () => BatchCompleted("Current Map", "Map", sourceCount, count, performanceStats); logIndexing.Debug("Mapped {0} documents for {1}", count, indexId); return(performance); }
private void IndexDocuments(IStorageActionsAccessor actions, string index, IndexingBatch batch) { var viewGenerator = context.IndexDefinitionStorage.GetViewGenerator(index); if (viewGenerator == null) return; // index was deleted, probably try { if (Log.IsDebugEnabled) { string ids; if (batch.Ids.Count < 256) ids = string.Join(",", batch.Ids); else { ids = string.Join(", ", batch.Ids.Take(128)) + " ... " + string.Join(", ", batch.Ids.Skip(batch.Ids.Count - 128)); } Log.Debug("Indexing {0} documents for index: {1}. ({2})", batch.Docs.Count, index, ids); } context.CancellationToken.ThrowIfCancellationRequested(); context.IndexStorage.Index(index, viewGenerator, batch, context, actions, batch.DateTime ?? DateTime.MinValue); } catch (OperationCanceledException) { throw; } catch (Exception e) { if (actions.IsWriteConflict(e)) return; Log.WarnException(string.Format("Failed to index documents for index: {0}", index), e); } }
public override void IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; int loadDocumentCount = 0; long loadDocumentDuration = 0; Write((indexWriter, analyzer, stats) => { var processedKeys = new HashSet<string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(indexId)) .Where(x => x != null) .ToList(); try { var indexingPerfStats = RecordCurrentBatch("Current", batch.Docs.Count); batch.SetIndexingPerformance(indexingPerfStats); var docIdTerm = new Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Docs.Select((doc, i) => { Interlocked.Increment(ref sourceCount); if (doc.__document_id == null) throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) return doc; InvokeOnIndexEntryDeletedOnAllBatchers(batchers, docIdTerm.CreateTerm(documentId.ToLowerInvariant())); if (batch.SkipDeleteFromIndex[i] == false || context.ShouldRemoveFromIndex(documentId)) // maybe it is recently deleted? indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); return doc; }) .Where(x => x is FilteredDocument == false) .ToList(); var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>(); var allReferenceEtags = new ConcurrentQueue<IDictionary<string, Etag>>(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, (partition) => { var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(context.Database, indexDefinition, viewGenerator, logIndexing); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); using (CurrentIndexingScope.Current = new CurrentIndexingScope(context.Database, PublicName)) { string currentDocId = null; int outputPerDocId = 0; Action<Exception, object> onErrorFunc; bool skipDocument = false; foreach (var doc in RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, stats, out onErrorFunc)) { float boost; IndexingResult indexingResult; try { indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); } catch (Exception e) { onErrorFunc(e, doc); continue; } // ReSharper disable once RedundantBoolCompare --> code clarity if (indexingResult.NewDocId == null || indexingResult.ShouldSkip != false) { continue; } if (currentDocId != indexingResult.NewDocId) { currentDocId = indexingResult.NewDocId; outputPerDocId = 0; skipDocument = false; } if (skipDocument) continue; outputPerDocId++; if (EnsureValidNumberOfOutputsForDocument(currentDocId, outputPerDocId) == false) { skipDocument = true; continue; } Interlocked.Increment(ref count); luceneDoc.GetFields().Clear(); luceneDoc.Boost = boost; documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format( "Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", indexId, indexingResult.NewDocId), exception); context.AddError(indexId, indexingResult.NewDocId, exception.Message, "OnIndexEntryCreated Trigger" ); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); AddDocumentToIndex(indexWriter, luceneDoc, analyzer); Interlocked.Increment(ref stats.IndexingSuccesses); } allReferenceEtags.Enqueue(CurrentIndexingScope.Current.ReferencesEtags); allReferencedDocs.Enqueue(CurrentIndexingScope.Current.ReferencedDocuments); Interlocked.Add(ref loadDocumentCount, CurrentIndexingScope.Current.LoadDocumentCount); Interlocked.Add(ref loadDocumentDuration, CurrentIndexingScope.Current.LoadDocumentDuration.ElapsedMilliseconds); } }); UpdateDocumentReferences(actions, allReferencedDocs, allReferenceEtags); } catch (Exception e) { batchers.ApplyAndIgnoreAllErrors( ex => { logIndexing.WarnException("Failed to notify index update trigger batcher about an error", ex); context.AddError(indexId, null, ex.Message, "AnErrorOccured Trigger"); }, x => x.AnErrorOccured(e)); throw; } finally { batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger", e); context.AddError(indexId, null, e.Message, "Dispose Trigger"); }, x => x.Dispose()); BatchCompleted("Current"); } return new IndexedItemsInfo(batch.HighestEtagBeforeFiltering) { ChangedDocs = sourceCount }; }); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, ItemsCount = sourceCount, InputCount = batch.Docs.Count, Duration = sw.Elapsed, Operation = "Index", Started = start, LoadDocumentCount = loadDocumentCount, LoadDocumentDurationMs = loadDocumentDuration }); logIndexing.Debug("Indexed {0} documents for {1}", count, indexId); }
public override void IndexDocuments(AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; Write((indexWriter, analyzer, stats) => { var processedKeys = new HashSet <string>(); var batchers = context.IndexUpdateTriggers.Select(x => x.CreateBatcher(name)) .Where(x => x != null) .ToList(); try { var docIdTerm = new Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Docs.Select((doc, i) => { Interlocked.Increment(ref sourceCount); if (doc.__document_id == null) { throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); } string documentId = doc.__document_id.ToString(); if (processedKeys.Add(documentId) == false) { return(doc); } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format("Error when executed OnIndexEntryDeleted trigger for index '{0}', key: '{1}'", name, documentId), exception); context.AddError(name, documentId, exception.Message ); }, trigger => trigger.OnIndexEntryDeleted(documentId)); if (batch.SkipDeleteFromIndex[i] == false || context.ShouldRemoveFromIndex(documentId)) // maybe it is recently deleted? { indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); } return(doc); }) .Where(x => x is FilteredDocument == false) .ToList(); var allReferencedDocs = new ConcurrentQueue <IDictionary <string, HashSet <string> > >(); BackgroundTaskExecuter.Instance.ExecuteAllBuffered(context, documentsWrapped, (partition) => { var anonymousObjectToLuceneDocumentConverter = new AnonymousObjectToLuceneDocumentConverter(indexDefinition); var luceneDoc = new Document(); var documentIdField = new Field(Constants.DocumentIdFieldName, "dummy", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); using (CurrentIndexingScope.Current = new CurrentIndexingScope(LoadDocument, allReferencedDocs.Enqueue)) { foreach (var doc in RobustEnumerationIndex(partition, viewGenerator.MapDefinitions, stats)) { float boost; var indexingResult = GetIndexingResult(doc, anonymousObjectToLuceneDocumentConverter, out boost); if (indexingResult.NewDocId != null && indexingResult.ShouldSkip == false) { Interlocked.Increment(ref count); luceneDoc.GetFields().Clear(); luceneDoc.Boost = boost; documentIdField.SetValue(indexingResult.NewDocId.ToLowerInvariant()); luceneDoc.Add(documentIdField); foreach (var field in indexingResult.Fields) { luceneDoc.Add(field); } batchers.ApplyAndIgnoreAllErrors( exception => { logIndexing.WarnException( string.Format("Error when executed OnIndexEntryCreated trigger for index '{0}', key: '{1}'", name, indexingResult.NewDocId), exception); context.AddError(name, indexingResult.NewDocId, exception.Message ); }, trigger => trigger.OnIndexEntryCreated(indexingResult.NewDocId, luceneDoc)); LogIndexedDocument(indexingResult.NewDocId, luceneDoc); AddDocumentToIndex(indexWriter, luceneDoc, analyzer); } Interlocked.Increment(ref stats.IndexingSuccesses); } } }); IDictionary <string, HashSet <string> > result; while (allReferencedDocs.TryDequeue(out result)) { foreach (var referencedDocument in result) { actions.Indexing.UpdateDocumentReferences(name, referencedDocument.Key, referencedDocument.Value); } } } catch (Exception e) { batchers.ApplyAndIgnoreAllErrors( ex => { logIndexing.WarnException("Failed to notify index update trigger batcher about an error", ex); context.AddError(name, null, ex.Message); }, x => x.AnErrorOccured(e)); throw; } finally { batchers.ApplyAndIgnoreAllErrors( e => { logIndexing.WarnException("Failed to dispose on index update trigger", e); context.AddError(name, null, e.Message); }, x => x.Dispose()); } return(sourceCount); }); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, InputCount = sourceCount, Duration = sw.Elapsed, Operation = "Index", Started = start }); logIndexing.Debug("Indexed {0} documents for {1}", count, name); }
private IEnumerable <IndexingBatchForIndex> FilterIndexes(IList <IndexToWorkOn> indexesToWorkOn, List <JsonDocument> jsonDocs, Etag highestETagInBatch) { var last = jsonDocs.Last(); Debug.Assert(last.Etag != null); Debug.Assert(last.LastModified != null); var lastEtag = last.Etag; var lastModified = last.LastModified.Value; var documentRetriever = new DocumentRetriever(null, null, context.ReadTriggers, context.Database.InFlightTransactionalState); var filteredDocs = BackgroundTaskExecuter.Instance.Apply(context, jsonDocs, doc => { var filteredDoc = documentRetriever.ExecuteReadTriggers(doc, null, ReadOperation.Index); return(filteredDoc == null ? new { Doc = doc, Json = (object)new FilteredDocument(doc) } : new { Doc = filteredDoc, Json = JsonToExpando.Convert(doc.ToJson()) }); }); Log.Debug("After read triggers executed, {0} documents remained", filteredDocs.Count); var results = new IndexingBatchForIndex[indexesToWorkOn.Count]; var actions = new Action <IStorageActionsAccessor> [indexesToWorkOn.Count]; BackgroundTaskExecuter.Instance.ExecuteAll(context, indexesToWorkOn, (indexToWorkOn, i) => { var indexName = indexToWorkOn.Index.PublicName; var viewGenerator = context.IndexDefinitionStorage.GetViewGenerator(indexName); if (viewGenerator == null) { return; // probably deleted } var batch = new IndexingBatch(highestETagInBatch); foreach (var item in filteredDocs) { if (defaultPrefetchingBehavior.FilterDocuments(item.Doc) == false) { continue; } // did we already indexed this document in this index? var etag = item.Doc.Etag; if (etag == null) { continue; } // is the Raven-Entity-Name a match for the things the index executes on? if (viewGenerator.ForEntityNames.Count != 0 && viewGenerator.ForEntityNames.Contains(item.Doc.Metadata.Value <string>(Constants.RavenEntityName)) == false) { continue; } batch.Add(item.Doc, item.Json, defaultPrefetchingBehavior.ShouldSkipDeleteFromIndex(item.Doc)); if (batch.DateTime == null) { batch.DateTime = item.Doc.LastModified; } else { batch.DateTime = batch.DateTime > item.Doc.LastModified ? item.Doc.LastModified : batch.DateTime; } } if (batch.Docs.Count == 0) { Log.Debug("All documents have been filtered for {0}, no indexing will be performed, updating to {1}, {2}", indexName, lastEtag, lastModified); // we use it this way to batch all the updates together actions[i] = accessor => { accessor.Indexing.UpdateLastIndexed(indexToWorkOn.Index.indexId, lastEtag, lastModified); accessor.AfterStorageCommit += () => { indexToWorkOn.Index.EnsureIndexWriter(); indexToWorkOn.Index.Flush(lastEtag); }; }; return; } if (Log.IsDebugEnabled) { Log.Debug("Going to index {0} documents in {1}: ({2})", batch.Ids.Count, indexToWorkOn, string.Join(", ", batch.Ids)); } results[i] = new IndexingBatchForIndex { Batch = batch, IndexId = indexToWorkOn.IndexId, Index = indexToWorkOn.Index, LastIndexedEtag = indexToWorkOn.LastIndexedEtag }; }); transactionalStorage.Batch(actionsAccessor => { foreach (var action in actions) { if (action != null) { action(actionsAccessor); } } }); return(results.Where(x => x != null)); }
public override void IndexDocuments( AbstractViewGenerator viewGenerator, IndexingBatch batch, IStorageActionsAccessor actions, DateTime minimumTimestamp) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = SystemTime.UtcNow; var changed = new HashSet<ReduceKeyAndBucket>(); var documentsWrapped = batch.Docs.Select(doc => { sourceCount++; var documentId = doc.__document_id; actions.MapReduce.DeleteMappedResultsForDocumentId((string)documentId, name, changed); return doc; }) .Where(x => x is FilteredDocument == false); var items = new List<MapResultItem>(); var stats = new IndexingWorkStats(); var allReferencedDocs = new ConcurrentQueue<IDictionary<string, HashSet<string>>>(); using (CurrentIndexingScope.Current = new CurrentIndexingScope(LoadDocument, allReferencedDocs.Enqueue)) { var mapResults = RobustEnumerationIndex( documentsWrapped.GetEnumerator(), viewGenerator.MapDefinitions, actions, stats) .ToList(); actions.MapReduce.UpdateRemovedMapReduceStats(name, changed); foreach (var mappedResultFromDocument in mapResults.GroupBy(GetDocumentId)) { var dynamicResults = mappedResultFromDocument.Select(x => (object)new DynamicJsonObject(RavenJObject.FromObject(x, jsonSerializer))).ToList(); foreach ( var doc in RobustEnumerationReduceDuringMapPhase(dynamicResults.GetEnumerator(), viewGenerator.ReduceDefinition, actions, context)) { count++; var reduceValue = viewGenerator.GroupByExtraction(doc); if (reduceValue == null) { logIndexing.Debug("Field {0} is used as the reduce key and cannot be null, skipping document {1}", viewGenerator.GroupByExtraction, mappedResultFromDocument.Key); continue; } var reduceKey = ReduceKeyToString(reduceValue); var docId = mappedResultFromDocument.Key.ToString(); var data = GetMappedData(doc); items.Add(new MapResultItem { Data = data, DocId = docId, ReduceKey = reduceKey }); changed.Add(new ReduceKeyAndBucket(IndexingUtil.MapBucket(docId), reduceKey)); } } } IDictionary<string, HashSet<string>> result; while (allReferencedDocs.TryDequeue(out result)) { foreach (var referencedDocument in result) { actions.Indexing.UpdateDocumentReferences(name, referencedDocument.Key, referencedDocument.Value); actions.General.MaybePulseTransaction(); } } foreach (var mapResultItem in items) { actions.MapReduce.PutMappedResult(name, mapResultItem.DocId, mapResultItem.ReduceKey, mapResultItem.Data); actions.General.MaybePulseTransaction(); } UpdateIndexingStats(context, stats); actions.MapReduce.ScheduleReductions(name, 0, changed); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, InputCount = sourceCount, Operation = "Map", Duration = sw.Elapsed, Started = start }); logIndexing.Debug("Mapped {0} documents for {1}", count, name); }