public void IndexDocument(BaristaIndexDefinition indexDefinition, string documentId, DocumentDto document) { try { if (documentId.IsNullOrWhiteSpace()) { throw new ArgumentNullException("documentId", @"A document id must be specified."); } if (document == null) { throw new ArgumentNullException("document", @"A document must be specified."); } var index = GetOrAddIndex(indexDefinition, true); try { //Add it to the index. var luceneDocument = DocumentDto.ConvertToLuceneDocument(document); var batch = new IndexingBatch(); batch.Add(new BatchedDocument { DocumentId = documentId, Document = luceneDocument, SkipDeleteFromIndex = false, }); index.IndexDocuments(batch); } catch (OutOfMemoryException) { CloseIndexWriter(indexDefinition, false); } } catch (Exception ex) { throw new FaultException(ex.Message); } }
public override void IndexDocuments(IndexingBatch batch) { var count = 0; var sourceCount = 0; var sw = Stopwatch.StartNew(); var start = DateTime.UtcNow; Write((indexWriter, analyzer) => { //TODO: The following would be a perfect candidate for a TPL DataFlow impl. Too bad we're currently on .Net 3.5 var processedKeys = new HashSet <string>(); var docIdTerm = new Lucene.Net.Index.Term(Constants.DocumentIdFieldName); var documentsWrapped = batch.Documents.Select((doc, i) => { Interlocked.Increment(ref sourceCount); if (doc.DocumentId == null) { throw new ArgumentException( string.Format("Cannot index something which doesn't have a document id, but got: '{0}'", doc)); } var documentId = doc.DocumentId.ToString(CultureInfo.InvariantCulture); if (processedKeys.Add(documentId) == false) { return(doc); } if (doc.SkipDeleteFromIndex == false) { indexWriter.DeleteDocuments(docIdTerm.CreateTerm(documentId.ToLowerInvariant())); } return(doc); }) .ToList(); foreach (var document in documentsWrapped) { Interlocked.Increment(ref count); LogIndexedDocument(document.DocumentId, document.Document); AddDocumentToIndex(indexWriter, document.Document, analyzer); indexWriter.Commit(); } return(sourceCount); }); AddindexingPerformanceStat(new IndexingPerformanceStats { OutputCount = count, InputCount = sourceCount, Duration = sw.Elapsed, Operation = "Index", Started = start }); LogIndexing.Debug("Indexed {0} documents for {1}", count, Name); }
public abstract void IndexDocuments(IndexingBatch batch);
public void IndexJsonDocuments(BaristaIndexDefinition indexDefinition, IEnumerable <JsonDocumentDto> documents) { try { if (documents == null) { throw new ArgumentNullException("documents", @"A collection of documents must be specified."); } var jsonDocuments = documents as IList <JsonDocumentDto> ?? documents.ToList(); if (jsonDocuments.Any() == false) { throw new ArgumentNullException("documents", @"At least one document must be contained within the collection."); } var index = GetOrAddIndex(indexDefinition, true); try { //Add it to the index. var batch = new IndexingBatch(); //Update the indexDefinition for the index based on the options specified. foreach (var document in jsonDocuments) { UpdateIndexDefinitionFromFieldOptions(index.IndexDefinition, document.FieldOptions); } //Attempt to create a new Search.JsonDocument from the document var searchJsonDocuments = jsonDocuments.Select(document => new Search.JsonDocument { DocumentId = document.DocumentId, Metadata = document.MetadataAsJson.IsNullOrWhiteSpace() == false ? JObject.Parse(document.MetadataAsJson) : new JObject(), DataAsJson = JObject.Parse(document.DataAsJson) }); var luceneDocuments = JsonDocumentToLuceneDocumentConverter.ConvertJsonDocumentToLuceneDocument(index.IndexDefinition, searchJsonDocuments); foreach (var luceneDocument in luceneDocuments) { batch.Add(luceneDocument); } //TODO: Add the batch to a BlockingCollection<IndexingBatch> and run a thread that consumes the batches //See http://www.codethinked.com/blockingcollection-and-iproducerconsumercollection index.IndexDocuments(batch); } catch (OutOfMemoryException) { CloseIndexWriter(indexDefinition, false); } } catch (Exception ex) { throw new FaultException(ex.Message); } }