private int CopyFieldsNoDeletions(FieldSelector fieldSelectorMerge, FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader) { int maxDoc = reader.MaxDoc(); int docCount = 0; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" while (docCount < maxDoc) { int len = System.Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len); fieldsWriter.AddRawDocuments(stream, rawDocLengths, len); docCount += len; checkAbort.Work(300 * len); } } else { for (; docCount < maxDoc; docCount++) { // NOTE: it's very important to first assign to doc then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Document doc = reader.Document(docCount, fieldSelectorMerge); fieldsWriter.AddDocument(doc); checkAbort.Work(300); } } return(docCount); }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; if (field.IsStored()) { storedCount++; } } fieldsStream.WriteVInt(storedCount); fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; if (field.IsStored()) { WriteField(fieldInfos.FieldInfo(field.Name()), field); } } }
// append fields from storedFieldReaders public override Document Document(int n, FieldSelector fieldSelector) { EnsureOpen(); Document result = new Document(); for (int i = 0; i < storedFieldReaders.Count; i++) { IndexReader reader = (IndexReader)storedFieldReaders[i]; bool include = (fieldSelector == null); if (!include) { System.Collections.IEnumerator it = ((System.Collections.ICollection)readerToFields[reader]).GetEnumerator(); while (it.MoveNext()) { if (fieldSelector.Accept((System.String)it.Current) != FieldSelectorResult.NO_LOAD) { include = true; break; } } } if (include) { System.Collections.IEnumerator fieldIterator = reader.Document(n, fieldSelector).GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { result.Add((Fieldable)fieldIterator.Current); } } } return(result); }
private int CopyFieldsWithDeletions(FieldSelector fieldSelectorMerge, FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader) { int docCount = 0; int maxDoc = reader.MaxDoc(); if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc;) { if (reader.IsDeleted(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) { break; } if (reader.IsDeleted(j)) { j++; break; } }while (numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; checkAbort.Work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (reader.IsDeleted(j)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to doc then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Document doc = reader.Document(j, fieldSelectorMerge); fieldsWriter.AddDocument(doc); docCount++; checkAbort.Work(300); } } return(docCount); }
/// <summary>Adds field info for a Document. </summary> public void Add(Document doc) { lock (this) { System.Collections.IList fields = doc.GetFields(); System.Collections.IEnumerator fieldIterator = fields.GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable)fieldIterator.Current; Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); } } }
/// <summary> Adds a document to this index, using the provided analyzer instead of the /// one specific in the constructor. If the document contains more than /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are /// discarded. /// </summary> /// <seealso cref="IndexWriter.AddDocument(Document, Analyzer)"> /// </seealso> /// <throws> IllegalStateException if the index is closed </throws> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> LockObtainFailedException if another writer </throws> /// <summary> has this index open (<code>write.lock</code> could not /// be obtained) /// </summary> /// <throws> IOException if there is a low-level IO error </throws> public virtual void AddDocument(Document doc, Analyzer docAnalyzer) { lock (directory) { AssureOpen(); CreateIndexWriter(); if (docAnalyzer != null) { indexWriter.AddDocument(doc, docAnalyzer); } else { indexWriter.AddDocument(doc); } } }
/// <summary> Test stored fields for a segment.</summary> private Status.StoredFieldStatus TestStoredFields(SegmentInfo info, SegmentReader reader, System.Globalization.NumberFormatInfo format) { Status.StoredFieldStatus status = new Status.StoredFieldStatus(); try { if (infoStream != null) { infoStream.Write(" test: stored fields......."); } // Scan stored fields for all documents for (int j = 0; j < info.docCount; ++j) { if (!reader.IsDeleted(j)) { status.docCount++; Document doc = reader.Document(j); status.totFields += doc.GetFields().Count; } } // Validate docCount if (status.docCount != reader.NumDocs()) { throw new System.SystemException("docCount=" + status.docCount + " but saw " + status.docCount + " undeleted docs"); } Msg(string.Format(format, "OK [{0:d} total field count; avg {1:f} fields per doc]", new object[] { status.totFields, (((float)status.totFields) / status.docCount) })); } catch (System.Exception e) { Msg("ERROR [" + System.Convert.ToString(e.Message) + "]"); status.error = e; if (infoStream != null) { infoStream.WriteLine(e.StackTrace); } } return(status); }
private void FetchTheHit() { doc = hits.Doc(hitNumber); resolved = true; }
public override DocumentsWriter.DocWriter ProcessDocument() { consumer.StartDocument(); fieldsWriter.StartDocument(); Document doc = docState.doc; System.Diagnostics.Debug.Assert(docFieldProcessor.docWriter.writer.TestPoint("DocumentsWriter.ThreadState.init start")); fieldCount = 0; int thisFieldGen = fieldGen++; System.Collections.IList docFields = doc.GetFields(); int numDocFields = docFields.Count; // Absorb any new fields first seen in this document. // Also absorb any changes to fields we had already // seen before (eg suddenly turning on norms or // vectors, etc.): for (int i = 0; i < numDocFields; i++) { Fieldable field = (Fieldable)docFields[i]; System.String fieldName = field.Name(); // Make sure we have a PerField allocated int hashPos = fieldName.GetHashCode() & hashMask; DocFieldProcessorPerField fp = fieldHash[hashPos]; while (fp != null && !fp.fieldInfo.name.Equals(fieldName)) { fp = fp.next; } if (fp == null) { // TODO FI: we need to genericize the "flags" that a // field holds, and, how these flags are merged; it // needs to be more "pluggable" such that if I want // to have a new "thing" my Fields can do, I can // easily add it FieldInfo fi = fieldInfos.Add(fieldName, field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; fieldHash[hashPos] = fp; totalFieldCount++; if (totalFieldCount >= fieldHash.Length / 2) { Rehash(); } } else { fp.fieldInfo.Update(field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); } if (thisFieldGen != fp.lastGen) { // First time we're seeing this field for this doc fp.fieldCount = 0; if (fieldCount == fields.Length) { int newSize = fields.Length * 2; DocFieldProcessorPerField[] newArray = new DocFieldProcessorPerField[newSize]; Array.Copy(fields, 0, newArray, 0, fieldCount); fields = newArray; } fields[fieldCount++] = fp; fp.lastGen = thisFieldGen; } if (fp.fieldCount == fp.fields.Length) { Fieldable[] newArray = new Fieldable[fp.fields.Length * 2]; Array.Copy(fp.fields, 0, newArray, 0, fp.fieldCount); fp.fields = newArray; } fp.fields[fp.fieldCount++] = field; if (field.IsStored()) { fieldsWriter.AddField(field, fp.fieldInfo); } } // If we are writing vectors then we must visit // fields in sorted order so they are written in // sorted order. TODO: we actually only need to // sort the subset of fields that have vectors // enabled; we could save [small amount of] CPU // here. QuickSort(fields, 0, fieldCount - 1); for (int i = 0; i < fieldCount; i++) { fields[i].consumer.ProcessFields(fields[i].fields, fields[i].fieldCount); } if (docState.maxTermPrefix != null && docState.infoStream != null) { docState.infoStream.WriteLine("WARNING: document contains at least one immense term (longer than the max length " + DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"); docState.maxTermPrefix = null; } DocumentsWriter.DocWriter one = fieldsWriter.FinishDocument(); DocumentsWriter.DocWriter two = consumer.FinishDocument(); if (one == null) { return(two); } else if (two == null) { return(one); } else { PerDoc both = GetPerDoc(); both.docID = docState.docID; System.Diagnostics.Debug.Assert(one.docID == docState.docID); System.Diagnostics.Debug.Assert(two.docID == docState.docID); both.one = one; both.two = two; return(both); } }
// append fields from storedFieldReaders public override Document Document(int n, FieldSelector fieldSelector) { EnsureOpen(); Document result = new Document(); for (int i = 0; i < storedFieldReaders.Count; i++) { IndexReader reader = (IndexReader) storedFieldReaders[i]; bool include = (fieldSelector == null); if (!include) { System.Collections.IEnumerator it = ((System.Collections.ICollection) readerToFields[reader]).GetEnumerator(); while (it.MoveNext()) { if (fieldSelector.Accept((System.String) it.Current) != FieldSelectorResult.NO_LOAD) { include = true; break; } } } if (include) { System.Collections.IEnumerator fieldIterator = reader.Document(n, fieldSelector).GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { result.Add((Fieldable) fieldIterator.Current); } } } return result; }
internal void AddDocument(Document doc) { indexStream.WriteLong(fieldsStream.GetFilePointer()); int storedCount = 0; System.Collections.IEnumerator fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable) fieldIterator.Current; if (field.IsStored()) storedCount++; } fieldsStream.WriteVInt(storedCount); fieldIterator = doc.GetFields().GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable) fieldIterator.Current; if (field.IsStored()) WriteField(fieldInfos.FieldInfo(field.Name()), field); } }
private void FetchTheHit() { doc = hits.Doc(hitNumber); resolved = true; }
/// <summary>Returns a free (idle) ThreadState that may be used for /// indexing this one document. This call also pauses if a /// flush is pending. If delTerm is non-null then we /// buffer this deleted term after the thread state has /// been acquired. /// </summary> internal DocumentsWriterThreadState GetThreadState(Document doc, Term delTerm) { lock (this) { // First, find a thread state. If this thread already // has affinity to a specific ThreadState, use that one // again. DocumentsWriterThreadState state = (DocumentsWriterThreadState) threadBindings[SupportClass.ThreadClass.Current()]; if (state == null) { // First time this thread has called us since last // flush. Find the least loaded thread state: DocumentsWriterThreadState minThreadState = null; for (int i = 0; i < threadStates.Length; i++) { DocumentsWriterThreadState ts = threadStates[i]; if (minThreadState == null || ts.numThreads < minThreadState.numThreads) minThreadState = ts; } if (minThreadState != null && (minThreadState.numThreads == 0 || threadStates.Length >= MAX_THREAD_STATE)) { state = minThreadState; state.numThreads++; } else { // Just create a new "private" thread state DocumentsWriterThreadState[] newArray = new DocumentsWriterThreadState[1 + threadStates.Length]; if (threadStates.Length > 0) Array.Copy(threadStates, 0, newArray, 0, threadStates.Length); state = newArray[threadStates.Length] = new DocumentsWriterThreadState(this); threadStates = newArray; } threadBindings[SupportClass.ThreadClass.Current()] = state; } // Next, wait until my thread state is idle (in case // it's shared with other threads) and for threads to // not be paused nor a flush pending: WaitReady(state); // Allocate segment name if this is the first doc since // last flush: InitSegmentName(false); state.isIdle = false; bool success = false; try { state.docState.docID = nextDocID; System.Diagnostics.Debug.Assert(writer.TestPoint("DocumentsWriter.ThreadState.init start")); if (delTerm != null) { AddDeleteTerm(delTerm, state.docState.docID); state.doFlushAfter = TimeToFlushDeletes(); } System.Diagnostics.Debug.Assert(writer.TestPoint("DocumentsWriter.ThreadState.init after delTerm")); nextDocID++; numDocsInRAM++; // We must at this point commit to flushing to ensure we // always get N docs when we flush by doc count, even if // > 1 thread is adding documents: if (!flushPending && maxBufferedDocs != IndexWriter.DISABLE_AUTO_FLUSH && numDocsInRAM >= maxBufferedDocs) { flushPending = true; state.doFlushAfter = true; } success = true; } finally { if (!success) { // Forcefully idle this ThreadState: state.isIdle = true; System.Threading.Monitor.PulseAll(this); if (state.doFlushAfter) { state.doFlushAfter = false; flushPending = false; } } } return state; } }
public void Clear() { // don't hold onto doc nor analyzer, in case it is // largish: doc = null; analyzer = null; }
/// <summary> Adds a document to this index. If the document contains more than /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are /// discarded. /// </summary> /// <seealso cref="IndexWriter.AddDocument(Document)"> /// </seealso> /// <throws> IllegalStateException if the index is closed </throws> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> LockObtainFailedException if another writer </throws> /// <summary> has this index open (<code>write.lock</code> could not /// be obtained) /// </summary> /// <throws> IOException if there is a low-level IO error </throws> public virtual void AddDocument(Document doc) { AddDocument(doc, null); }
/// <summary> Adds a document to this index. If the document contains more than /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are /// discarded. /// </summary> /// <seealso cref="IndexWriter.AddDocument(Document)"> /// </seealso> /// <throws> IllegalStateException if the index is closed </throws> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> LockObtainFailedException if another writer </throws> /// <summary> has this index open (<code>write.lock</code> could not /// be obtained) /// </summary> /// <throws> IOException if there is a low-level IO error </throws> public virtual void AddDocument(Document doc) { AddDocument(doc, null); }
/// <summary>Returns true if the caller (IndexWriter) should now /// flush. /// </summary> internal bool AddDocument(Document doc, Analyzer analyzer) { return UpdateDocument(doc, analyzer, null); }
/// <summary> Adds a document to this index. If the document contains more than /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are /// discarded. /// /// <p/> Note that if an Exception is hit (for example disk full) /// then the index will be consistent, but this document /// may not have been added. Furthermore, it's possible /// the index will have one segment in non-compound format /// even when using compound files (when a merge has /// partially succeeded).<p/> /// /// <p/> This method periodically flushes pending documents /// to the Directory (see <a href="#flush">above</a>), and /// also periodically triggers segment merges in the index /// according to the {@link MergePolicy} in use.<p/> /// /// <p/>Merges temporarily consume space in the /// directory. The amount of space required is up to 1X the /// size of all segments being merged, when no /// readers/searchers are open against the index, and up to /// 2X the size of all segments being merged when /// readers/searchers are open against the index (see /// {@link #Optimize()} for details). The sequence of /// primitive merge operations performed is governed by the /// merge policy. /// /// <p/>Note that each term in the document can be no longer /// than 16383 characters, otherwise an /// IllegalArgumentException will be thrown.<p/> /// /// <p/>Note that it's possible to create an invalid Unicode /// string in java if a UTF16 surrogate pair is malformed. /// In this case, the invalid characters are silently /// replaced with the Unicode replacement character /// U+FFFD.<p/> /// /// <p/><b>NOTE</b>: if this method hits an OutOfMemoryError /// you should immediately close the writer. See <a /// href="#OOME">above</a> for details.<p/> /// /// </summary> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> public virtual void AddDocument(Document doc) { AddDocument(doc, analyzer); }
/// <summary> Updates a document by first deleting the document(s) /// containing <code>term</code> and then adding the new /// document. The delete and then add are atomic as seen /// by a reader on the same index (flush may happen only after /// the add). /// /// <p/><b>NOTE</b>: if this method hits an OutOfMemoryError /// you should immediately close the writer. See <a /// href="#OOME">above</a> for details.<p/> /// /// </summary> /// <param name="term">the term to identify the document(s) to be /// deleted /// </param> /// <param name="doc">the document to be added /// </param> /// <param name="analyzer">the analyzer to use when analyzing the document /// </param> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> public virtual void UpdateDocument(Term term, Document doc, Analyzer analyzer) { EnsureOpen(); try { bool doFlush = false; bool success = false; try { doFlush = docWriter.UpdateDocument(term, doc, analyzer); success = true; } finally { if (!success) { if (infoStream != null) Message("hit exception updating document"); lock (this) { // If docWriter has some aborted files that were // never incref'd, then we clean them up here System.Collections.Generic.ICollection<string> files = docWriter.AbortedFiles(); if (files != null) deleter.DeleteNewFiles(files); } } } if (doFlush) Flush(true, false, false); } catch (System.OutOfMemoryException oom) { HandleOOM(oom, "updateDocument"); } }
/// <summary> Updates a document by first deleting the document(s) /// containing <code>term</code> and then adding the new /// document. The delete and then add are atomic as seen /// by a reader on the same index (flush may happen only after /// the add). /// /// <p/><b>NOTE</b>: if this method hits an OutOfMemoryError /// you should immediately close the writer. See <a /// href="#OOME">above</a> for details.<p/> /// /// </summary> /// <param name="term">the term to identify the document(s) to be /// deleted /// </param> /// <param name="doc">the document to be added /// </param> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> public virtual void UpdateDocument(Term term, Document doc) { EnsureOpen(); UpdateDocument(term, doc, GetAnalyzer()); }
internal bool UpdateDocument(Term t, Document doc, Analyzer analyzer) { return UpdateDocument(doc, analyzer, t); }
/// <summary> Adds a document to this index, using the provided analyzer instead of the /// one specific in the constructor. If the document contains more than /// {@link #SetMaxFieldLength(int)} terms for a given field, the remainder are /// discarded. /// </summary> /// <seealso cref="IndexWriter.AddDocument(Document, Analyzer)"> /// </seealso> /// <throws> IllegalStateException if the index is closed </throws> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> LockObtainFailedException if another writer </throws> /// <summary> has this index open (<code>write.lock</code> could not /// be obtained) /// </summary> /// <throws> IOException if there is a low-level IO error </throws> public virtual void AddDocument(Document doc, Analyzer docAnalyzer) { lock (directory) { AssureOpen(); CreateIndexWriter(); if (docAnalyzer != null) indexWriter.AddDocument(doc, docAnalyzer); else indexWriter.AddDocument(doc); } }
/// <summary>Adds field info for a Document. </summary> public void Add(Document doc) { lock (this) { System.Collections.IList fields = doc.GetFields(); System.Collections.IEnumerator fieldIterator = fields.GetEnumerator(); while (fieldIterator.MoveNext()) { Fieldable field = (Fieldable) fieldIterator.Current; Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false, field.GetOmitTf()); } } }
internal bool UpdateDocument(Document doc, Analyzer analyzer, Term delTerm) { // This call is synchronized but fast DocumentsWriterThreadState state = GetThreadState(doc, delTerm); DocState docState = state.docState; docState.doc = doc; docState.analyzer = analyzer; bool doReturnFalse = false; // {{Aroush-2.9}} to handle return from finally clause bool success = false; try { // This call is not synchronized and does all the // work DocWriter perDoc; try { perDoc = state.consumer.ProcessDocument(); } finally { docState.Clear(); } // This call is synchronized but fast FinishDocument(state, perDoc); success = true; } finally { if (!success) { lock (this) { if (aborting) { state.isIdle = true; System.Threading.Monitor.PulseAll(this); Abort(); } else { skipDocWriter.docID = docState.docID; bool success2 = false; try { waitQueue.Add(skipDocWriter); success2 = true; } finally { if (!success2) { state.isIdle = true; System.Threading.Monitor.PulseAll(this); Abort(); // return false; // {{Aroush-2.9}} this 'return false' is move to outside finally doReturnFalse = true; } } if (!doReturnFalse) // {{Aroush-2.9}} added because of the above 'return false' removal { state.isIdle = true; System.Threading.Monitor.PulseAll(this); // If this thread state had decided to flush, we // must clear it so another thread can flush if (state.doFlushAfter) { state.doFlushAfter = false; flushPending = false; System.Threading.Monitor.PulseAll(this); } // Immediately mark this document as deleted // since likely it was partially added. This // keeps indexing as "all or none" (atomic) when // adding a document: AddDeleteDocID(state.docState.docID); } } } } } if (doReturnFalse) // {{Aroush-2.9}} see comment abouve { return false; } return state.doFlushAfter || TimeToFlushDeletes(); }