private int CopyFieldsNoDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths) { int maxDoc = reader.MaxDoc; int docCount = 0; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" while (docCount < maxDoc) { int len = Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len); AddRawDocuments(stream, rawDocLengths, len); docCount += len; mergeState.CheckAbort.Work(300 * len); } } else { for (; docCount < maxDoc; docCount++) { // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 Document doc = reader.Document(docCount); AddDocument(doc, mergeState.FieldInfos); mergeState.CheckAbort.Work(300); } } return(docCount); }
public override void Warm(AtomicReader reader) { if (VERBOSE) { Console.WriteLine("TEST: now warm merged reader=" + reader); } outerInstance.warmed[((SegmentReader)reader).core] = true; int maxDoc = reader.MaxDoc; IBits liveDocs = reader.LiveDocs; int sum = 0; int inc = Math.Max(1, maxDoc / 50); for (int docID = 0; docID < maxDoc; docID += inc) { if (liveDocs == null || liveDocs.Get(docID)) { Document doc = reader.Document(docID); sum += doc.Fields.Count; } } IndexSearcher searcher = #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION outerInstance. #endif NewSearcher(reader); sum += searcher.Search(new TermQuery(new Term("body", "united")), 10).TotalHits; if (VERBOSE) { Console.WriteLine("TEST: warm visited " + sum + " fields"); } }
private int CopyFieldsWithDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths) { int docCount = 0; int maxDoc = reader.MaxDoc; IBits liveDocs = reader.LiveDocs; Debug.Assert(liveDocs != null); if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc;) { if (!liveDocs.Get(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) { break; } if (!liveDocs.Get(j)) { j++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; mergeState.CheckAbort.Work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (!liveDocs.Get(j)) { // skip deleted docs continue; } // TODO: this could be more efficient using // FieldVisitor instead of loading/writing entire // doc; ie we just have to renumber the field number // on the fly? // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 Document doc = reader.Document(j); AddDocument(doc, mergeState.FieldInfos); docCount++; mergeState.CheckAbort.Work(300); } } return(docCount); }
public virtual void TestDocsEnum() { IBits mappedLiveDocs = RandomLiveDocs(reader.MaxDoc); TermsEnum termsEnum = reader.GetTerms(DOCS_ENUM_FIELD).GetIterator(null); assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOCS_ENUM_TERM))); DocsEnum docs = termsEnum.Docs(mappedLiveDocs, null); int doc; int prev = -1; while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(doc)); assertEquals("incorrect value; doc " + doc, sortedValues[doc], int.Parse(reader.Document(doc).Get(ID_FIELD))); while (++prev < doc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev)); } } while (++prev < reader.MaxDoc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev)); } DocsEnum reuse = docs; docs = termsEnum.Docs(mappedLiveDocs, reuse); if (docs is SortingAtomicReader.SortingDocsEnum) { assertTrue(((SortingAtomicReader.SortingDocsEnum)docs).Reused(reuse)); // make sure reuse worked } doc = -1; prev = -1; while ((doc = docs.Advance(doc + 1)) != DocIdSetIterator.NO_MORE_DOCS) { assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(doc)); assertEquals("incorrect value; doc " + doc, sortedValues[doc], int.Parse(reader.Document(doc).Get(ID_FIELD))); while (++prev < doc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev)); } } while (++prev < reader.MaxDoc) { assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev)); } }
/// <summary> /// Split a given index into 3 indexes for training, test and cross validation tasks respectively /// </summary> /// <param name="originalIndex">an <see cref="AtomicReader"/> on the source index</param> /// <param name="trainingIndex">a <see cref="Directory"/> used to write the training index</param> /// <param name="testIndex">a <see cref="Directory"/> used to write the test index</param> /// <param name="crossValidationIndex">a <see cref="Directory"/> used to write the cross validation index</param> /// <param name="analyzer"><see cref="Analyzer"/> used to create the new docs</param> /// <param name="fieldNames">names of fields that need to be put in the new indexes or <c>null</c> if all should be used</param> /// <exception cref="IOException">if any writing operation fails on any of the indexes</exception> public virtual void Split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, params string[] fieldNames) { #pragma warning disable 612, 618 // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer)); #pragma warning restore 612, 618 try { int size = originalIndex.MaxDoc; IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.Search(new MatchAllDocsQuery(), int.MaxValue); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; int b = 0; // iterate over existing documents foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.Length > 0) { foreach (string fieldName in fieldNames) { doc.Add(new Field(fieldName, originalIndex.Document(scoreDoc.Doc).GetField(fieldName).ToString(), ft)); } } else { foreach (IIndexableField storableField in originalIndex.Document(scoreDoc.Doc).Fields) { if (storableField.GetReaderValue() != null) { doc.Add(new Field(storableField.Name, storableField.GetReaderValue(), ft)); } else if (storableField.GetBinaryValue() != null) { doc.Add(new Field(storableField.Name, storableField.GetBinaryValue(), ft)); } else if (storableField.GetStringValue() != null) { doc.Add(new Field(storableField.Name, storableField.GetStringValue(), ft)); } else if (storableField.NumericType != NumericFieldType.NONE) // LUCENENET specific - checking the NumricType property is quicker than the type conversion { // LUCENENET specific - need to pass invariant culture here (we are assuming the Field will be stored) // and we need to round-trip floating point numbers so we don't lose precision. if (storableField.NumericType == NumericFieldType.SINGLE || storableField.NumericType == NumericFieldType.DOUBLE) { // LUCENENET: Need to specify the "R" for round-trip: http://stackoverflow.com/a/611564 doc.Add(new Field(storableField.Name, storableField.GetStringValue("R", CultureInfo.InvariantCulture), ft)); } else { doc.Add(new Field(storableField.Name, storableField.GetStringValue(CultureInfo.InvariantCulture), ft)); } } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.MaxDoc < size * testRatio) { testWriter.AddDocument(doc); } else if (cvWriter.MaxDoc < size * crossValidationRatio) { cvWriter.AddDocument(doc); } else { trainingWriter.AddDocument(doc); } b++; } } catch (Exception e) when(e.IsException()) { throw new IOException("Exceptio in DatasetSplitter", e); } finally { testWriter.Commit(); cvWriter.Commit(); trainingWriter.Commit(); // close IWs testWriter.Dispose(); cvWriter.Dispose(); trainingWriter.Dispose(); } }
/// <summary> /// Split a given index into 3 indexes for training, test and cross validation tasks respectively /// </summary> /// <param name="originalIndex">an <see cref="AtomicReader"/> on the source index</param> /// <param name="trainingIndex">a <see cref="Directory"/> used to write the training index</param> /// <param name="testIndex">a <see cref="Directory"/> used to write the test index</param> /// <param name="crossValidationIndex">a <see cref="Directory"/> used to write the cross validation index</param> /// <param name="analyzer"><see cref="Analyzer"/> used to create the new docs</param> /// <param name="fieldNames">names of fields that need to be put in the new indexes or <c>null</c> if all should be used</param> /// <exception cref="IOException">if any writing operation fails on any of the indexes</exception> public virtual void Split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, params string[] fieldNames) { #pragma warning disable 612, 618 // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Util.LuceneVersion.LUCENE_CURRENT, analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Util.LuceneVersion.LUCENE_CURRENT, analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Util.LuceneVersion.LUCENE_CURRENT, analyzer)); #pragma warning restore 612, 618 try { int size = originalIndex.MaxDoc; IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.Search(new MatchAllDocsQuery(), Int32.MaxValue); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; int b = 0; // iterate over existing documents foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.Length > 0) { foreach (string fieldName in fieldNames) { doc.Add(new Field(fieldName, originalIndex.Document(scoreDoc.Doc).GetField(fieldName).ToString(), ft)); } } else { foreach (IIndexableField storableField in originalIndex.Document(scoreDoc.Doc).Fields) { if (storableField.GetReaderValue() != null) { doc.Add(new Field(storableField.Name, storableField.GetReaderValue(), ft)); } else if (storableField.GetBinaryValue() != null) { doc.Add(new Field(storableField.Name, storableField.GetBinaryValue(), ft)); } else if (storableField.GetStringValue() != null) { doc.Add(new Field(storableField.Name, storableField.GetStringValue(), ft)); } else if (storableField.GetNumericValue() != null) { doc.Add(new Field(storableField.Name, storableField.GetNumericValue().ToString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.MaxDoc < size * _testRatio) { testWriter.AddDocument(doc); } else if (cvWriter.MaxDoc < size * _crossValidationRatio) { cvWriter.AddDocument(doc); } else { trainingWriter.AddDocument(doc); } b++; } } catch (Exception e) { throw new IOException("Exceptio in DatasetSplitter", e); } finally { testWriter.Commit(); cvWriter.Commit(); trainingWriter.Commit(); // close IWs testWriter.Dispose(); cvWriter.Dispose(); trainingWriter.Dispose(); } }
private int CopyFieldsNoDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths) { int maxDoc = reader.MaxDoc(); int docCount = 0; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" while (docCount < maxDoc) { int len = Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len); AddRawDocuments(stream, rawDocLengths, len); docCount += len; mergeState.checkAbort.Work(300 * len); } } else { for (; docCount < maxDoc; docCount++) { // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 Document doc = reader.Document(docCount); AddDocument(doc, mergeState.FieldInfos); mergeState.checkAbort.Work(300); } } return docCount; }
private int CopyFieldsWithDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths) { int docCount = 0; int maxDoc = reader.MaxDoc(); Bits liveDocs = reader.LiveDocs; Debug.Assert(liveDocs != null); if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc; ) { if (!liveDocs.Get(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) { break; } if (!liveDocs.Get(j)) { j++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; mergeState.checkAbort.Work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (!liveDocs.Get(j)) { // skip deleted docs continue; } // TODO: this could be more efficient using // FieldVisitor instead of loading/writing entire // doc; ie we just have to renumber the field number // on the fly? // NOTE: it's very important to first assign to doc then pass it to // fieldsWriter.addDocument; see LUCENE-1282 Document doc = reader.Document(j); AddDocument(doc, mergeState.FieldInfos); docCount++; mergeState.checkAbort.Work(300); } } return docCount; }