示例#1
0
        private int CopyFieldsNoDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths)
        {
            int maxDoc   = reader.MaxDoc;
            int docCount = 0;

            if (matchingFieldsReader != null)
            {
                // We can bulk-copy because the fieldInfos are "congruent"
                while (docCount < maxDoc)
                {
                    int        len    = Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
                    IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len);
                    AddRawDocuments(stream, rawDocLengths, len);
                    docCount += len;
                    mergeState.CheckAbort.Work(300 * len);
                }
            }
            else
            {
                for (; docCount < maxDoc; docCount++)
                {
                    // NOTE: it's very important to first assign to doc then pass it to
                    // fieldsWriter.addDocument; see LUCENE-1282
                    Document doc = reader.Document(docCount);
                    AddDocument(doc, mergeState.FieldInfos);
                    mergeState.CheckAbort.Work(300);
                }
            }
            return(docCount);
        }
            public override void Warm(AtomicReader reader)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: now warm merged reader=" + reader);
                }
                outerInstance.warmed[((SegmentReader)reader).core] = true;
                int   maxDoc   = reader.MaxDoc;
                IBits liveDocs = reader.LiveDocs;
                int   sum      = 0;
                int   inc      = Math.Max(1, maxDoc / 50);

                for (int docID = 0; docID < maxDoc; docID += inc)
                {
                    if (liveDocs == null || liveDocs.Get(docID))
                    {
                        Document doc = reader.Document(docID);
                        sum += doc.Fields.Count;
                    }
                }
                IndexSearcher searcher =
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                    outerInstance.
#endif
                    NewSearcher(reader);

                sum += searcher.Search(new TermQuery(new Term("body", "united")), 10).TotalHits;

                if (VERBOSE)
                {
                    Console.WriteLine("TEST: warm visited " + sum + " fields");
                }
            }
示例#3
0
        private int CopyFieldsWithDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths)
        {
            int   docCount = 0;
            int   maxDoc   = reader.MaxDoc;
            IBits liveDocs = reader.LiveDocs;

            Debug.Assert(liveDocs != null);
            if (matchingFieldsReader != null)
            {
                // We can bulk-copy because the fieldInfos are "congruent"
                for (int j = 0; j < maxDoc;)
                {
                    if (!liveDocs.Get(j))
                    {
                        // skip deleted docs
                        ++j;
                        continue;
                    }
                    // We can optimize this case (doing a bulk byte copy) since the field
                    // numbers are identical
                    int start = j, numDocs = 0;
                    do
                    {
                        j++;
                        numDocs++;
                        if (j >= maxDoc)
                        {
                            break;
                        }
                        if (!liveDocs.Get(j))
                        {
                            j++;
                            break;
                        }
                    } while (numDocs < MAX_RAW_MERGE_DOCS);

                    IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs);
                    AddRawDocuments(stream, rawDocLengths, numDocs);
                    docCount += numDocs;
                    mergeState.CheckAbort.Work(300 * numDocs);
                }
            }
            else
            {
                for (int j = 0; j < maxDoc; j++)
                {
                    if (!liveDocs.Get(j))
                    {
                        // skip deleted docs
                        continue;
                    }
                    // TODO: this could be more efficient using
                    // FieldVisitor instead of loading/writing entire
                    // doc; ie we just have to renumber the field number
                    // on the fly?
                    // NOTE: it's very important to first assign to doc then pass it to
                    // fieldsWriter.addDocument; see LUCENE-1282
                    Document doc = reader.Document(j);
                    AddDocument(doc, mergeState.FieldInfos);
                    docCount++;
                    mergeState.CheckAbort.Work(300);
                }
            }
            return(docCount);
        }
示例#4
0
        public virtual void TestDocsEnum()
        {
            IBits     mappedLiveDocs = RandomLiveDocs(reader.MaxDoc);
            TermsEnum termsEnum      = reader.GetTerms(DOCS_ENUM_FIELD).GetIterator(null);

            assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOCS_ENUM_TERM)));
            DocsEnum docs = termsEnum.Docs(mappedLiveDocs, null);

            int doc;
            int prev = -1;

            while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
            {
                assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(doc));
                assertEquals("incorrect value; doc " + doc, sortedValues[doc], int.Parse(reader.Document(doc).Get(ID_FIELD)));
                while (++prev < doc)
                {
                    assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev));
                }
            }
            while (++prev < reader.MaxDoc)
            {
                assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev));
            }

            DocsEnum reuse = docs;

            docs = termsEnum.Docs(mappedLiveDocs, reuse);
            if (docs is SortingAtomicReader.SortingDocsEnum)
            {
                assertTrue(((SortingAtomicReader.SortingDocsEnum)docs).Reused(reuse)); // make sure reuse worked
            }
            doc  = -1;
            prev = -1;
            while ((doc = docs.Advance(doc + 1)) != DocIdSetIterator.NO_MORE_DOCS)
            {
                assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(doc));
                assertEquals("incorrect value; doc " + doc, sortedValues[doc], int.Parse(reader.Document(doc).Get(ID_FIELD)));
                while (++prev < doc)
                {
                    assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev));
                }
            }
            while (++prev < reader.MaxDoc)
            {
                assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.Get(prev));
            }
        }
示例#5
0
        /// <summary>
        /// Split a given index into 3 indexes for training, test and cross validation tasks respectively
        /// </summary>
        /// <param name="originalIndex">an <see cref="AtomicReader"/> on the source index</param>
        /// <param name="trainingIndex">a <see cref="Directory"/> used to write the training index</param>
        /// <param name="testIndex">a <see cref="Directory"/> used to write the test index</param>
        /// <param name="crossValidationIndex">a <see cref="Directory"/> used to write the cross validation index</param>
        /// <param name="analyzer"><see cref="Analyzer"/> used to create the new docs</param>
        /// <param name="fieldNames">names of fields that need to be put in the new indexes or <c>null</c> if all should be used</param>
        /// <exception cref="IOException">if any writing operation fails on any of the indexes</exception>
        public virtual void Split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, params string[] fieldNames)
        {
#pragma warning disable 612, 618
            // create IWs for train / test / cv IDXs
            IndexWriter testWriter     = new IndexWriter(testIndex, new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer));
            IndexWriter cvWriter       = new IndexWriter(crossValidationIndex, new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer));
            IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer));
#pragma warning restore 612, 618

            try
            {
                int size = originalIndex.MaxDoc;

                IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
                TopDocs       topDocs       = indexSearcher.Search(new MatchAllDocsQuery(), int.MaxValue);

                // set the type to be indexed, stored, with term vectors
                FieldType ft = new FieldType(TextField.TYPE_STORED);
                ft.StoreTermVectors         = true;
                ft.StoreTermVectorOffsets   = true;
                ft.StoreTermVectorPositions = true;

                int b = 0;

                // iterate over existing documents
                foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
                {
                    // create a new document for indexing
                    Document doc = new Document();
                    if (fieldNames != null && fieldNames.Length > 0)
                    {
                        foreach (string fieldName in fieldNames)
                        {
                            doc.Add(new Field(fieldName, originalIndex.Document(scoreDoc.Doc).GetField(fieldName).ToString(), ft));
                        }
                    }
                    else
                    {
                        foreach (IIndexableField storableField in originalIndex.Document(scoreDoc.Doc).Fields)
                        {
                            if (storableField.GetReaderValue() != null)
                            {
                                doc.Add(new Field(storableField.Name, storableField.GetReaderValue(), ft));
                            }
                            else if (storableField.GetBinaryValue() != null)
                            {
                                doc.Add(new Field(storableField.Name, storableField.GetBinaryValue(), ft));
                            }
                            else if (storableField.GetStringValue() != null)
                            {
                                doc.Add(new Field(storableField.Name, storableField.GetStringValue(), ft));
                            }
                            else if (storableField.NumericType != NumericFieldType.NONE) // LUCENENET specific - checking the NumricType property is quicker than the type conversion
                            {
                                // LUCENENET specific - need to pass invariant culture here (we are assuming the Field will be stored)
                                // and we need to round-trip floating point numbers so we don't lose precision.
                                if (storableField.NumericType == NumericFieldType.SINGLE || storableField.NumericType == NumericFieldType.DOUBLE)
                                {
                                    // LUCENENET: Need to specify the "R" for round-trip: http://stackoverflow.com/a/611564
                                    doc.Add(new Field(storableField.Name, storableField.GetStringValue("R", CultureInfo.InvariantCulture), ft));
                                }
                                else
                                {
                                    doc.Add(new Field(storableField.Name, storableField.GetStringValue(CultureInfo.InvariantCulture), ft));
                                }
                            }
                        }
                    }

                    // add it to one of the IDXs
                    if (b % 2 == 0 && testWriter.MaxDoc < size * testRatio)
                    {
                        testWriter.AddDocument(doc);
                    }
                    else if (cvWriter.MaxDoc < size * crossValidationRatio)
                    {
                        cvWriter.AddDocument(doc);
                    }
                    else
                    {
                        trainingWriter.AddDocument(doc);
                    }
                    b++;
                }
            }
            catch (Exception e) when(e.IsException())
            {
                throw new IOException("Exceptio in DatasetSplitter", e);
            }
            finally
            {
                testWriter.Commit();
                cvWriter.Commit();
                trainingWriter.Commit();
                // close IWs
                testWriter.Dispose();
                cvWriter.Dispose();
                trainingWriter.Dispose();
            }
        }
示例#6
0
        /// <summary>
        /// Split a given index into 3 indexes for training, test and cross validation tasks respectively
        /// </summary>
        /// <param name="originalIndex">an <see cref="AtomicReader"/> on the source index</param>
        /// <param name="trainingIndex">a <see cref="Directory"/> used to write the training index</param>
        /// <param name="testIndex">a <see cref="Directory"/> used to write the test index</param>
        /// <param name="crossValidationIndex">a <see cref="Directory"/> used to write the cross validation index</param>
        /// <param name="analyzer"><see cref="Analyzer"/> used to create the new docs</param>
        /// <param name="fieldNames">names of fields that need to be put in the new indexes or <c>null</c> if all should be used</param>
        /// <exception cref="IOException">if any writing operation fails on any of the indexes</exception>
        public virtual void Split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, params string[] fieldNames)
        {
#pragma warning disable 612, 618
            // create IWs for train / test / cv IDXs
            IndexWriter testWriter     = new IndexWriter(testIndex, new IndexWriterConfig(Util.LuceneVersion.LUCENE_CURRENT, analyzer));
            IndexWriter cvWriter       = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Util.LuceneVersion.LUCENE_CURRENT, analyzer));
            IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Util.LuceneVersion.LUCENE_CURRENT, analyzer));
#pragma warning restore 612, 618

            try
            {
                int size = originalIndex.MaxDoc;

                IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
                TopDocs       topDocs       = indexSearcher.Search(new MatchAllDocsQuery(), Int32.MaxValue);

                // set the type to be indexed, stored, with term vectors
                FieldType ft = new FieldType(TextField.TYPE_STORED);
                ft.StoreTermVectors         = true;
                ft.StoreTermVectorOffsets   = true;
                ft.StoreTermVectorPositions = true;

                int b = 0;

                // iterate over existing documents
                foreach (ScoreDoc scoreDoc in topDocs.ScoreDocs)
                {
                    // create a new document for indexing
                    Document doc = new Document();
                    if (fieldNames != null && fieldNames.Length > 0)
                    {
                        foreach (string fieldName in fieldNames)
                        {
                            doc.Add(new Field(fieldName, originalIndex.Document(scoreDoc.Doc).GetField(fieldName).ToString(), ft));
                        }
                    }
                    else
                    {
                        foreach (IIndexableField storableField in originalIndex.Document(scoreDoc.Doc).Fields)
                        {
                            if (storableField.GetReaderValue() != null)
                            {
                                doc.Add(new Field(storableField.Name, storableField.GetReaderValue(), ft));
                            }
                            else if (storableField.GetBinaryValue() != null)
                            {
                                doc.Add(new Field(storableField.Name, storableField.GetBinaryValue(), ft));
                            }
                            else if (storableField.GetStringValue() != null)
                            {
                                doc.Add(new Field(storableField.Name, storableField.GetStringValue(), ft));
                            }
                            else if (storableField.GetNumericValue() != null)
                            {
                                doc.Add(new Field(storableField.Name, storableField.GetNumericValue().ToString(), ft));
                            }
                        }
                    }

                    // add it to one of the IDXs
                    if (b % 2 == 0 && testWriter.MaxDoc < size * _testRatio)
                    {
                        testWriter.AddDocument(doc);
                    }
                    else if (cvWriter.MaxDoc < size * _crossValidationRatio)
                    {
                        cvWriter.AddDocument(doc);
                    }
                    else
                    {
                        trainingWriter.AddDocument(doc);
                    }
                    b++;
                }
            }
            catch (Exception e)
            {
                throw new IOException("Exceptio in DatasetSplitter", e);
            }
            finally
            {
                testWriter.Commit();
                cvWriter.Commit();
                trainingWriter.Commit();
                // close IWs
                testWriter.Dispose();
                cvWriter.Dispose();
                trainingWriter.Dispose();
            }
        }
 private int CopyFieldsNoDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths)
 {
     int maxDoc = reader.MaxDoc();
     int docCount = 0;
     if (matchingFieldsReader != null)
     {
         // We can bulk-copy because the fieldInfos are "congruent"
         while (docCount < maxDoc)
         {
             int len = Math.Min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
             IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, docCount, len);
             AddRawDocuments(stream, rawDocLengths, len);
             docCount += len;
             mergeState.checkAbort.Work(300 * len);
         }
     }
     else
     {
         for (; docCount < maxDoc; docCount++)
         {
             // NOTE: it's very important to first assign to doc then pass it to
             // fieldsWriter.addDocument; see LUCENE-1282
             Document doc = reader.Document(docCount);
             AddDocument(doc, mergeState.FieldInfos);
             mergeState.checkAbort.Work(300);
         }
     }
     return docCount;
 }
        private int CopyFieldsWithDeletions(MergeState mergeState, AtomicReader reader, Lucene40StoredFieldsReader matchingFieldsReader, int[] rawDocLengths)
        {
            int docCount = 0;
            int maxDoc = reader.MaxDoc();
            Bits liveDocs = reader.LiveDocs;
            Debug.Assert(liveDocs != null);
            if (matchingFieldsReader != null)
            {
                // We can bulk-copy because the fieldInfos are "congruent"
                for (int j = 0; j < maxDoc; )
                {
                    if (!liveDocs.Get(j))
                    {
                        // skip deleted docs
                        ++j;
                        continue;
                    }
                    // We can optimize this case (doing a bulk byte copy) since the field
                    // numbers are identical
                    int start = j, numDocs = 0;
                    do
                    {
                        j++;
                        numDocs++;
                        if (j >= maxDoc)
                        {
                            break;
                        }
                        if (!liveDocs.Get(j))
                        {
                            j++;
                            break;
                        }
                    } while (numDocs < MAX_RAW_MERGE_DOCS);

                    IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs);
                    AddRawDocuments(stream, rawDocLengths, numDocs);
                    docCount += numDocs;
                    mergeState.checkAbort.Work(300 * numDocs);
                }
            }
            else
            {
                for (int j = 0; j < maxDoc; j++)
                {
                    if (!liveDocs.Get(j))
                    {
                        // skip deleted docs
                        continue;
                    }
                    // TODO: this could be more efficient using
                    // FieldVisitor instead of loading/writing entire
                    // doc; ie we just have to renumber the field number
                    // on the fly?
                    // NOTE: it's very important to first assign to doc then pass it to
                    // fieldsWriter.addDocument; see LUCENE-1282
                    Document doc = reader.Document(j);
                    AddDocument(doc, mergeState.FieldInfos);
                    docCount++;
                    mergeState.checkAbort.Work(300);
                }
            }
            return docCount;
        }