private int CopyFieldsWithDeletions(FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader) { int docCount = 0; int maxDoc = reader.MaxDoc; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc;) { if (reader.IsDeleted(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) { break; } if (reader.IsDeleted(j)) { j++; break; } }while (numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; checkAbort.Work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (reader.IsDeleted(j)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to doc then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Document doc = reader.Document(j); fieldsWriter.AddDocument(doc); docCount++; checkAbort.Work(300); } } return(docCount); }
private void CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader) { int maxDoc = reader.MaxDoc; if (matchingVectorsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int docNum = 0; docNum < maxDoc;) { if (reader.IsDeleted(docNum)) { // skip deleted docs ++docNum; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = docNum, numDocs = 0; do { docNum++; numDocs++; if (docNum >= maxDoc) { break; } if (reader.IsDeleted(docNum)) { docNum++; break; } }while (numDocs < MAX_RAW_MERGE_DOCS); matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs); termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs); checkAbort.Work(300 * numDocs); } } else { for (int docNum = 0; docNum < maxDoc; docNum++) { if (reader.IsDeleted(docNum)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 ITermFreqVector[] vectors = reader.GetTermFreqVectors(docNum); termVectorsWriter.AddAllDocVectors(vectors); checkAbort.Work(300); } } }
private void MergeNorms() { for (int i = 0; i < fieldInfos.Size(); i++) { FieldInfo fi = fieldInfos.FieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { IndexOutput output = directory.CreateOutput(segment + ".f" + i); try { for (int j = 0; j < readers.Count; j++) { IndexReader reader = (IndexReader)readers[j]; int maxDoc = reader.MaxDoc(); byte[] input = new byte[maxDoc]; reader.Norms(fi.name, input, 0); for (int k = 0; k < maxDoc; k++) { if (!reader.IsDeleted(k)) { output.WriteByte(input[k]); } } } } finally { output.Close(); } } } }
/// <summary> 1. Get a norm from the original reader 2. Clone the original reader 3. /// Delete a document and set the norm of the cloned reader 4. Verify the norms /// are not the same on each reader 5. Verify the doc deleted is only in the /// cloned reader 6. Try to delete a document in the original reader, an /// exception should be thrown /// /// </summary> /// <param name="r1">IndexReader to perform tests on /// </param> /// <throws> Exception </throws> private void PerformDefaultTests(IndexReader r1) { float norm1 = Similarity.DecodeNorm(r1.Norms("field1")[4]); IndexReader pr1Clone = (IndexReader)r1.Clone(); pr1Clone.DeleteDocument(10); pr1Clone.SetNorm(4, "field1", 0.5f); Assert.IsTrue(Similarity.DecodeNorm(r1.Norms("field1")[4]) == norm1); Assert.IsTrue(Similarity.DecodeNorm(pr1Clone.Norms("field1")[4]) != norm1); Assert.IsTrue(!r1.IsDeleted(10)); Assert.IsTrue(pr1Clone.IsDeleted(10)); // try to update the original reader, which should throw an exception try { r1.DeleteDocument(11); Assert.Fail("Tried to delete doc 11 and an exception should have been thrown"); } catch (System.Exception exception) { // expectted } pr1Clone.Close(); }
private void MergeNorms() { byte[] normBuffer = null; IndexOutput output = null; try { for (int i = 0; i < fieldInfos.Size(); i++) { FieldInfo fi = fieldInfos.FieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { if (output == null) { output = directory.CreateOutput(segment + "." + IndexFileNames.NORMS_EXTENSION); output.WriteBytes(NORMS_HEADER, NORMS_HEADER.Length); } for (int j = 0; j < readers.Count; j++) { IndexReader reader = (IndexReader)readers[j]; int maxDoc = reader.MaxDoc(); if (normBuffer == null || normBuffer.Length < maxDoc) { // the buffer is too small for the current segment normBuffer = new byte[maxDoc]; } reader.Norms(fi.name, normBuffer, 0); if (!reader.HasDeletions()) { //optimized case for segments without deleted docs output.WriteBytes(normBuffer, maxDoc); } else { // this segment has deleted docs, so we have to // check for every doc if it is deleted or not for (int k = 0; k < maxDoc; k++) { if (!reader.IsDeleted(k)) { output.WriteByte(normBuffer[k]); } } } if (checkAbort != null) { checkAbort.Work(maxDoc); } } } } } finally { if (output != null) { output.Close(); } } }
/// <summary> Merge the TermVectors from each of the segments into the new one.</summary> /// <throws> IOException </throws> private void MergeVectors() { TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos); try { for (int r = 0; r < readers.Count; r++) { IndexReader reader = (IndexReader)readers[r]; int maxDoc = reader.MaxDoc(); for (int docNum = 0; docNum < maxDoc; docNum++) { // skip deleted docs if (reader.IsDeleted(docNum)) { continue; } termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum)); if (checkAbort != null) { checkAbort.Work(300); } } } } finally { termVectorsWriter.Close(); } }
// maps around deleted docs internal int[] GetDocMap() { if (docMap == null) { delCount = 0; // build array which maps document numbers around deletions if (reader.HasDeletions) { int maxDoc = reader.MaxDoc; docMap = new int[maxDoc]; int j = 0; for (int i = 0; i < maxDoc; i++) { if (reader.IsDeleted(i)) { delCount++; docMap[i] = -1; } else { docMap[i] = j++; } } } } return(docMap); }
static void MakeLatestVersionLookupPerReader(IDictionary<string, Tuple<NuGetVersion, string, int>> lookup, IndexReader reader, string readerName, bool includePrerelease, bool includeUnlisted) { for (int n = 0; n < reader.MaxDoc; n++) { if (reader.IsDeleted(n)) { continue; } Document document = reader.Document(n); NuGetVersion version = GetVersion(document); if (version == null) { continue; } bool isListed = GetListed(document); if (isListed || includeUnlisted) { if (!version.IsPrerelease || includePrerelease) { string id = GetId(document); if (id == null) { continue; } Tuple<NuGetVersion, string, int> existingVersion; if (lookup.TryGetValue(id, out existingVersion)) { if (version > existingVersion.Item1) { lookup[id] = Tuple.Create(version, readerName, n); } } else { lookup.Add(id, Tuple.Create(version, readerName, n)); } } } } }
protected override CustomScoreProvider GetCustomScoreProvider(IndexReader reader) { int maxDoc = reader.MaxDoc; long[] daysAgo = new long[maxDoc]; long[] publishDate = FieldCache_Fields.DEFAULT.GetLongs(reader, SearchDocument.TimeStampFieldName); long currentDate = DateTime.UtcNow.ToFileTime(); long timeSpanFromDaysTicks = TimeSpan.FromDays(36500).Ticks; for (int i = 0; i < maxDoc; i++) { if (!reader.IsDeleted(i)) daysAgo[i] = currentDate - publishDate[i]; } var recencyData = new RecencyOptions(daysAgo, 5, timeSpanFromDaysTicks); return new RecencyScoreProvider(reader, recencyData); }
/// <summary> </summary> /// <returns> The number of documents in all of the readers /// </returns> /// <throws> IOException </throws> private int MergeFields() { fieldInfos = new FieldInfos(); // merge field names int docCount = 0; for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false); fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false); } fieldInfos.Write(directory, segment + ".fnm"); FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new AnonymousClassFieldSelector(this); try { for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; int maxDoc = reader.MaxDoc(); for (int j = 0; j < maxDoc; j++) { if (!reader.IsDeleted(j)) { // skip deleted docs fieldsWriter.AddDocument(reader.Document(j, fieldSelectorMerge)); docCount++; } } } } finally { fieldsWriter.Close(); } return(docCount); }
void ProcessReader(IndexReader indexReader, string readerName, ref int perIndexDocumentNumber) { for (int perReaderDocumentNumber = 0; perReaderDocumentNumber < indexReader.MaxDoc; perReaderDocumentNumber++) { if (indexReader.IsDeleted(perReaderDocumentNumber)) { ProcessDocument(indexReader, readerName, perReaderDocumentNumber, perIndexDocumentNumber, null, isDelete: true); } else { Document document = indexReader.Document(perReaderDocumentNumber); ProcessDocument(indexReader, readerName, perReaderDocumentNumber, perIndexDocumentNumber, document, isDelete: false); } perIndexDocumentNumber++; } }
public PackageVersions(IndexReader reader) { _reader = reader; _registrations = new Dictionary<string, List<NuGetVersion>>(); for (int i = 0; i < reader.MaxDoc; i++) { if (reader.IsDeleted(i)) { continue; } Document document = reader[i]; NuGetVersion currentVersion = GetVersion(document); if (currentVersion == null) { continue; } string id = GetId(document); if (id == null) { continue; } List<NuGetVersion> versions; if (!_registrations.TryGetValue(id, out versions)) { versions = new List<NuGetVersion>(); _registrations.Add(id, versions); } versions.Add(currentVersion); } foreach (List<NuGetVersion> values in _registrations.Values) { values.Sort(); } }
/// <summary> 1. Get a norm from the original reader 2. Clone the original reader 3. /// Delete a document and set the norm of the cloned reader 4. Verify the norms /// are not the same on each reader 5. Verify the doc deleted is only in the /// cloned reader 6. Try to delete a document in the original reader, an /// exception should be thrown /// /// </summary> /// <param name="r1">IndexReader to perform tests on /// </param> /// <throws> Exception </throws> private void PerformDefaultTests(IndexReader r1) { float norm1 = Similarity.DecodeNorm(r1.Norms("field1")[4]); IndexReader pr1Clone = (IndexReader)r1.Clone(); pr1Clone.DeleteDocument(10); pr1Clone.SetNorm(4, "field1", 0.5f); Assert.IsTrue(Similarity.DecodeNorm(r1.Norms("field1")[4]) == norm1); Assert.IsTrue(Similarity.DecodeNorm(pr1Clone.Norms("field1")[4]) != norm1); Assert.IsTrue(!r1.IsDeleted(10)); Assert.IsTrue(pr1Clone.IsDeleted(10)); // try to update the original reader, which should throw an exception Assert.Throws <LockObtainFailedException>(() => r1.DeleteDocument(11), "Tried to delete doc 11 and an exception should have been thrown"); pr1Clone.Close(); }
/// <summary> </summary> /// <returns> The number of documents in all of the readers /// </returns> /// <throws> IOException </throws> private int MergeFields() { fieldInfos = new FieldInfos(); // merge field names int docCount = 0; for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false); fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false); } fieldInfos.Write(directory, segment + ".fnm"); FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; int maxDoc = reader.MaxDoc(); for (int j = 0; j < maxDoc; j++) { if (!reader.IsDeleted(j)) { // skip deleted docs fieldsWriter.AddDocument(reader.Document(j)); docCount++; } } } } finally { fieldsWriter.Close(); } return(docCount); }
public virtual void TestCloneWithDeletes() { Directory dir1 = new MockRAMDirectory(); TestIndexReaderReopen.CreateIndex(dir1, false); IndexReader origReader = IndexReader.Open(dir1, false); origReader.DeleteDocument(1); IndexReader clonedReader = (IndexReader)origReader.Clone(); origReader.Close(); clonedReader.Close(); IndexReader r = IndexReader.Open(dir1, false); Assert.IsTrue(r.IsDeleted(1)); r.Close(); dir1.Close(); }
private void CheckExpecteds(System.Collections.BitArray expecteds) { IndexReader r = IndexReader.Open(dir); //Perhaps not the most efficient approach but meets our needs here. for (int i = 0; i < r.MaxDoc(); i++) { if (!r.IsDeleted(i)) { System.String sval = r.Document(i).Get(FIELD_RECORD_ID); if (sval != null) { int val = System.Int32.Parse(sval); Assert.IsTrue(expecteds.Get(val), "Did not expect document #" + val); expecteds.Set(val, false); } } } r.Close(); Assert.AreEqual(0, SupportClass.BitSetSupport.Cardinality(expecteds), "Should have 0 docs remaining "); }
public override IEnumerable<Row> Execute(IEnumerable<Row> rows) { if (_indexDirectory == null) yield break; try { _reader = IndexReader.Open(_indexDirectory, true); } catch (Exception) { Warn("Failed to open lucene index in {0}.", _indexDirectory.Directory.FullName); yield break; } var docCount = _reader.NumDocs(); Info("Found {0} documents in lucene index.", docCount); for (var i = 0; i < docCount; i++) { if (_reader.IsDeleted(i)) continue; var doc = _reader.Document(i); var row = new Row(); foreach (var field in doc.GetFields().Where(field => field.IsStored)) { switch (field.Name) { case "dropped": row[field.Name] = Convert.ToBoolean(field.StringValue); break; default: row[field.Name] = field.StringValue; break; } } yield return row; } }
public static IEnumerable<string> GetDistintTenantId(IndexReader reader) { HashSet<string> result = new HashSet<string>(); for (int i = 0; i < reader.MaxDoc; i++) { if (reader.IsDeleted(i)) { continue; } Document document = reader[i]; string tenantId = document.Get("TenantId"); if (tenantId != null) { result.Add(tenantId); } } return result; }
private static Dictionary<string, int[]> FillCache(IndexReader reader, int docBase, string field) { using (var termDocs = reader.TermDocs()) { var items = new Dictionary<string, int[]>(); var docsForTerm = new List<int>(); using (var termEnum = reader.Terms(new Term(field))) { do { if (termEnum.Term == null || field != termEnum.Term.Field) break; Term term = termEnum.Term; if (LowPrecisionNumber(term.Field, term.Text)) continue; var totalDocCountIncludedDeletes = termEnum.DocFreq(); termDocs.Seek(termEnum.Term); while (termDocs.Next() && totalDocCountIncludedDeletes > 0) { var curDoc = termDocs.Doc; totalDocCountIncludedDeletes -= 1; if (reader.IsDeleted(curDoc)) continue; docsForTerm.Add(curDoc + docBase); } docsForTerm.Sort(); items[term.Text] = docsForTerm.ToArray(); docsForTerm.Clear(); } while (termEnum.Next()); } return items; } }
public override bool IsDeleted(int n) { // Don't call ensureOpen() here (it could affect performance) return(in_Renamed.IsDeleted(n)); }
private int CopyFieldsWithDeletions(FieldSelector fieldSelectorMerge, FieldsWriter fieldsWriter, IndexReader reader, FieldsReader matchingFieldsReader) { int docCount = 0; int maxDoc = reader.MaxDoc(); if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc; ) { if (reader.IsDeleted(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) break; if (reader.IsDeleted(j)) { j++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; checkAbort.Work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (reader.IsDeleted(j)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to doc then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Document doc = reader.Document(j, fieldSelectorMerge); fieldsWriter.AddDocument(doc); docCount++; checkAbort.Work(300); } } return docCount; }
private void CopyVectorsWithDeletions(TermVectorsWriter termVectorsWriter, TermVectorsReader matchingVectorsReader, IndexReader reader) { int maxDoc = reader.MaxDoc(); if (matchingVectorsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int docNum = 0; docNum < maxDoc; ) { if (reader.IsDeleted(docNum)) { // skip deleted docs ++docNum; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = docNum, numDocs = 0; do { docNum++; numDocs++; if (docNum >= maxDoc) break; if (reader.IsDeleted(docNum)) { docNum++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs); termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs); checkAbort.Work(300 * numDocs); } } else { for (int docNum = 0; docNum < maxDoc; docNum++) { if (reader.IsDeleted(docNum)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum); termVectorsWriter.AddAllDocVectors(vectors); checkAbort.Work(300); } } }
private void assertCompressedFields29(Directory dir, bool shouldStillBeCompressed) { int count = 0; int TEXT_PLAIN_LENGTH = TEXT_TO_COMPRESS.Length * 2; // FieldSelectorResult.SIZE returns 2*number_of_chars for String fields: int BINARY_PLAIN_LENGTH = BINARY_TO_COMPRESS.Length; IndexReader reader = IndexReader.Open(dir, true); try { // look into sub readers and check if raw merge is on/off var readers = new System.Collections.Generic.List <IndexReader>(); ReaderUtil.GatherSubReaders(readers, reader); foreach (IndexReader ir in readers) { FieldsReader fr = ((SegmentReader)ir).GetFieldsReader(); Assert.IsTrue(shouldStillBeCompressed != fr.CanReadRawDocs(), "for a 2.9 index, FieldsReader.canReadRawDocs() must be false and other way round for a trunk index"); } // test that decompression works correctly for (int i = 0; i < reader.MaxDoc; i++) { if (!reader.IsDeleted(i)) { Document d = reader.Document(i); if (d.Get("content3") != null) { continue; } count++; IFieldable compressed = d.GetFieldable("compressed"); if (int.Parse(d.Get("id")) % 2 == 0) { Assert.IsFalse(compressed.IsBinary); Assert.AreEqual(TEXT_TO_COMPRESS, compressed.StringValue, "incorrectly decompressed string"); } else { Assert.IsTrue(compressed.IsBinary); Assert.IsTrue(BINARY_TO_COMPRESS.SequenceEqual(compressed.GetBinaryValue()), "incorrectly decompressed binary"); } } } //check if field was decompressed after optimize for (int i = 0; i < reader.MaxDoc; i++) { if (!reader.IsDeleted(i)) { Document d = reader.Document(i, new AnonymousFieldSelector()); if (d.Get("content3") != null) { continue; } count++; // read the size from the binary value using BinaryReader (this prevents us from doing the shift ops ourselves): // ugh, Java uses Big-Endian streams, so we need to do it manually. byte[] encodedSize = d.GetFieldable("compressed").GetBinaryValue().Take(4).Reverse().ToArray(); int actualSize = BitConverter.ToInt32(encodedSize, 0); int compressedSize = int.Parse(d.Get("compressedSize")); bool binary = int.Parse(d.Get("id")) % 2 > 0; int shouldSize = shouldStillBeCompressed ? compressedSize : (binary ? BINARY_PLAIN_LENGTH : TEXT_PLAIN_LENGTH); Assert.AreEqual(shouldSize, actualSize, "size incorrect"); if (!shouldStillBeCompressed) { Assert.IsFalse(compressedSize == actualSize, "uncompressed field should have another size than recorded in index"); } } } Assert.AreEqual(34 * 2, count, "correct number of tests"); } finally { reader.Dispose(); } }
public virtual void searchIndex(System.String dirName, System.String oldName) { //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer()); //Query query = parser.parse("handle:1"); dirName = FullDir(dirName); Directory dir = FSDirectory.Open(new System.IO.DirectoryInfo(dirName)); IndexSearcher searcher = new IndexSearcher(dir, true); IndexReader reader = searcher.IndexReader; _TestUtil.CheckIndex(dir); for (int i = 0; i < 35; i++) { if (!reader.IsDeleted(i)) { Document d = reader.Document(i); var fields = d.GetFields(); if (!oldName.StartsWith("19.") && !oldName.StartsWith("20.") && !oldName.StartsWith("21.") && !oldName.StartsWith("22.")) { if (d.GetField("content3") == null) { int numFields = oldName.StartsWith("29.") ? 7 : 5; Assert.AreEqual(numFields, fields.Count); Field f = d.GetField("id"); Assert.AreEqual("" + i, f.StringValue); f = (Field)d.GetField("utf8"); Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.StringValue); f = (Field)d.GetField("autf8"); Assert.AreEqual("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.StringValue); f = (Field)d.GetField("content2"); Assert.AreEqual("here is more content with aaa aaa aaa", f.StringValue); f = (Field)d.GetField("fie\u2C77ld"); Assert.AreEqual("field with non-ascii name", f.StringValue); } } } // Only ID 7 is deleted else { Assert.AreEqual(7, i); } } ScoreDoc[] hits = searcher.Search(new TermQuery(new Term("content", "aaa")), null, 1000).ScoreDocs; // First document should be #21 since it's norm was // increased: Document d2 = searcher.Doc(hits[0].Doc); Assert.AreEqual("21", d2.Get("id"), "didn't get the right document first"); TestHits(hits, 34, searcher.IndexReader); if (!oldName.StartsWith("19.") && !oldName.StartsWith("20.") && !oldName.StartsWith("21.") && !oldName.StartsWith("22.")) { // Test on indices >= 2.3 hits = searcher.Search(new TermQuery(new Term("utf8", "\u0000")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); hits = searcher.Search(new TermQuery(new Term("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); hits = searcher.Search(new TermQuery(new Term("utf8", "ab\ud917\udc17cd")), null, 1000).ScoreDocs; Assert.AreEqual(34, hits.Length); } searcher.Close(); dir.Close(); }
private static void FillCache(IndexSearcherHolder.IndexSearcherHoldingState state, List<string> fieldsToRead,IndexReader reader) { foreach (var field in fieldsToRead) { using (var termDocs = reader.TermDocs()) { using (var termEnum = reader.Terms(new Term(field))) { do { if (termEnum.Term == null || field != termEnum.Term.Field) break; if (LowPrecisionNumber(termEnum.Term)) continue; var totalDocCountIncludedDeletes = termEnum.DocFreq(); termDocs.Seek(termEnum.Term); while (termDocs.Next() && totalDocCountIncludedDeletes > 0) { totalDocCountIncludedDeletes -= 1; if (reader.IsDeleted(termDocs.Doc)) continue; state.SetInCache(field, termDocs.Doc, termEnum.Term); } } while (termEnum.Next()); } } } }
/// <summary> Merge the TermVectors from each of the segments into the new one.</summary> /// <throws> IOException </throws> private void MergeVectors() { TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos); try { for (int r = 0; r < readers.Count; r++) { SegmentReader matchingSegmentReader = matchingSegmentReaders[r]; TermVectorsReader matchingVectorsReader; bool hasMatchingReader; if (matchingSegmentReader != null) { matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig; // If the TV* files are an older format then they // cannot read raw docs: if (matchingVectorsReader != null && !matchingVectorsReader.CanReadRawDocs()) { matchingVectorsReader = null; hasMatchingReader = false; } else { hasMatchingReader = matchingVectorsReader != null; } } else { hasMatchingReader = false; matchingVectorsReader = null; } IndexReader reader = (IndexReader)readers[r]; bool hasDeletions = reader.HasDeletions(); int maxDoc = reader.MaxDoc(); for (int docNum = 0; docNum < maxDoc;) { // skip deleted docs if (!hasDeletions || !reader.IsDeleted(docNum)) { if (hasMatchingReader) { // We can optimize this case (doing a bulk // byte copy) since the field numbers are // identical int start = docNum; int numDocs = 0; do { docNum++; numDocs++; if (docNum >= maxDoc) { break; } if (hasDeletions && matchingSegmentReader.IsDeleted(docNum)) { docNum++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs); termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs); if (checkAbort != null) { checkAbort.Work(300 * numDocs); } } else { // NOTE: it's very important to first assign // to vectors then pass it to // termVectorsWriter.addAllDocVectors; see // LUCENE-1282 TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum); termVectorsWriter.AddAllDocVectors(vectors); docNum++; if (checkAbort != null) { checkAbort.Work(300); } } } else { docNum++; } } } } finally { termVectorsWriter.Close(); } long tvxSize = directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); // {{dougsale-2.4.0} // this shouldn't be a problem for us - if it is, // then it's not a JRE bug //if (4 + mergedDocs * 16 != tvxSize) // // This is most likely a bug in Sun JRE 1.6.0_04/_05; // // we detect that the bug has struck, here, and // // throw an exception to prevent the corruption from // // entering the index. See LUCENE-1282 for // // details. // throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption"); }
/// <summary> 1. Get a norm from the original reader 2. Clone the original reader 3. /// Delete a document and set the norm of the cloned reader 4. Verify the norms /// are not the same on each reader 5. Verify the doc deleted is only in the /// cloned reader 6. Try to delete a document in the original reader, an /// exception should be thrown /// /// </summary> /// <param name="r1">IndexReader to perform tests on /// </param> /// <throws> Exception </throws> private void PerformDefaultTests(IndexReader r1) { float norm1 = Similarity.DecodeNorm(r1.Norms("field1")[4]); IndexReader pr1Clone = (IndexReader) r1.Clone(); pr1Clone.DeleteDocument(10); pr1Clone.SetNorm(4, "field1", 0.5f); Assert.IsTrue(Similarity.DecodeNorm(r1.Norms("field1")[4]) == norm1); Assert.IsTrue(Similarity.DecodeNorm(pr1Clone.Norms("field1")[4]) != norm1); Assert.IsTrue(!r1.IsDeleted(10)); Assert.IsTrue(pr1Clone.IsDeleted(10)); // try to update the original reader, which should throw an exception Assert.Throws<LockObtainFailedException>(() => r1.DeleteDocument(11), "Tried to delete doc 11 and an exception should have been thrown"); pr1Clone.Close(); }
/// <summary> 1. Get a norm from the original reader 2. Clone the original reader 3. /// Delete a document and set the norm of the cloned reader 4. Verify the norms /// are not the same on each reader 5. Verify the doc deleted is only in the /// cloned reader 6. Try to delete a document in the original reader, an /// exception should be thrown /// /// </summary> /// <param name="r1">IndexReader to perform tests on /// </param> /// <throws> Exception </throws> private void PerformDefaultTests(IndexReader r1) { float norm1 = Similarity.DecodeNorm(r1.Norms("field1")[4]); IndexReader pr1Clone = (IndexReader) r1.Clone(); pr1Clone.DeleteDocument(10); pr1Clone.SetNorm(4, "field1", 0.5f); Assert.IsTrue(Similarity.DecodeNorm(r1.Norms("field1")[4]) == norm1); Assert.IsTrue(Similarity.DecodeNorm(pr1Clone.Norms("field1")[4]) != norm1); Assert.IsTrue(!r1.IsDeleted(10)); Assert.IsTrue(pr1Clone.IsDeleted(10)); // try to update the original reader, which should throw an exception try { r1.DeleteDocument(11); Assert.Fail("Tried to delete doc 11 and an exception should have been thrown"); } catch (System.Exception exception) { // expectted } pr1Clone.Close(); }
private static void FillCache(IndexSearcherHolder.IndexSearcherHoldingState state, IEnumerable<string> fieldsToRead,IndexReader reader) { foreach (var field in fieldsToRead) { var items = new LinkedList<IndexSearcherHolder.IndexSearcherHoldingState.CacheVal>[reader.MaxDoc]; using (var termDocs = reader.TermDocs()) { using (var termEnum = reader.Terms(new Term(field))) { do { if (termEnum.Term == null || field != termEnum.Term.Field) break; Term term = termEnum.Term; if (LowPrecisionNumber(term.Field, term.Text)) continue; var totalDocCountIncludedDeletes = termEnum.DocFreq(); termDocs.Seek(termEnum.Term); while (termDocs.Next() && totalDocCountIncludedDeletes > 0) { totalDocCountIncludedDeletes -= 1; if (reader.IsDeleted(termDocs.Doc)) continue; if(items[termDocs.Doc] == null) items[termDocs.Doc] = new LinkedList<IndexSearcherHolder.IndexSearcherHoldingState.CacheVal>(); items[termDocs.Doc].AddLast(new IndexSearcherHolder.IndexSearcherHoldingState.CacheVal { Term = termEnum.Term }); } } while (termEnum.Next()); } } state.SetInCache(field, items); } }
public override bool IsDeleted(int n) { return(in_Renamed.IsDeleted(n)); }
/// <summary> </summary> /// <returns> The number of documents in all of the readers /// </returns> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> private int MergeFields() { if (!mergeDocStores) { // When we are not merging by doc stores, that means // all segments were written as part of a single // autoCommit=false IndexWriter session, so their field // name -> number mapping are the same. So, we start // with the fieldInfos of the last segment in this // case, to keep that numbering. SegmentReader sr = (SegmentReader)readers[readers.Count - 1]; fieldInfos = (FieldInfos)sr.fieldInfos.Clone(); } else { fieldInfos = new FieldInfos(); // merge field names } for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; if (reader is SegmentReader) { SegmentReader segmentReader = (SegmentReader)reader; for (int j = 0; j < segmentReader.GetFieldInfos().Size(); j++) { FieldInfo fi = segmentReader.GetFieldInfos().FieldInfo(j); fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads); } } else { AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false); fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false); } } fieldInfos.Write(directory, segment + ".fnm"); int docCount = 0; if (mergeDocStores) { // If the i'th reader is a SegmentReader and has // identical fieldName -> number mapping, then this // array will be non-null at position i: SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.Count]; // If this reader is a SegmentReader, and all of its // field name -> number mappings match the "merged" // FieldInfos, then we can do a bulk copy of the // stored fields: for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; if (reader is SegmentReader) { SegmentReader segmentReader = (SegmentReader)reader; bool same = true; FieldInfos segmentFieldInfos = segmentReader.GetFieldInfos(); for (int j = 0; same && j < segmentFieldInfos.Size(); j++) { same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j)); } if (same) { matchingSegmentReaders[i] = segmentReader; } } } // Used for bulk-reading raw bytes for stored fields int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new AnonymousClassFieldSelector(this); // merge field values FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; SegmentReader matchingSegmentReader = matchingSegmentReaders[i]; FieldsReader matchingFieldsReader; if (matchingSegmentReader != null) { matchingFieldsReader = matchingSegmentReader.GetFieldsReader(); } else { matchingFieldsReader = null; } int maxDoc = reader.MaxDoc(); for (int j = 0; j < maxDoc;) { if (!reader.IsDeleted(j)) { // skip deleted docs if (matchingSegmentReader != null) { // We can optimize this case (doing a bulk // byte copy) since the field numbers are // identical int start = j; int numDocs = 0; do { j++; numDocs++; }while (j < maxDoc && !matchingSegmentReader.IsDeleted(j) && numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; if (checkAbort != null) { checkAbort.Work(300 * numDocs); } } else { fieldsWriter.AddDocument(reader.Document(j, fieldSelectorMerge)); j++; docCount++; if (checkAbort != null) { checkAbort.Work(300); } } } else { j++; } } } } finally { fieldsWriter.Close(); } } // If we are skipping the doc stores, that means there // are no deletions in any of these segments, so we // just sum numDocs() of each segment to get total docCount else { for (int i = 0; i < readers.Count; i++) { docCount += ((IndexReader)readers[i]).NumDocs(); } } return(docCount); }
public static void AssertIndexEquals(IndexReader index1, IndexReader index2) { Assert.AreEqual(index1.NumDocs(), index2.NumDocs(), "IndexReaders have different values for numDocs."); Assert.AreEqual(index1.MaxDoc, index2.MaxDoc, "IndexReaders have different values for maxDoc."); Assert.AreEqual(index1.HasDeletions, index2.HasDeletions, "Only one IndexReader has deletions."); Assert.AreEqual(index1.IsOptimized(), index2.IsOptimized(), "Only one index is optimized."); // check field names System.Collections.Generic.ICollection<string> fieldsNames1 = index1.GetFieldNames(FieldOption.ALL); System.Collections.Generic.ICollection<string> fieldsNames2 = index1.GetFieldNames(FieldOption.ALL); System.Collections.Generic.ICollection<IFieldable> fields1 = null; System.Collections.Generic.ICollection<IFieldable> fields2 = null; Assert.AreEqual(fieldsNames1.Count, fieldsNames2.Count, "IndexReaders have different numbers of fields."); System.Collections.IEnumerator it1 = fieldsNames1.GetEnumerator(); System.Collections.IEnumerator it2 = fieldsNames2.GetEnumerator(); while (it1.MoveNext() && it2.MoveNext()) { Assert.AreEqual((System.String) it1.Current, (System.String) it2.Current, "Different field names."); } // check norms it1 = fieldsNames1.GetEnumerator(); while (it1.MoveNext()) { System.String curField = (System.String) it1.Current; byte[] norms1 = index1.Norms(curField); byte[] norms2 = index2.Norms(curField); if (norms1 != null && norms2 != null) { Assert.AreEqual(norms1.Length, norms2.Length); for (int i = 0; i < norms1.Length; i++) { Assert.AreEqual(norms1[i], norms2[i], "Norm different for doc " + i + " and field '" + curField + "'."); } } else { Assert.AreSame(norms1, norms2); } } // check deletions for (int i = 0; i < index1.MaxDoc; i++) { Assert.AreEqual(index1.IsDeleted(i), index2.IsDeleted(i), "Doc " + i + " only deleted in one index."); } // check stored fields for (int i = 0; i < index1.MaxDoc; i++) { if (!index1.IsDeleted(i)) { Document doc1 = index1.Document(i); Document doc2 = index2.Document(i); fields1 = doc1.GetFields(); fields2 = doc2.GetFields(); Assert.AreEqual(fields1.Count, fields2.Count, "Different numbers of fields for doc " + i + "."); it1 = fields1.GetEnumerator(); it2 = fields2.GetEnumerator(); while (it1.MoveNext() && it2.MoveNext()) { Field curField1 = (Field) it1.Current; Field curField2 = (Field) it2.Current; Assert.AreEqual(curField1.Name, curField2.Name, "Different fields names for doc " + i + "."); Assert.AreEqual(curField1.StringValue, curField2.StringValue, "Different field values for doc " + i + "."); } } } // check dictionary and posting lists TermEnum enum1 = index1.Terms(); TermEnum enum2 = index2.Terms(); TermPositions tp1 = index1.TermPositions(); TermPositions tp2 = index2.TermPositions(); while (enum1.Next()) { Assert.IsTrue(enum2.Next()); Assert.AreEqual(enum1.Term, enum2.Term, "Different term in dictionary."); tp1.Seek(enum1.Term); tp2.Seek(enum1.Term); while (tp1.Next()) { Assert.IsTrue(tp2.Next()); Assert.AreEqual(tp1.Doc, tp2.Doc, "Different doc id in postinglist of term " + enum1.Term + "."); Assert.AreEqual(tp1.Freq, tp2.Freq, "Different term frequence in postinglist of term " + enum1.Term + "."); for (int i = 0; i < tp1.Freq; i++) { Assert.AreEqual(tp1.NextPosition(), tp2.NextPosition(), "Different positions in postinglist of term " + enum1.Term + "."); } } } }
/// <summary> </summary> /// <returns> The number of documents in all of the readers /// </returns> /// <throws> CorruptIndexException if the index is corrupt </throws> /// <throws> IOException if there is a low-level IO error </throws> private int MergeFields() { if (!mergeDocStores) { // When we are not merging by doc stores, that means // all segments were written as part of a single // autoCommit=false IndexWriter session, so their field // name -> number mapping are the same. So, we start // with the fieldInfos of the last segment in this // case, to keep that numbering. SegmentReader sr = (SegmentReader)readers[readers.Count - 1]; fieldInfos = (FieldInfos)sr.fieldInfos.Clone(); } else { fieldInfos = new FieldInfos(); // merge field names } for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; if (reader is SegmentReader) { SegmentReader segmentReader = (SegmentReader)reader; for (int j = 0; j < segmentReader.GetFieldInfos().Size(); j++) { FieldInfo fi = segmentReader.GetFieldInfos().FieldInfo(j); fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads, fi.omitTf); } } else { AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.OMIT_TF), false, false, false, false, true); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true, false); AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false, false); fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false); } } fieldInfos.Write(directory, segment + ".fnm"); int docCount = 0; SetMatchingSegmentReaders(); if (mergeDocStores) { // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new AnonymousClassFieldSelector(this); // merge field values FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { for (int i = 0; i < readers.Count; i++) { IndexReader reader = (IndexReader)readers[i]; SegmentReader matchingSegmentReader = matchingSegmentReaders[i]; FieldsReader matchingFieldsReader; bool hasMatchingReader; if (matchingSegmentReader != null) { FieldsReader fieldsReader = matchingSegmentReader.GetFieldsReader(); if (fieldsReader != null && !fieldsReader.CanReadRawDocs()) { matchingFieldsReader = null; hasMatchingReader = false; } else { matchingFieldsReader = fieldsReader; hasMatchingReader = true; } } else { hasMatchingReader = false; matchingFieldsReader = null; } int maxDoc = reader.MaxDoc(); bool hasDeletions = reader.HasDeletions(); for (int j = 0; j < maxDoc;) { if (!hasDeletions || !reader.IsDeleted(j)) { // skip deleted docs if (hasMatchingReader) { // We can optimize this case (doing a bulk // byte copy) since the field numbers are // identical int start = j; int numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) { break; } if (hasDeletions && matchingSegmentReader.IsDeleted(j)) { j++; break; } } while (numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs); fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; if (checkAbort != null) { checkAbort.Work(300 * numDocs); } } else { // NOTE: it's very important to first assign // to doc then pass it to // termVectorsWriter.addAllDocVectors; see // LUCENE-1282 Document doc = reader.Document(j, fieldSelectorMerge); fieldsWriter.AddDocument(doc); j++; docCount++; if (checkAbort != null) { checkAbort.Work(300); } } } else { j++; } } } } finally { fieldsWriter.Close(); } long fdxFileLength = directory.FileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); // {{dougsale-2.4.0} // this shouldn't be a problem for us - if it is, // then it's not a JRE bug... //if (4+docCount*8 != fdxFileLength) // // This is most likely a bug in Sun JRE 1.6.0_04/_05; // // we detect that the bug has struck, here, and // // throw an exception to prevent the corruption from // // entering the index. See LUCENE-1282 for // // details. // throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + "; now aborting this merge to prevent index corruption"); } else { // If we are skipping the doc stores, that means there // are no deletions in any of these segments, so we // just sum numDocs() of each segment to get total docCount for (int i = 0; i < readers.Count; i++) { docCount += ((IndexReader)readers[i]).NumDocs(); } } return(docCount); }