public override void SetUp() { base.SetUp(); Dir = NewDirectory(); FieldName = Random().NextBoolean() ? "field" : ""; // sometimes use an empty string as field name RandomIndexWriter writer = new RandomIndexWriter(Random(), Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt(Random(), 50, 1000))); List <string> terms = new List <string>(); int num = AtLeast(200); for (int i = 0; i < num; i++) { Document doc = new Document(); doc.Add(NewStringField("id", Convert.ToString(i), Field.Store.NO)); int numTerms = Random().Next(4); for (int j = 0; j < numTerms; j++) { string s = TestUtil.RandomUnicodeString(Random()); doc.Add(NewStringField(FieldName, s, Field.Store.NO)); // if the default codec doesn't support sortedset, we will uninvert at search time if (DefaultCodecSupportsSortedSet()) { doc.Add(new SortedSetDocValuesField(FieldName, new BytesRef(s))); } terms.Add(s); } writer.AddDocument(doc); } if (VERBOSE) { // utf16 order terms.Sort(); Console.WriteLine("UTF16 order:"); foreach (string s in terms) { Console.WriteLine(" " + UnicodeUtil.ToHexString(s)); } } int numDeletions = Random().Next(num / 10); for (int i = 0; i < numDeletions; i++) { writer.DeleteDocuments(new Term("id", Convert.ToString(Random().Next(num)))); } Reader = writer.Reader; Searcher1 = NewSearcher(Reader); Searcher2 = NewSearcher(Reader); writer.Dispose(); }
// single straight enum private void DoTestStraightEnum(IList <Term> fieldTerms, IndexReader reader, int uniqueTermCount) { if (Verbose) { Console.WriteLine("\nTEST: top now enum reader=" + reader); } Fields fields = MultiFields.GetFields(reader); { // Test straight enum: int termCount = 0; foreach (string field in fields) { Terms terms = fields.GetTerms(field); Assert.IsNotNull(terms); TermsEnum termsEnum = terms.GetEnumerator(); BytesRef text; BytesRef lastText = null; while (termsEnum.MoveNext()) { text = termsEnum.Term; Term exp = fieldTerms[termCount]; if (Verbose) { Console.WriteLine(" got term=" + field + ":" + UnicodeUtil.ToHexString(text.Utf8ToString())); Console.WriteLine(" exp=" + exp.Field + ":" + UnicodeUtil.ToHexString(exp.Text)); Console.WriteLine(); } if (lastText == null) { lastText = BytesRef.DeepCopyOf(text); } else { Assert.IsTrue(lastText.CompareTo(text) < 0); lastText.CopyBytes(text); } Assert.AreEqual(exp.Field, field); Assert.AreEqual(exp.Bytes, text); termCount++; } if (Verbose) { Console.WriteLine(" no more terms for field=" + field); } } Assert.AreEqual(uniqueTermCount, termCount); } }
public override void SetUp() { base.SetUp(); Dir = NewDirectory(); FieldName = Random().NextBoolean() ? "field" : ""; // sometimes use an empty string as field name RandomIndexWriter writer = new RandomIndexWriter(Random(), Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt(Random(), 50, 1000))); Document doc = new Document(); Field field = NewStringField(FieldName, "", Field.Store.NO); doc.Add(field); List <string> terms = new List <string>(); int num = AtLeast(200); for (int i = 0; i < num; i++) { string s = TestUtil.RandomUnicodeString(Random()); field.StringValue = s; terms.Add(s); writer.AddDocument(doc); } if (VERBOSE) { // utf16 order terms.Sort(); Console.WriteLine("UTF16 order:"); foreach (string s in terms) { Console.WriteLine(" " + UnicodeUtil.ToHexString(s)); } } Reader = writer.Reader; Searcher1 = NewSearcher(Reader); Searcher2 = NewSearcher(Reader); writer.Dispose(); }
private static string ToHexString(Term t) { return(t.Field + ":" + UnicodeUtil.ToHexString(t.Text)); }
private void DoTestSeekDoesNotExist(Random r, int numField, IList <Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); if (Verbose) { Console.WriteLine("TEST: top random seeks"); } { int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // seek to random spot string field = ("f" + r.Next(numField)).Intern(); Term tx = new Term(field, GetRandomString(r)); int spot = Array.BinarySearch(fieldTermsArray, tx); if (spot < 0) { if (Verbose) { Console.WriteLine("TEST: non-exist seek to " + field + ":" + UnicodeUtil.ToHexString(tx.Text)); } // term does not exist: if (!tes.TryGetValue(field, out TermsEnum te)) { te = MultiFields.GetTerms(reader, field).GetEnumerator(); tes[field] = te; } if (Verbose) { Console.WriteLine(" got enum"); } spot = -spot - 1; if (spot == fieldTerms.Count || !fieldTerms[spot].Field.Equals(field, StringComparison.Ordinal)) { Assert.AreEqual(TermsEnum.SeekStatus.END, te.SeekCeil(tx.Bytes)); } else { Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, te.SeekCeil(tx.Bytes)); if (Verbose) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(te.Term.Utf8ToString())); Console.WriteLine(" exp term=" + UnicodeUtil.ToHexString(fieldTerms[spot].Text)); } Assert.AreEqual(fieldTerms[spot].Bytes, te.Term); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } Term term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsFalse(te.MoveNext()); break; } else { Assert.IsTrue(te.MoveNext()); BytesRef t = te.Term; if (Verbose) { Console.WriteLine(" got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text.ToString())); } Assert.AreEqual(term.Bytes, t); } } } } } } }
// randomly seeks to term that we know exists, then next's // from there private void DoTestSeekExists(Random r, IList <Term> fieldTerms, IndexReader reader) { IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>(); // Test random seek to existing term, then enum: if (Verbose) { Console.WriteLine("\nTEST: top now seek"); } int num = AtLeast(100); for (int iter = 0; iter < num; iter++) { // pick random field+term int spot = r.Next(fieldTerms.Count); Term term = fieldTerms[spot]; string field = term.Field; if (Verbose) { Console.WriteLine("TEST: exist seek field=" + field + " term=" + UnicodeUtil.ToHexString(term.Text)); } // seek to it if (!tes.TryGetValue(field, out TermsEnum te)) { te = MultiFields.GetTerms(reader, field).GetEnumerator(); tes[field] = te; } if (Verbose) { Console.WriteLine(" done get enum"); } // seek should find the term Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(term.Bytes)); // now .next() this many times: int ct = TestUtil.NextInt32(r, 5, 100); for (int i = 0; i < ct; i++) { if (Verbose) { Console.WriteLine("TEST: now next()"); } if (1 + spot + i >= fieldTerms.Count) { break; } term = fieldTerms[1 + spot + i]; if (!term.Field.Equals(field, StringComparison.Ordinal)) { Assert.IsFalse(te.MoveNext()); break; } else { Assert.IsTrue(te.MoveNext()); BytesRef t = te.Term; if (Verbose) { Console.WriteLine(" got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString()))); Console.WriteLine(" exp=" + UnicodeUtil.ToHexString(term.Text.ToString())); } Assert.AreEqual(term.Bytes, t); } } } }
// Look for seek type 3 ("pop"): if the delta from // prev -> current was replacing an S with an E, // we must now seek to beyond that E. this seek // "finishes" the dance at this character // position. private bool DoPop() { if (DEBUG_SURROGATES) { Console.WriteLine(" try pop"); } if (Debugging.AssertsEnabled) { Debugging.Assert(newSuffixStart <= prevTerm.Length); Debugging.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0); } if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart)) { // Seek type 2 -- put 0xFF at this position: scratchTerm.Bytes[newSuffixStart] = 0xff; scratchTerm.Length = newSuffixStart + 1; if (DEBUG_SURROGATES) { Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap // the enums? outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true); Term t2 = termEnum.Term(); // We could hit EOF or different field since this // was a seek "forward": if (t2 != null && t2.Field == internedFieldName) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text) + " " + t2.Bytes); } BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } // Set newSuffixStart -- we can't use // termEnum's since the above seek may have // done no scanning (eg, term was precisely // and index term, or, was in the term seek // cache): scratchTerm.CopyBytes(b2); SetNewSuffixStart(prevTerm, scratchTerm); return(true); } else if (newSuffixStart != 0 || scratchTerm.Length != 0) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=null (or next field)"); } newSuffixStart = 0; scratchTerm.Length = 0; return(true); } } return(false); }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; if (Debugging.AssertsEnabled) { Debugging.Assert(term.Offset == 0); } // The 3 bytes starting at downTo make up 1 // unicode character: if (Debugging.AssertsEnabled) { Debugging.Assert(IsHighBMPChar(term.Bytes, pos)); } // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 is null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text)); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }
public virtual void TestRandom() { int num = AtLeast(2); for (int iter = 0; iter < num; iter++) { if (VERBOSE) { Console.WriteLine("TEST: iter=" + iter); } Directory dir = NewDirectory(); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMergePolicy(NoMergePolicy.COMPOUND_FILES)); // we can do this because we use NoMergePolicy (and dont merge to "nothing") w.KeepFullyDeletedSegments = true; IDictionary <BytesRef, IList <int?> > docs = new Dictionary <BytesRef, IList <int?> >(); ISet <int?> deleted = new JCG.HashSet <int?>(); IList <BytesRef> terms = new List <BytesRef>(); int numDocs = TestUtil.NextInt32(Random, 1, 100 * RANDOM_MULTIPLIER); Documents.Document doc = new Documents.Document(); Field f = NewStringField("field", "", Field.Store.NO); doc.Add(f); Field id = NewStringField("id", "", Field.Store.NO); doc.Add(id); bool onlyUniqueTerms = Random.NextBoolean(); if (VERBOSE) { Console.WriteLine("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); } ISet <BytesRef> uniqueTerms = new JCG.HashSet <BytesRef>(); for (int i = 0; i < numDocs; i++) { if (!onlyUniqueTerms && Random.NextBoolean() && terms.Count > 0) { // re-use existing term BytesRef term = terms[Random.Next(terms.Count)]; docs[term].Add(i); f.SetStringValue(term.Utf8ToString()); } else { string s = TestUtil.RandomUnicodeString(Random, 10); BytesRef term = new BytesRef(s); if (!docs.TryGetValue(term, out IList <int?> docsTerm)) { docs[term] = docsTerm = new List <int?>(); } docsTerm.Add(i); terms.Add(term); uniqueTerms.Add(term); f.SetStringValue(s); } id.SetStringValue("" + i); w.AddDocument(doc); if (Random.Next(4) == 1) { w.Commit(); } if (i > 0 && Random.Next(20) == 1) { int delID = Random.Next(i); deleted.Add(delID); w.DeleteDocuments(new Term("id", "" + delID)); if (VERBOSE) { Console.WriteLine("TEST: delete " + delID); } } } if (VERBOSE) { List <BytesRef> termsList = new List <BytesRef>(uniqueTerms); #pragma warning disable 612, 618 termsList.Sort(BytesRef.UTF8SortedAsUTF16Comparer); #pragma warning restore 612, 618 Console.WriteLine("TEST: terms in UTF16 order:"); foreach (BytesRef b in termsList) { Console.WriteLine(" " + UnicodeUtil.ToHexString(b.Utf8ToString()) + " " + b); foreach (int docID in docs[b]) { if (deleted.Contains(docID)) { Console.WriteLine(" " + docID + " (deleted)"); } else { Console.WriteLine(" " + docID); } } } } IndexReader reader = w.GetReader(); w.Dispose(); if (VERBOSE) { Console.WriteLine("TEST: reader=" + reader); } IBits liveDocs = MultiFields.GetLiveDocs(reader); foreach (int delDoc in deleted) { Assert.IsFalse(liveDocs.Get(delDoc)); } for (int i = 0; i < 100; i++) { BytesRef term = terms[Random.Next(terms.Count)]; if (VERBOSE) { Console.WriteLine("TEST: seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " " + term); } DocsEnum docsEnum = TestUtil.Docs(Random, reader, "field", term, liveDocs, null, DocsFlags.NONE); Assert.IsNotNull(docsEnum); foreach (int docID in docs[term]) { if (!deleted.Contains(docID)) { Assert.AreEqual(docID, docsEnum.NextDoc()); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc()); } reader.Dispose(); dir.Dispose(); } }
public override SeekStatus SeekCeil(BytesRef term) { if (DEBUG_SURROGATES) { Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } skipNext = false; TermInfosReader tis = outerInstance.TermsDict; Term t0 = new Term(fieldInfo.Name, term); Debug.Assert(termEnum != null); tis.SeekEnum(termEnum, t0, false); Term t = termEnum.Term(); if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes)) { // If we found an exact match, no need to do the // surrogate dance if (DEBUG_SURROGATES) { Console.WriteLine(" seek exact match"); } current = t.Bytes; return(SeekStatus.FOUND); } else if (t == null || t.Field != internedFieldName) { // TODO: maybe we can handle this like the next() // into null? set term as prevTerm then dance? if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit EOF"); } // We hit EOF; try end-case surrogate dance: if we // find an E, try swapping in S, backwards: scratchTerm.CopyBytes(term); Debug.Assert(scratchTerm.Offset == 0); for (int i = scratchTerm.Length - 1; i >= 0; i--) { if (IsHighBMPChar(scratchTerm.Bytes, i)) { if (DEBUG_SURROGATES) { Console.WriteLine(" found E pos=" + i + "; try seek"); } if (SeekToNonBMP(seekTermEnum, scratchTerm, i)) { scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false); newSuffixStart = 1 + i; DoPushes(); // Found a match // TODO: faster seek? current = termEnum.Term().Bytes; return(SeekStatus.NOT_FOUND); } } } if (DEBUG_SURROGATES) { Console.WriteLine(" seek END"); } current = null; return(SeekStatus.END); } else { // We found a non-exact but non-null term; this one // is fun -- just treat it like next, by pretending // requested term was prev: prevTerm.CopyBytes(term); if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text())); } BytesRef br = t.Bytes; Debug.Assert(br.Offset == 0); SetNewSuffixStart(term, br); SurrogateDance(); Term t2 = termEnum.Term(); if (t2 == null || t2.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal)); current = null; return(SeekStatus.END); } else { current = t2.Bytes; Debug.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString())); return(SeekStatus.NOT_FOUND); } } }
// Look for seek type 1 ("push"): if the newly added // suffix contains any S, we must try to seek to the // corresponding E. If we find a match, we go there; // else we keep looking for additional S's in the new // suffix. this "starts" the dance, at this character // position: private void DoPushes() { int upTo = newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length); } while (upTo < scratchTerm.Length) { if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo))))) { // A non-BMP char (4 bytes UTF8) starts here: Debug.Assert(scratchTerm.Length >= upTo + 4); int savLength = scratchTerm.Length; scratch[0] = (sbyte)scratchTerm.Bytes[upTo]; scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1]; scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2]; scratchTerm.Bytes[upTo] = (byte)UTF8_HIGH_BMP_LEAD; scratchTerm.Bytes[upTo + 1] = 0x80; scratchTerm.Bytes[upTo + 2] = 0x80; scratchTerm.Length = upTo + 3; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); } // Seek "forward": // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true); scratchTerm.Bytes[upTo] = (byte)scratch[0]; scratchTerm.Bytes[upTo + 1] = (byte)scratch[1]; scratchTerm.Bytes[upTo + 2] = (byte)scratch[2]; scratchTerm.Length = savLength; // Did we find a match? Term t2 = seekTermEnum.Term(); if (DEBUG_SURROGATES) { if (t2 == null) { Console.WriteLine(" hit term=null"); } else { Console.WriteLine(" hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes)); } } // Since this was a seek "forward", we could hit // EOF or a different field: bool matches; if (t2 != null && t2.Field == internedFieldName) { BytesRef b2 = t2.Bytes; Debug.Assert(b2.Offset == 0); if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo)) { matches = true; for (int i = 0; i < upTo; i++) { if (scratchTerm.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } } else { matches = false; } if (matches) { if (DEBUG_SURROGATES) { Console.WriteLine(" matches!"); } // OK seek "back" // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); // +3 because we don't need to check the char // at upTo: we know it's > BMP upTo += 3; // NOTE: we keep iterating, now, since this // can easily "recurse". Ie, after seeking // forward at a certain char position, we may // find another surrogate in our [new] suffix // and must then do another seek (recurse) } else { upTo++; } } else { upTo++; } } }
// Pre-flex indices store terms in UTF16 sort order, but // certain queries require Unicode codepoint order; this // method carefully seeks around surrogates to handle // this impedance mismatch private void SurrogateDance() { if (!unicodeSortOrder) { return; } // We are invoked after TIS.next() (by UTF16 order) to // possibly seek to a different "next" (by unicode // order) term. // We scan only the "delta" from the last term to the // current term, in UTF8 bytes. We look at 1) the bytes // stripped from the prior term, and then 2) the bytes // appended to that prior term's prefix. // We don't care about specific UTF8 sequences, just // the "category" of the UTF16 character. Category S // is a high/low surrogate pair (it non-BMP). // Category E is any BMP char > UNI_SUR_LOW_END (and < // U+FFFF). Category A is the rest (any unicode char // <= UNI_SUR_HIGH_START). // The core issue is that pre-flex indices sort the // characters as ASE, while flex must sort as AES. So // when scanning, when we hit S, we must 1) seek // forward to E and enum the terms there, then 2) seek // back to S and enum all terms there, then 3) seek to // after E. Three different seek points (1, 2, 3). // We can easily detect S in UTF8: if a byte has // prefix 11110 (0xf0), then that byte and the // following 3 bytes encode a single unicode codepoint // in S. Similarly, we can detect E: if a byte has // prefix 1110111 (0xee), then that byte and the // following 2 bytes encode a single unicode codepoint // in E. // Note that this is really a recursive process -- // maybe the char at pos 2 needs to dance, but any // point in its dance, suddenly pos 4 needs to dance // so you must finish pos 4 before returning to pos // 2. But then during pos 4's dance maybe pos 7 needs // to dance, etc. However, despite being recursive, // we don't need to hold any state because the state // can always be derived by looking at prior term & // current term. // TODO: can we avoid this copy? if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName) { scratchTerm.Length = 0; } else { scratchTerm.CopyBytes(termEnum.Term().Bytes); } if (DEBUG_SURROGATES) { Console.WriteLine(" dance"); Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString())); Console.WriteLine(" " + prevTerm.ToString()); Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString())); Console.WriteLine(" " + scratchTerm.ToString()); } // this code assumes TermInfosReader/SegmentTermEnum // always use BytesRef.offset == 0 Debug.Assert(prevTerm.Offset == 0); Debug.Assert(scratchTerm.Offset == 0); // Need to loop here because we may need to do multiple // pops, and possibly a continue in the end, ie: // // cont // pop, cont // pop, pop, cont // <nothing> // while (true) { if (DoContinue()) { break; } else { if (!DoPop()) { break; } } } if (DEBUG_SURROGATES) { Console.WriteLine(" finish bmp ends"); } DoPushes(); }
private string ToHexString(Term t) { return(t.Field() + ":" + UnicodeUtil.ToHexString(t.Text())); }
// Look for seek type 3 ("pop"): if the delta from // prev -> current was replacing an S with an E, // we must now seek to beyond that E. this seek // "finishes" the dance at this character // position. internal virtual bool DoPop() { if (DEBUG_SURROGATES) { Console.WriteLine(" try pop"); } Debug.Assert(NewSuffixStart <= PrevTerm.Length); Debug.Assert(NewSuffixStart < ScratchTerm.Length || NewSuffixStart == 0); if (PrevTerm.Length > NewSuffixStart && IsNonBMPChar(PrevTerm.Bytes, NewSuffixStart) && IsHighBMPChar(ScratchTerm.Bytes, NewSuffixStart)) { // Seek type 2 -- put 0xFF at this position: ScratchTerm.Bytes[NewSuffixStart] = unchecked ((byte)0xff); ScratchTerm.Length = NewSuffixStart + 1; if (DEBUG_SURROGATES) { Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(ScratchTerm.Utf8ToString()) + " " + ScratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap // the enums? OuterInstance.TermsDict.SeekEnum(TermEnum, new Term(fieldInfo.Name, ScratchTerm), true); Term t2 = TermEnum.Term(); // We could hit EOF or different field since this // was a seek "forward": if (t2 != null && t2.Field() == InternedFieldName) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes()); } BytesRef b2 = t2.Bytes(); Debug.Assert(b2.Offset == 0); // Set newSuffixStart -- we can't use // termEnum's since the above seek may have // done no scanning (eg, term was precisely // and index term, or, was in the term seek // cache): ScratchTerm.CopyBytes(b2); SetNewSuffixStart(PrevTerm, ScratchTerm); return(true); } else if (NewSuffixStart != 0 || ScratchTerm.Length != 0) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=null (or next field)"); } NewSuffixStart = 0; ScratchTerm.Length = 0; return(true); } } return(false); }