public override void SetUp()
        {
            base.SetUp();
            Dir       = NewDirectory();
            FieldName = Random().NextBoolean() ? "field" : ""; // sometimes use an empty string as field name
            RandomIndexWriter writer = new RandomIndexWriter(Random(), Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt(Random(), 50, 1000)));
            List <string>     terms  = new List <string>();
            int num = AtLeast(200);

            for (int i = 0; i < num; i++)
            {
                Document doc = new Document();
                doc.Add(NewStringField("id", Convert.ToString(i), Field.Store.NO));
                int numTerms = Random().Next(4);
                for (int j = 0; j < numTerms; j++)
                {
                    string s = TestUtil.RandomUnicodeString(Random());
                    doc.Add(NewStringField(FieldName, s, Field.Store.NO));
                    // if the default codec doesn't support sortedset, we will uninvert at search time
                    if (DefaultCodecSupportsSortedSet())
                    {
                        doc.Add(new SortedSetDocValuesField(FieldName, new BytesRef(s)));
                    }
                    terms.Add(s);
                }
                writer.AddDocument(doc);
            }

            if (VERBOSE)
            {
                // utf16 order
                terms.Sort();
                Console.WriteLine("UTF16 order:");
                foreach (string s in terms)
                {
                    Console.WriteLine("  " + UnicodeUtil.ToHexString(s));
                }
            }

            int numDeletions = Random().Next(num / 10);

            for (int i = 0; i < numDeletions; i++)
            {
                writer.DeleteDocuments(new Term("id", Convert.ToString(Random().Next(num))));
            }

            Reader    = writer.Reader;
            Searcher1 = NewSearcher(Reader);
            Searcher2 = NewSearcher(Reader);
            writer.Dispose();
        }
Beispiel #2
0
 // single straight enum
 private void DoTestStraightEnum(IList <Term> fieldTerms, IndexReader reader, int uniqueTermCount)
 {
     if (Verbose)
     {
         Console.WriteLine("\nTEST: top now enum reader=" + reader);
     }
     Fields fields = MultiFields.GetFields(reader);
     {
         // Test straight enum:
         int termCount = 0;
         foreach (string field in fields)
         {
             Terms terms = fields.GetTerms(field);
             Assert.IsNotNull(terms);
             TermsEnum termsEnum = terms.GetEnumerator();
             BytesRef  text;
             BytesRef  lastText = null;
             while (termsEnum.MoveNext())
             {
                 text = termsEnum.Term;
                 Term exp = fieldTerms[termCount];
                 if (Verbose)
                 {
                     Console.WriteLine("  got term=" + field + ":" + UnicodeUtil.ToHexString(text.Utf8ToString()));
                     Console.WriteLine("       exp=" + exp.Field + ":" + UnicodeUtil.ToHexString(exp.Text));
                     Console.WriteLine();
                 }
                 if (lastText == null)
                 {
                     lastText = BytesRef.DeepCopyOf(text);
                 }
                 else
                 {
                     Assert.IsTrue(lastText.CompareTo(text) < 0);
                     lastText.CopyBytes(text);
                 }
                 Assert.AreEqual(exp.Field, field);
                 Assert.AreEqual(exp.Bytes, text);
                 termCount++;
             }
             if (Verbose)
             {
                 Console.WriteLine("  no more terms for field=" + field);
             }
         }
         Assert.AreEqual(uniqueTermCount, termCount);
     }
 }
Beispiel #3
0
        public override void SetUp()
        {
            base.SetUp();
            Dir       = NewDirectory();
            FieldName = Random().NextBoolean() ? "field" : ""; // sometimes use an empty string as field name
            RandomIndexWriter writer = new RandomIndexWriter(Random(), Dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt(Random(), 50, 1000)));
            Document          doc    = new Document();
            Field             field  = NewStringField(FieldName, "", Field.Store.NO);

            doc.Add(field);
            List <string> terms = new List <string>();
            int           num   = AtLeast(200);

            for (int i = 0; i < num; i++)
            {
                string s = TestUtil.RandomUnicodeString(Random());
                field.StringValue = s;
                terms.Add(s);
                writer.AddDocument(doc);
            }

            if (VERBOSE)
            {
                // utf16 order
                terms.Sort();
                Console.WriteLine("UTF16 order:");
                foreach (string s in terms)
                {
                    Console.WriteLine("  " + UnicodeUtil.ToHexString(s));
                }
            }

            Reader    = writer.Reader;
            Searcher1 = NewSearcher(Reader);
            Searcher2 = NewSearcher(Reader);
            writer.Dispose();
        }
Beispiel #4
0
 private static string ToHexString(Term t)
 {
     return(t.Field + ":" + UnicodeUtil.ToHexString(t.Text));
 }
Beispiel #5
0
        private void DoTestSeekDoesNotExist(Random r, int numField, IList <Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader)
        {
            IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>();

            if (Verbose)
            {
                Console.WriteLine("TEST: top random seeks");
            }

            {
                int num = AtLeast(100);
                for (int iter = 0; iter < num; iter++)
                {
                    // seek to random spot
                    string field = ("f" + r.Next(numField)).Intern();
                    Term   tx    = new Term(field, GetRandomString(r));

                    int spot = Array.BinarySearch(fieldTermsArray, tx);

                    if (spot < 0)
                    {
                        if (Verbose)
                        {
                            Console.WriteLine("TEST: non-exist seek to " + field + ":" + UnicodeUtil.ToHexString(tx.Text));
                        }

                        // term does not exist:
                        if (!tes.TryGetValue(field, out TermsEnum te))
                        {
                            te         = MultiFields.GetTerms(reader, field).GetEnumerator();
                            tes[field] = te;
                        }

                        if (Verbose)
                        {
                            Console.WriteLine("  got enum");
                        }

                        spot = -spot - 1;

                        if (spot == fieldTerms.Count || !fieldTerms[spot].Field.Equals(field, StringComparison.Ordinal))
                        {
                            Assert.AreEqual(TermsEnum.SeekStatus.END, te.SeekCeil(tx.Bytes));
                        }
                        else
                        {
                            Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, te.SeekCeil(tx.Bytes));

                            if (Verbose)
                            {
                                Console.WriteLine("  got term=" + UnicodeUtil.ToHexString(te.Term.Utf8ToString()));
                                Console.WriteLine("  exp term=" + UnicodeUtil.ToHexString(fieldTerms[spot].Text));
                            }

                            Assert.AreEqual(fieldTerms[spot].Bytes, te.Term);

                            // now .next() this many times:
                            int ct = TestUtil.NextInt32(r, 5, 100);
                            for (int i = 0; i < ct; i++)
                            {
                                if (Verbose)
                                {
                                    Console.WriteLine("TEST: now next()");
                                }
                                if (1 + spot + i >= fieldTerms.Count)
                                {
                                    break;
                                }
                                Term term = fieldTerms[1 + spot + i];
                                if (!term.Field.Equals(field, StringComparison.Ordinal))
                                {
                                    Assert.IsFalse(te.MoveNext());
                                    break;
                                }
                                else
                                {
                                    Assert.IsTrue(te.MoveNext());
                                    BytesRef t = te.Term;

                                    if (Verbose)
                                    {
                                        Console.WriteLine("  got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString())));
                                        Console.WriteLine("       exp=" + UnicodeUtil.ToHexString(term.Text.ToString()));
                                    }

                                    Assert.AreEqual(term.Bytes, t);
                                }
                            }
                        }
                    }
                }
            }
        }
Beispiel #6
0
        // randomly seeks to term that we know exists, then next's
        // from there
        private void DoTestSeekExists(Random r, IList <Term> fieldTerms, IndexReader reader)
        {
            IDictionary <string, TermsEnum> tes = new Dictionary <string, TermsEnum>();

            // Test random seek to existing term, then enum:
            if (Verbose)
            {
                Console.WriteLine("\nTEST: top now seek");
            }

            int num = AtLeast(100);

            for (int iter = 0; iter < num; iter++)
            {
                // pick random field+term
                int    spot  = r.Next(fieldTerms.Count);
                Term   term  = fieldTerms[spot];
                string field = term.Field;

                if (Verbose)
                {
                    Console.WriteLine("TEST: exist seek field=" + field + " term=" + UnicodeUtil.ToHexString(term.Text));
                }

                // seek to it
                if (!tes.TryGetValue(field, out TermsEnum te))
                {
                    te         = MultiFields.GetTerms(reader, field).GetEnumerator();
                    tes[field] = te;
                }

                if (Verbose)
                {
                    Console.WriteLine("  done get enum");
                }

                // seek should find the term
                Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(term.Bytes));

                // now .next() this many times:
                int ct = TestUtil.NextInt32(r, 5, 100);
                for (int i = 0; i < ct; i++)
                {
                    if (Verbose)
                    {
                        Console.WriteLine("TEST: now next()");
                    }
                    if (1 + spot + i >= fieldTerms.Count)
                    {
                        break;
                    }
                    term = fieldTerms[1 + spot + i];
                    if (!term.Field.Equals(field, StringComparison.Ordinal))
                    {
                        Assert.IsFalse(te.MoveNext());
                        break;
                    }
                    else
                    {
                        Assert.IsTrue(te.MoveNext());
                        BytesRef t = te.Term;

                        if (Verbose)
                        {
                            Console.WriteLine("  got term=" + (t == null ? null : UnicodeUtil.ToHexString(t.Utf8ToString())));
                            Console.WriteLine("       exp=" + UnicodeUtil.ToHexString(term.Text.ToString()));
                        }

                        Assert.AreEqual(term.Bytes, t);
                    }
                }
            }
        }
Beispiel #7
0
            // Look for seek type 3 ("pop"): if the delta from
            // prev -> current was replacing an S with an E,
            // we must now seek to beyond that E.  this seek
            // "finishes" the dance at this character
            // position.
            private bool DoPop()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try pop");
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(newSuffixStart <= prevTerm.Length);
                    Debugging.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0);
                }

                if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart))
                {
                    // Seek type 2 -- put 0xFF at this position:
                    scratchTerm.Bytes[newSuffixStart] = 0xff;
                    scratchTerm.Length = newSuffixStart + 1;

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("    seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
                    }

                    // TODO: more efficient seek?  can we simply swap
                    // the enums?
                    outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true);

                    Term t2 = termEnum.Term();

                    // We could hit EOF or different field since this
                    // was a seek "forward":
                    if (t2 != null && t2.Field == internedFieldName)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text) + " " + t2.Bytes);
                        }

                        BytesRef b2 = t2.Bytes;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(b2.Offset == 0);
                        }

                        // Set newSuffixStart -- we can't use
                        // termEnum's since the above seek may have
                        // done no scanning (eg, term was precisely
                        // and index term, or, was in the term seek
                        // cache):
                        scratchTerm.CopyBytes(b2);
                        SetNewSuffixStart(prevTerm, scratchTerm);

                        return(true);
                    }
                    else if (newSuffixStart != 0 || scratchTerm.Length != 0)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=null (or next field)");
                        }
                        newSuffixStart     = 0;
                        scratchTerm.Length = 0;
                        return(true);
                    }
                }

                return(false);
            }
Beispiel #8
0
            // Swap in S, in place of E:
            private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(term.Offset == 0);
                }

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(IsHighBMPChar(term.Bytes, pos));
                }

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                scratch[0] = (sbyte)term.Bytes[pos];
                scratch[1] = (sbyte)term.Bytes[pos + 1];
                scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos]     = 0xf0;
                term.Bytes[pos + 1] = 0x90;
                term.Bytes[pos + 2] = 0x80;
                term.Bytes[pos + 3] = 0x80;
                term.Length         = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 is null || t2.Field != internedFieldName)
                {
                    return(false);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(b2.Offset == 0);
                }

                bool matches;

                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length         = savLength;
                term.Bytes[pos]     = (byte)scratch[0];
                term.Bytes[pos + 1] = (byte)scratch[1];
                term.Bytes[pos + 2] = (byte)scratch[2];

                return(matches);
            }
Beispiel #9
0
        public virtual void TestRandom()
        {
            int num = AtLeast(2);

            for (int iter = 0; iter < num; iter++)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: iter=" + iter);
                }

                Directory dir = NewDirectory();

                IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMergePolicy(NoMergePolicy.COMPOUND_FILES));
                // we can do this because we use NoMergePolicy (and dont merge to "nothing")
                w.KeepFullyDeletedSegments = true;

                IDictionary <BytesRef, IList <int?> > docs = new Dictionary <BytesRef, IList <int?> >();
                ISet <int?>      deleted = new JCG.HashSet <int?>();
                IList <BytesRef> terms   = new List <BytesRef>();

                int numDocs            = TestUtil.NextInt32(Random, 1, 100 * RANDOM_MULTIPLIER);
                Documents.Document doc = new Documents.Document();
                Field f = NewStringField("field", "", Field.Store.NO);
                doc.Add(f);
                Field id = NewStringField("id", "", Field.Store.NO);
                doc.Add(id);

                bool onlyUniqueTerms = Random.NextBoolean();
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs);
                }
                ISet <BytesRef> uniqueTerms = new JCG.HashSet <BytesRef>();
                for (int i = 0; i < numDocs; i++)
                {
                    if (!onlyUniqueTerms && Random.NextBoolean() && terms.Count > 0)
                    {
                        // re-use existing term
                        BytesRef term = terms[Random.Next(terms.Count)];
                        docs[term].Add(i);
                        f.SetStringValue(term.Utf8ToString());
                    }
                    else
                    {
                        string   s    = TestUtil.RandomUnicodeString(Random, 10);
                        BytesRef term = new BytesRef(s);
                        if (!docs.TryGetValue(term, out IList <int?> docsTerm))
                        {
                            docs[term] = docsTerm = new List <int?>();
                        }
                        docsTerm.Add(i);
                        terms.Add(term);
                        uniqueTerms.Add(term);
                        f.SetStringValue(s);
                    }
                    id.SetStringValue("" + i);
                    w.AddDocument(doc);
                    if (Random.Next(4) == 1)
                    {
                        w.Commit();
                    }
                    if (i > 0 && Random.Next(20) == 1)
                    {
                        int delID = Random.Next(i);
                        deleted.Add(delID);
                        w.DeleteDocuments(new Term("id", "" + delID));
                        if (VERBOSE)
                        {
                            Console.WriteLine("TEST: delete " + delID);
                        }
                    }
                }

                if (VERBOSE)
                {
                    List <BytesRef> termsList = new List <BytesRef>(uniqueTerms);
#pragma warning disable 612, 618
                    termsList.Sort(BytesRef.UTF8SortedAsUTF16Comparer);
#pragma warning restore 612, 618
                    Console.WriteLine("TEST: terms in UTF16 order:");
                    foreach (BytesRef b in termsList)
                    {
                        Console.WriteLine("  " + UnicodeUtil.ToHexString(b.Utf8ToString()) + " " + b);
                        foreach (int docID in docs[b])
                        {
                            if (deleted.Contains(docID))
                            {
                                Console.WriteLine("    " + docID + " (deleted)");
                            }
                            else
                            {
                                Console.WriteLine("    " + docID);
                            }
                        }
                    }
                }

                IndexReader reader = w.GetReader();
                w.Dispose();
                if (VERBOSE)
                {
                    Console.WriteLine("TEST: reader=" + reader);
                }

                IBits liveDocs = MultiFields.GetLiveDocs(reader);
                foreach (int delDoc in deleted)
                {
                    Assert.IsFalse(liveDocs.Get(delDoc));
                }

                for (int i = 0; i < 100; i++)
                {
                    BytesRef term = terms[Random.Next(terms.Count)];
                    if (VERBOSE)
                    {
                        Console.WriteLine("TEST: seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " " + term);
                    }

                    DocsEnum docsEnum = TestUtil.Docs(Random, reader, "field", term, liveDocs, null, DocsFlags.NONE);
                    Assert.IsNotNull(docsEnum);

                    foreach (int docID in docs[term])
                    {
                        if (!deleted.Contains(docID))
                        {
                            Assert.AreEqual(docID, docsEnum.NextDoc());
                        }
                    }
                    Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc());
                }

                reader.Dispose();
                dir.Dispose();
            }
        }
Beispiel #10
0
            public override SeekStatus SeekCeil(BytesRef term)
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }
                skipNext = false;
                TermInfosReader tis = outerInstance.TermsDict;
                Term            t0  = new Term(fieldInfo.Name, term);

                Debug.Assert(termEnum != null);

                tis.SeekEnum(termEnum, t0, false);

                Term t = termEnum.Term();

                if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes))
                {
                    // If we found an exact match, no need to do the
                    // surrogate dance
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek exact match");
                    }
                    current = t.Bytes;
                    return(SeekStatus.FOUND);
                }
                else if (t == null || t.Field != internedFieldName)
                {
                    // TODO: maybe we can handle this like the next()
                    // into null?  set term as prevTerm then dance?

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit EOF");
                    }

                    // We hit EOF; try end-case surrogate dance: if we
                    // find an E, try swapping in S, backwards:
                    scratchTerm.CopyBytes(term);

                    Debug.Assert(scratchTerm.Offset == 0);

                    for (int i = scratchTerm.Length - 1; i >= 0; i--)
                    {
                        if (IsHighBMPChar(scratchTerm.Bytes, i))
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("    found E pos=" + i + "; try seek");
                            }

                            if (SeekToNonBMP(seekTermEnum, scratchTerm, i))
                            {
                                scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);
                                outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false);

                                newSuffixStart = 1 + i;

                                DoPushes();

                                // Found a match
                                // TODO: faster seek?
                                current = termEnum.Term().Bytes;
                                return(SeekStatus.NOT_FOUND);
                            }
                        }
                    }

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek END");
                    }

                    current = null;
                    return(SeekStatus.END);
                }
                else
                {
                    // We found a non-exact but non-null term; this one
                    // is fun -- just treat it like next, by pretending
                    // requested term was prev:
                    prevTerm.CopyBytes(term);

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text()));
                    }

                    BytesRef br = t.Bytes;
                    Debug.Assert(br.Offset == 0);

                    SetNewSuffixStart(term, br);

                    SurrogateDance();

                    Term t2 = termEnum.Term();
                    if (t2 == null || t2.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        Debug.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        current = null;
                        return(SeekStatus.END);
                    }
                    else
                    {
                        current = t2.Bytes;
                        Debug.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString()));
                        return(SeekStatus.NOT_FOUND);
                    }
                }
            }
Beispiel #11
0
            // Look for seek type 1 ("push"): if the newly added
            // suffix contains any S, we must try to seek to the
            // corresponding E.  If we find a match, we go there;
            // else we keep looking for additional S's in the new
            // suffix.  this "starts" the dance, at this character
            // position:
            private void DoPushes()
            {
                int upTo = newSuffixStart;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length);
                }

                while (upTo < scratchTerm.Length)
                {
                    if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo)))))
                    {
                        // A non-BMP char (4 bytes UTF8) starts here:
                        Debug.Assert(scratchTerm.Length >= upTo + 4);

                        int savLength = scratchTerm.Length;
                        scratch[0] = (sbyte)scratchTerm.Bytes[upTo];
                        scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1];
                        scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2];

                        scratchTerm.Bytes[upTo]     = (byte)UTF8_HIGH_BMP_LEAD;
                        scratchTerm.Bytes[upTo + 1] = 0x80;
                        scratchTerm.Bytes[upTo + 2] = 0x80;
                        scratchTerm.Length          = upTo + 3;

                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
                        }

                        // Seek "forward":
                        // TODO: more efficient seek?
                        outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true);

                        scratchTerm.Bytes[upTo]     = (byte)scratch[0];
                        scratchTerm.Bytes[upTo + 1] = (byte)scratch[1];
                        scratchTerm.Bytes[upTo + 2] = (byte)scratch[2];
                        scratchTerm.Length          = savLength;

                        // Did we find a match?
                        Term t2 = seekTermEnum.Term();

                        if (DEBUG_SURROGATES)
                        {
                            if (t2 == null)
                            {
                                Console.WriteLine("      hit term=null");
                            }
                            else
                            {
                                Console.WriteLine("      hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes));
                            }
                        }

                        // Since this was a seek "forward", we could hit
                        // EOF or a different field:
                        bool matches;

                        if (t2 != null && t2.Field == internedFieldName)
                        {
                            BytesRef b2 = t2.Bytes;
                            Debug.Assert(b2.Offset == 0);
                            if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo))
                            {
                                matches = true;
                                for (int i = 0; i < upTo; i++)
                                {
                                    if (scratchTerm.Bytes[i] != b2.Bytes[i])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }
                            }
                            else
                            {
                                matches = false;
                            }
                        }
                        else
                        {
                            matches = false;
                        }

                        if (matches)
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      matches!");
                            }

                            // OK seek "back"
                            // TODO: more efficient seek?
                            outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);

                            scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);

                            // +3 because we don't need to check the char
                            // at upTo: we know it's > BMP
                            upTo += 3;

                            // NOTE: we keep iterating, now, since this
                            // can easily "recurse".  Ie, after seeking
                            // forward at a certain char position, we may
                            // find another surrogate in our [new] suffix
                            // and must then do another seek (recurse)
                        }
                        else
                        {
                            upTo++;
                        }
                    }
                    else
                    {
                        upTo++;
                    }
                }
            }
Beispiel #12
0
            // Pre-flex indices store terms in UTF16 sort order, but
            // certain queries require Unicode codepoint order; this
            // method carefully seeks around surrogates to handle
            // this impedance mismatch

            private void SurrogateDance()
            {
                if (!unicodeSortOrder)
                {
                    return;
                }

                // We are invoked after TIS.next() (by UTF16 order) to
                // possibly seek to a different "next" (by unicode
                // order) term.

                // We scan only the "delta" from the last term to the
                // current term, in UTF8 bytes.  We look at 1) the bytes
                // stripped from the prior term, and then 2) the bytes
                // appended to that prior term's prefix.

                // We don't care about specific UTF8 sequences, just
                // the "category" of the UTF16 character.  Category S
                // is a high/low surrogate pair (it non-BMP).
                // Category E is any BMP char > UNI_SUR_LOW_END (and <
                // U+FFFF). Category A is the rest (any unicode char
                // <= UNI_SUR_HIGH_START).

                // The core issue is that pre-flex indices sort the
                // characters as ASE, while flex must sort as AES.  So
                // when scanning, when we hit S, we must 1) seek
                // forward to E and enum the terms there, then 2) seek
                // back to S and enum all terms there, then 3) seek to
                // after E.  Three different seek points (1, 2, 3).

                // We can easily detect S in UTF8: if a byte has
                // prefix 11110 (0xf0), then that byte and the
                // following 3 bytes encode a single unicode codepoint
                // in S.  Similarly, we can detect E: if a byte has
                // prefix 1110111 (0xee), then that byte and the
                // following 2 bytes encode a single unicode codepoint
                // in E.

                // Note that this is really a recursive process --
                // maybe the char at pos 2 needs to dance, but any
                // point in its dance, suddenly pos 4 needs to dance
                // so you must finish pos 4 before returning to pos
                // 2.  But then during pos 4's dance maybe pos 7 needs
                // to dance, etc.  However, despite being recursive,
                // we don't need to hold any state because the state
                // can always be derived by looking at prior term &
                // current term.

                // TODO: can we avoid this copy?
                if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName)
                {
                    scratchTerm.Length = 0;
                }
                else
                {
                    scratchTerm.CopyBytes(termEnum.Term().Bytes);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  dance");
                    Console.WriteLine("    prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString()));
                    Console.WriteLine("         " + prevTerm.ToString());
                    Console.WriteLine("    term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()));
                    Console.WriteLine("         " + scratchTerm.ToString());
                }

                // this code assumes TermInfosReader/SegmentTermEnum
                // always use BytesRef.offset == 0
                Debug.Assert(prevTerm.Offset == 0);
                Debug.Assert(scratchTerm.Offset == 0);

                // Need to loop here because we may need to do multiple
                // pops, and possibly a continue in the end, ie:
                //
                //  cont
                //  pop, cont
                //  pop, pop, cont
                //  <nothing>
                //

                while (true)
                {
                    if (DoContinue())
                    {
                        break;
                    }
                    else
                    {
                        if (!DoPop())
                        {
                            break;
                        }
                    }
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  finish bmp ends");
                }

                DoPushes();
            }
Beispiel #13
0
 private string ToHexString(Term t)
 {
     return(t.Field() + ":" + UnicodeUtil.ToHexString(t.Text()));
 }
Beispiel #14
0
            // Look for seek type 3 ("pop"): if the delta from
            // prev -> current was replacing an S with an E,
            // we must now seek to beyond that E.  this seek
            // "finishes" the dance at this character
            // position.
            internal virtual bool DoPop()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try pop");
                }

                Debug.Assert(NewSuffixStart <= PrevTerm.Length);
                Debug.Assert(NewSuffixStart < ScratchTerm.Length || NewSuffixStart == 0);

                if (PrevTerm.Length > NewSuffixStart && IsNonBMPChar(PrevTerm.Bytes, NewSuffixStart) && IsHighBMPChar(ScratchTerm.Bytes, NewSuffixStart))
                {
                    // Seek type 2 -- put 0xFF at this position:
                    ScratchTerm.Bytes[NewSuffixStart] = unchecked ((byte)0xff);
                    ScratchTerm.Length = NewSuffixStart + 1;

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("    seek to term=" + UnicodeUtil.ToHexString(ScratchTerm.Utf8ToString()) + " " + ScratchTerm.ToString());
                    }

                    // TODO: more efficient seek?  can we simply swap
                    // the enums?
                    OuterInstance.TermsDict.SeekEnum(TermEnum, new Term(fieldInfo.Name, ScratchTerm), true);

                    Term t2 = TermEnum.Term();

                    // We could hit EOF or different field since this
                    // was a seek "forward":
                    if (t2 != null && t2.Field() == InternedFieldName)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes());
                        }

                        BytesRef b2 = t2.Bytes();
                        Debug.Assert(b2.Offset == 0);

                        // Set newSuffixStart -- we can't use
                        // termEnum's since the above seek may have
                        // done no scanning (eg, term was precisely
                        // and index term, or, was in the term seek
                        // cache):
                        ScratchTerm.CopyBytes(b2);
                        SetNewSuffixStart(PrevTerm, ScratchTerm);

                        return(true);
                    }
                    else if (NewSuffixStart != 0 || ScratchTerm.Length != 0)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=null (or next field)");
                        }
                        NewSuffixStart     = 0;
                        ScratchTerm.Length = 0;
                        return(true);
                    }
                }

                return(false);
            }