Exemple #1
0
        public override SeekStatus SeekCeil(BytesRef text)
        {
            long ord = values.LookupTerm(text);

            if (ord >= 0)
            {
                currentOrd  = ord;
                term.Offset = 0;
                // TODO: is there a cleaner way?
                // term.bytes may be pointing to codec-private byte[]
                // storage, so we must force new byte[] allocation:
                term.Bytes = new byte[text.Length];
                term.CopyBytes(text);
                return(SeekStatus.FOUND);
            }
            else
            {
                currentOrd = -ord - 1;
                if (currentOrd == values.ValueCount)
                {
                    return(SeekStatus.END);
                }
                else
                {
                    // TODO: hmm can we avoid this "extra" lookup?:
                    values.LookupOrd(currentOrd, term);
                    return(SeekStatus.NOT_FOUND);
                }
            }
        }
Exemple #2
0
            // Seek type 2 "continue" (back to the start of the
            // surrogates): scan the stripped suffix from the
            // prior term, backwards. If there was an E in that
            // part, then we try to seek back to S.  If that
            // seek finds a matching term, we go there.
            private bool DoContinue()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try cont");
                }

                int downTo = prevTerm.Length - 1;

                bool didSeek = false;

                int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1);

                while (downTo > limit)
                {
                    if (IsHighBMPChar(prevTerm.Bytes, downTo))
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    found E pos=" + downTo + " vs len=" + prevTerm.Length);
                        }

                        if (SeekToNonBMP(seekTermEnum, prevTerm, downTo))
                        {
                            // TODO: more efficient seek?
                            outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);
                            //newSuffixStart = downTo+4;
                            newSuffixStart = downTo;
                            scratchTerm.CopyBytes(termEnum.Term().Bytes);
                            didSeek = true;
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      seek!");
                            }
                            break;
                        }
                        else
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      no seek");
                            }
                        }
                    }

                    // Shorten prevTerm in place so that we don't redo
                    // this loop if we come back here:
                    if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0)
                    {
                        prevTerm.Length = downTo;
                    }

                    downTo--;
                }

                return(didSeek);
            }
Exemple #3
0
 public void Set(Term term)
 {
     if (term == null)
     {
         Reset();
         return;
     }
     Bytes.CopyBytes(term.Bytes);
     Field = String.Intern(term.Field);
     CurrentFieldNumber = -1;
     this.Term          = term;
 }
            public override bool CheckIndexTerm(BytesRef text, TermStats stats)
            {
                // NOTE: we must force the first term per field to be
                // indexed, in case policy doesn't:
                if (_vgtiw._policy.IsIndexTerm(text, stats) || _first)
                {
                    _first = false;
                    return(true);
                }

                _lastTerm.CopyBytes(text);
                return(false);
            }
Exemple #5
0
        private void CheckTermsOrder(IndexReader r, ISet <string> allTerms, bool isTop)
        {
            TermsEnum terms = MultiFields.GetFields(r).GetTerms("f").GetEnumerator();

            BytesRef last = new BytesRef();

            ISet <string> seenTerms = new JCG.HashSet <string>();

            while (terms.MoveNext())
            {
                BytesRef term = terms.Term;

                Assert.IsTrue(last.CompareTo(term) < 0);
                last.CopyBytes(term);

                string s = term.Utf8ToString();
                Assert.IsTrue(allTerms.Contains(s), "term " + TermDesc(s) + " was not added to index (count=" + allTerms.Count + ")");
                seenTerms.Add(s);
            }

            if (isTop)
            {
                Assert.IsTrue(allTerms.SetEquals(seenTerms));
            }

            // Test seeking:
            IEnumerator <string> it = seenTerms.GetEnumerator();

            while (it.MoveNext())
            {
                BytesRef tr = new BytesRef(it.Current);
                Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(tr), "seek failed for term=" + TermDesc(tr.Utf8ToString()));
            }
        }
Exemple #6
0
        public override void Copy(MutableValue source)
        {
            MutableValueStr s = (MutableValueStr)source;

            Exists = s.Exists;
            Value.CopyBytes(s.Value);
        }
        protected override BytesRef NextSeekTerm(BytesRef term)
        {
            //System.out.println("ATE.nextSeekTerm term=" + term);
            if (term == null)
            {
                Debug.Assert(seekBytesRef.Length == 0);
                // return the empty term, as its valid
                if (runAutomaton.IsAccept(runAutomaton.InitialState))
                {
                    return(seekBytesRef);
                }
            }
            else
            {
                seekBytesRef.CopyBytes(term);
            }

            // seek to the next possible string;
            if (NextString())
            {
                return(seekBytesRef); // reposition
            }
            else
            {
                return(null); // no more possible strings can match
            }
        }
        private void WriteTerm(int fieldNumber, BytesRef term)
        {
            //System.out.println("  tiw.write field=" + fieldNumber + " term=" + term.utf8ToString());

            // TODO: UTF16toUTF8 could tell us this prefix
            // Compute prefix in common with last term:
            int start = 0;
            int limit = term.Length < LastTerm.Length ? term.Length : LastTerm.Length;

            while (start < limit)
            {
                if (term.Bytes[start + term.Offset] != LastTerm.Bytes[start + LastTerm.Offset])
                {
                    break;
                }
                start++;
            }

            int length = term.Length - start;

            Output.WriteVInt32(start);                                  // write shared prefix length
            Output.WriteVInt32(length);                                 // write delta length
            Output.WriteBytes(term.Bytes, start + term.Offset, length); // write delta bytes
            Output.WriteVInt32(fieldNumber);                            // write field num
            LastTerm.CopyBytes(term);
        }
        /// <summary>
        /// Builds the final automaton from a list of entries.
        /// </summary>
        private FST <object> BuildAutomaton(IBytesRefSorter sorter)
        {
            // Build the automaton.
            Outputs <object> outputs = NoOutputs.Singleton;
            object           empty   = outputs.NoOutput;
            Builder <object> builder = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInt32s.DEFAULT, true, 15);

            BytesRef          scratch = new BytesRef();
            BytesRef          entry;
            Int32sRef         scratchIntsRef = new Int32sRef();
            int               count          = 0;
            IBytesRefIterator iter           = sorter.GetIterator();

            while ((entry = iter.Next()) != null)
            {
                count++;
                if (scratch.CompareTo(entry) != 0)
                {
                    builder.Add(Util.Fst.Util.ToInt32sRef(entry, scratchIntsRef), empty);
                    scratch.CopyBytes(entry);
                }
            }

            return(count == 0 ? null : builder.Finish());
        }
Exemple #10
0
 public virtual void TestCopyBytes()
 {
     sbyte[] bytes = new sbyte[] { (sbyte)'a', (sbyte)'b', (sbyte)'c', (sbyte)'d' };
     BytesRef b = new BytesRef(bytes, 1, 3); // bcd
     b.CopyBytes(new BytesRef("bcde"));
     Assert.AreEqual("bcde", b.Utf8ToString());
 }
Exemple #11
0
 private void SetTerm()
 {
     // TODO: is there a cleaner way
     term.Bytes  = new byte[termBuffer.Length];
     term.Offset = 0;
     term.CopyBytes(termBuffer);
 }
 public override bool CheckIndexTerm(BytesRef text, TermStats stats)
 {
     //System.out.println("VGW: index term=" + text.utf8ToString());
     // NOTE: we must force the first term per field to be
     // indexed, in case policy doesn't:
     if (outerInstance.policy.IsIndexTerm(text, stats) || first)
     {
         first = false;
         //System.out.println("  YES");
         return(true);
     }
     else
     {
         lastTerm.CopyBytes(text);
         return(false);
     }
 }
            public override bool CheckIndexTerm(BytesRef text, TermStats stats)
            {
                // First term is first indexed term:
                //System.output.println("FGW: checkIndexTerm text=" + text.utf8ToString());
                if (0 == (_numTerms++ % _fgtiw._termIndexInterval))
                {
                    return(true);
                }

                // save last term just before next index term so we
                // can compute wasted suffix
                if (0 == _numTerms % _fgtiw._termIndexInterval)
                {
                    _lastTerm.CopyBytes(text);
                }

                return(false);
            }
Exemple #14
0
        public virtual void TestUpdateDelteSlices()
        {
            DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue();
            int size = 200 + Random.Next(500) * RANDOM_MULTIPLIER;

            int?[] ids = new int?[size];
            for (int i = 0; i < ids.Length; i++)
            {
                ids[i] = Random.Next();
            }
            DeleteSlice     slice1       = queue.NewSlice();
            DeleteSlice     slice2       = queue.NewSlice();
            BufferedUpdates bd1          = new BufferedUpdates();
            BufferedUpdates bd2          = new BufferedUpdates();
            int             last1        = 0;
            int             last2        = 0;
            ISet <Term>     uniqueValues = new JCG.HashSet <Term>();

            for (int j = 0; j < ids.Length; j++)
            {
                int?i = ids[j];
                // create an array here since we compare identity below against tailItem
                Term[] term = new Term[] { new Term("id", i.ToString()) };
                uniqueValues.Add(term[0]);
                queue.AddDelete(term);
                if (Random.Next(20) == 0 || j == ids.Length - 1)
                {
                    queue.UpdateSlice(slice1);
                    Assert.IsTrue(slice1.IsTailItem(term));
                    slice1.Apply(bd1, j);
                    AssertAllBetween(last1, j, bd1, ids);
                    last1 = j + 1;
                }
                if (Random.Next(10) == 5 || j == ids.Length - 1)
                {
                    queue.UpdateSlice(slice2);
                    Assert.IsTrue(slice2.IsTailItem(term));
                    slice2.Apply(bd2, j);
                    AssertAllBetween(last2, j, bd2, ids);
                    last2 = j + 1;
                }
                Assert.AreEqual(j + 1, queue.NumGlobalTermDeletes);
            }
            assertEquals(uniqueValues, new JCG.HashSet <Term>(bd1.terms.Keys));
            assertEquals(uniqueValues, new JCG.HashSet <Term>(bd2.terms.Keys));
            var frozenSet = new JCG.HashSet <Term>();

            foreach (Term t in queue.FreezeGlobalBuffer(null).GetTermsEnumerable())
            {
                BytesRef bytesRef = new BytesRef();
                bytesRef.CopyBytes(t.Bytes);
                frozenSet.Add(new Term(t.Field, bytesRef));
            }
            assertEquals(uniqueValues, frozenSet);
            Assert.AreEqual(0, queue.NumGlobalTermDeletes, "num deletes must be 0 after freeze");
        }
Exemple #15
0
        // Currently used only by assert statement
        private int CompareToLastTerm(int fieldNumber, BytesRef term)
        {
            if (lastFieldNumber != fieldNumber)
            {
                int cmp = FieldName(fieldInfos, lastFieldNumber).CompareToOrdinal(FieldName(fieldInfos, fieldNumber));
                // If there is a field named "" (empty string) then we
                // will get 0 on this comparison, yet, it's "OK".  But
                // it's not OK if two different field numbers map to
                // the same name.
                if (cmp != 0 || lastFieldNumber != -1)
                {
                    return(cmp);
                }
            }

            scratchBytes.CopyBytes(term);
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(lastTerm.Offset == 0);
            }
            UnicodeUtil.UTF8toUTF16(lastTerm.Bytes, 0, lastTerm.Length, utf16Result1);

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(scratchBytes.Offset == 0);
            }
            UnicodeUtil.UTF8toUTF16(scratchBytes.Bytes, 0, scratchBytes.Length, utf16Result2);

            int len;

            if (utf16Result1.Length < utf16Result2.Length)
            {
                len = utf16Result1.Length;
            }
            else
            {
                len = utf16Result2.Length;
            }

            for (int i = 0; i < len; i++)
            {
                char ch1 = utf16Result1.Chars[i];
                char ch2 = utf16Result2.Chars[i];
                if (ch1 != ch2)
                {
                    return(ch1 - ch2);
                }
            }
            if (utf16Result1.Length == 0 && lastFieldNumber == -1)
            {
                // If there is a field named "" (empty string) with a term text of "" (empty string) then we
                // will get 0 on this comparison, yet, it's "OK".
                return(-1);
            }
            return(utf16Result1.Length - utf16Result2.Length);
        }
            internal virtual void ReadVectors()
            {
                TermAndPostings = new TermAndPostings[NumTerms];
                BytesRef lastTerm = new BytesRef();

                for (int i = 0; i < NumTerms; i++)
                {
                    TermAndPostings t    = new TermAndPostings();
                    BytesRef        term = new BytesRef();
                    term.CopyBytes(lastTerm);
                    int start    = Tvf.ReadVInt();
                    int deltaLen = Tvf.ReadVInt();
                    term.Length = start + deltaLen;
                    term.Grow(term.Length);
                    Tvf.ReadBytes(term.Bytes, start, deltaLen);
                    t.Term = term;
                    int freq = Tvf.ReadVInt();
                    t.Freq = freq;

                    if (StorePositions)
                    {
                        int[] positions = new int[freq];
                        int   pos       = 0;
                        for (int posUpto = 0; posUpto < freq; posUpto++)
                        {
                            int delta = Tvf.ReadVInt();
                            if (delta == -1)
                            {
                                delta = 0; // LUCENE-1542 correction
                            }
                            pos += delta;
                            positions[posUpto] = pos;
                        }
                        t.Positions = positions;
                    }

                    if (StoreOffsets)
                    {
                        int[] startOffsets = new int[freq];
                        int[] endOffsets   = new int[freq];
                        int   offset       = 0;
                        for (int posUpto = 0; posUpto < freq; posUpto++)
                        {
                            startOffsets[posUpto] = offset + Tvf.ReadVInt();
                            offset = endOffsets[posUpto] = startOffsets[posUpto] + Tvf.ReadVInt();
                        }
                        t.StartOffsets = startOffsets;
                        t.EndOffsets   = endOffsets;
                    }
                    lastTerm.CopyBytes(term);
                    TermAndPostings[i] = t;
                }
            }
Exemple #17
0
        public virtual void TestStressDeleteQueue()
        {
            DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue();
            ISet <Term> uniqueValues         = new JCG.HashSet <Term>();
            int         size = 10000 + Random.Next(500) * RANDOM_MULTIPLIER;

            int?[] ids = new int?[size];
            for (int i = 0; i < ids.Length; i++)
            {
                ids[i] = Random.Next();
                uniqueValues.Add(new Term("id", ids[i].ToString()));
            }
            CountdownEvent latch      = new CountdownEvent(1);
            AtomicInt32    index      = new AtomicInt32(0);
            int            numThreads = 2 + Random.Next(5);

            UpdateThread[] threads = new UpdateThread[numThreads];
            for (int i = 0; i < threads.Length; i++)
            {
                threads[i] = new UpdateThread(queue, index, ids, latch);
                threads[i].Start();
            }
            latch.Signal();
            for (int i = 0; i < threads.Length; i++)
            {
                threads[i].Join();
            }

            foreach (UpdateThread updateThread in threads)
            {
                DeleteSlice slice = updateThread.Slice;
                queue.UpdateSlice(slice);
                BufferedUpdates deletes = updateThread.Deletes;
                slice.Apply(deletes, BufferedUpdates.MAX_INT32);
                assertEquals(uniqueValues, new JCG.HashSet <Term>(deletes.terms.Keys));
            }
            queue.TryApplyGlobalSlice();
            ISet <Term> frozenSet = new JCG.HashSet <Term>();

            foreach (Term t in queue.FreezeGlobalBuffer(null).GetTermsEnumerable())
            {
                BytesRef bytesRef = new BytesRef();
                bytesRef.CopyBytes(t.Bytes);
                frozenSet.Add(new Term(t.Field, bytesRef));
            }
            Assert.AreEqual(0, queue.NumGlobalTermDeletes, "num deletes must be 0 after freeze");
            Assert.AreEqual(uniqueValues.Count, frozenSet.Count);
            assertEquals(uniqueValues, frozenSet);
        }
Exemple #18
0
 // single straight enum
 private void DoTestStraightEnum(IList <Term> fieldTerms, IndexReader reader, int uniqueTermCount)
 {
     if (Verbose)
     {
         Console.WriteLine("\nTEST: top now enum reader=" + reader);
     }
     Fields fields = MultiFields.GetFields(reader);
     {
         // Test straight enum:
         int termCount = 0;
         foreach (string field in fields)
         {
             Terms terms = fields.GetTerms(field);
             Assert.IsNotNull(terms);
             TermsEnum termsEnum = terms.GetEnumerator();
             BytesRef  text;
             BytesRef  lastText = null;
             while (termsEnum.MoveNext())
             {
                 text = termsEnum.Term;
                 Term exp = fieldTerms[termCount];
                 if (Verbose)
                 {
                     Console.WriteLine("  got term=" + field + ":" + UnicodeUtil.ToHexString(text.Utf8ToString()));
                     Console.WriteLine("       exp=" + exp.Field + ":" + UnicodeUtil.ToHexString(exp.Text));
                     Console.WriteLine();
                 }
                 if (lastText == null)
                 {
                     lastText = BytesRef.DeepCopyOf(text);
                 }
                 else
                 {
                     Assert.IsTrue(lastText.CompareTo(text) < 0);
                     lastText.CopyBytes(text);
                 }
                 Assert.AreEqual(exp.Field, field);
                 Assert.AreEqual(exp.Bytes, text);
                 termCount++;
             }
             if (Verbose)
             {
                 Console.WriteLine("  no more terms for field=" + field);
             }
         }
         Assert.AreEqual(uniqueTermCount, termCount);
     }
 }
 public BytesRef Next()
 {
     if (termsEnum != null)
     {
         BytesRef next;
         while ((next = termsEnum.Next()) != null)
         {
             if (IsFrequent(termsEnum.DocFreq()))
             {
                 freq = termsEnum.DocFreq();
                 spare.CopyBytes(next);
                 return(spare);
             }
         }
     }
     return(null);
 }
Exemple #20
0
 private bool CompareToLastTerm(BytesRef t)
 {
     if (lastTerm == null && t != null)
     {
         lastTerm = BytesRef.DeepCopyOf(t);
     }
     else if (t == null)
     {
         lastTerm = null;
     }
     else
     {
         Debug.Assert(termsEnum.Comparer.Compare(lastTerm, t) < 0, "lastTerm=" + lastTerm + " t=" + t);
         lastTerm.CopyBytes(t);
     }
     return(true);
 }
Exemple #21
0
 public bool MoveNext()
 {
     if (!(termsEnum is null))
     {
         while (termsEnum.MoveNext())
         {
             if (IsFrequent(termsEnum.DocFreq))
             {
                 freq = termsEnum.DocFreq;
                 spare.CopyBytes(termsEnum.Term);
                 current = spare;
                 return(true);
             }
         }
     }
     current = null;
     return(false);
 }
Exemple #22
0
 private bool CompareToLastTerm(BytesRef t)
 {
     if (lastTerm == null && t != null)
     {
         lastTerm = BytesRef.DeepCopyOf(t);
     }
     else if (t == null)
     {
         lastTerm = null;
     }
     else
     {
         if (Debugging.AssertsEnabled)
         {
             Debugging.Assert(termsEnum.Comparer.Compare(lastTerm, t) < 0, "lastTerm={0} t={1}", lastTerm, t);
         }
         lastTerm.CopyBytes(t);
     }
     return(true);
 }
Exemple #23
0
        public BytesRef Next()
        {
            // LUCENENET NOTE: We moved the cursor when
            // the instance was created. Make sure we don't
            // move it again until the second call to Next().
            if (first && current != null)
            {
                first = false;
            }
            else if (i.MoveNext())
            {
                current = i.Current;
            }
            else
            {
                return(null);
            }

            spare.CopyBytes(current.term);
            return(spare);
        }
        public override void StartTerm(BytesRef term, int freq)
        {
            int prefix = StringHelper.BytesDifference(lastTerm, term);
            int suffix = term.Length - prefix;

            tvf.WriteVInt32(prefix);
            tvf.WriteVInt32(suffix);
            tvf.WriteBytes(term.Bytes, term.Offset + prefix, suffix);
            tvf.WriteVInt32(freq);
            lastTerm.CopyBytes(term);
            lastPosition = lastOffset = 0;

            if (offsets && positions)
            {
                // we might need to buffer if its a non-bulk merge
                offsetStartBuffer = ArrayUtil.Grow(offsetStartBuffer, freq);
                offsetEndBuffer   = ArrayUtil.Grow(offsetEndBuffer, freq);
                offsetIndex       = 0;
                offsetFreq        = freq;
            }
        }
        public override void StartTerm(BytesRef term, int freq)
        {
            int prefix = StringHelper.BytesDifference(LastTerm, term);
            int suffix = term.Length - prefix;

            Tvf.WriteVInt(prefix);
            Tvf.WriteVInt(suffix);
            Tvf.WriteBytes(term.Bytes, term.Offset + prefix, suffix);
            Tvf.WriteVInt(freq);
            LastTerm.CopyBytes(term);
            LastPosition = LastOffset = 0;

            if (Offsets && Positions)
            {
                // we might need to buffer if its a non-bulk merge
                OffsetStartBuffer = ArrayUtil.Grow(OffsetStartBuffer, freq);
                OffsetEndBuffer   = ArrayUtil.Grow(OffsetEndBuffer, freq);
            }
            BufferedIndex      = 0;
            BufferedFreq       = freq;
            PayloadData.Length = 0;
        }
 public override bool BytesVal(int doc, BytesRef target)
 {
     target.CopyBytes(outerInstance.bytesRef);
     return true;
 }
Exemple #27
0
            public override BytesRef Next()
            {
                if (nextTerm >= numTerms)
                {
                    return(null);
                }
                term.CopyBytes(lastTerm);
                int start    = tvf.ReadVInt32();
                int deltaLen = tvf.ReadVInt32();

                term.Length = start + deltaLen;
                term.Grow(term.Length);
                tvf.ReadBytes(term.Bytes, start, deltaLen);
                freq = tvf.ReadVInt32();

                if (storePayloads)
                {
                    positions      = new int[freq];
                    payloadOffsets = new int[freq];
                    int totalPayloadLength = 0;
                    int pos = 0;
                    for (int posUpto = 0; posUpto < freq; posUpto++)
                    {
                        int code = tvf.ReadVInt32();
                        pos += (int)((uint)code >> 1);
                        positions[posUpto] = pos;
                        if ((code & 1) != 0)
                        {
                            // length change
                            lastPayloadLength = tvf.ReadVInt32();
                        }
                        payloadOffsets[posUpto] = totalPayloadLength;
                        totalPayloadLength     += lastPayloadLength;
                        Debug.Assert(totalPayloadLength >= 0);
                    }
                    payloadData = new byte[totalPayloadLength];
                    tvf.ReadBytes(payloadData, 0, payloadData.Length);
                } // no payloads
                else if (storePositions)
                {
                    // TODO: we could maybe reuse last array, if we can
                    // somehow be careful about consumer never using two
                    // D&PEnums at once...
                    positions = new int[freq];
                    int pos = 0;
                    for (int posUpto = 0; posUpto < freq; posUpto++)
                    {
                        pos += tvf.ReadVInt32();
                        positions[posUpto] = pos;
                    }
                }

                if (storeOffsets)
                {
                    startOffsets = new int[freq];
                    endOffsets   = new int[freq];
                    int offset = 0;
                    for (int posUpto = 0; posUpto < freq; posUpto++)
                    {
                        startOffsets[posUpto] = offset + tvf.ReadVInt32();
                        offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.ReadVInt32();
                    }
                }

                lastTerm.CopyBytes(term);
                nextTerm++;
                return(term);
            }
Exemple #28
0
                /// <remarks>
                /// TODO: we may want an alternate mode here which is
                /// "if you are about to return NOT_FOUND I won't use
                /// the terms data from that"; eg FuzzyTermsEnum will
                /// (usually) just immediately call seek again if we
                /// return NOT_FOUND so it's a waste for us to fill in
                /// the term that was actually NOT_FOUND
                /// </remarks>
                public override SeekStatus SeekCeil(BytesRef target)
                {
                    if (indexEnum == null)
                    {
                        throw new InvalidOperationException("terms index was not loaded");
                    }

                    //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
                    if (didIndexNext)
                    {
                        if (nextIndexTerm == null)
                        {
                            //System.out.println("  nextIndexTerm=null");
                        }
                        else
                        {
                            //System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
                        }
                    }

                    bool doSeek = true;

                    // See if we can avoid seeking, because target term
                    // is after current term but before next index term:
                    if (indexIsCurrent)
                    {
                        int cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(term, target);

                        if (cmp == 0)
                        {
                            // Already at the requested term
                            return(SeekStatus.FOUND);
                        }
                        else if (cmp < 0)
                        {
                            // Target term is after current term
                            if (!didIndexNext)
                            {
                                if (indexEnum.Next() == -1)
                                {
                                    nextIndexTerm = null;
                                }
                                else
                                {
                                    nextIndexTerm = indexEnum.Term;
                                }
                                //System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
                                didIndexNext = true;
                            }

                            if (nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, nextIndexTerm) < 0)
                            {
                                // Optimization: requested term is within the
                                // same term block we are now in; skip seeking
                                // (but do scanning):
                                doSeek = false;
                                //System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
                            }
                        }
                    }

                    if (doSeek)
                    {
                        //System.out.println("  seek");

                        // Ask terms index to find biggest indexed term (=
                        // first term in a block) that's <= our text:
                        input.Seek(indexEnum.Seek(target));
                        bool result = NextBlock();

                        // Block must exist since, at least, the indexed term
                        // is in the block:
                        Debug.Assert(result);

                        indexIsCurrent  = true;
                        didIndexNext    = false;
                        blocksSinceSeek = 0;

                        if (doOrd)
                        {
                            state.Ord = indexEnum.Ord - 1;
                        }

                        term.CopyBytes(indexEnum.Term);
                        //System.out.println("  seek: term=" + term.utf8ToString());
                    }
                    else
                    {
                        //System.out.println("  skip seek");
                        if (state.TermBlockOrd == blockTermCount && !NextBlock())
                        {
                            indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                    }

                    seekPending = false;

                    int common = 0;

                    // Scan within block.  We could do this by calling
                    // _next() and testing the resulting term, but this
                    // is wasteful.  Instead, we first confirm the
                    // target matches the common prefix of this block,
                    // and then we scan the term bytes directly from the
                    // termSuffixesreader's byte[], saving a copy into
                    // the BytesRef term per term.  Only when we return
                    // do we then copy the bytes into the term.

                    while (true)
                    {
                        // First, see if target term matches common prefix
                        // in this block:
                        if (common < termBlockPrefix)
                        {
                            int cmp = (term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF);
                            if (cmp < 0)
                            {
                                // TODO: maybe we should store common prefix
                                // in block header?  (instead of relying on
                                // last term of previous block)

                                // Target's prefix is after the common block
                                // prefix, so term cannot be in this block
                                // but it could be in next block.  We
                                // must scan to end-of-block to set common
                                // prefix for next block:
                                if (state.TermBlockOrd < blockTermCount)
                                {
                                    while (state.TermBlockOrd < blockTermCount - 1)
                                    {
                                        state.TermBlockOrd++;
                                        state.Ord++;
                                        termSuffixesReader.SkipBytes(termSuffixesReader.ReadVInt32());
                                    }
                                    int suffix = termSuffixesReader.ReadVInt32();
                                    term.Length = termBlockPrefix + suffix;
                                    if (term.Bytes.Length < term.Length)
                                    {
                                        term.Grow(term.Length);
                                    }
                                    termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                }
                                state.Ord++;

                                if (!NextBlock())
                                {
                                    indexIsCurrent = false;
                                    return(SeekStatus.END);
                                }
                                common = 0;
                            }
                            else if (cmp > 0)
                            {
                                // Target's prefix is before the common prefix
                                // of this block, so we position to start of
                                // block and return NOT_FOUND:
                                Debug.Assert(state.TermBlockOrd == 0);

                                int suffix = termSuffixesReader.ReadVInt32();
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                return(SeekStatus.NOT_FOUND);
                            }
                            else
                            {
                                common++;
                            }

                            continue;
                        }

                        // Test every term in this block
                        while (true)
                        {
                            state.TermBlockOrd++;
                            state.Ord++;

                            int suffix = termSuffixesReader.ReadVInt32();

                            // We know the prefix matches, so just compare the new suffix:
                            int termLen = termBlockPrefix + suffix;
                            int bytePos = termSuffixesReader.Position;

                            bool next      = false;
                            int  limit     = target.Offset + (termLen < target.Length ? termLen : target.Length);
                            int  targetPos = target.Offset + termBlockPrefix;
                            while (targetPos < limit)
                            {
                                int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF);
                                if (cmp < 0)
                                {
                                    // Current term is still before the target;
                                    // keep scanning
                                    next = true;
                                    break;
                                }
                                else if (cmp > 0)
                                {
                                    // Done!  Current term is after target. Stop
                                    // here, fill in real term, return NOT_FOUND.
                                    term.Length = termBlockPrefix + suffix;
                                    if (term.Bytes.Length < term.Length)
                                    {
                                        term.Grow(term.Length);
                                    }
                                    termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                    //System.out.println("  NOT_FOUND");
                                    return(SeekStatus.NOT_FOUND);
                                }
                            }

                            if (!next && target.Length <= termLen)
                            {
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);

                                if (target.Length == termLen)
                                {
                                    // Done!  Exact match.  Stop here, fill in
                                    // real term, return FOUND.
                                    //System.out.println("  FOUND");
                                    return(SeekStatus.FOUND);
                                }
                                else
                                {
                                    //System.out.println("  NOT_FOUND");
                                    return(SeekStatus.NOT_FOUND);
                                }
                            }

                            if (state.TermBlockOrd == blockTermCount)
                            {
                                // Must pre-fill term for next block's common prefix
                                term.Length = termBlockPrefix + suffix;
                                if (term.Bytes.Length < term.Length)
                                {
                                    term.Grow(term.Length);
                                }
                                termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix);
                                break;
                            }
                            else
                            {
                                termSuffixesReader.SkipBytes(suffix);
                            }
                        }

                        // The purpose of the terms dict index is to seek
                        // the enum to the closest index term before the
                        // term we are looking for.  So, we should never
                        // cross another index term (besides the first
                        // one) while we are scanning:

                        Debug.Assert(indexIsCurrent);

                        if (!NextBlock())
                        {
                            //System.out.println("  END");
                            indexIsCurrent = false;
                            return(SeekStatus.END);
                        }
                        common = 0;
                    }
                }
Exemple #29
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
        /// <summary>
        /// expert: writes a value dictionary for a sorted/sortedset field </summary>
        protected internal virtual void AddTermsDict(FieldInfo field, IEnumerable<BytesRef> values)
        {
            // first check if its a "fixed-length" terms dict
            int minLength = int.MaxValue;
            int maxLength = int.MinValue;
            foreach (BytesRef v in values)
            {
                minLength = Math.Min(minLength, v.Length);
                maxLength = Math.Max(maxLength, v.Length);
            }
            if (minLength == maxLength)
            {
                // no index needed: direct addressing by mult
                AddBinaryField(field, values);
            }
            else
            {
                // header
                Meta.WriteVInt(field.Number);
                Meta.WriteByte((byte)Lucene45DocValuesFormat.BINARY);
                Meta.WriteVInt(BINARY_PREFIX_COMPRESSED);
                Meta.WriteLong(-1L);
                // now write the bytes: sharing prefixes within a block
                long startFP = Data.FilePointer;
                // currently, we have to store the delta from expected for every 1/nth term
                // we could avoid this, but its not much and less overall RAM than the previous approach!
                RAMOutputStream addressBuffer = new RAMOutputStream();
                MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
                BytesRef lastTerm = new BytesRef();
                long count = 0;
                foreach (BytesRef v in values)
                {
                    if (count % ADDRESS_INTERVAL == 0)
                    {
                        termAddresses.Add(Data.FilePointer - startFP);
                        // force the first term in a block to be abs-encoded
                        lastTerm.Length = 0;
                    }

                    // prefix-code
                    int sharedPrefix = StringHelper.BytesDifference(lastTerm, v);
                    Data.WriteVInt(sharedPrefix);
                    Data.WriteVInt(v.Length - sharedPrefix);
                    Data.WriteBytes(v.Bytes, v.Offset + sharedPrefix, v.Length - sharedPrefix);
                    lastTerm.CopyBytes(v);
                    count++;
                }
                long indexStartFP = Data.FilePointer;
                // write addresses of indexed terms
                termAddresses.Finish();
                addressBuffer.WriteTo(Data);
                addressBuffer = null;
                termAddresses = null;
                Meta.WriteVInt(minLength);
                Meta.WriteVInt(maxLength);
                Meta.WriteVLong(count);
                Meta.WriteLong(startFP);
                Meta.WriteVInt(ADDRESS_INTERVAL);
                Meta.WriteLong(indexStartFP);
                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Meta.WriteVInt(BLOCK_SIZE);
            }
        }
        /// <summary>
        /// expert: writes a value dictionary for a sorted/sortedset field </summary>
        protected internal virtual void AddTermsDict(FieldInfo field, IEnumerable <BytesRef> values)
        {
            // first check if its a "fixed-length" terms dict
            int minLength = int.MaxValue;
            int maxLength = int.MinValue;

            foreach (BytesRef v in values)
            {
                minLength = Math.Min(minLength, v.Length);
                maxLength = Math.Max(maxLength, v.Length);
            }
            if (minLength == maxLength)
            {
                // no index needed: direct addressing by mult
                AddBinaryField(field, values);
            }
            else
            {
                // header
                Meta.WriteVInt(field.Number);
                Meta.WriteByte((byte)Lucene45DocValuesFormat.BINARY);
                Meta.WriteVInt(BINARY_PREFIX_COMPRESSED);
                Meta.WriteLong(-1L);
                // now write the bytes: sharing prefixes within a block
                long startFP = Data.FilePointer;
                // currently, we have to store the delta from expected for every 1/nth term
                // we could avoid this, but its not much and less overall RAM than the previous approach!
                RAMOutputStream            addressBuffer = new RAMOutputStream();
                MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
                BytesRef lastTerm = new BytesRef();
                long     count    = 0;
                foreach (BytesRef v in values)
                {
                    if (count % ADDRESS_INTERVAL == 0)
                    {
                        termAddresses.Add(Data.FilePointer - startFP);
                        // force the first term in a block to be abs-encoded
                        lastTerm.Length = 0;
                    }

                    // prefix-code
                    int sharedPrefix = StringHelper.BytesDifference(lastTerm, v);
                    Data.WriteVInt(sharedPrefix);
                    Data.WriteVInt(v.Length - sharedPrefix);
                    Data.WriteBytes(v.Bytes, v.Offset + sharedPrefix, v.Length - sharedPrefix);
                    lastTerm.CopyBytes(v);
                    count++;
                }
                long indexStartFP = Data.FilePointer;
                // write addresses of indexed terms
                termAddresses.Finish();
                addressBuffer.WriteTo(Data);
                addressBuffer = null;
                termAddresses = null;
                Meta.WriteVInt(minLength);
                Meta.WriteVInt(maxLength);
                Meta.WriteVLong(count);
                Meta.WriteLong(startFP);
                Meta.WriteVInt(ADDRESS_INTERVAL);
                Meta.WriteLong(indexStartFP);
                Meta.WriteVInt(PackedInts.VERSION_CURRENT);
                Meta.WriteVInt(BLOCK_SIZE);
            }
        }
Exemple #32
0
        public override SeekStatus SeekCeil(BytesRef term)
        {
            queue.Clear();
            numTop        = 0;
            lastSeekExact = false;

            bool seekOpt = false;

            if (lastSeek != null && termComp.Compare(lastSeek, term) <= 0)
            {
                seekOpt = true;
            }

            lastSeekScratch.CopyBytes(term);
            lastSeek = lastSeekScratch;

            for (int i = 0; i < numSubs; i++)
            {
                SeekStatus status;
                // LUCENE-2130: if we had just seek'd already, prior
                // to this seek, and the new seek term is after the
                // previous one, don't try to re-seek this sub if its
                // current term is already beyond this new seek term.
                // Doing so is a waste because this sub will simply
                // seek to the same spot.
                if (seekOpt)
                {
                    BytesRef curTerm = currentSubs[i].Current;
                    if (curTerm != null)
                    {
                        int cmp = termComp.Compare(term, curTerm);
                        if (cmp == 0)
                        {
                            status = SeekStatus.FOUND;
                        }
                        else if (cmp < 0)
                        {
                            status = SeekStatus.NOT_FOUND;
                        }
                        else
                        {
                            status = currentSubs[i].Terms.SeekCeil(term);
                        }
                    }
                    else
                    {
                        status = SeekStatus.END;
                    }
                }
                else
                {
                    status = currentSubs[i].Terms.SeekCeil(term);
                }

                if (status == SeekStatus.FOUND)
                {
                    top[numTop++] = currentSubs[i];
                    current       = currentSubs[i].Current = currentSubs[i].Terms.Term;
                }
                else
                {
                    if (status == SeekStatus.NOT_FOUND)
                    {
                        currentSubs[i].Current = currentSubs[i].Terms.Term;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(currentSubs[i].Current != null);
                        }
                        queue.Add(currentSubs[i]);
                    }
                    else
                    {
                        // enum exhausted
                        currentSubs[i].Current = null;
                    }
                }
            }

            if (numTop > 0)
            {
                // at least one sub had exact match to the requested term
                return(SeekStatus.FOUND);
            }
            else if (queue.Count > 0)
            {
                // no sub had exact match, but at least one sub found
                // a term after the requested term -- advance to that
                // next term:
                PullTop();
                return(SeekStatus.NOT_FOUND);
            }
            else
            {
                return(SeekStatus.END);
            }
        }
        /// <summary>
        /// Builds the final automaton from a list of entries.
        /// </summary>
        private FST<object> BuildAutomaton(BytesRefSorter sorter)
        {
            // Build the automaton.
            Outputs<object> outputs = NoOutputs.Singleton;
            object empty = outputs.NoOutput;
            Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15);

            BytesRef scratch = new BytesRef();
            BytesRef entry;
            IntsRef scratchIntsRef = new IntsRef();
            int count = 0;
            BytesRefIterator iter = sorter.GetEnumerator();
            while ((entry = iter.Next()) != null)
            {
                count++;
                if (scratch.CompareTo(entry) != 0)
                {
                    builder.Add(Util.Fst.Util.ToIntsRef(entry, scratchIntsRef), empty);
                    scratch.CopyBytes(entry);
                }
            }

            return count == 0 ? null : builder.Finish();
        }
 public virtual void TestUpdateDelteSlices()
 {
     DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue();
     int size = 200 + Random().Next(500) * RANDOM_MULTIPLIER;
     int?[] ids = new int?[size];
     for (int i = 0; i < ids.Length; i++)
     {
         ids[i] = Random().Next();
     }
     DeleteSlice slice1 = queue.NewSlice();
     DeleteSlice slice2 = queue.NewSlice();
     BufferedUpdates bd1 = new BufferedUpdates();
     BufferedUpdates bd2 = new BufferedUpdates();
     int last1 = 0;
     int last2 = 0;
     HashSet<Term> uniqueValues = new HashSet<Term>();
     for (int j = 0; j < ids.Length; j++)
     {
         int? i = ids[j];
         // create an array here since we compare identity below against tailItem
         Term[] term = new Term[] { new Term("id", i.ToString()) };
         uniqueValues.Add(term[0]);
         queue.AddDelete(term);
         if (Random().Next(20) == 0 || j == ids.Length - 1)
         {
             queue.UpdateSlice(slice1);
             Assert.IsTrue(slice1.IsTailItem(term));
             slice1.Apply(bd1, j);
             AssertAllBetween(last1, j, bd1, ids);
             last1 = j + 1;
         }
         if (Random().Next(10) == 5 || j == ids.Length - 1)
         {
             queue.UpdateSlice(slice2);
             Assert.IsTrue(slice2.IsTailItem(term));
             slice2.Apply(bd2, j);
             AssertAllBetween(last2, j, bd2, ids);
             last2 = j + 1;
         }
         Assert.AreEqual(j + 1, queue.NumGlobalTermDeletes());
     }
     Assert.AreEqual(uniqueValues, bd1.Terms_Nunit().Keys);
     Assert.AreEqual(uniqueValues, bd2.Terms_Nunit().Keys);
     HashSet<Term> frozenSet = new HashSet<Term>();
     foreach (Term t in queue.FreezeGlobalBuffer(null).TermsIterable())
     {
         BytesRef bytesRef = new BytesRef();
         bytesRef.CopyBytes(t.Bytes());
         frozenSet.Add(new Term(t.Field(), bytesRef));
     }
     Assert.AreEqual(uniqueValues, frozenSet);
     Assert.AreEqual(0, queue.NumGlobalTermDeletes(), "num deletes must be 0 after freeze");
 }
        private void CheckTermsOrder(IndexReader r, ISet<string> allTerms, bool isTop)
        {
            TermsEnum terms = MultiFields.GetFields(r).Terms("f").Iterator(null);

            BytesRef last = new BytesRef();

            HashSet<string> seenTerms = new HashSet<string>();

            while (true)
            {
                BytesRef term = terms.Next();
                if (term == null)
                {
                    break;
                }

                Assert.IsTrue(last.CompareTo(term) < 0);
                last.CopyBytes(term);

                string s = term.Utf8ToString();
                Assert.IsTrue(allTerms.Contains(s), "term " + TermDesc(s) + " was not added to index (count=" + allTerms.Count + ")");
                seenTerms.Add(s);
            }

            if (isTop)
            {
                Assert.IsTrue(allTerms.SetEquals(seenTerms));
            }

            // Test seeking:
            IEnumerator<string> it = seenTerms.GetEnumerator();
            while (it.MoveNext())
            {
                BytesRef tr = new BytesRef(it.Current);
                Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(tr), "seek failed for term=" + TermDesc(tr.Utf8ToString()));
            }
        }
Exemple #36
0
 public override bool BytesVal(int doc, BytesRef target)
 {
     target.CopyBytes(outerInstance.m_bytesRef);
     return(true);
 }
        public virtual void TestStressDeleteQueue()
        {
            DocumentsWriterDeleteQueue queue = new DocumentsWriterDeleteQueue();
            HashSet<Term> uniqueValues = new HashSet<Term>();
            int size = 10000 + Random().Next(500) * RANDOM_MULTIPLIER;
            int?[] ids = new int?[size];
            for (int i = 0; i < ids.Length; i++)
            {
                ids[i] = Random().Next();
                uniqueValues.Add(new Term("id", ids[i].ToString()));
            }
            CountDownLatch latch = new CountDownLatch(1);
            AtomicInteger index = new AtomicInteger(0);
            int numThreads = 2 + Random().Next(5);
            UpdateThread[] threads = new UpdateThread[numThreads];
            for (int i = 0; i < threads.Length; i++)
            {
                threads[i] = new UpdateThread(queue, index, ids, latch);
                threads[i].Start();
            }
            latch.countDown();
            for (int i = 0; i < threads.Length; i++)
            {
                threads[i].Join();
            }

            foreach (UpdateThread updateThread in threads)
            {
                DeleteSlice slice = updateThread.Slice;
                queue.UpdateSlice(slice);
                BufferedUpdates deletes = updateThread.Deletes;
                slice.Apply(deletes, BufferedUpdates.MAX_INT);
                Assert.AreEqual(uniqueValues, deletes.Terms_Nunit().Keys);
            }
            queue.TryApplyGlobalSlice();
            HashSet<Term> frozenSet = new HashSet<Term>();
            foreach (Term t in queue.FreezeGlobalBuffer(null).TermsIterable())
            {
                BytesRef bytesRef = new BytesRef();
                bytesRef.CopyBytes(t.Bytes());
                frozenSet.Add(new Term(t.Field(), bytesRef));
            }
            Assert.AreEqual(0, queue.NumGlobalTermDeletes(), "num deletes must be 0 after freeze");
            Assert.AreEqual(uniqueValues.Count, frozenSet.Count);
            Assert.AreEqual(uniqueValues, frozenSet);
        }