Exemple #1
0
        // FST is complete
        private void VerifyUnPruned(int inputMode, FST <T> fst)
        {
            FST <long>  fstLong;
            ISet <long> validOutputs;
            long        minLong = long.MaxValue;
            long        maxLong = long.MinValue;

            if (DoReverseLookup)
            {
                FST <long> fstLong0 = fst as FST <long>;
                fstLong      = fstLong0;
                validOutputs = new HashSet <long>();
                foreach (InputOutput <T> pair in Pairs)
                {
                    long?output = pair.Output as long?;
                    maxLong = Math.Max(maxLong, output.Value);
                    minLong = Math.Min(minLong, output.Value);
                    validOutputs.Add(output.Value);
                }
            }
            else
            {
                fstLong      = null;
                validOutputs = null;
            }

            if (Pairs.Count == 0)
            {
                Assert.IsNull(fst);
                return;
            }

            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: now verify " + Pairs.Count + " terms");
                foreach (InputOutput <T> pair in Pairs)
                {
                    Assert.IsNotNull(pair);
                    Assert.IsNotNull(pair.Input);
                    Assert.IsNotNull(pair.Output);
                    Console.WriteLine("  " + InputToString(inputMode, pair.Input) + ": " + Outputs.OutputToString(pair.Output));
                }
            }

            Assert.IsNotNull(fst);

            // visit valid pairs in order -- make sure all words
            // are accepted, and FSTEnum's next() steps through
            // them correctly
            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: check valid terms/next()");
            }
            {
                IntsRefFSTEnum <T> fstEnum = new IntsRefFSTEnum <T>(fst);
                foreach (InputOutput <T> pair in Pairs)
                {
                    IntsRef term = pair.Input;
                    if (LuceneTestCase.VERBOSE)
                    {
                        Console.WriteLine("TEST: check term=" + InputToString(inputMode, term) + " output=" + fst.Outputs.OutputToString(pair.Output));
                    }
                    T output = Run(fst, term, null);
                    Assert.IsNotNull(output, "term " + InputToString(inputMode, term) + " is not accepted");
                    Assert.IsTrue(OutputsEqual(pair.Output, output));

                    // verify enum's next
                    IntsRefFSTEnum <T> .InputOutput <T> t = fstEnum.Next();
                    Assert.IsNotNull(t);
                    Assert.AreEqual(term, t.Input, "expected input=" + InputToString(inputMode, term) + " but fstEnum returned " + InputToString(inputMode, t.Input));
                    Assert.IsTrue(OutputsEqual(pair.Output, t.Output));
                }
                Assert.IsNull(fstEnum.Next());
            }

            IDictionary <IntsRef, T> termsMap = new Dictionary <IntsRef, T>();

            foreach (InputOutput <T> pair in Pairs)
            {
                termsMap[pair.Input] = pair.Output;
            }

            if (DoReverseLookup && maxLong > minLong)
            {
                // Do random lookups so we test null (output doesn't
                // exist) case:
                Assert.IsNull(Util.GetByOutput(fstLong, minLong - 7));
                Assert.IsNull(Util.GetByOutput(fstLong, maxLong + 7));

                int num = LuceneTestCase.AtLeast(Random, 100);
                for (int iter = 0; iter < num; iter++)
                {
                    long    v     = TestUtil.NextLong(Random, minLong, maxLong);
                    IntsRef input = Util.GetByOutput(fstLong, v);
                    Assert.IsTrue(validOutputs.Contains(v) || input == null);
                }
            }

            // find random matching word and make sure it's valid
            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: verify random accepted terms");
            }
            IntsRef scratch = new IntsRef(10);
            int     num_    = LuceneTestCase.AtLeast(Random, 500);

            for (int iter = 0; iter < num_; iter++)
            {
                T output = RandomAcceptedWord(fst, scratch);
                Assert.IsTrue(termsMap.ContainsKey(scratch), "accepted word " + InputToString(inputMode, scratch) + " is not valid");
                Assert.IsTrue(OutputsEqual(termsMap[scratch], output));

                if (DoReverseLookup)
                {
                    //System.out.println("lookup output=" + output + " outs=" + fst.Outputs);
                    IntsRef input = Util.GetByOutput(fstLong, (output as long?).Value);
                    Assert.IsNotNull(input);
                    //System.out.println("  got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
                    Assert.AreEqual(scratch, input);
                }
            }

            // test IntsRefFSTEnum.Seek:
            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: verify seek");
            }
            IntsRefFSTEnum <T> fstEnum_ = new IntsRefFSTEnum <T>(fst);

            num_ = LuceneTestCase.AtLeast(Random, 100);
            for (int iter = 0; iter < num_; iter++)
            {
                if (LuceneTestCase.VERBOSE)
                {
                    Console.WriteLine("  iter=" + iter);
                }
                if (Random.NextBoolean())
                {
                    // seek to term that doesn't exist:
                    while (true)
                    {
                        IntsRef term = ToIntsRef(GetRandomString(Random), inputMode);
                        int     pos  = Pairs.BinarySearch(new InputOutput <T>(term, default(T)));
                        if (pos < 0)
                        {
                            pos = -(pos + 1);
                            // ok doesn't exist
                            //System.out.println("  seek " + inputToString(inputMode, term));
                            IntsRefFSTEnum <T> .InputOutput <T> seekResult;
                            if (Random.Next(3) == 0)
                            {
                                if (LuceneTestCase.VERBOSE)
                                {
                                    Console.WriteLine("  do non-exist seekExact term=" + InputToString(inputMode, term));
                                }
                                seekResult = fstEnum_.SeekExact(term);
                                pos        = -1;
                            }
                            else if (Random.NextBoolean())
                            {
                                if (LuceneTestCase.VERBOSE)
                                {
                                    Console.WriteLine("  do non-exist seekFloor term=" + InputToString(inputMode, term));
                                }
                                seekResult = fstEnum_.SeekFloor(term);
                                pos--;
                            }
                            else
                            {
                                if (LuceneTestCase.VERBOSE)
                                {
                                    Console.WriteLine("  do non-exist seekCeil term=" + InputToString(inputMode, term));
                                }
                                seekResult = fstEnum_.SeekCeil(term);
                            }

                            if (pos != -1 && pos < Pairs.Count)
                            {
                                //System.out.println("    got " + inputToString(inputMode,seekResult.input) + " output=" + fst.Outputs.outputToString(seekResult.Output));
                                Assert.IsNotNull(seekResult, "got null but expected term=" + InputToString(inputMode, Pairs[pos].Input));
                                if (LuceneTestCase.VERBOSE)
                                {
                                    Console.WriteLine("    got " + InputToString(inputMode, seekResult.Input));
                                }
                                Assert.AreEqual(Pairs[pos].Input, seekResult.Input, "expected " + InputToString(inputMode, Pairs[pos].Input) + " but got " + InputToString(inputMode, seekResult.Input));
                                Assert.IsTrue(OutputsEqual(Pairs[pos].Output, seekResult.Output));
                            }
                            else
                            {
                                // seeked before start or beyond end
                                //System.out.println("seek=" + seekTerm);
                                Assert.IsNull(seekResult, "expected null but got " + (seekResult == null ? "null" : InputToString(inputMode, seekResult.Input)));
                                if (LuceneTestCase.VERBOSE)
                                {
                                    Console.WriteLine("    got null");
                                }
                            }

                            break;
                        }
                    }
                }
                else
                {
                    // seek to term that does exist:
                    InputOutput <T> pair = Pairs[Random.Next(Pairs.Count)];
                    IntsRefFSTEnum <T> .InputOutput <T> seekResult;
                    if (Random.Next(3) == 2)
                    {
                        if (LuceneTestCase.VERBOSE)
                        {
                            Console.WriteLine("  do exists seekExact term=" + InputToString(inputMode, pair.Input));
                        }
                        seekResult = fstEnum_.SeekExact(pair.Input);
                    }
                    else if (Random.NextBoolean())
                    {
                        if (LuceneTestCase.VERBOSE)
                        {
                            Console.WriteLine("  do exists seekFloor " + InputToString(inputMode, pair.Input));
                        }
                        seekResult = fstEnum_.SeekFloor(pair.Input);
                    }
                    else
                    {
                        if (LuceneTestCase.VERBOSE)
                        {
                            Console.WriteLine("  do exists seekCeil " + InputToString(inputMode, pair.Input));
                        }
                        seekResult = fstEnum_.SeekCeil(pair.Input);
                    }
                    Assert.IsNotNull(seekResult);
                    Assert.AreEqual(pair.Input, seekResult.Input, "got " + InputToString(inputMode, seekResult.Input) + " but expected " + InputToString(inputMode, pair.Input));
                    Assert.IsTrue(OutputsEqual(pair.Output, seekResult.Output));
                }
            }

            if (LuceneTestCase.VERBOSE)
            {
                Console.WriteLine("TEST: mixed next/seek");
            }

            // test mixed next/seek
            num_ = LuceneTestCase.AtLeast(Random, 100);
            for (int iter = 0; iter < num_; iter++)
            {
                if (LuceneTestCase.VERBOSE)
                {
                    Console.WriteLine("TEST: iter " + iter);
                }
                // reset:
                fstEnum_ = new IntsRefFSTEnum <T>(fst);
                int upto = -1;
                while (true)
                {
                    bool isDone = false;
                    if (upto == Pairs.Count - 1 || Random.NextBoolean())
                    {
                        // next
                        upto++;
                        if (LuceneTestCase.VERBOSE)
                        {
                            Console.WriteLine("  do next");
                        }
                        isDone = fstEnum_.Next() == null;
                    }
                    else if (upto != -1 && upto < 0.75 * Pairs.Count && Random.NextBoolean())
                    {
                        int attempt = 0;
                        for (; attempt < 10; attempt++)
                        {
                            IntsRef term = ToIntsRef(GetRandomString(Random), inputMode);
                            if (!termsMap.ContainsKey(term) && term.CompareTo(Pairs[upto].Input) > 0)
                            {
                                int pos = Pairs.BinarySearch(new InputOutput <T>(term, default(T)));
                                Debug.Assert(pos < 0);
                                upto = -(pos + 1);

                                if (Random.NextBoolean())
                                {
                                    upto--;
                                    Assert.IsTrue(upto != -1);
                                    if (LuceneTestCase.VERBOSE)
                                    {
                                        Console.WriteLine("  do non-exist seekFloor(" + InputToString(inputMode, term) + ")");
                                    }
                                    isDone = fstEnum_.SeekFloor(term) == null;
                                }
                                else
                                {
                                    if (LuceneTestCase.VERBOSE)
                                    {
                                        Console.WriteLine("  do non-exist seekCeil(" + InputToString(inputMode, term) + ")");
                                    }
                                    isDone = fstEnum_.SeekCeil(term) == null;
                                }

                                break;
                            }
                        }
                        if (attempt == 10)
                        {
                            continue;
                        }
                    }
                    else
                    {
                        int inc = Random.Next(Pairs.Count - upto - 1);
                        upto += inc;
                        if (upto == -1)
                        {
                            upto = 0;
                        }

                        if (Random.NextBoolean())
                        {
                            if (LuceneTestCase.VERBOSE)
                            {
                                Console.WriteLine("  do seekCeil(" + InputToString(inputMode, Pairs[upto].Input) + ")");
                            }
                            isDone = fstEnum_.SeekCeil(Pairs[upto].Input) == null;
                        }
                        else
                        {
                            if (LuceneTestCase.VERBOSE)
                            {
                                Console.WriteLine("  do seekFloor(" + InputToString(inputMode, Pairs[upto].Input) + ")");
                            }
                            isDone = fstEnum_.SeekFloor(Pairs[upto].Input) == null;
                        }
                    }
                    if (LuceneTestCase.VERBOSE)
                    {
                        if (!isDone)
                        {
                            Console.WriteLine("    got " + InputToString(inputMode, fstEnum_.Current().Input));
                        }
                        else
                        {
                            Console.WriteLine("    got null");
                        }
                    }

                    if (upto == Pairs.Count)
                    {
                        Assert.IsTrue(isDone);
                        break;
                    }
                    else
                    {
                        Assert.IsFalse(isDone);
                        Assert.AreEqual(Pairs[upto].Input, fstEnum_.Current().Input);
                        Assert.IsTrue(OutputsEqual(Pairs[upto].Output, fstEnum_.Current().Output));

                        /*
                         * if (upto < pairs.size()-1) {
                         * int tryCount = 0;
                         * while(tryCount < 10) {
                         * final IntsRef t = toIntsRef(getRandomString(), inputMode);
                         * if (pairs.get(upto).input.compareTo(t) < 0) {
                         * final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
                         * if (LuceneTestCase.VERBOSE) {
                         * System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
                         * }
                         * Assert.AreEqual(expected, fstEnum.beforeNext(t));
                         * break;
                         * }
                         * tryCount++;
                         * }
                         * }
                         */
                    }
                }
            }
        }
Exemple #2
0
        // for debugging

        /*
         * private String toString(BytesRef b) {
         * try {
         *  return b.utf8ToString() + " " + b;
         * } catch (Throwable t) {
         *  return b.toString();
         * }
         * }
         */

        /// <summary>
        /// It's OK to add the same input twice in a row with
        ///  different outputs, as long as outputs impls the merge
        ///  method. Note that input is fully consumed after this
        ///  method is returned (so caller is free to reuse), but
        ///  output is not.  So if your outputs are changeable (eg
        ///  <seealso cref="ByteSequenceOutputs"/> or {@link
        ///  IntSequenceOutputs}) then you cannot reuse across
        ///  calls.
        /// </summary>
        public virtual void Add(IntsRef input, T output)
        {
            /*
             * if (DEBUG) {
             * BytesRef b = new BytesRef(input.length);
             * for(int x=0;x<input.length;x++) {
             *  b.bytes[x] = (byte) input.ints[x];
             * }
             * b.length = input.length;
             * if (output == NO_OUTPUT) {
             *  System.out.println("\nFST ADD: input=" + toString(b) + " " + b);
             * } else {
             *  System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output));
             * }
             * }
             */

            // De-dup NO_OUTPUT since it must be a singleton:
            if (output.Equals(NO_OUTPUT))
            {
                output = NO_OUTPUT;
            }

            Debug.Assert(LastInput.Length == 0 || input.CompareTo(LastInput) >= 0, "inputs are added out of order lastInput=" + LastInput + " vs input=" + input);
            Debug.Assert(ValidOutput(output));

            //System.out.println("\nadd: " + input);
            if (input.Length == 0)
            {
                // empty input: only allowed as first input.  we have
                // to special case this because the packed FST
                // format cannot represent the empty input since
                // 'finalness' is stored on the incoming arc, not on
                // the node
                Frontier[0].InputCount++;
                Frontier[0].IsFinal = true;
                Fst.EmptyOutput     = output;
                return;
            }

            // compare shared prefix length
            int pos1     = 0;
            int pos2     = input.Offset;
            int pos1Stop = Math.Min(LastInput.Length, input.Length);

            while (true)
            {
                Frontier[pos1].InputCount++;
                //System.out.println("  incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]);
                if (pos1 >= pos1Stop || LastInput.Ints[pos1] != input.Ints[pos2])
                {
                    break;
                }
                pos1++;
                pos2++;
            }
            int prefixLenPlus1 = pos1 + 1;

            if (Frontier.Length < input.Length + 1)
            {
                UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                Array.Copy(Frontier, 0, next, 0, Frontier.Length);
                for (int idx = Frontier.Length; idx < next.Length; idx++)
                {
                    next[idx] = new UnCompiledNode <T>(this, idx);
                }
                Frontier = next;
            }

            // minimize/compile states from previous input's
            // orphan'd suffix
            DoFreezeTail(prefixLenPlus1);

            // init tail states for current input
            for (int idx = prefixLenPlus1; idx <= input.Length; idx++)
            {
                Frontier[idx - 1].AddArc(input.Ints[input.Offset + idx - 1], Frontier[idx]);
                Frontier[idx].InputCount++;
            }

            UnCompiledNode <T> lastNode = Frontier[input.Length];

            if (LastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1)
            {
                lastNode.IsFinal = true;
                lastNode.Output  = NO_OUTPUT;
            }

            // push conflicting outputs forward, only as far as
            // needed
            for (int idx = 1; idx < prefixLenPlus1; idx++)
            {
                UnCompiledNode <T> node       = Frontier[idx];
                UnCompiledNode <T> parentNode = Frontier[idx - 1];

                T lastOutput = parentNode.GetLastOutput(input.Ints[input.Offset + idx - 1]);
                Debug.Assert(ValidOutput(lastOutput));

                T commonOutputPrefix;
                T wordSuffix;

                if ((object)lastOutput != (object)NO_OUTPUT)
                {
                    commonOutputPrefix = Fst.Outputs.Common(output, lastOutput);
                    Debug.Assert(ValidOutput(commonOutputPrefix));
                    wordSuffix = Fst.Outputs.Subtract(lastOutput, commonOutputPrefix);
                    Debug.Assert(ValidOutput(wordSuffix));
                    parentNode.SetLastOutput(input.Ints[input.Offset + idx - 1], commonOutputPrefix);
                    node.PrependOutput(wordSuffix);
                }
                else
                {
                    commonOutputPrefix = wordSuffix = NO_OUTPUT;
                }

                output = Fst.Outputs.Subtract(output, commonOutputPrefix);
                Debug.Assert(ValidOutput(output));
            }

            if (LastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length)
            {
                // same input more than 1 time in a row, mapping to
                // multiple outputs
                lastNode.Output = Fst.Outputs.Merge(lastNode.Output, output);
            }
            else
            {
                // this new arc is private to this new input; set its
                // arc output to the leftover output:
                Frontier[prefixLenPlus1 - 1].SetLastOutput(input.Ints[input.Offset + prefixLenPlus1 - 1], output);
            }

            // save last input
            LastInput.CopyInts(input);

            //System.out.println("  count[0]=" + frontier[0].inputCount);
        }
Exemple #3
0
        public Builder <T> add(IntsRef input, T output)
        {
            Debug.Assert(lastInput.length == 0 || input.CompareTo(lastInput) >= 0, "inputs are added out of order lastInput=" + lastInput + " vs input=" + input);
            if (input.length == 0)
            {
                // empty input: only allowed as first input.  we have
                // to special case this because the packed FST
                // format cannot represent the empty input since
                // 'finalness' is stored on the incoming arc, not on
                // the node
                frontier[0].inputCount++;
                frontier[0].isFinal = true;
                fst.setEmptyOutput(output);
                return(this);
            }

            // compare shared prefix length
            int pos1     = 0;
            int pos2     = input.offset;
            int pos1Stop = Math.Min(lastInput.length, input.length);

            while (true)
            {
                frontier[pos1].inputCount++;
                if (pos1 >= pos1Stop || lastInput.ints[pos1] != input.ints[pos2])
                {
                    break;
                }
                pos1++;
                pos2++;
            }
            int prefixLenPlus1 = pos1 + 1;

            if (frontier.Length < input.length + 1)
            {
                UnCompiledNode <T>[] next = ArrayUtil.grow(frontier, input.length + 1);
                for (int idx = frontier.Length; idx < next.Length; idx++)
                {
                    next[idx] = new UnCompiledNode <T>(this, idx);
                }
                frontier = next;
            }

            // minimize/compile states from previous input's
            // orphan'd suffix
            freezeTail(prefixLenPlus1);

            // init tail states for current input
            for (int idx = prefixLenPlus1; idx <= input.length; idx++)
            {
                frontier[idx - 1].addArc(input.ints[input.offset + idx - 1],
                                         frontier[idx]);
                frontier[idx].inputCount++;
            }
            UnCompiledNode <T> lastNode = frontier[input.length];

            if (lastInput.length != input.length || prefixLenPlus1 != input.length + 1)
            {
                lastNode.isFinal = true;
                lastNode.output  = NO_OUTPUT;
            }

            // push conflicting outputs forward, only as far as
            // needed
            for (int idx = 1; idx < prefixLenPlus1; idx++)
            {
                UnCompiledNode <T> node       = frontier[idx];
                UnCompiledNode <T> parentNode = frontier[idx - 1];

                T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]);

                T commonOutputPrefix;
                T wordSuffix;

                if (!lastOutput.Equals(NO_OUTPUT))
                {
                    commonOutputPrefix = fst.outputs.common(output, lastOutput);
                    wordSuffix         = fst.outputs.subtract(lastOutput, commonOutputPrefix);
                    parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix);
                    node.prependOutput(wordSuffix);
                }
                else
                {
                    commonOutputPrefix = wordSuffix = NO_OUTPUT;
                }

                output = fst.outputs.subtract(output, commonOutputPrefix);
            }
            if (lastInput.length == input.length && prefixLenPlus1 == 1 + input.length)
            {
                // same input more than 1 time in a row, mapping to
                // multiple outputs
                lastNode.output = fst.outputs.merge(lastNode.output, output);
            }
            else
            {
                // this new arc is private to this new input; set its
                // arc output to the leftover output:
                frontier[prefixLenPlus1 - 1].setLastOutput(input.ints[input.offset + prefixLenPlus1 - 1], output);
            }

            // save last input
            lastInput = (IntsRef)input.Clone();

            return(this);
        }