// FST is complete private void VerifyUnPruned(int inputMode, FST <T> fst) { FST <long> fstLong; ISet <long> validOutputs; long minLong = long.MaxValue; long maxLong = long.MinValue; if (DoReverseLookup) { FST <long> fstLong0 = fst as FST <long>; fstLong = fstLong0; validOutputs = new HashSet <long>(); foreach (InputOutput <T> pair in Pairs) { long?output = pair.Output as long?; maxLong = Math.Max(maxLong, output.Value); minLong = Math.Min(minLong, output.Value); validOutputs.Add(output.Value); } } else { fstLong = null; validOutputs = null; } if (Pairs.Count == 0) { Assert.IsNull(fst); return; } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now verify " + Pairs.Count + " terms"); foreach (InputOutput <T> pair in Pairs) { Assert.IsNotNull(pair); Assert.IsNotNull(pair.Input); Assert.IsNotNull(pair.Output); Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + Outputs.OutputToString(pair.Output)); } } Assert.IsNotNull(fst); // visit valid pairs in order -- make sure all words // are accepted, and FSTEnum's next() steps through // them correctly if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check valid terms/next()"); } { IntsRefFSTEnum <T> fstEnum = new IntsRefFSTEnum <T>(fst); foreach (InputOutput <T> pair in Pairs) { IntsRef term = pair.Input; if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check term=" + InputToString(inputMode, term) + " output=" + fst.Outputs.OutputToString(pair.Output)); } T output = Run(fst, term, null); Assert.IsNotNull(output, "term " + InputToString(inputMode, term) + " is not accepted"); Assert.IsTrue(OutputsEqual(pair.Output, output)); // verify enum's next IntsRefFSTEnum <T> .InputOutput <T> t = fstEnum.Next(); Assert.IsNotNull(t); Assert.AreEqual(term, t.Input, "expected input=" + InputToString(inputMode, term) + " but fstEnum returned " + InputToString(inputMode, t.Input)); Assert.IsTrue(OutputsEqual(pair.Output, t.Output)); } Assert.IsNull(fstEnum.Next()); } IDictionary <IntsRef, T> termsMap = new Dictionary <IntsRef, T>(); foreach (InputOutput <T> pair in Pairs) { termsMap[pair.Input] = pair.Output; } if (DoReverseLookup && maxLong > minLong) { // Do random lookups so we test null (output doesn't // exist) case: Assert.IsNull(Util.GetByOutput(fstLong, minLong - 7)); Assert.IsNull(Util.GetByOutput(fstLong, maxLong + 7)); int num = LuceneTestCase.AtLeast(Random, 100); for (int iter = 0; iter < num; iter++) { long v = TestUtil.NextLong(Random, minLong, maxLong); IntsRef input = Util.GetByOutput(fstLong, v); Assert.IsTrue(validOutputs.Contains(v) || input == null); } } // find random matching word and make sure it's valid if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify random accepted terms"); } IntsRef scratch = new IntsRef(10); int num_ = LuceneTestCase.AtLeast(Random, 500); for (int iter = 0; iter < num_; iter++) { T output = RandomAcceptedWord(fst, scratch); Assert.IsTrue(termsMap.ContainsKey(scratch), "accepted word " + InputToString(inputMode, scratch) + " is not valid"); Assert.IsTrue(OutputsEqual(termsMap[scratch], output)); if (DoReverseLookup) { //System.out.println("lookup output=" + output + " outs=" + fst.Outputs); IntsRef input = Util.GetByOutput(fstLong, (output as long?).Value); Assert.IsNotNull(input); //System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString()); Assert.AreEqual(scratch, input); } } // test IntsRefFSTEnum.Seek: if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify seek"); } IntsRefFSTEnum <T> fstEnum_ = new IntsRefFSTEnum <T>(fst); num_ = LuceneTestCase.AtLeast(Random, 100); for (int iter = 0; iter < num_; iter++) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" iter=" + iter); } if (Random.NextBoolean()) { // seek to term that doesn't exist: while (true) { IntsRef term = ToIntsRef(GetRandomString(Random), inputMode); int pos = Pairs.BinarySearch(new InputOutput <T>(term, default(T))); if (pos < 0) { pos = -(pos + 1); // ok doesn't exist //System.out.println(" seek " + inputToString(inputMode, term)); IntsRefFSTEnum <T> .InputOutput <T> seekResult; if (Random.Next(3) == 0) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekExact term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekExact(term); pos = -1; } else if (Random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekFloor term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekFloor(term); pos--; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekCeil term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekCeil(term); } if (pos != -1 && pos < Pairs.Count) { //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.Outputs.outputToString(seekResult.Output)); Assert.IsNotNull(seekResult, "got null but expected term=" + InputToString(inputMode, Pairs[pos].Input)); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" got " + InputToString(inputMode, seekResult.Input)); } Assert.AreEqual(Pairs[pos].Input, seekResult.Input, "expected " + InputToString(inputMode, Pairs[pos].Input) + " but got " + InputToString(inputMode, seekResult.Input)); Assert.IsTrue(OutputsEqual(Pairs[pos].Output, seekResult.Output)); } else { // seeked before start or beyond end //System.out.println("seek=" + seekTerm); Assert.IsNull(seekResult, "expected null but got " + (seekResult == null ? "null" : InputToString(inputMode, seekResult.Input))); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" got null"); } } break; } } } else { // seek to term that does exist: InputOutput <T> pair = Pairs[Random.Next(Pairs.Count)]; IntsRefFSTEnum <T> .InputOutput <T> seekResult; if (Random.Next(3) == 2) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekExact term=" + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekExact(pair.Input); } else if (Random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekFloor " + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekFloor(pair.Input); } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekCeil " + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekCeil(pair.Input); } Assert.IsNotNull(seekResult); Assert.AreEqual(pair.Input, seekResult.Input, "got " + InputToString(inputMode, seekResult.Input) + " but expected " + InputToString(inputMode, pair.Input)); Assert.IsTrue(OutputsEqual(pair.Output, seekResult.Output)); } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: mixed next/seek"); } // test mixed next/seek num_ = LuceneTestCase.AtLeast(Random, 100); for (int iter = 0; iter < num_; iter++) { if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: iter " + iter); } // reset: fstEnum_ = new IntsRefFSTEnum <T>(fst); int upto = -1; while (true) { bool isDone = false; if (upto == Pairs.Count - 1 || Random.NextBoolean()) { // next upto++; if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do next"); } isDone = fstEnum_.Next() == null; } else if (upto != -1 && upto < 0.75 * Pairs.Count && Random.NextBoolean()) { int attempt = 0; for (; attempt < 10; attempt++) { IntsRef term = ToIntsRef(GetRandomString(Random), inputMode); if (!termsMap.ContainsKey(term) && term.CompareTo(Pairs[upto].Input) > 0) { int pos = Pairs.BinarySearch(new InputOutput <T>(term, default(T))); Debug.Assert(pos < 0); upto = -(pos + 1); if (Random.NextBoolean()) { upto--; Assert.IsTrue(upto != -1); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekFloor(" + InputToString(inputMode, term) + ")"); } isDone = fstEnum_.SeekFloor(term) == null; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekCeil(" + InputToString(inputMode, term) + ")"); } isDone = fstEnum_.SeekCeil(term) == null; } break; } } if (attempt == 10) { continue; } } else { int inc = Random.Next(Pairs.Count - upto - 1); upto += inc; if (upto == -1) { upto = 0; } if (Random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do seekCeil(" + InputToString(inputMode, Pairs[upto].Input) + ")"); } isDone = fstEnum_.SeekCeil(Pairs[upto].Input) == null; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do seekFloor(" + InputToString(inputMode, Pairs[upto].Input) + ")"); } isDone = fstEnum_.SeekFloor(Pairs[upto].Input) == null; } } if (LuceneTestCase.VERBOSE) { if (!isDone) { Console.WriteLine(" got " + InputToString(inputMode, fstEnum_.Current().Input)); } else { Console.WriteLine(" got null"); } } if (upto == Pairs.Count) { Assert.IsTrue(isDone); break; } else { Assert.IsFalse(isDone); Assert.AreEqual(Pairs[upto].Input, fstEnum_.Current().Input); Assert.IsTrue(OutputsEqual(Pairs[upto].Output, fstEnum_.Current().Output)); /* * if (upto < pairs.size()-1) { * int tryCount = 0; * while(tryCount < 10) { * final IntsRef t = toIntsRef(getRandomString(), inputMode); * if (pairs.get(upto).input.compareTo(t) < 0) { * final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; * if (LuceneTestCase.VERBOSE) { * System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); * } * Assert.AreEqual(expected, fstEnum.beforeNext(t)); * break; * } * tryCount++; * } * } */ } } } }
// for debugging /* * private String toString(BytesRef b) { * try { * return b.utf8ToString() + " " + b; * } catch (Throwable t) { * return b.toString(); * } * } */ /// <summary> /// It's OK to add the same input twice in a row with /// different outputs, as long as outputs impls the merge /// method. Note that input is fully consumed after this /// method is returned (so caller is free to reuse), but /// output is not. So if your outputs are changeable (eg /// <seealso cref="ByteSequenceOutputs"/> or {@link /// IntSequenceOutputs}) then you cannot reuse across /// calls. /// </summary> public virtual void Add(IntsRef input, T output) { /* * if (DEBUG) { * BytesRef b = new BytesRef(input.length); * for(int x=0;x<input.length;x++) { * b.bytes[x] = (byte) input.ints[x]; * } * b.length = input.length; * if (output == NO_OUTPUT) { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b); * } else { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output)); * } * } */ // De-dup NO_OUTPUT since it must be a singleton: if (output.Equals(NO_OUTPUT)) { output = NO_OUTPUT; } Debug.Assert(LastInput.Length == 0 || input.CompareTo(LastInput) >= 0, "inputs are added out of order lastInput=" + LastInput + " vs input=" + input); Debug.Assert(ValidOutput(output)); //System.out.println("\nadd: " + input); if (input.Length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node Frontier[0].InputCount++; Frontier[0].IsFinal = true; Fst.EmptyOutput = output; return; } // compare shared prefix length int pos1 = 0; int pos2 = input.Offset; int pos1Stop = Math.Min(LastInput.Length, input.Length); while (true) { Frontier[pos1].InputCount++; //System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]); if (pos1 >= pos1Stop || LastInput.Ints[pos1] != input.Ints[pos2]) { break; } pos1++; pos2++; } int prefixLenPlus1 = pos1 + 1; if (Frontier.Length < input.Length + 1) { UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(Frontier, 0, next, 0, Frontier.Length); for (int idx = Frontier.Length; idx < next.Length; idx++) { next[idx] = new UnCompiledNode <T>(this, idx); } Frontier = next; } // minimize/compile states from previous input's // orphan'd suffix DoFreezeTail(prefixLenPlus1); // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.Length; idx++) { Frontier[idx - 1].AddArc(input.Ints[input.Offset + idx - 1], Frontier[idx]); Frontier[idx].InputCount++; } UnCompiledNode <T> lastNode = Frontier[input.Length]; if (LastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1) { lastNode.IsFinal = true; lastNode.Output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for (int idx = 1; idx < prefixLenPlus1; idx++) { UnCompiledNode <T> node = Frontier[idx]; UnCompiledNode <T> parentNode = Frontier[idx - 1]; T lastOutput = parentNode.GetLastOutput(input.Ints[input.Offset + idx - 1]); Debug.Assert(ValidOutput(lastOutput)); T commonOutputPrefix; T wordSuffix; if ((object)lastOutput != (object)NO_OUTPUT) { commonOutputPrefix = Fst.Outputs.Common(output, lastOutput); Debug.Assert(ValidOutput(commonOutputPrefix)); wordSuffix = Fst.Outputs.Subtract(lastOutput, commonOutputPrefix); Debug.Assert(ValidOutput(wordSuffix)); parentNode.SetLastOutput(input.Ints[input.Offset + idx - 1], commonOutputPrefix); node.PrependOutput(wordSuffix); } else { commonOutputPrefix = wordSuffix = NO_OUTPUT; } output = Fst.Outputs.Subtract(output, commonOutputPrefix); Debug.Assert(ValidOutput(output)); } if (LastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.Output = Fst.Outputs.Merge(lastNode.Output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: Frontier[prefixLenPlus1 - 1].SetLastOutput(input.Ints[input.Offset + prefixLenPlus1 - 1], output); } // save last input LastInput.CopyInts(input); //System.out.println(" count[0]=" + frontier[0].inputCount); }
public Builder <T> add(IntsRef input, T output) { Debug.Assert(lastInput.length == 0 || input.CompareTo(lastInput) >= 0, "inputs are added out of order lastInput=" + lastInput + " vs input=" + input); if (input.length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node frontier[0].inputCount++; frontier[0].isFinal = true; fst.setEmptyOutput(output); return(this); } // compare shared prefix length int pos1 = 0; int pos2 = input.offset; int pos1Stop = Math.Min(lastInput.length, input.length); while (true) { frontier[pos1].inputCount++; if (pos1 >= pos1Stop || lastInput.ints[pos1] != input.ints[pos2]) { break; } pos1++; pos2++; } int prefixLenPlus1 = pos1 + 1; if (frontier.Length < input.length + 1) { UnCompiledNode <T>[] next = ArrayUtil.grow(frontier, input.length + 1); for (int idx = frontier.Length; idx < next.Length; idx++) { next[idx] = new UnCompiledNode <T>(this, idx); } frontier = next; } // minimize/compile states from previous input's // orphan'd suffix freezeTail(prefixLenPlus1); // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.length; idx++) { frontier[idx - 1].addArc(input.ints[input.offset + idx - 1], frontier[idx]); frontier[idx].inputCount++; } UnCompiledNode <T> lastNode = frontier[input.length]; if (lastInput.length != input.length || prefixLenPlus1 != input.length + 1) { lastNode.isFinal = true; lastNode.output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for (int idx = 1; idx < prefixLenPlus1; idx++) { UnCompiledNode <T> node = frontier[idx]; UnCompiledNode <T> parentNode = frontier[idx - 1]; T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]); T commonOutputPrefix; T wordSuffix; if (!lastOutput.Equals(NO_OUTPUT)) { commonOutputPrefix = fst.outputs.common(output, lastOutput); wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix); parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix); node.prependOutput(wordSuffix); } else { commonOutputPrefix = wordSuffix = NO_OUTPUT; } output = fst.outputs.subtract(output, commonOutputPrefix); } if (lastInput.length == input.length && prefixLenPlus1 == 1 + input.length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.output = fst.outputs.merge(lastNode.output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: frontier[prefixLenPlus1 - 1].setLastOutput(input.ints[input.offset + prefixLenPlus1 - 1], output); } // save last input lastInput = (IntsRef)input.Clone(); return(this); }