// for debugging /* * private String toString(BytesRef b) { * try { * return b.utf8ToString() + " " + b; * } catch (Throwable t) { * return b.toString(); * } * } */ /// <summary> /// It's OK to add the same input twice in a row with /// different outputs, as long as outputs impls the merge /// method. Note that input is fully consumed after this /// method is returned (so caller is free to reuse), but /// output is not. So if your outputs are changeable (eg /// <see cref="ByteSequenceOutputs"/> or /// <see cref="Int32SequenceOutputs"/>) then you cannot reuse across /// calls. /// </summary> public virtual void Add(Int32sRef input, T output) { /* * if (DEBUG) { * BytesRef b = new BytesRef(input.length); * for(int x=0;x<input.length;x++) { * b.bytes[x] = (byte) input.ints[x]; * } * b.length = input.length; * if (output == NO_OUTPUT) { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b); * } else { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output)); * } * } */ // De-dup NO_OUTPUT since it must be a singleton: if (output.Equals(NO_OUTPUT)) { output = NO_OUTPUT; } if (Debugging.AssertsEnabled) { Debugging.Assert(lastInput.Length == 0 || input.CompareTo(lastInput) >= 0, "inputs are added out of order lastInput={0} vs input={1}", lastInput, input); Debugging.Assert(ValidOutput(output)); } //System.out.println("\nadd: " + input); if (input.Length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node frontier[0].InputCount++; frontier[0].IsFinal = true; fst.EmptyOutput = output; return; } // compare shared prefix length int pos1 = 0; int pos2 = input.Offset; int pos1Stop = Math.Min(lastInput.Length, input.Length); while (true) { frontier[pos1].InputCount++; //System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]); if (pos1 >= pos1Stop || lastInput.Int32s[pos1] != input.Int32s[pos2]) { break; } pos1++; pos2++; } int prefixLenPlus1 = pos1 + 1; if (frontier.Length < input.Length + 1) { UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(frontier, 0, next, 0, frontier.Length); for (int idx = frontier.Length; idx < next.Length; idx++) { next[idx] = new UnCompiledNode <T>(this, idx); } frontier = next; } // minimize/compile states from previous input's // orphan'd suffix DoFreezeTail(prefixLenPlus1); // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.Length; idx++) { frontier[idx - 1].AddArc(input.Int32s[input.Offset + idx - 1], frontier[idx]); frontier[idx].InputCount++; } UnCompiledNode <T> lastNode = frontier[input.Length]; if (lastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1) { lastNode.IsFinal = true; lastNode.Output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for (int idx = 1; idx < prefixLenPlus1; idx++) { UnCompiledNode <T> node = frontier[idx]; UnCompiledNode <T> parentNode = frontier[idx - 1]; T lastOutput = parentNode.GetLastOutput(input.Int32s[input.Offset + idx - 1]); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(lastOutput)); } T commonOutputPrefix; T wordSuffix; if (!lastOutput.Equals(NO_OUTPUT)) { commonOutputPrefix = fst.Outputs.Common(output, lastOutput); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(commonOutputPrefix)); } wordSuffix = fst.Outputs.Subtract(lastOutput, commonOutputPrefix); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(wordSuffix)); } parentNode.SetLastOutput(input.Int32s[input.Offset + idx - 1], commonOutputPrefix); node.PrependOutput(wordSuffix); } else { commonOutputPrefix = /*wordSuffix =*/ NO_OUTPUT; // LUCENENET: Removed unnecessary assignment } output = fst.Outputs.Subtract(output, commonOutputPrefix); if (Debugging.AssertsEnabled) { Debugging.Assert(ValidOutput(output)); } } if (lastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.Output = fst.Outputs.Merge(lastNode.Output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: frontier[prefixLenPlus1 - 1].SetLastOutput(input.Int32s[input.Offset + prefixLenPlus1 - 1], output); } // save last input lastInput.CopyInt32s(input); //System.out.println(" count[0]=" + frontier[0].inputCount); }
// FST is complete private void VerifyUnPruned(int inputMode, FST <T> fst) { FST <long?> fstLong; ISet <long?> validOutputs; long minLong = long.MaxValue; long maxLong = long.MinValue; if (doReverseLookup) { FST <long?> fstLong0 = fst as FST <long?>; fstLong = fstLong0; validOutputs = new HashSet <long?>(); foreach (InputOutput <T> pair in pairs) { long?output = pair.Output as long?; maxLong = Math.Max(maxLong, output.Value); minLong = Math.Min(minLong, output.Value); validOutputs.Add(output.Value); } } else { fstLong = null; validOutputs = null; } if (pairs.Count == 0) { Assert.IsNull(fst); return; } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now verify " + pairs.Count + " terms"); foreach (InputOutput <T> pair in pairs) { Assert.IsNotNull(pair); Assert.IsNotNull(pair.Input); Assert.IsNotNull(pair.Output); Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + outputs.OutputToString(pair.Output)); } } Assert.IsNotNull(fst); // visit valid pairs in order -- make sure all words // are accepted, and FSTEnum's next() steps through // them correctly if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check valid terms/next()"); } { Int32sRefFSTEnum <T> fstEnum = new Int32sRefFSTEnum <T>(fst); foreach (InputOutput <T> pair in pairs) { Int32sRef term = pair.Input; if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check term=" + InputToString(inputMode, term) + " output=" + fst.Outputs.OutputToString(pair.Output)); } T output = Run(fst, term, null); Assert.IsNotNull(output, "term " + InputToString(inputMode, term) + " is not accepted"); Assert.IsTrue(OutputsEqual(pair.Output, output)); // verify enum's next Int32sRefFSTEnum.InputOutput <T> t = fstEnum.Next(); Assert.IsNotNull(t); Assert.AreEqual(term, t.Input, "expected input=" + InputToString(inputMode, term) + " but fstEnum returned " + InputToString(inputMode, t.Input)); Assert.IsTrue(OutputsEqual(pair.Output, t.Output)); } Assert.IsNull(fstEnum.Next()); } IDictionary <Int32sRef, T> termsMap = new Dictionary <Int32sRef, T>(); foreach (InputOutput <T> pair in pairs) { termsMap[pair.Input] = pair.Output; } if (doReverseLookup && maxLong > minLong) { // Do random lookups so we test null (output doesn't // exist) case: Assert.IsNull(Util.GetByOutput(fstLong, minLong - 7)); Assert.IsNull(Util.GetByOutput(fstLong, maxLong + 7)); int num = LuceneTestCase.AtLeast(random, 100); for (int iter = 0; iter < num; iter++) { long v = TestUtil.NextInt64(random, minLong, maxLong); Int32sRef input = Util.GetByOutput(fstLong, v); Assert.IsTrue(validOutputs.Contains(v) || input == null); } } // find random matching word and make sure it's valid if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify random accepted terms"); } Int32sRef scratch = new Int32sRef(10); int num_ = LuceneTestCase.AtLeast(random, 500); for (int iter = 0; iter < num_; iter++) { T output = RandomAcceptedWord(fst, scratch); Assert.IsTrue(termsMap.ContainsKey(scratch), "accepted word " + InputToString(inputMode, scratch) + " is not valid"); Assert.IsTrue(OutputsEqual(termsMap[scratch], output)); if (doReverseLookup) { //System.out.println("lookup output=" + output + " outs=" + fst.Outputs); Int32sRef input = Util.GetByOutput(fstLong, (output as long?).Value); Assert.IsNotNull(input); //System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString()); Assert.AreEqual(scratch, input); } } // test IntsRefFSTEnum.Seek: if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify seek"); } Int32sRefFSTEnum <T> fstEnum_ = new Int32sRefFSTEnum <T>(fst); num_ = LuceneTestCase.AtLeast(random, 100); for (int iter = 0; iter < num_; iter++) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" iter=" + iter); } if (random.NextBoolean()) { // seek to term that doesn't exist: while (true) { Int32sRef term = ToInt32sRef(GetRandomString(random), inputMode); int pos = pairs.BinarySearch(new InputOutput <T>(term, default(T))); if (pos < 0) { pos = -(pos + 1); // ok doesn't exist //System.out.println(" seek " + inputToString(inputMode, term)); Int32sRefFSTEnum.InputOutput <T> seekResult; if (random.Next(3) == 0) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekExact term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekExact(term); pos = -1; } else if (random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekFloor term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekFloor(term); pos--; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekCeil term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekCeil(term); } if (pos != -1 && pos < pairs.Count) { //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.Outputs.outputToString(seekResult.Output)); Assert.IsNotNull(seekResult, "got null but expected term=" + InputToString(inputMode, pairs[pos].Input)); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" got " + InputToString(inputMode, seekResult.Input)); } Assert.AreEqual(pairs[pos].Input, seekResult.Input, "expected " + InputToString(inputMode, pairs[pos].Input) + " but got " + InputToString(inputMode, seekResult.Input)); Assert.IsTrue(OutputsEqual(pairs[pos].Output, seekResult.Output)); } else { // seeked before start or beyond end //System.out.println("seek=" + seekTerm); Assert.IsNull(seekResult, "expected null but got " + (seekResult == null ? "null" : InputToString(inputMode, seekResult.Input))); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" got null"); } } break; } } } else { // seek to term that does exist: InputOutput <T> pair = pairs[random.Next(pairs.Count)]; Int32sRefFSTEnum.InputOutput <T> seekResult; if (random.Next(3) == 2) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekExact term=" + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekExact(pair.Input); } else if (random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekFloor " + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekFloor(pair.Input); } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekCeil " + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekCeil(pair.Input); } Assert.IsNotNull(seekResult); Assert.AreEqual(pair.Input, seekResult.Input, "got " + InputToString(inputMode, seekResult.Input) + " but expected " + InputToString(inputMode, pair.Input)); Assert.IsTrue(OutputsEqual(pair.Output, seekResult.Output)); } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: mixed next/seek"); } // test mixed next/seek num_ = LuceneTestCase.AtLeast(random, 100); for (int iter = 0; iter < num_; iter++) { if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: iter " + iter); } // reset: fstEnum_ = new Int32sRefFSTEnum <T>(fst); int upto = -1; while (true) { bool isDone = false; if (upto == pairs.Count - 1 || random.NextBoolean()) { // next upto++; if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do next"); } isDone = fstEnum_.Next() == null; } else if (upto != -1 && upto < 0.75 * pairs.Count && random.NextBoolean()) { int attempt = 0; for (; attempt < 10; attempt++) { Int32sRef term = ToInt32sRef(GetRandomString(random), inputMode); if (!termsMap.ContainsKey(term) && term.CompareTo(pairs[upto].Input) > 0) { int pos = pairs.BinarySearch(new InputOutput <T>(term, default(T))); Debug.Assert(pos < 0); upto = -(pos + 1); if (random.NextBoolean()) { upto--; Assert.IsTrue(upto != -1); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekFloor(" + InputToString(inputMode, term) + ")"); } isDone = fstEnum_.SeekFloor(term) == null; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekCeil(" + InputToString(inputMode, term) + ")"); } isDone = fstEnum_.SeekCeil(term) == null; } break; } } if (attempt == 10) { continue; } } else { int inc = random.Next(pairs.Count - upto - 1); upto += inc; if (upto == -1) { upto = 0; } if (random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do seekCeil(" + InputToString(inputMode, pairs[upto].Input) + ")"); } isDone = fstEnum_.SeekCeil(pairs[upto].Input) == null; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do seekFloor(" + InputToString(inputMode, pairs[upto].Input) + ")"); } isDone = fstEnum_.SeekFloor(pairs[upto].Input) == null; } } if (LuceneTestCase.VERBOSE) { if (!isDone) { Console.WriteLine(" got " + InputToString(inputMode, fstEnum_.Current.Input)); } else { Console.WriteLine(" got null"); } } if (upto == pairs.Count) { Assert.IsTrue(isDone); break; } else { Assert.IsFalse(isDone); Assert.AreEqual(pairs[upto].Input, fstEnum_.Current.Input); Assert.IsTrue(OutputsEqual(pairs[upto].Output, fstEnum_.Current.Output)); /* * if (upto < pairs.size()-1) { * int tryCount = 0; * while(tryCount < 10) { * final IntsRef t = toIntsRef(getRandomString(), inputMode); * if (pairs.get(upto).input.compareTo(t) < 0) { * final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; * if (LuceneTestCase.VERBOSE) { * System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); * } * Assert.AreEqual(expected, fstEnum.beforeNext(t)); * break; * } * tryCount++; * } * } */ } } } }