public void TestListOfOutputs() { PositiveIntOutputs _outputs = PositiveIntOutputs.Singleton; ListOfOutputs <long?> outputs = new ListOfOutputs <long?>(_outputs); Builder <object> builder = new Builder <object>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratch = new IntsRef(); // Add the same input more than once and the outputs // are merged: builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 1L); builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 3L); builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 0L); builder.Add(Util.ToIntsRef(new BytesRef("b"), scratch), 17L); FST <object> fst = builder.Finish(); object output = Util.Get(fst, new BytesRef("a")); assertNotNull(output); IList <long?> outputList = outputs.AsList(output); assertEquals(3, outputList.size()); assertEquals(1L, outputList[0]); assertEquals(3L, outputList[1]); assertEquals(0L, outputList[2]); output = Util.Get(fst, new BytesRef("b")); assertNotNull(output); outputList = outputs.AsList(output); assertEquals(1, outputList.size()); assertEquals(17L, outputList[0]); }
public void TestListOfOutputsEmptyString() { PositiveIntOutputs _outputs = PositiveIntOutputs.Singleton; ListOfOutputs <long?> outputs = new ListOfOutputs <long?>(_outputs); Builder <object> builder = new Builder <object>(FST.INPUT_TYPE.BYTE1, outputs); IntsRef scratch = new IntsRef(); builder.Add(scratch, 0L); builder.Add(scratch, 1L); builder.Add(scratch, 17L); builder.Add(scratch, 1L); builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 1L); builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 3L); builder.Add(Util.ToIntsRef(new BytesRef("a"), scratch), 0L); builder.Add(Util.ToIntsRef(new BytesRef("b"), scratch), 0L); FST <object> fst = builder.Finish(); object output = Util.Get(fst, new BytesRef("")); assertNotNull(output); IList <long?> outputList = outputs.AsList(output); assertEquals(4, outputList.size()); assertEquals(0L, outputList[0]); assertEquals(1L, outputList[1]); assertEquals(17L, outputList[2]); assertEquals(1L, outputList[3]); output = Util.Get(fst, new BytesRef("a")); assertNotNull(output); outputList = outputs.AsList(output); assertEquals(3, outputList.size()); assertEquals(1L, outputList[0]); assertEquals(3L, outputList[1]); assertEquals(0L, outputList[2]); output = Util.Get(fst, new BytesRef("b")); assertNotNull(output); outputList = outputs.AsList(output); assertEquals(1, outputList.size()); assertEquals(0L, outputList[0]); }
public virtual void Test() { int[] ints = new int[7]; Int32sRef input = new Int32sRef(ints, 0, ints.Length); int seed = Random.Next(); Directory dir = new MMapDirectory(CreateTempDir("2BFST")); for (int doPackIter = 0; doPackIter < 2; doPackIter++) { bool doPack = doPackIter == 1; // Build FST w/ NoOutputs and stop when nodeCount > 2.2B if (!doPack) { Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs <object> outputs = NoOutputs.Singleton; object NO_OUTPUT = outputs.NoOutput; Builder <object> b = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); int count = 0; Random r = new Random(seed); int[] ints2 = new int[200]; Int32sRef input2 = new Int32sRef(ints2, 0, ints2.Length); while (true) { //System.out.println("add: " + input + " -> " + output); for (int i = 10; i < ints2.Length; i++) { ints2[i] = r.Next(256); } b.Add(input2, NO_OUTPUT); count++; if (count % 100000 == 0) { Console.WriteLine(count + ": " + b.GetFstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes"); } if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024) { break; } NextInput(r, ints2); } FST <object> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints2, 0); r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2)); NextInput(r, ints2); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <object> fstEnum = new Int32sRefFSTEnum <object>(fst); Arrays.Fill(ints2, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <object> pair = fstEnum.Next(); if (pair == null) { break; } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(input2, pair.Input); Assert.AreEqual(NO_OUTPUT, pair.Output); upto++; NextInput(r, ints2); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <object>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton; Builder <BytesRef> b = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); var outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { r.NextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); b.Add(input, BytesRef.DeepCopyOf(output)); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <BytesRef> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); r = new Random(seed); Arrays.Fill(ints, 0); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } r.NextBytes(outputBytes); Assert.AreEqual(output, Util.Get(fst, input)); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <BytesRef> fstEnum = new Int32sRefFSTEnum <BytesRef>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <BytesRef> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); r.NextBytes(outputBytes); Assert.AreEqual(output, pair.Output); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <BytesRef>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> b = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); long output = 1; Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { //System.out.println("add: " + input + " -> " + output); b.Add(input, output); output += 1 + r.Next(10); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <long?> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints, 0); output = 1; r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } // forward lookup: Assert.AreEqual(output, (long)Util.Get(fst, input)); // reverse lookup: Assert.AreEqual(input, Util.GetByOutput(fst, output)); output += 1 + r.Next(10); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <long?> fstEnum = new Int32sRefFSTEnum <long?>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; output = 1; while (true) { Int32sRefFSTEnum.InputOutput <long?> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); Assert.AreEqual(output, pair.Output.Value); output += 1 + r.Next(10); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <long?>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } } dir.Dispose(); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.TokenStream("", key.ToString()); try { TermToBytesRefAttribute termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>(); OffsetAttribute offsetAtt = ts.AddAttribute <OffsetAttribute>(); PositionLengthAttribute posLenAtt = ts.AddAttribute <PositionLengthAttribute>(); PositionIncrementAttribute posIncAtt = ts.AddAttribute <PositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset()); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset()); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.BytesReader; // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; IList <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; prefixOutput = LookupPrefix(fst, bytesReader, token, arc); //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } nextCompletionBreak : backoff *= ALPHA; } results.Sort(new ComparatorAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.CloseWhileHandlingException(ts); } }