/// <summary> /// Cache the root node's output arcs starting with completions with the /// highest weights. /// </summary> private static FST.Arc <object>[] CacheRootArcs(FST <object> automaton) { try { IList <FST.Arc <object> > rootArcs = new List <FST.Arc <object> >(); FST.Arc <object> arc = automaton.GetFirstArc(new FST.Arc <object>()); FST.BytesReader fstReader = automaton.BytesReader; automaton.ReadFirstTargetArc(arc, arc, fstReader); while (true) { rootArcs.Add((new FST.Arc <object>()).CopyFrom(arc)); if (arc.IsLast) { break; } automaton.ReadNextArc(arc, fstReader); } // we want highest weights first. return(rootArcs.Reverse().ToArray()); } catch (IOException e) { throw new Exception(e.Message, e); } }
internal static void Walk <T>(FST <T> fst) // LUCENENET NOTE: Not referenced { List <FST.Arc <T> > queue = new List <FST.Arc <T> >(); FST.BytesReader reader = fst.GetBytesReader(); FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>()); queue.Add(startArc); BitArray seen = new BitArray(queue.Count); while (queue.Count > 0) { FST.Arc <T> arc = queue[0]; queue.RemoveAt(0); long node = arc.Target; //System.out.println(arc); if (FST <T> .TargetHasArcs(arc) && !seen.SafeGet((int)node)) { seen.SafeSet((int)node, true); fst.ReadFirstRealTargetArc(node, arc, reader); while (true) { queue.Add((new FST.Arc <T>()).CopyFrom(arc)); if (arc.IsLast) { break; } else { fst.ReadNextRealArc(arc, reader); } } } } }
// runs the term, returning the output, or null if term // isn't accepted. if prefixLength is non-null it must be // length 1 int array; prefixLength[0] is set to the length // of the term prefix that matches private static T Run(FST <T> fst, Int32sRef term, int[] prefixLength) // LUCENENET: CA1822: Mark members as static { if (Debugging.AssertsEnabled) { Debugging.Assert(prefixLength == null || prefixLength.Length == 1); } FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.GetBytesReader(); for (int i = 0; i <= term.Length; i++) { int label; if (i == term.Length) { label = FST.END_LABEL; } else { label = term.Int32s[term.Offset + i]; } // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); if (fst.FindTargetArc(label, arc, arc, fstReader) == null) { // System.out.println(" not found"); if (prefixLength != null) { prefixLength[0] = i; return(output); } else { return(default);
/// <summary> /// Cache the root node's output arcs starting with completions with the /// highest weights. /// </summary> private static FST.Arc <object>[] CacheRootArcs(FST <object> automaton) { try { // LUCENENET specific: Using a stack rather than List, as we want the results in reverse Stack <FST.Arc <object> > rootArcs = new Stack <FST.Arc <object> >(); FST.Arc <object> arc = automaton.GetFirstArc(new FST.Arc <object>()); FST.BytesReader fstReader = automaton.GetBytesReader(); automaton.ReadFirstTargetArc(arc, arc, fstReader); while (true) { rootArcs.Push(new FST.Arc <object>().CopyFrom(arc)); if (arc.IsLast) { break; } automaton.ReadNextArc(arc, fstReader); } // we want highest weights first. return(rootArcs.ToArray()); } catch (Exception e) when(e.IsIOException()) { throw RuntimeException.Create(e); } }
// Use the builder to create: private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: var scratchArc = new FST.Arc<CharsRef>(); FST.BytesReader fstReader = map.BytesReader; map.GetFirstArc(scratchArc); if (FST<CharsRef>.TargetHasArcs(scratchArc)) { map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader); while (true) { Debug.Assert(scratchArc.Label != FST.END_LABEL); cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc<CharsRef>()).CopyFrom(scratchArc); if (scratchArc.IsLast) { break; } map.ReadNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception("Should never happen", ioe); } } }
/// <summary> /// Looks up the output for this input, or null if the /// input is not accepted. /// </summary> public static T Get <T>(FST <T> fst, IntsRef input) { // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST <T> .Arc <T>()); var fstReader = fst.BytesReader; // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Ints[input.Offset + i], arc, arc, fstReader) == null) { return(default(T)); } output = fst.Outputs.Add(output, arc.Output); } if (arc.Final) { return(fst.Outputs.Add(output, arc.NextFinalOutput)); } else { return(default(T)); } }
private long?LookupPrefix(BytesRef scratch, FST.Arc <long?> arc) //Bogus { if (Debugging.AssertsEnabled) { Debugging.Assert(0 == (long)fst.Outputs.NoOutput); } long output = 0; var bytesReader = fst.GetBytesReader(); fst.GetFirstArc(arc); byte[] bytes = scratch.Bytes; int pos = scratch.Offset; int end = pos + scratch.Length; while (pos < end) { if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return(null); } else { output += (long)arc.Output; } } return(output); }
// Use the builder to create: private NormalizeCharMap(FST <CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: var scratchArc = new FST.Arc <CharsRef>(); FST.BytesReader fstReader = map.BytesReader; map.GetFirstArc(scratchArc); if (FST <CharsRef> .TargetHasArcs(scratchArc)) { map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader); while (true) { Debug.Assert(scratchArc.Label != FST <CharsRef> .END_LABEL); // LUCENENET TODO END_LABEL shouldn't be under generic? cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc <CharsRef>()).CopyFrom(scratchArc); if (scratchArc.Last) { break; } map.ReadNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception("Should never happen", ioe); } } }
// TODO: maybe a CharsRef version for BYTE2 /// <summary> /// Looks up the output for this input, or <c>null</c> if the /// input is not accepted /// </summary> public static T Get <T>(FST <T> fst, BytesRef input) { Debug.Assert(fst.InputType == FST.INPUT_TYPE.BYTE1); var fstReader = fst.GetBytesReader(); // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST.Arc <T>()); // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Bytes[i + input.Offset] & 0xFF, arc, arc, fstReader) == null) { return(default(T)); } output = fst.Outputs.Add(output, arc.Output); } if (arc.IsFinal) { return(fst.Outputs.Add(output, arc.NextFinalOutput)); } else { return(default(T)); } }
internal static void Walk <T>(FST <T> fst) { var queue = new List <FST.Arc <T> >(); var seen = new BitArray(); var reader = fst.BytesReader; var startArc = fst.GetFirstArc(new FST.Arc <T>()); queue.Add(startArc); while (queue.Count > 0) { FST.Arc <T> arc = queue.Remove(0); long node = arc.Target; //System.out.println(arc); if (FST.TargetHasArcs(arc) && !seen.Get((int)node)) { seen.Set((int)node, true); fst.ReadFirstRealTargetArc(node, arc, reader); while (true) { queue.Add((new FST.Arc <T>()).CopyFrom(arc)); if (arc.Last) { break; } else { fst.ReadNextRealArc(arc, reader); } } } } }
/// <summary> /// doFloor controls the behavior of advance: if it's true /// doFloor is true, advance positions to the biggest /// term before target. /// </summary> protected internal FSTEnum(FST <T> fst) { this.Fst = fst; FstReader = fst.BytesReader; NO_OUTPUT = fst.Outputs.NoOutput; fst.GetFirstArc(GetArc(0)); Output[0] = NO_OUTPUT; }
private FST.Arc <long?>[] CacheRootArcs() { FST.Arc <long?>[] rootCache = new FST.Arc <long?> [1 + (cacheCeiling - 0x3040)]; FST.Arc <long?> firstArc = new FST.Arc <long?>(); fst.GetFirstArc(firstArc); FST.Arc <long?> arc = new FST.Arc <long?>(); FST.BytesReader fstReader = fst.GetBytesReader(); // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) for (int i = 0; i < rootCache.Length; i++) { if (fst.FindTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) { rootCache[i] = new FST.Arc <long?>().CopyFrom(arc); } } return(rootCache); }
/// <summary> /// doFloor controls the behavior of advance: if it's true /// doFloor is true, advance positions to the biggest /// term before target. /// </summary> protected FSTEnum(FST <T> fst) { this.m_fst = fst; m_fstReader = fst.GetBytesReader(); NO_OUTPUT = fst.Outputs.NoOutput; fst.GetFirstArc(GetArc(0)); m_output[0] = NO_OUTPUT; }
/// <summary> /// Reverse lookup (lookup by output instead of by input), /// in the special case when your FSTs outputs are /// strictly ascending. This locates the input/output /// pair where the output is equal to the target, and will /// return <c>null</c> if that output does not exist. /// /// <para/>NOTE: this only works with <see cref="T:FST{long?}"/>, only /// works when the outputs are ascending in order with /// the inputs. /// For example, simple ordinals (0, 1, /// 2, ...), or file offets (when appending to a file) /// fit this. /// </summary> public static Int32sRef GetByOutput(FST <long?> fst, long targetOutput) { var @in = fst.GetBytesReader(); // TODO: would be nice not to alloc this on every lookup FST.Arc <long?> arc = fst.GetFirstArc(new FST.Arc <long?>()); FST.Arc <long?> scratchArc = new FST.Arc <long?>(); Int32sRef result = new Int32sRef(); return(GetByOutput(fst, targetOutput, @in, arc, scratchArc, result)); }
public override void SeekExact(long ord) { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.Position = 0; fst.GetFirstArc(firstArc); Int32sRef output = Lucene.Net.Util.Fst.Util.GetByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); scratchBytes.Bytes = new byte[output.Length]; scratchBytes.Offset = 0; scratchBytes.Length = 0; Lucene.Net.Util.Fst.Util.ToBytesRef(output, scratchBytes); // TODO: we could do this lazily, better to try to push into FSTEnum though? @in.SeekExact(scratchBytes); }
/// <summary> /// Looks up the output for this input, or <c>null</c> if the /// input is not accepted. /// </summary> public static T Get <T>(FST <T> fst, Int32sRef input) where T : class // LUCENENET specific - added class constraint, since we compare reference equality { // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST.Arc <T>()); var fstReader = fst.GetBytesReader(); // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) is null) { return(default);
private T RandomAcceptedWord(FST <T> fst, IntsRef @in) { FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); IList <FST.Arc <T> > arcs = new List <FST.Arc <T> >(); @in.Length = 0; @in.Offset = 0; T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.BytesReader; while (true) { // read all arcs: fst.ReadFirstTargetArc(arc, arc, fstReader); arcs.Add((new FST.Arc <T>()).CopyFrom(arc)); while (!arc.Last) { fst.ReadNextArc(arc, fstReader); arcs.Add((new FST.Arc <T>()).CopyFrom(arc)); } // pick one arc = arcs[Random.Next(arcs.Count)]; arcs.Clear(); // accumulate output output = fst.Outputs.Add(output, arc.Output); // append label if (arc.Label == FST <T> .END_LABEL) { break; } if (@in.Ints.Length == @in.Length) { @in.Grow(1 + @in.Length); } @in.Ints[@in.Length++] = arc.Label; } return(output); }
public override void LookupOrd(int ord, BytesRef result) { try { @in.Position = 0; fst.GetFirstArc(firstArc); Int32sRef output = Lucene.Net.Util.Fst.Util.GetByOutput(fst, ord, @in, firstArc, scratchArc, scratchInts); result.Bytes = new byte[output.Length]; result.Offset = 0; result.Length = 0; Util.ToBytesRef(output, result); } catch (Exception bogus) when(bogus.IsIOException()) { throw RuntimeException.Create(bogus); } }
public override void LookupOrd(long ord, BytesRef result) { try { @in.Position = 0; fst.GetFirstArc(firstArc); Int32sRef output = Lucene.Net.Util.Fst.Util.GetByOutput(fst, ord, @in, firstArc, scratchArc, scratchInts); result.Bytes = new byte[output.Length]; result.Offset = 0; result.Length = 0; Lucene.Net.Util.Fst.Util.ToBytesRef(output, result); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } }
public override void LookupOrd(int ord, BytesRef result) { try { @in.Position = 0; Fst.GetFirstArc(FirstArc); IntsRef output = Lucene.Net.Util.Fst.Util.GetByOutput(Fst, ord, @in, FirstArc, ScratchArc, ScratchInts); result.Bytes = new byte[output.Length]; result.Offset = 0; result.Length = 0; Util.ToBytesRef(output, result); } catch (System.IO.IOException bogus) { throw bogus; } }
// TODO: this is pretty stupid, considering how the stemming algorithm works // we can speed it up to be significantly faster! internal virtual IntsRef Lookup(FST <IntsRef> fst, char[] word, int offset, int length) { if (fst == null) { return(null); } FST.BytesReader bytesReader = fst.BytesReader; FST.Arc <IntsRef> arc = fst.GetFirstArc(new FST.Arc <IntsRef>()); // Accumulate output as we go IntsRef NO_OUTPUT = fst.Outputs.NoOutput; IntsRef output = NO_OUTPUT; int l = offset + length; try { for (int i = offset, cp = 0; i < l; i += Character.CharCount(cp)) { cp = Character.CodePointAt(word, i, l); if (fst.FindTargetArc(cp, arc, arc, bytesReader) == null) { return(null); } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } if (fst.FindTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { return(null); } else if (arc.Output != NO_OUTPUT) { return(fst.Outputs.Add(output, arc.Output)); } else { return(output); } } catch (IOException bogus) { throw new Exception(bogus.Message, bogus); } }
public override void LookupOrd(int ord, BytesRef result) { try { @in.Position = 0; fst.GetFirstArc(firstArc); IntsRef output = Util.GetByOutput(fst, ord, @in, firstArc, scratchArc, scratchInts); result.Bytes = new byte[output.Length]; result.Offset = 0; result.Length = 0; Util.ToBytesRef(output, result); } catch (IOException bogus) { throw new Exception(bogus.Message, bogus); } }
// runs the term, returning the output, or null if term // isn't accepted. if prefixLength is non-null it must be // length 1 int array; prefixLength[0] is set to the length // of the term prefix that matches private T Run(FST <T> fst, IntsRef term, int[] prefixLength) { Debug.Assert(prefixLength == null || prefixLength.Length == 1); FST <T> .Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.BytesReader; for (int i = 0; i <= term.Length; i++) { int label; if (i == term.Length) { label = FST <T> .END_LABEL; } else { label = term.Ints[term.Offset + i]; } // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); if (fst.FindTargetArc(label, arc, arc, fstReader) == null) { // System.out.println(" not found"); if (prefixLength != null) { prefixLength[0] = i; return(output); } else { return(default(T)); } } output = fst.Outputs.Add(output, arc.Output); } if (prefixLength != null) { prefixLength[0] = term.Length; } return(output); }
// TODO: this could be more efficient! internal static void ApplyMappings(FST <CharsRef> fst, StringBuilder sb) { FST.BytesReader bytesReader = fst.BytesReader; FST.Arc <CharsRef> firstArc = fst.GetFirstArc(new FST.Arc <CharsRef>()); CharsRef NO_OUTPUT = fst.Outputs.NoOutput; // temporary stuff FST.Arc <CharsRef> arc = new FST.Arc <CharsRef>(); int longestMatch; CharsRef longestOutput; for (int i = 0; i < sb.Length; i++) { arc.CopyFrom(firstArc); CharsRef output = NO_OUTPUT; longestMatch = -1; longestOutput = null; for (int j = i; j < sb.Length; j++) { char ch = sb[j]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else { output = fst.Outputs.Add(output, arc.Output); } if (arc.IsFinal) { longestOutput = fst.Outputs.Add(output, arc.NextFinalOutput); longestMatch = j; } } if (longestMatch >= 0) { sb.Remove(i, longestMatch + 1 - i); sb.Insert(i, longestOutput); i += (longestOutput.Length - 1); } } }
private static void Walk <T>(FST <T> fst) // LUCENENET NOTE: Not referenced anywhere { var queue = new List <FST.Arc <T> >(); // Java version was BitSet(), but in .NET we don't have a zero contructor BitSet. // Couldn't find the default size in BitSet, so went with zero here. var seen = new BitSet(); var reader = fst.GetBytesReader(); var startArc = fst.GetFirstArc(new FST.Arc <T>()); queue.Add(startArc); while (queue.Count > 0) { //FST.Arc<T> arc = queue.Remove(0); var arc = queue[0]; queue.RemoveAt(0); long node = arc.Target; //System.out.println(arc); if (FST <T> .TargetHasArcs(arc) && !seen.Get((int)node)) { seen.Set((int)node); fst.ReadFirstRealTargetArc(node, arc, reader); while (true) { queue.Add((new FST.Arc <T>()).CopyFrom(arc)); if (arc.IsLast) { break; } else { fst.ReadNextRealArc(arc, reader); } } } } }
/// <summary> /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. /// </summary> public BytesRef Get(char[] buffer, int bufferLen, FST.Arc <BytesRef> scratchArc, FST.BytesReader fstReader) { BytesRef pendingOutput = fst.Outputs.NoOutput; BytesRef matchOutput = null; int bufUpto = 0; fst.GetFirstArc(scratchArc); while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null) { return(null); } pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); bufUpto += Character.CharCount(codePoint); } if (scratchArc.IsFinal) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); } return(matchOutput); }
// Use the builder to create: private NormalizeCharMap(FST <CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: var scratchArc = new FST.Arc <CharsRef>(); FST.BytesReader fstReader = map.GetBytesReader(); map.GetFirstArc(scratchArc); if (FST <CharsRef> .TargetHasArcs(scratchArc)) { map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader); while (true) { if (Debugging.AssertsEnabled) { Debugging.Assert(scratchArc.Label != FST.END_LABEL); } cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc <CharsRef>()).CopyFrom(scratchArc); if (scratchArc.IsLast) { break; } map.ReadNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (Exception ioe) when(ioe.IsIOException()) { // Bogus FST IOExceptions!! (will never happen) throw RuntimeException.Create("Should never happen", ioe); } } }
// NOTE: copied from WFSTCompletionLookup & tweaked private long?LookupPrefix(FST <long?> fst, FST.BytesReader bytesReader, BytesRef scratch, FST.Arc <long?> arc) { long?output = fst.Outputs.NoOutput; fst.GetFirstArc(arc); var bytes = scratch.Bytes; var pos = scratch.Offset; var end = pos + scratch.Length; while (pos < end) { if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return(null); } else { output = fst.Outputs.Add(output, arc.Output); } } return(output); }
/// <summary> Load frame for start arc(node) on fst. </summary> private Frame LoadFirstFrame(Frame frame) { frame.fstArc = fst.GetFirstArc(frame.fstArc); frame.fsaState = fsa.InitialState; return(frame); }
private void Parse() { //System.out.println("\nS: parse"); if (Debugging.AssertsEnabled) { Debugging.Assert(inputSkipCount == 0); } int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.Outputs.NoOutput; fst.GetFirstArc(scratchArc); if (Debugging.AssertsEnabled) { Debugging.Assert(scratchArc.Output == fst.Outputs.NoOutput); } int tokenCount = 0; while (true) { // Pull next token's chars: char[] buffer; int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); if (Debugging.AssertsEnabled) { Debugging.Assert(futureInputs[nextWrite].consumed); } // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (m_input.IncrementToken()) { buffer = termAtt.Buffer; bufferLen = termAtt.Length; PendingInput pendingInput = futureInputs[nextWrite]; lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset; lastEndOffset = pendingInput.endOffset = offsetAtt.EndOffset; inputEndOffset = pendingInput.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { Capture(); } else { pendingInput.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.Chars; bufferLen = futureInputs[curNextRead].term.Length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); goto byTokenBreak; } // Accum the output pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.CharCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.IsFinal) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); if (nextRead == nextWrite) { Capture(); } } curNextRead = RollIncr(curNextRead); } byTokenBreak: if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = RollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; AddOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(finished); } } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
/// <summary> /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description /// for visualization. Example of use: /// /// <code> /// using (TextWriter sw = new StreamWriter("out.dot")) /// { /// Util.ToDot(fst, sw, true, true); /// } /// </code> /// /// and then, from command line: /// /// <code> /// dot -Tpng -o out.png out.dot /// </code> /// /// <para/> /// Note: larger FSTs (a few thousand nodes) won't even /// render, don't bother. If the FST is > 2.1 GB in size /// then this method will throw strange exceptions. /// <para/> /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>. /// </summary> /// <param name="sameRank"> /// If <c>true</c>, the resulting <c>dot</c> file will try /// to order states in layers of breadth-first traversal. This may /// mess up arcs, but makes the output FST's structure a bit clearer. /// </param> /// <param name="labelStates"> /// If <c>true</c> states will have labels equal to their offsets in their /// binary format. Expands the graph considerably. /// </param> public static void ToDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates) { const string expandedNodeColor = "blue"; // this is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. FST.Arc <T> startArc = fst.GetFirstArc(new FST.Arc <T>()); // A queue of transitions to consider for the next level. IList <FST.Arc <T> > thisLevelQueue = new List <FST.Arc <T> >(); // A queue of transitions to consider when processing the next level. IList <FST.Arc <T> > nextLevelQueue = new List <FST.Arc <T> >(); nextLevelQueue.Add(startArc); //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). IList <int?> sameLevelStates = new List <int?>(); // A bitset of already seen states (target offset). BitArray seen = new BitArray(32); seen.SafeSet((int)startArc.Target, true); // Shape for states. const string stateShape = "circle"; const string finalStateShape = "doublecircle"; // Emit DOT prologue. @out.Write("digraph FST {\n"); @out.Write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); if (!labelStates) { @out.Write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); } EmitDotState(@out, "initial", "point", "white", ""); T NO_OUTPUT = fst.Outputs.NoOutput; var r = fst.GetBytesReader(); // final FST.Arc<T> scratchArc = new FST.Arc<>(); { string stateColor; if (fst.IsExpandedTarget(startArc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } bool isFinal; T finalOutput; if (startArc.IsFinal) { isFinal = true; finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput; } else { isFinal = false; finalOutput = default(T); } EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput)); } @out.Write(" initial -> " + startArc.Target + "\n"); int level = 0; while (nextLevelQueue.Count > 0) { // we could double buffer here, but it doesn't matter probably. //System.out.println("next level=" + level); thisLevelQueue.AddRange(nextLevelQueue); nextLevelQueue.Clear(); level++; @out.Write("\n // Transitions and states at level: " + level + "\n"); while (thisLevelQueue.Count > 0) { FST.Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1]; thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1); //System.out.println(" pop: " + arc); if (FST <T> .TargetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); long node = arc.Target; fst.ReadFirstRealTargetArc(arc.Target, arc, r); //System.out.println(" firstTarget: " + arc); while (true) { //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target)) { /* * boolean isFinal = false; * T finalOutput = null; * fst.readFirstTargetArc(arc, scratchArc); * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { * // target is final * isFinal = true; * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; * System.out.println("dot hit final label=" + (char) scratchArc.label); * } */ string stateColor; if (fst.IsExpandedTarget(arc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } string finalOutput; if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput); } else { finalOutput = ""; } EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.SafeSet((int)arc.Target, true); nextLevelQueue.Add((new FST.Arc <T>()).CopyFrom(arc)); sameLevelStates.Add((int)arc.Target); } string outs; if (!arc.Output.Equals(NO_OUTPUT)) { outs = "/" + fst.Outputs.OutputToString(arc.Output); } else { outs = ""; } if (!FST <T> .TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { // Tricky special case: sometimes, due to // pruning, the builder can [sillily] produce // an FST with an arc into the final end state // (-1) but also with a next final output; in // this case we pull that output up onto this // arc outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]"; } string arcColor; if (arc.Flag(FST.BIT_TARGET_NEXT)) { arcColor = "red"; } else { arcColor = "black"; } Debug.Assert(arc.Label != FST.END_LABEL); @out.Write(" " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.IsLast) { //System.out.println(" break"); break; } fst.ReadNextRealArc(arc, r); } } } // Emit state ranking information. if (sameRank && sameLevelStates.Count > 1) { @out.Write(" {rank=same; "); foreach (int state in sameLevelStates) { @out.Write(state + "; "); } @out.Write(" }\n"); } sameLevelStates.Clear(); } // Emit terminating state (always there anyway). @out.Write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); @out.Write(" {rank=sink; -1 }\n"); @out.Write("}\n"); @out.Flush(); }