private long?LookupPrefix(BytesRef scratch, FST.Arc <long?> arc) //Bogus { if (Debugging.AssertsEnabled) { Debugging.Assert(0 == (long)fst.Outputs.NoOutput); } long output = 0; var bytesReader = fst.GetBytesReader(); fst.GetFirstArc(arc); byte[] bytes = scratch.Bytes; int pos = scratch.Offset; int end = pos + scratch.Length; while (pos < end) { if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return(null); } else { output += (long)arc.Output; } } return(output); }
/// <summary> /// Looks up the output for this input, or null if the /// input is not accepted. /// </summary> public static T Get <T>(FST <T> fst, IntsRef input) { // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST <T> .Arc <T>()); var fstReader = fst.BytesReader; // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Ints[input.Offset + i], arc, arc, fstReader) == null) { return(default(T)); } output = fst.Outputs.Add(output, arc.Output); } if (arc.Final) { return(fst.Outputs.Add(output, arc.NextFinalOutput)); } else { return(default(T)); } }
// TODO: maybe a CharsRef version for BYTE2 /// <summary> /// Looks up the output for this input, or <c>null</c> if the /// input is not accepted /// </summary> public static T Get <T>(FST <T> fst, BytesRef input) { Debug.Assert(fst.InputType == FST.INPUT_TYPE.BYTE1); var fstReader = fst.GetBytesReader(); // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST.Arc <T>()); // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Bytes[i + input.Offset] & 0xFF, arc, arc, fstReader) == null) { return(default(T)); } output = fst.Outputs.Add(output, arc.Output); } if (arc.IsFinal) { return(fst.Outputs.Add(output, arc.NextFinalOutput)); } else { return(default(T)); } }
// runs the term, returning the output, or null if term // isn't accepted. if prefixLength is non-null it must be // length 1 int array; prefixLength[0] is set to the length // of the term prefix that matches private static T Run(FST <T> fst, Int32sRef term, int[] prefixLength) // LUCENENET: CA1822: Mark members as static { if (Debugging.AssertsEnabled) { Debugging.Assert(prefixLength == null || prefixLength.Length == 1); } FST.Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.GetBytesReader(); for (int i = 0; i <= term.Length; i++) { int label; if (i == term.Length) { label = FST.END_LABEL; } else { label = term.Int32s[term.Offset + i]; } // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); if (fst.FindTargetArc(label, arc, arc, fstReader) == null) { // System.out.println(" not found"); if (prefixLength != null) { prefixLength[0] = i; return(output); } else { return(default);
// TODO: this is pretty stupid, considering how the stemming algorithm works // we can speed it up to be significantly faster! internal virtual IntsRef Lookup(FST <IntsRef> fst, char[] word, int offset, int length) { if (fst == null) { return(null); } FST.BytesReader bytesReader = fst.BytesReader; FST.Arc <IntsRef> arc = fst.GetFirstArc(new FST.Arc <IntsRef>()); // Accumulate output as we go IntsRef NO_OUTPUT = fst.Outputs.NoOutput; IntsRef output = NO_OUTPUT; int l = offset + length; try { for (int i = offset, cp = 0; i < l; i += Character.CharCount(cp)) { cp = Character.CodePointAt(word, i, l); if (fst.FindTargetArc(cp, arc, arc, bytesReader) == null) { return(null); } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } if (fst.FindTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { return(null); } else if (arc.Output != NO_OUTPUT) { return(fst.Outputs.Add(output, arc.Output)); } else { return(output); } } catch (IOException bogus) { throw new Exception(bogus.Message, bogus); } }
private FST.Arc <long?>[] CacheRootArcs() { FST.Arc <long?>[] rootCache = new FST.Arc <long?> [1 + (cacheCeiling - 0x3040)]; FST.Arc <long?> firstArc = new FST.Arc <long?>(); fst.GetFirstArc(firstArc); FST.Arc <long?> arc = new FST.Arc <long?>(); FST.BytesReader fstReader = fst.GetBytesReader(); // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) for (int i = 0; i < rootCache.Length; i++) { if (fst.FindTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) { rootCache[i] = new FST.Arc <long?>().CopyFrom(arc); } } return(rootCache); }
/// <summary> /// Looks up the output for this input, or <c>null</c> if the /// input is not accepted. /// </summary> public static T Get <T>(FST <T> fst, Int32sRef input) where T : class // LUCENENET specific - added class constraint, since we compare reference equality { // TODO: would be nice not to alloc this on every lookup var arc = fst.GetFirstArc(new FST.Arc <T>()); var fstReader = fst.GetBytesReader(); // Accumulate output as we go T output = fst.Outputs.NoOutput; for (int i = 0; i < input.Length; i++) { if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) is null) { return(default);
/// <summary> /// Descend along the path starting at <paramref name="arc"/> and going through bytes /// in the argument. /// </summary> /// <param name="arc"> /// The starting arc. This argument is modified in-place. </param> /// <param name="utf8"> /// The term to descend along. </param> /// <returns> If <c>true</c>, <paramref name="arc"/> will be set to the arc /// matching last byte of <paramref name="term"/>. <c>false</c> is /// returned if no such prefix exists. </returns> private bool DescendWithPrefix(FST.Arc <object> arc, BytesRef utf8) { int max = utf8.Offset + utf8.Length; // Cannot save as instance var since multiple threads // can use FSTCompletion at once... FST.BytesReader fstReader = automaton.BytesReader; for (int i = utf8.Offset; i < max; i++) { if (automaton.FindTargetArc(utf8.Bytes[i] & 0xff, arc, arc, fstReader) == null) { // No matching prefixes, return an empty result. return(false); } } return(true); }
/// <summary> /// Seeks to exactly target term. </summary> protected virtual bool DoSeekExact() { // TODO: possibly caller could/should provide common // prefix length? ie this work may be redundant if // caller is in fact intersecting against its own // automaton //System.out.println("FE: seek exact upto=" + upto); // Save time by starting at the end of the shared prefix // b/w our current term & the target: RewindPrefix(); //System.out.println("FE: after rewind upto=" + upto); FST.Arc <T> arc = GetArc(m_upto - 1); int targetLabel = TargetLabel; FST.BytesReader fstReader = m_fst.GetBytesReader(); while (true) { //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); FST.Arc <T> nextArc = m_fst.FindTargetArc(targetLabel, arc, GetArc(m_upto), fstReader); if (nextArc == null) { // short circuit //upto--; //upto = 0; m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), fstReader); //System.out.println(" no match upto=" + upto); return(false); } // Match -- recurse: m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], nextArc.Output); if (targetLabel == FST.END_LABEL) { //System.out.println(" return found; upto=" + upto + " output=" + output[upto] + " nextArc=" + nextArc.isLast()); return(true); } CurrentLabel = targetLabel; Incr(); targetLabel = TargetLabel; arc = nextArc; } }
// TODO: this could be more efficient! internal static void ApplyMappings(FST <CharsRef> fst, StringBuilder sb) { FST.BytesReader bytesReader = fst.BytesReader; FST.Arc <CharsRef> firstArc = fst.GetFirstArc(new FST.Arc <CharsRef>()); CharsRef NO_OUTPUT = fst.Outputs.NoOutput; // temporary stuff FST.Arc <CharsRef> arc = new FST.Arc <CharsRef>(); int longestMatch; CharsRef longestOutput; for (int i = 0; i < sb.Length; i++) { arc.CopyFrom(firstArc); CharsRef output = NO_OUTPUT; longestMatch = -1; longestOutput = null; for (int j = i; j < sb.Length; j++) { char ch = sb[j]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else { output = fst.Outputs.Add(output, arc.Output); } if (arc.IsFinal) { longestOutput = fst.Outputs.Add(output, arc.NextFinalOutput); longestMatch = j; } } if (longestMatch >= 0) { sb.Remove(i, longestMatch + 1 - i); sb.Insert(i, longestOutput); i += (longestOutput.Length - 1); } } }
// runs the term, returning the output, or null if term // isn't accepted. if prefixLength is non-null it must be // length 1 int array; prefixLength[0] is set to the length // of the term prefix that matches private T Run(FST <T> fst, IntsRef term, int[] prefixLength) { Debug.Assert(prefixLength == null || prefixLength.Length == 1); FST <T> .Arc <T> arc = fst.GetFirstArc(new FST.Arc <T>()); T NO_OUTPUT = fst.Outputs.NoOutput; T output = NO_OUTPUT; FST.BytesReader fstReader = fst.BytesReader; for (int i = 0; i <= term.Length; i++) { int label; if (i == term.Length) { label = FST <T> .END_LABEL; } else { label = term.Ints[term.Offset + i]; } // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); if (fst.FindTargetArc(label, arc, arc, fstReader) == null) { // System.out.println(" not found"); if (prefixLength != null) { prefixLength[0] = i; return(output); } else { return(default(T)); } } output = fst.Outputs.Add(output, arc.Output); } if (prefixLength != null) { prefixLength[0] = term.Length; } return(output); }
/// <summary> /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. /// </summary> public BytesRef Get(char[] buffer, int bufferLen, FST.Arc <BytesRef> scratchArc, FST.BytesReader fstReader) { BytesRef pendingOutput = fst.Outputs.NoOutput; BytesRef matchOutput = null; int bufUpto = 0; fst.GetFirstArc(scratchArc); while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null) { return(null); } pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); bufUpto += Character.CharCount(codePoint); } if (scratchArc.IsFinal) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); } return(matchOutput); }
// NOTE: copied from WFSTCompletionLookup & tweaked private long?LookupPrefix(FST <long?> fst, FST.BytesReader bytesReader, BytesRef scratch, FST.Arc <long?> arc) { long?output = fst.Outputs.NoOutput; fst.GetFirstArc(arc); var bytes = scratch.Bytes; var pos = scratch.Offset; var end = pos + scratch.Length; while (pos < end) { if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return(null); } else { output = fst.Outputs.Add(output, arc.Output); } } return(output); }
public override IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, bool onlyMorePopular, int num) { if (Debugging.AssertsEnabled) { Debugging.Assert(num > 0); } if (onlyMorePopular) { throw new ArgumentException("this suggester only works with onlyMorePopular=false"); } if (contexts != null) { throw new ArgumentException("this suggester doesn't support contexts"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } //System.out.println("lookup key=" + key + " num=" + num); for (var i = 0; i < key.Length; i++) { if (key[i] == 0x1E) { throw new ArgumentException( "lookup key cannot contain HOLE character U+001E; this character is reserved"); } if (key[i] == 0x1F) { throw new ArgumentException( "lookup key cannot contain unit separator character U+001F; this character is reserved"); } } var utf8Key = new BytesRef(key); try { Automaton lookupAutomaton = ToLookupAutomaton(key); var spare = new CharsRef(); //System.out.println(" now intersect exactFirst=" + exactFirst); // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); FST.BytesReader bytesReader = fst.GetBytesReader(); var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>(); IList <LookupResult> results = new List <LookupResult>(); IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths = FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst); if (exactFirst) { int count = 0; foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: count++; } } // Searcher just to find the single exact only // match, if present: Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher; searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparer); // NOTE: we could almost get away with only using // the first start node. The only catch is if // maxSurfaceFormsPerAnalyzedForm had kicked in and // pruned our exact match from one of these nodes // ...: foreach (var path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.FstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.Output, scratchArc.Output), false, path.Input); } } var completions = searcher.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions.IsComplete); } // NOTE: this is rather inefficient: we enumerate // every matching "exactly the same analyzed form" // path, and then do linear scan to see if one of // these exactly matches the input. It should be // possible (though hairy) to do something similar // to getByOutput, since the surface form is encoded // into the FST output, so we more efficiently hone // in on the exact surface-form match. Still, I // suspect very little time is spent in this linear // seach: it's bounded by how many prefix start // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: foreach (var completion in completions) { BytesRef output2 = completion.Output.Output2; if (SameSurfaceForm(utf8Key, output2)) { results.Add(GetLookupResult(completion.Output.Output1, output2, spare)); break; } } if (results.Count == num) { // That was quick: return(results); } } Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher2; searcher2 = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count, num * maxAnalyzedPathsForOneInput, weightComparer, utf8Key, results); prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst); foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { searcher2.AddStartPaths(path.FstNode, path.Output, true, path.Input); } var completions2 = searcher2.Search(); if (Debugging.AssertsEnabled) { Debugging.Assert(completions2.IsComplete); } foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions2) { LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare); // TODO: for fuzzy case would be nice to return // how many edits were required //System.out.println(" result=" + result); results.Add(result); if (results.Count == num) { // In the exactFirst=true case the search may // produce one extra path break; } } return(results); } catch (IOException bogus) { throw new Exception(bogus.ToString(), bogus); } }
public override int Read() { //System.out.println("\nread"); while (true) { if (replacement != null && replacementPointer < replacement.Length) { //System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]); return(replacement.Chars[replacement.Offset + replacementPointer++]); } // TODO: a more efficient approach would be Aho/Corasick's // algorithm // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps // // I think this would be (almost?) equivalent to 1) adding // epsilon arcs from all final nodes back to the init // node in the FST, 2) adding a .* (skip any char) // loop on the initial node, and 3) determinizing // that. Then we would not have to Restart matching // at each position. int lastMatchLen = -1; CharsRef lastMatch = null; int firstCH = buffer.Get(inputOff); if (firstCH != -1) { // LUCENENET fix: Check the dictionary to ensure it contains a key before reading it. char key = Convert.ToChar((char)firstCH); if (cachedRootArcs.TryGetValue(key, out FST.Arc <CharsRef> arc) && arc != null) { if (!FST.TargetHasArcs(arc)) { // Fast pass for single character match: if (Debugging.AssertsEnabled) { Debugging.Assert(arc.IsFinal); } lastMatchLen = 1; lastMatch = arc.Output; } else { int lookahead = 0; CharsRef output = arc.Output; while (true) { lookahead++; if (arc.IsFinal) { // Match! (to node is final) lastMatchLen = lookahead; lastMatch = outputs.Add(output, arc.NextFinalOutput); // Greedy: keep searching to see if there's a // longer match... } if (!FST.TargetHasArcs(arc)) { break; } int ch = buffer.Get(inputOff + lookahead); if (ch == -1) { break; } if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null) { // Dead end break; } output = outputs.Add(output, arc.Output); } } } } if (lastMatch != null) { inputOff += lastMatchLen; //System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch); int diff = lastMatchLen - lastMatch.Length; if (diff != 0) { int prevCumulativeDiff = LastCumulativeDiff; if (diff > 0) { // Replacement is shorter than matched input: AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff); } else { // Replacement is longer than matched input: remap // the "extra" chars all back to the same input // offset: int outputStart = inputOff - prevCumulativeDiff; for (int extraIDX = 0; extraIDX < -diff; extraIDX++) { AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1); } } } replacement = lastMatch; replacementPointer = 0; } else { int ret = buffer.Get(inputOff); if (ret != -1) { inputOff++; buffer.FreeBefore(inputOff); } return(ret); } } }
/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) { if (Debugging.AssertsEnabled) { Debugging.Assert(a.IsDeterministic); } IList <Path <T> > queue = new List <Path <T> >(); List <Path <T> > endNodes = new List <Path <T> >(); queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.GetBytesReader(); while (queue.Count != 0) { Path <T> path = queue[queue.Count - 1]; queue.Remove(path); if (path.State.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } Int32sRef currentInput = path.Input; foreach (Transition t in path.State.GetTransitions()) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader); if (nextArc != null) { Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label <= max); } if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label >= min, () => nextArc.Label + " " + min); } Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc == null || label < nextArc.Label, () => "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString())); } } } } } return(endNodes); }
private void Parse() { //System.out.println("\nS: parse"); if (Debugging.AssertsEnabled) { Debugging.Assert(inputSkipCount == 0); } int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.Outputs.NoOutput; fst.GetFirstArc(scratchArc); if (Debugging.AssertsEnabled) { Debugging.Assert(scratchArc.Output == fst.Outputs.NoOutput); } int tokenCount = 0; while (true) { // Pull next token's chars: char[] buffer; int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); if (Debugging.AssertsEnabled) { Debugging.Assert(futureInputs[nextWrite].consumed); } // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (m_input.IncrementToken()) { buffer = termAtt.Buffer; bufferLen = termAtt.Length; PendingInput pendingInput = futureInputs[nextWrite]; lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset; lastEndOffset = pendingInput.endOffset = offsetAtt.EndOffset; inputEndOffset = pendingInput.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { Capture(); } else { pendingInput.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.Chars; bufferLen = futureInputs[curNextRead].term.Length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); goto byTokenBreak; } // Accum the output pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.CharCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.IsFinal) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); if (nextRead == nextWrite) { Capture(); } } curNextRead = RollIncr(curNextRead); } byTokenBreak: if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = RollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; AddOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(finished); } } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
/// <summary> /// Generates a list of stems for the provided word /// </summary> /// <param name="word"> Word to generate the stems for </param> /// <param name="length"> length </param> /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param> /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param> /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param> /// <param name="recursionDepth"> current recursiondepth </param> /// <param name="doPrefix"> true if we should remove prefixes </param> /// <param name="doSuffix"> true if we should remove suffixes </param> /// <param name="previousWasPrefix"> true if the previous removal was a prefix: /// if we are removing a suffix, and it has no continuation requirements, its ok. /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns> private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant) { // TODO: allow this stuff to be reused by tokenfilter JCG.List <CharsRef> stems = new JCG.List <CharsRef>(); if (doPrefix && dictionary.prefixes != null) { FST <Int32sRef> fst = dictionary.prefixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = prefixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? length : length - 1; for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { prefixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < prefixes.Length; j++) { int prefix = prefixes.Int32s[prefixes.Offset + j]; if (prefix == previous) { continue; } affixReader.Position = 8 * prefix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.AddRange(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST <Int32sRef> fst = dictionary.suffixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = suffixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { suffixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < suffixes.Length; j++) { int suffix = suffixes.Int32s[suffixes.Offset + j]; if (suffix == previous) { continue; } affixReader.Position = 8 * suffix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.AddRange(stemList); } } } } return(stems); }