/// <summary> /// Create a new StemmerOverrideFilter, performing dictionary-based stemming /// with the provided <code>dictionary</code>. /// <para> /// Any dictionary-stemmed terms will be marked with <seealso cref="KeywordAttribute"/> /// so that they will not be stemmed with stemmers down the chain. /// </para> /// </summary> public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap) : base(input) { this.stemmerOverrideMap = stemmerOverrideMap; fstReader = stemmerOverrideMap.BytesReader; termAtt = AddAttribute<ICharTermAttribute>(); keywordAtt = AddAttribute<IKeywordAttribute>(); }
/// <summary> /// Default constructor that takes a <seealso cref="Reader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.BytesReader; } else { fstReader = null; } }
/// <summary> /// Default constructor that takes a <seealso cref="TextReader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { //LUCENENET support to reset the reader. _input = GetBufferedReader(@in); _input.Mark(BufferedCharFilter.defaultCharBufferSize); buffer.Reset(_input); //buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.BytesReader; } else { fstReader = null; } }
/// <summary> /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public org.apache.lucene.util.BytesRef get(char[] buffer, int bufferLen, org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.BytesRef> scratchArc, org.apache.lucene.util.fst.FST.BytesReader fstReader) throws java.io.IOException public BytesRef get(char[] buffer, int bufferLen, FST.Arc <BytesRef> scratchArc, FST.BytesReader fstReader) { BytesRef pendingOutput = fst.outputs.NoOutput; BytesRef matchOutput = null; int bufUpto = 0; fst.getFirstArc(scratchArc); while (bufUpto < bufferLen) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); int codePoint = char.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { return(null); } pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += char.charCount(codePoint); } if (scratchArc.Final) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); } return(matchOutput); }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.entry = entry; this.docToOrds = docToOrds; this.fst = fst; this.@in = @in; this.firstArc = firstArc; this.scratchArc = scratchArc; this.scratchInts = scratchInts; this.fstEnum = fstEnum; this.@ref = @ref; this.input = input; }
// TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / // SEEK_END)? saves the eq check above? /// <summary> /// Seeks to largest term that's <= target. </summary> protected virtual void DoSeekFloor() { // TODO: possibly caller could/should provide common // prefix length? ie this work may be redundant if // caller is in fact intersecting against its own // automaton //System.out.println("FE: seek floor upto=" + upto); // Save CPU by starting at the end of the shared prefix // b/w our current term & the target: RewindPrefix(); //System.out.println("FE: after rewind upto=" + upto); FST.Arc <T> arc = GetArc(m_upto); int targetLabel = TargetLabel; //System.out.println("FE: init targetLabel=" + targetLabel); // Now scan forward, matching the new suffix of the target while (true) { //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc); if (arc.BytesPerArc != 0 && arc.Label != FST.END_LABEL) { // Arcs are fixed array -- use binary search to find // the target. FST.BytesReader @in = m_fst.GetBytesReader(); int low = arc.ArcIdx; int high = arc.NumArcs - 1; int mid = 0; //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); bool found = false; while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid + 1); int midLabel = m_fst.ReadLabel(@in); int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { found = true; break; } } // NOTE: this code is dup'd w/ the code below (in // the outer else clause): if (found) { // Match -- recurse //System.out.println(" match! arcIdx=" + mid); arc.ArcIdx = mid - 1; m_fst.ReadNextRealArc(arc, @in); Debug.Assert(arc.ArcIdx == mid); Debug.Assert(arc.Label == targetLabel, "arc.label=" + arc.Label + " vs targetLabel=" + targetLabel + " mid=" + mid); m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output); if (targetLabel == FST.END_LABEL) { return; } CurrentLabel = arc.Label; Incr(); arc = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader); targetLabel = TargetLabel; continue; } else if (high == -1) { //System.out.println(" before first"); // Very first arc is after our target // TODO: if each arc could somehow read the arc just // before, we can save this re-scan. The ceil case // doesn't need this because it reads the next arc // instead: while (true) { // First, walk backwards until we find a first arc // that's before our target label: m_fst.ReadFirstTargetArc(GetArc(m_upto - 1), arc, m_fstReader); if (arc.Label < targetLabel) { // Then, scan forwards to the arc just before // the targetLabel: while (!arc.IsLast && m_fst.ReadNextArcLabel(arc, @in) < targetLabel) { m_fst.ReadNextArc(arc, m_fstReader); } PushLast(); return; } m_upto--; if (m_upto == 0) { return; } targetLabel = TargetLabel; arc = GetArc(m_upto); } } else { // There is a floor arc: arc.ArcIdx = (low > high ? high : low) - 1; //System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1)); m_fst.ReadNextRealArc(arc, @in); // LUCNENET specific: We don't want the ReadNextArcLabel call to be // excluded when Debug.Assert is stripped out by the compiler. bool check = arc.IsLast || m_fst.ReadNextArcLabel(arc, @in) > targetLabel; Debug.Assert(check); Debug.Assert(arc.Label < targetLabel, "arc.label=" + arc.Label + " vs targetLabel=" + targetLabel); PushLast(); return; } } else { if (arc.Label == targetLabel) { // Match -- recurse m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output); if (targetLabel == FST.END_LABEL) { return; } CurrentLabel = arc.Label; Incr(); arc = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader); targetLabel = TargetLabel; } else if (arc.Label > targetLabel) { // TODO: if each arc could somehow read the arc just // before, we can save this re-scan. The ceil case // doesn't need this because it reads the next arc // instead: while (true) { // First, walk backwards until we find a first arc // that's before our target label: m_fst.ReadFirstTargetArc(GetArc(m_upto - 1), arc, m_fstReader); if (arc.Label < targetLabel) { // Then, scan forwards to the arc just before // the targetLabel: while (!arc.IsLast && m_fst.ReadNextArcLabel(arc, m_fstReader) < targetLabel) { m_fst.ReadNextArc(arc, m_fstReader); } PushLast(); return; } m_upto--; if (m_upto == 0) { return; } targetLabel = TargetLabel; arc = GetArc(m_upto); } } else if (!arc.IsLast) { //System.out.println(" check next label=" + fst.readNextArcLabel(arc) + " (" + (char) fst.readNextArcLabel(arc) + ")"); if (m_fst.ReadNextArcLabel(arc, m_fstReader) > targetLabel) { PushLast(); return; } else { // keep scanning m_fst.ReadNextArc(arc, m_fstReader); } } else { PushLast(); return; } } } }
/// <summary> /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. /// </summary> public BytesRef Get(char[] buffer, int bufferLen, FST.Arc <BytesRef> scratchArc, FST.BytesReader fstReader) { BytesRef pendingOutput = fst.Outputs.NoOutput; BytesRef matchOutput = null; int bufUpto = 0; fst.GetFirstArc(scratchArc); while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint, CultureInfo.InvariantCulture) : codePoint, scratchArc, scratchArc, fstReader) == null) { return(null); } pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); bufUpto += Character.CharCount(codePoint); } if (scratchArc.IsFinal) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); } return(matchOutput); }
public SortedDocValuesAnonymousClass(FSTEntry fstEntry, NumericDocValues numericDocValues, FST <Int64> fst1, FST.BytesReader @in, FST.Arc <Int64> arc, FST.Arc <Int64> scratchArc1, Int32sRef intsRef, BytesRefFSTEnum <Int64> bytesRefFstEnum) { entry = fstEntry; docToOrd = numericDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; }
/// <param name="input"> input tokenstream </param> /// <param name="synonyms"> synonym map </param> /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>. /// Note, if you set this to true, its your responsibility to lowercase /// the input entries when you create the <seealso cref="SynonymMap"/> </param> public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; if (fst == null) { throw new System.ArgumentException("fst must be non-null"); } this.fstReader = fst.BytesReader; // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1 + synonyms.maxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for (int pos = 0; pos < rollBufferSize; pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc<BytesRef>(); }
public FST.Arc <long?> FindTargetArc(int ch, FST.Arc <long?> follow, FST.Arc <long?> arc, bool useCache, FST.BytesReader fstReader) { if (useCache && ch >= 0x3040 && ch <= cacheCeiling) { if (Debugging.AssertsEnabled) { Debugging.Assert(ch != FST.END_LABEL); } FST.Arc <long?> result = rootCache[ch - 0x3040]; if (result == null) { return(null); } else { arc.CopyFrom(result); return(arc); } } else { return(fst.FindTargetArc(ch, follow, arc, fstReader)); } }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST<long?> fst, FST.BytesReader @in, FST.Arc<long?> firstArc, FST.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.Entry = entry; this.DocToOrds = docToOrds; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; this.@ref = @ref; this.Input = input; }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST<long?> fst, FST.BytesReader @in, FST.Arc<long?> firstArc, FST.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum) { this.Entry = entry; this.DocToOrd = docToOrd; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; }
internal FSTTermsEnum(FST<long?> fst) { this.Fst = fst; @in = new BytesRefFSTEnum<long?>(fst); BytesReader = fst.BytesReader; }
public SegmentTermsEnum(BlockTreeTermsReader.FieldReader outerInstance) { this.OuterInstance = outerInstance; //if (DEBUG) System.out.println("BTTR.init seg=" + segment); Stack = new Frame[0]; // Used to hold seek by TermState, or cached seek StaticFrame = new Frame(this, -1); if (outerInstance.Index == null) { FstReader = null; } else { FstReader = OuterInstance.Index.BytesReader; } // Init w/ root block; don't use index since it may // not (and need not) have been loaded for (int arcIdx = 0; arcIdx < Arcs.Length; arcIdx++) { Arcs[arcIdx] = new FST.Arc<BytesRef>(); } CurrentFrame = StaticFrame; FST.Arc<BytesRef> arc; if (outerInstance.Index != null) { arc = outerInstance.Index.GetFirstArc(Arcs[0]); // Empty string prefix must have an output in the index! Debug.Assert(arc.IsFinal); } else { arc = null; } CurrentFrame = StaticFrame; //currentFrame = pushFrame(arc, rootCode, 0); //currentFrame.loadBlock(); ValidIndexPrefix = 0; // if (DEBUG) { // System.out.println("init frame state " + currentFrame.ord); // printSeekState(); // } //System.out.println(); // computeBlockStats().print(System.out); }
// TODO: in some cases we can filter by length? eg // regexp foo*bar must be at least length 6 bytes public IntersectEnum(BlockTreeTermsReader.FieldReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) { this.OuterInstance = outerInstance; // if (DEBUG) { // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); // } runAutomaton = compiled.RunAutomaton; CompiledAutomaton = compiled; @in = (IndexInput)[email protected](); Stack = new Frame[5]; for (int idx = 0; idx < Stack.Length; idx++) { Stack[idx] = new Frame(this, idx); } for (int arcIdx = 0; arcIdx < Arcs.Length; arcIdx++) { Arcs[arcIdx] = new FST.Arc<BytesRef>(); } if (outerInstance.Index == null) { FstReader = null; } else { FstReader = outerInstance.Index.BytesReader; } // TODO: if the automaton is "smallish" we really // should use the terms index to seek at least to // the initial term and likely to subsequent terms // (or, maybe just fallback to ATE for such cases). // Else the seek cost of loading the frames will be // too costly. FST.Arc<BytesRef> arc = outerInstance.Index.GetFirstArc(Arcs[0]); // Empty string prefix must have an output in the index! Debug.Assert(arc.IsFinal); // Special pushFrame since it's the first one: Frame f = Stack[0]; f.Fp = f.FpOrig = outerInstance.RootBlockFP; f.Prefix = 0; f.State = runAutomaton.InitialState; f.Arc = arc; f.OutputPrefix = arc.Output; f.Load(outerInstance.RootCode); // for assert: Debug.Assert(SetSavedStartTerm(startTerm)); CurrentFrame = f; if (startTerm != null) { SeekToStartTerm(startTerm); } }
public override IList <LookupResult> DoLookup(string key, HashSet <BytesRef> contexts, bool onlyMorePopular, int num) { Debug.Assert(num > 0); if (onlyMorePopular) { throw new System.ArgumentException("this suggester only works with onlyMorePopular=false"); } if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } if (fst == null) { return(Collections.EmptyList <LookupResult>()); } //System.out.println("lookup key=" + key + " num=" + num); for (var i = 0; i < key.Length; i++) { if (key[i] == 0x1E) { throw new ArgumentException( "lookup key cannot contain HOLE character U+001E; this character is reserved"); } if (key[i] == 0x1F) { throw new ArgumentException( "lookup key cannot contain unit separator character U+001F; this character is reserved"); } } var utf8Key = new BytesRef(key); try { Automaton lookupAutomaton = ToLookupAutomaton(key); var spare = new CharsRef(); //System.out.println(" now intersect exactFirst=" + exactFirst); // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); FST.BytesReader bytesReader = fst.BytesReader; var scratchArc = new FST.Arc <PairOutputs <long?, BytesRef> .Pair>(); IList <LookupResult> results = new List <LookupResult>(); IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths = FSTUtil.IntersectPrefixPaths(ConvertAutomaton(lookupAutomaton), fst); if (exactFirst) { int count = 0; foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: count++; } } // Searcher just to find the single exact only // match, if present: Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher; searcher = new Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); // NOTE: we could almost get away with only using // the first start node. The only catch is if // maxSurfaceFormsPerAnalyzedForm had kicked in and // pruned our exact match from one of these nodes // ...: foreach (var path in prefixPaths) { if (fst.FindTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: searcher.AddStartPaths(scratchArc, fst.Outputs.Add(path.output, scratchArc.Output), false, path.input); } } var completions = searcher.Search(); Debug.Assert(completions.IsComplete); // NOTE: this is rather inefficient: we enumerate // every matching "exactly the same analyzed form" // path, and then do linear scan to see if one of // these exactly matches the input. It should be // possible (though hairy) to do something similar // to getByOutput, since the surface form is encoded // into the FST output, so we more efficiently hone // in on the exact surface-form match. Still, I // suspect very little time is spent in this linear // seach: it's bounded by how many prefix start // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: foreach (var completion in completions) { BytesRef output2 = completion.Output.Output2; if (SameSurfaceForm(utf8Key, output2)) { results.Add(GetLookupResult(completion.Output.Output1, output2, spare)); break; } } if (results.Count == num) { // That was quick: return(results); } } Util.Fst.Util.TopNSearcher <PairOutputs <long?, BytesRef> .Pair> searcher; searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count, num * maxAnalyzedPathsForOneInput, weightComparator, utf8Key, results); prefixPaths = GetFullPrefixPaths(prefixPaths, lookupAutomaton, fst); foreach (FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> path in prefixPaths) { searcher.AddStartPaths(path.fstNode, path.output, true, path.input); } var completions = searcher.Search(); Debug.Assert(completions.IsComplete); foreach (Util.Fst.Util.Result <PairOutputs <long?, BytesRef> .Pair> completion in completions) { LookupResult result = GetLookupResult(completion.Output.Output1, completion.Output.Output2, spare); // TODO: for fuzzy case would be nice to return // how many edits were required //System.out.println(" result=" + result); results.Add(result); if (results.Count == num) { // In the exactFirst=true case the search may // produce one extra path break; } } return(results); } catch (IOException bogus) { throw; } }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, NumericDocValues numericDocValues, FST<long?> fst1, FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1, IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum) { entry = fstEntry; docToOrd = numericDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST<long?> fst1, FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc1, IntsRef intsRef, BytesRefFSTEnum<long?> bytesRefFstEnum, BytesRef @ref, ByteArrayDataInput byteArrayDataInput) { entry = fstEntry; docToOrds = binaryDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; this.@ref = @ref; input = byteArrayDataInput; }
internal FSTTermsEnum(FST<long?> fst) { this.fst = fst; input = new BytesRefFSTEnum<long?>(fst); bytesReader = fst.BytesReader; }
private void Verify(BytesStore bytes, byte[] expected, int totalLength) { Assert.AreEqual(totalLength, bytes.Position); if (totalLength == 0) { return; } if (VERBOSE) { Console.WriteLine(" verify..."); } // First verify whole thing in one blast: byte[] actual = new byte[totalLength]; if (Random.NextBoolean()) { if (VERBOSE) { Console.WriteLine(" bulk: reversed"); } // reversed FST.BytesReader r2 = bytes.GetReverseReader(); Assert.IsTrue(r2.IsReversed); r2.Position = totalLength - 1; r2.ReadBytes(actual, 0, actual.Length); int start = 0; int end = totalLength - 1; while (start < end) { byte b = actual[start]; actual[start] = actual[end]; actual[end] = b; start++; end--; } } else { // forward if (VERBOSE) { Console.WriteLine(" bulk: forward"); } FST.BytesReader r3 = bytes.GetForwardReader(); Assert.IsFalse(r3.IsReversed); r3.ReadBytes(actual, 0, actual.Length); } for (int i = 0; i < totalLength; i++) { assertEquals("byte @ index=" + i, expected[i], actual[i]); } FST.BytesReader r; // Then verify ops: bool reversed = Random.NextBoolean(); if (reversed) { if (VERBOSE) { Console.WriteLine(" ops: reversed"); } r = bytes.GetReverseReader(); } else { if (VERBOSE) { Console.WriteLine(" ops: forward"); } r = bytes.GetForwardReader(); } if (totalLength > 1) { int numOps = TestUtil.NextInt32(Random, 100, 200); for (int op = 0; op < numOps; op++) { int numBytes = Random.Next(Math.Min(1000, totalLength - 1)); int pos; if (reversed) { pos = TestUtil.NextInt32(Random, numBytes, totalLength - 1); } else { pos = Random.Next(totalLength - numBytes); } if (VERBOSE) { Console.WriteLine(" op iter=" + op + " reversed=" + reversed + " numBytes=" + numBytes + " pos=" + pos); } byte[] temp = new byte[numBytes]; r.Position = pos; Assert.AreEqual(pos, r.Position); r.ReadBytes(temp, 0, temp.Length); for (int i = 0; i < numBytes; i++) { byte expectedByte; if (reversed) { expectedByte = expected[pos - i]; } else { expectedByte = expected[pos + i]; } assertEquals("byte @ index=" + i, expectedByte, temp[i]); } int left; int expectedPos; if (reversed) { expectedPos = pos - numBytes; left = (int)r.Position; } else { expectedPos = pos + numBytes; left = (int)(totalLength - r.Position); } Assert.AreEqual(expectedPos, r.Position); if (left > 4) { int skipBytes = Random.Next(left - 4); int expectedInt = 0; if (reversed) { expectedPos -= skipBytes; expectedInt |= (expected[expectedPos--] & 0xFF) << 24; expectedInt |= (expected[expectedPos--] & 0xFF) << 16; expectedInt |= (expected[expectedPos--] & 0xFF) << 8; expectedInt |= (expected[expectedPos--] & 0xFF); } else { expectedPos += skipBytes; expectedInt |= (expected[expectedPos++] & 0xFF) << 24; expectedInt |= (expected[expectedPos++] & 0xFF) << 16; expectedInt |= (expected[expectedPos++] & 0xFF) << 8; expectedInt |= (expected[expectedPos++] & 0xFF); } if (VERBOSE) { Console.WriteLine(" skip numBytes=" + skipBytes); Console.WriteLine(" readInt"); } r.SkipBytes(skipBytes); Assert.AreEqual(expectedInt, r.ReadInt32()); } } } }
internal FSTTermsEnum(FST <long?> fst) { this.fst = fst; input = new BytesRefFSTEnum <long?>(fst); bytesReader = fst.BytesReader; }
internal FSTTermsEnum(FST <Int64> fst) { this.fst = fst; input = new BytesRefFSTEnum <Int64>(fst); bytesReader = fst.GetBytesReader(); }
/// <summary> /// Generates a list of stems for the provided word /// </summary> /// <param name="word"> Word to generate the stems for </param> /// <param name="length"> length </param> /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param> /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param> /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param> /// <param name="recursionDepth"> current recursiondepth </param> /// <param name="doPrefix"> true if we should remove prefixes </param> /// <param name="doSuffix"> true if we should remove suffixes </param> /// <param name="previousWasPrefix"> true if the previous removal was a prefix: /// if we are removing a suffix, and it has no continuation requirements, its ok. /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns> private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant) { // TODO: allow this stuff to be reused by tokenfilter JCG.List <CharsRef> stems = new JCG.List <CharsRef>(); if (doPrefix && dictionary.prefixes != null) { FST <Int32sRef> fst = dictionary.prefixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = prefixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? length : length - 1; for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) is null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { prefixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < prefixes.Length; j++) { int prefix = prefixes.Int32s[prefixes.Offset + j]; if (prefix == previous) { continue; } affixReader.Position = 8 * prefix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.AddRange(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST <Int32sRef> fst = dictionary.suffixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = suffixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) is null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { suffixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < suffixes.Length; j++) { int suffix = suffixes.Int32s[suffixes.Offset + j]; if (suffix == previous) { continue; } affixReader.Position = 8 * suffix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.AddRange(stemList); } } } } return(stems); }
// TODO: should we return a status here (SEEK_FOUND / SEEK_NOT_FOUND / // SEEK_END)? saves the eq check above? /// <summary> /// Seeks to smallest term that's >= target. </summary> protected virtual void DoSeekCeil() { //System.out.println(" advance len=" + target.length + " curlen=" + current.length); // TODO: possibly caller could/should provide common // prefix length? ie this work may be redundant if // caller is in fact intersecting against its own // automaton //System.out.println("FE.seekCeil upto=" + upto); // Save time by starting at the end of the shared prefix // b/w our current term & the target: RewindPrefix(); //System.out.println(" after rewind upto=" + upto); FST.Arc <T> arc = GetArc(m_upto); int targetLabel = TargetLabel; //System.out.println(" init targetLabel=" + targetLabel); // Now scan forward, matching the new suffix of the target while (true) { //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") vs targetLabel=" + targetLabel); if (arc.BytesPerArc != 0 && arc.Label != -1) { // Arcs are fixed array -- use binary search to find // the target. FST.BytesReader @in = m_fst.GetBytesReader(); int low = arc.ArcIdx; int high = arc.NumArcs - 1; int mid = 0; //System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + targetLabel); bool found = false; while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid + 1); int midLabel = m_fst.ReadLabel(@in); int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { found = true; break; } } // NOTE: this code is dup'd w/ the code below (in // the outer else clause): if (found) { // Match arc.ArcIdx = mid - 1; m_fst.ReadNextRealArc(arc, @in); Debug.Assert(arc.ArcIdx == mid); Debug.Assert(arc.Label == targetLabel, "arc.label=" + arc.Label + " vs targetLabel=" + targetLabel + " mid=" + mid); m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output); if (targetLabel == FST.END_LABEL) { return; } CurrentLabel = arc.Label; Incr(); arc = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader); targetLabel = TargetLabel; continue; } else if (low == arc.NumArcs) { // Dead end arc.ArcIdx = arc.NumArcs - 2; m_fst.ReadNextRealArc(arc, @in); Debug.Assert(arc.IsLast); // Dead end (target is after the last arc); // rollback to last fork then push m_upto--; while (true) { if (m_upto == 0) { return; } FST.Arc <T> prevArc = GetArc(m_upto); //System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast()); if (!prevArc.IsLast) { m_fst.ReadNextArc(prevArc, m_fstReader); PushFirst(); return; } m_upto--; } } else { arc.ArcIdx = (low > high ? low : high) - 1; m_fst.ReadNextRealArc(arc, @in); Debug.Assert(arc.Label > targetLabel); PushFirst(); return; } } else { // Arcs are not array'd -- must do linear scan: if (arc.Label == targetLabel) { // recurse m_output[m_upto] = m_fst.Outputs.Add(m_output[m_upto - 1], arc.Output); if (targetLabel == FST.END_LABEL) { return; } CurrentLabel = arc.Label; Incr(); arc = m_fst.ReadFirstTargetArc(arc, GetArc(m_upto), m_fstReader); targetLabel = TargetLabel; } else if (arc.Label > targetLabel) { PushFirst(); return; } else if (arc.IsLast) { // Dead end (target is after the last arc); // rollback to last fork then push m_upto--; while (true) { if (m_upto == 0) { return; } FST.Arc <T> prevArc = GetArc(m_upto); //System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " isLast?=" + prevArc.isLast()); if (!prevArc.IsLast) { m_fst.ReadNextArc(prevArc, m_fstReader); PushFirst(); return; } m_upto--; } } else { // keep scanning //System.out.println(" next scan"); m_fst.ReadNextArc(arc, m_fstReader); } } } }
// Uncomment for debugging: /* * public static <T> void dotToFile(FST<T> fst, String filePath) throws IOException { * Writer w = new OutputStreamWriter(new FileOutputStream(filePath)); * toDot(fst, w, true, true); * w.Dispose(); * } */ /// <summary> /// Reads the first arc greater or equal that the given label into the provided /// arc in place and returns it iff found, otherwise return <c>null</c>. /// </summary> /// <param name="label"> the label to ceil on </param> /// <param name="fst"> the fst to operate on </param> /// <param name="follow"> the arc to follow reading the label from </param> /// <param name="arc"> the arc to read into in place </param> /// <param name="in"> the fst's <see cref="FST.BytesReader"/> </param> public static FST.Arc <T> ReadCeilArc <T>(int label, FST <T> fst, FST.Arc <T> follow, FST.Arc <T> arc, FST.BytesReader @in) { // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum? if (label == FST.END_LABEL) { if (follow.IsFinal) { if (follow.Target <= 0) { arc.Flags = (sbyte)FST.BIT_LAST_ARC; } else { arc.Flags = 0; // NOTE: nextArc is a node (not an address!) in this case: arc.NextArc = follow.Target; arc.Node = follow.Target; } arc.Output = follow.NextFinalOutput; arc.Label = FST.END_LABEL; return(arc); } else { return(null); } } if (!FST <T> .TargetHasArcs(follow)) { return(null); } fst.ReadFirstTargetArc(follow, arc, @in); if (arc.BytesPerArc != 0 && arc.Label != FST.END_LABEL) { // Arcs are fixed array -- use binary search to find // the target. int low = arc.ArcIdx; int high = arc.NumArcs - 1; int mid = 0; // System.out.println("do arc array low=" + low + " high=" + high + // " targetLabel=" + targetLabel); while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid + 1); int midLabel = fst.ReadLabel(@in); int cmp = midLabel - label; // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + // mid + " midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { arc.ArcIdx = mid - 1; return(fst.ReadNextRealArc(arc, @in)); } } if (low == arc.NumArcs) { // DEAD END! return(null); } arc.ArcIdx = (low > high ? high : low); return(fst.ReadNextRealArc(arc, @in)); } // Linear scan fst.ReadFirstRealTargetArc(follow.Target, arc, @in); while (true) { // System.out.println(" non-bs cycle"); // TODO: we should fix this code to not have to create // object for the output of every arc we scan... only // for the matching arc, if found if (arc.Label >= label) { // System.out.println(" found!"); return(arc); } else if (arc.IsLast) { return(null); } else { fst.ReadNextRealArc(arc, @in); } } }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum) { this.entry = entry; this.docToOrd = docToOrd; this.fst = fst; this.@in = @in; this.firstArc = firstArc; this.scratchArc = scratchArc; this.scratchInts = scratchInts; this.fstEnum = fstEnum; }
/// <summary> /// Expert: like <see cref="Util.GetByOutput(FST{long?}, long)"/> except reusing /// <see cref="FST.BytesReader"/>, initial and scratch Arc, and result. /// </summary> public static Int32sRef GetByOutput(FST <long?> fst, long targetOutput, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc, Int32sRef result) { long output = arc.Output.Value; int upto = 0; //System.out.println("reverseLookup output=" + targetOutput); while (true) { //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc); if (arc.IsFinal) { long finalOutput = output + arc.NextFinalOutput.Value; //System.out.println(" isFinal finalOutput=" + finalOutput); if (finalOutput == targetOutput) { result.Length = upto; //System.out.println(" found!"); return(result); } else if (finalOutput > targetOutput) { //System.out.println(" not found!"); return(null); } } if (FST <long?> .TargetHasArcs(arc)) { //System.out.println(" targetHasArcs"); if (result.Int32s.Length == upto) { result.Grow(1 + upto); } fst.ReadFirstRealTargetArc(arc.Target, arc, @in); if (arc.BytesPerArc != 0) { int low = 0; int high = arc.NumArcs - 1; int mid = 0; //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output); bool exact = false; while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid); var flags = (sbyte)@in.ReadByte(); fst.ReadLabel(@in); long minArcOutput; if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0) { long arcOutput = fst.Outputs.Read(@in).Value; minArcOutput = output + arcOutput; } else { minArcOutput = output; } if (minArcOutput == targetOutput) { exact = true; break; } else if (minArcOutput < targetOutput) { low = mid + 1; } else { high = mid - 1; } } if (high == -1) { return(null); } else if (exact) { arc.ArcIdx = mid - 1; } else { arc.ArcIdx = low - 2; } fst.ReadNextRealArc(arc, @in); result.Int32s[upto++] = arc.Label; output += arc.Output.Value; } else { FST.Arc <long?> prevArc = null; while (true) { //System.out.println(" cycle label=" + arc.label + " output=" + arc.output); // this is the min output we'd hit if we follow // this arc: long minArcOutput = output + arc.Output.Value; if (minArcOutput == targetOutput) { // Recurse on this arc: //System.out.println(" match! break"); output = minArcOutput; result.Int32s[upto++] = arc.Label; break; } else if (minArcOutput > targetOutput) { if (prevArc == null) { // Output doesn't exist return(null); } else { // Recurse on previous arc: arc.CopyFrom(prevArc); result.Int32s[upto++] = arc.Label; output += arc.Output.Value; //System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output); break; } } else if (arc.IsLast) { // Recurse on this arc: output = minArcOutput; //System.out.println(" recurse last label=" + (char) arc.label + " output=" + output); result.Int32s[upto++] = arc.Label; break; } else { // Read next arc in this node: prevArc = scratchArc; prevArc.CopyFrom(arc); //System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label); fst.ReadNextRealArc(arc, @in); } } } } else { //System.out.println(" no target arcs; not found!"); return(null); } } }
internal FSTTermsEnum(FST <long?> fst) { this.fst = fst; @in = new BytesRefFSTEnum <long?>(fst); bytesReader = fst.GetBytesReader(); }
/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) { Debug.Assert(a.IsDeterministic); IList <Path <T> > queue = new List <Path <T> >(); List <Path <T> > endNodes = new List <Path <T> >(); queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.GetBytesReader(); while (queue.Count != 0) { Path <T> path = queue.ElementAt(queue.Count - 1); queue.Remove(path); if (path.State.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } Int32sRef currentInput = path.Input; foreach (Transition t in path.State.GetTransitions()) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader); if (nextArc != null) { Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { Debug.Assert(nextArc.Label <= max); Debug.Assert(nextArc.Label >= min, nextArc.Label + " " + min); Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); Debug.Assert(nextArc == null || label < nextArc.Label, "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString())); } } } } return(endNodes); }
/// <summary> /// Create a new StemmerOverrideFilter, performing dictionary-based stemming /// with the provided <code>dictionary</code>. /// <para> /// Any dictionary-stemmed terms will be marked with <seealso cref="KeywordAttribute"/> /// so that they will not be stemmed with stemmers down the chain. /// </para> /// </summary> //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: //ORIGINAL LINE: public StemmerOverrideFilter(final org.apache.lucene.analysis.TokenStream input, final StemmerOverrideMap stemmerOverrideMap) public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap) : base(input) { this.stemmerOverrideMap = stemmerOverrideMap; fstReader = stemmerOverrideMap.BytesReader; }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.TokenStream("", key.ToString()); try { TermToBytesRefAttribute termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>(); OffsetAttribute offsetAtt = ts.AddAttribute <OffsetAttribute>(); PositionLengthAttribute posLenAtt = ts.AddAttribute <PositionLengthAttribute>(); PositionIncrementAttribute posIncAtt = ts.AddAttribute <PositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset()); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset()); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } FST.Arc <long?> arc = new FST.Arc <long?>(); FST.BytesReader bytesReader = fst.BytesReader; // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; IList <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; try { prefixOutput = LookupPrefix(fst, bytesReader, token, arc); } catch (IOException bogus) { throw new Exception(bogus); } //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Util.Get(fst, Util.ToIntsRef(context, new IntsRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)decodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } nextCompletionBreak : backoff *= ALPHA; } results.Sort(new ComparatorAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.CloseWhileHandlingException(ts); } }