/// <summary> /// This method is called by a consumer before it begins consumption using /// <see cref="TokenStream.IncrementToken()"/>. /// <para/> /// Resets this stream to a clean state. Stateful implementations must implement /// this method so that they can be reused, just as if they had been created fresh. /// <para/> /// If you override this method, always call <c>base.Reset()</c>, otherwise /// some internal state will not be correctly reset (e.g., <see cref="Tokenizer"/> will /// throw <see cref="InvalidOperationException"/> on further usage). /// </summary> /// <remarks> /// <b>NOTE:</b> /// The default implementation chains the call to the input <see cref="TokenStream"/>, so /// be sure to call <c>base.Reset()</c> when overriding this method. /// </remarks> public override void Reset() { m_input.Reset(); }
/// <summary> /// Pulls the graph (including <see cref="IPositionLengthAttribute"/> /// from the provided <see cref="TokenStream"/>, and creates the corresponding /// automaton where arcs are bytes (or Unicode code points /// if unicodeArcs = true) from each term. /// </summary> public virtual Automaton ToAutomaton(TokenStream @in) { var a = new Automaton(); bool deterministic = true; var posIncAtt = @in.AddAttribute <IPositionIncrementAttribute>(); var posLengthAtt = @in.AddAttribute <IPositionLengthAttribute>(); var offsetAtt = @in.AddAttribute <IOffsetAttribute>(); var termBytesAtt = @in.AddAttribute <ITermToBytesRefAttribute>(); BytesRef term = termBytesAtt.BytesRef; @in.Reset(); // Only temporarily holds states ahead of our current // position: RollingBuffer <Position> positions = new Positions(); int pos = -1; Position posData = null; int maxOffset = 0; while (@in.IncrementToken()) { int posInc = posIncAtt.PositionIncrement; if (!preservePositionIncrements && posInc > 1) { posInc = 1; } Debug.Assert(pos > -1 || posInc > 0); if (posInc > 0) { // New node: pos += posInc; posData = positions.Get(pos); Debug.Assert(posData.leaving == null); if (posData.arriving == null) { // No token ever arrived to this position if (pos == 0) { // OK: this is the first token posData.leaving = a.GetInitialState(); } else { // this means there's a hole (eg, StopFilter // does this): posData.leaving = new State(); AddHoles(a.GetInitialState(), positions, pos); } } else { posData.leaving = new State(); posData.arriving.AddTransition(new Transition(POS_SEP, posData.leaving)); if (posInc > 1) { // A token spanned over a hole; add holes // "under" it: AddHoles(a.GetInitialState(), positions, pos); } } positions.FreeBefore(pos); } else { // note: this isn't necessarily true. its just that we aren't surely det. // we could optimize this further (e.g. buffer and sort synonyms at a position) // but thats probably overkill. this is cheap and dirty deterministic = false; } int endPos = pos + posLengthAtt.PositionLength; termBytesAtt.FillBytesRef(); BytesRef termUTF8 = ChangeToken(term); int[] termUnicode = null; Position endPosData = positions.Get(endPos); if (endPosData.arriving == null) { endPosData.arriving = new State(); } State state = posData.leaving; int termLen = termUTF8.Length; if (unicodeArcs) { string utf16 = termUTF8.Utf8ToString(); termUnicode = new int[utf16.CodePointCount(0, utf16.Length)]; termLen = termUnicode.Length; for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp)) { termUnicode[j++] = cp = Character.CodePointAt(utf16, i); } } else { termLen = termUTF8.Length; } for (int byteIDX = 0; byteIDX < termLen; byteIDX++) { State nextState = byteIDX == termLen - 1 ? endPosData.arriving : new State(); int c; if (unicodeArcs) { c = termUnicode[byteIDX]; } else { c = termUTF8.Bytes[termUTF8.Offset + byteIDX] & 0xff; } state.AddTransition(new Transition(c, nextState)); state = nextState; } maxOffset = Math.Max(maxOffset, offsetAtt.EndOffset); } @in.End(); State endState = null; if (offsetAtt.EndOffset > maxOffset) { endState = new State(); endState.Accept = true; } pos++; while (pos <= positions.MaxPos) { posData = positions.Get(pos); if (posData.arriving != null) { if (endState != null) { posData.arriving.AddTransition(new Transition(POS_SEP, endState)); } else { posData.arriving.Accept = true; } } pos++; } //toDot(a); a.IsDeterministic = deterministic; return(a); }