/// <summary> /// checks condition of the concatenation of two strings </summary> // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem // but this is a little bit more complicated. private bool CheckCondition(int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) { if (condition != 0) { CharacterRunAutomaton pattern = dictionary.patterns[condition]; int state = pattern.InitialState; for (int i = c1off; i < c1off + c1len; i++) { state = pattern.Step(state, c1[i]); if (state == -1) { return(false); } } for (int i = c2off; i < c2off + c2len; i++) { state = pattern.Step(state, c2[i]); if (state == -1) { return(false); } } return(pattern.IsAccept(state)); } return(true); }
public sealed override bool IncrementToken() { //Debug.Assert(!EnableChecks_Renamed || (StreamState == State.RESET || StreamState == State.INCREMENT), "IncrementToken() called while in wrong state: " + StreamState); ClearAttributes(); for (; ;) { int startOffset; int cp; if (BufferedCodePoint >= 0) { cp = BufferedCodePoint; startOffset = BufferedOff; BufferedCodePoint = -1; } else { startOffset = Off; cp = ReadCodePoint(); } if (cp < 0) { break; } else if (IsTokenChar(cp)) { int endOffset; do { char[] chars = Character.ToChars(Normalize(cp)); for (int i = 0; i < chars.Length; i++) { TermAtt.Append(chars[i]); } endOffset = Off; if (TermAtt.Length >= MaxTokenLength) { break; } cp = ReadCodePoint(); } while (cp >= 0 && IsTokenChar(cp)); if (TermAtt.Length < MaxTokenLength) { // buffer up, in case the "rejected" char can start a new word of its own BufferedCodePoint = cp; BufferedOff = endOffset; } else { // otherwise, its because we hit term limit. BufferedCodePoint = -1; } int correctedStartOffset = CorrectOffset(startOffset); int correctedEndOffset = CorrectOffset(endOffset); Assert.True(correctedStartOffset >= 0); Assert.True(correctedEndOffset >= 0); Assert.True(correctedStartOffset >= LastOffset); LastOffset = correctedStartOffset; Assert.True(correctedEndOffset >= correctedStartOffset); OffsetAtt.SetOffset(correctedStartOffset, correctedEndOffset); if (state == -1 || RunAutomaton.IsAccept(state)) { // either we hit a reject state (longest match), or end-of-text, but in an accept state StreamState = State.INCREMENT; return(true); } } } StreamState = State.INCREMENT_FALSE; return(false); }
/// <summary> /// Consumers use this method to advance the stream to /// the next token. Implementing classes must implement this method and update /// the appropriate with the attributes of the next /// token. /// The producer must make no assumptions about the attributes after the method /// has been returned: the caller may arbitrarily change it. If the producer /// needs to preserve the state for subsequent calls, it can use /// to create a copy of the current attribute state /// this method is called for every token of a document, so an efficient /// implementation is crucial for good performance. To avoid calls to, /// references to all that this stream uses should be /// retrieved during instantiation. /// To ensure that filters and consumers know which attributes are available, /// the attributes must be added during instantiation. Filters and consumers /// are not required to check for availability of attributes. /// </summary> /// <returns> false for end of stream; true otherwise </returns> public override bool IncrementToken() { if (base.input != null) { if (base.input.Peek() > -1) { //ClearAttributes(); //int length = 0; //int start = bufferIndex; //char[] buffer = TermAtt.Buffer(); //while (true) //{ // if (bufferIndex >= dataLen) // { // offset += dataLen; // dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); // if (dataLen <= 0) // { // dataLen = 0; // so next offset += dataLen won't decrement offset // if (length > 0) // break; // return false; // } // bufferIndex = 0; // } // char c = ioBuffer[bufferIndex++]; // if (IsTokenChar(c)) // { // // if it's a token char // if (length == 0) // // start of token // start = offset + bufferIndex - 1; // else if (length == buffer.Length) // buffer = TermAtt.ResizeBuffer(1 + length); // buffer[length++] = Normalize(c); // buffer it, normalized // if (length == MAX_WORD_LEN) // // buffer overflow! // break; // } // else if (length > 0) // // at non-Letter w/ chars // break; // return 'em //} //TermAtt.Length = length; //OffsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); //return true; ClearAttributes(); for (; ;) { int startOffset; int cp; if (BufferedCodePoint >= 0) { cp = BufferedCodePoint; startOffset = BufferedOff; BufferedCodePoint = -1; } else { startOffset = Off; cp = ReadCodePoint(); } if (cp < 0) { break; } else if (IsTokenChar(cp)) { int endOffset; do { char[] chars = Character.ToChars(Normalize(cp)); for (int i = 0; i < chars.Length; i++) { TermAtt.Append(chars[i]); } endOffset = Off; if (TermAtt.Length >= MaxTokenLength) { break; } cp = ReadCodePoint(); } while (cp >= 0 && IsTokenChar(cp)); if (TermAtt.Length < MaxTokenLength) { // buffer up, in case the "rejected" char can start a new word of its own BufferedCodePoint = cp; BufferedOff = endOffset; } else { // otherwise, its because we hit term limit. BufferedCodePoint = -1; } int correctedStartOffset = CorrectOffset(startOffset); int correctedEndOffset = CorrectOffset(endOffset); LastOffset = correctedStartOffset; OffsetAtt.SetOffset(correctedStartOffset, correctedEndOffset); if (state == -1 || RunAutomaton.IsAccept(state)) { // either we hit a reject state (longest match), or end-of-text, but in an accept state StreamState = ReadState.INCREMENT; return(true); } } } StreamState = ReadState.INCREMENT_FALSE; return(false); } else { StreamState = ReadState.INCREMENT_FALSE; return(false); } } else { StreamState = ReadState.INCREMENT_FALSE; return(false); } }