/// <summary> /// <para>Fills <see cref="inputWindow"/> with input stream tokens, if available, /// shifting to the right if the window was previously full. /// </para> /// <para> /// Resets <see cref="gramSize"/> to its minimum value. /// </para> /// </summary> /// <exception cref="IOException"> if there's a problem getting the next token </exception> private void ShiftInputWindow() { InputWindowToken firstToken = null; if (inputWindow.Count > 0) { firstToken = inputWindow.First.Value; inputWindow.Remove(firstToken); } while (inputWindow.Count < maxShingleSize) { if (null != firstToken) // recycle the firstToken, if available { if (null != GetNextToken(firstToken)) { inputWindow.AddLast(firstToken); // the firstToken becomes the last firstToken = null; } else { break; // end of input stream } } else { InputWindowToken nextToken = GetNextToken(null); if (null != nextToken) { inputWindow.AddLast(nextToken); } else { break; // end of input stream } } } if (outputUnigramsIfNoShingles && noShingleOutput && gramSize.MinValue > 1 && inputWindow.Count < minShingleSize) { gramSize.MinValue = 1; } gramSize.Reset(); isOutputHere = false; }
/// <summary> /// <para>Get the next token from the input stream. /// </para> /// <para>If the next token has <c>positionIncrement > 1</c>, /// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are /// inserted first. /// </para> /// </summary> /// <param name="target"> Where to put the new token; if null, a new instance is created. </param> /// <returns> On success, the populated token; null otherwise </returns> /// <exception cref="IOException"> if the input stream has a problem </exception> private InputWindowToken GetNextToken(InputWindowToken target) { InputWindowToken newTarget = target; if (numFillerTokensToInsert > 0) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } // A filler token occupies no space newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } isNextInputStreamToken = false; newTarget.isFiller = false; } else if (!exhausted) { if (m_input.IncrementToken()) { if (null == target) { newTarget = new InputWindowToken(this, CloneAttributes()); } else { this.CopyTo(target.attSource); } if (posIncrAtt.PositionIncrement > 1) { // Each output shingle must contain at least one input token, // so no more than (maxShingleSize - 1) filler tokens will be inserted. numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1); // Save the current token as the next input stream token if (null == nextInputStreamToken) { nextInputStreamToken = CloneAttributes(); } else { this.CopyTo(nextInputStreamToken); } isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { newTarget.isFiller = false; } } else { exhausted = true; m_input.End(); endState = CaptureState(); numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(this.GetAttributeFactory()); nextInputStreamToken.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset); // Recurse/loop just once: return(GetNextToken(target)); } else { newTarget = null; } } } else { newTarget = null; } return(newTarget); }
public override bool IncrementToken() { bool tokenAvailable = false; int builtGramSize = 0; if (gramSize.AtMinValue() || inputWindow.Count < gramSize.Value) { ShiftInputWindow(); gramBuilder.Length = 0; } else { builtGramSize = gramSize.PreviousValue; } if (inputWindow.Count >= gramSize.Value) { bool isAllFiller = true; InputWindowToken nextToken = null; IEnumerator <InputWindowToken> iter = inputWindow.GetEnumerator(); for (int gramNum = 1; iter.MoveNext() && builtGramSize < gramSize.Value; ++gramNum) { nextToken = iter.Current; if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.Append(tokenSeparator); } gramBuilder.Append(nextToken.termAtt.Buffer, 0, nextToken.termAtt.Length); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.Value) { gramSize.Advance(); } } else { isAllFiller = false; } } if (!isAllFiller && builtGramSize == gramSize.Value) { inputWindow.First.Value.attSource.CopyTo(this); posIncrAtt.PositionIncrement = isOutputHere ? 0 : 1; termAtt.SetEmpty().Append(gramBuilder); if (gramSize.Value > 1) { typeAtt.Type = tokenType; noShingleOutput = false; } offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset); posLenAtt.PositionLength = builtGramSize; isOutputHere = true; gramSize.Advance(); tokenAvailable = true; } } return(tokenAvailable); }
public override bool IncrementToken() { bool tokenAvailable = false; int builtGramSize = 0; if (gramSize.atMinValue() || inputWindow.Count < gramSize.Value) { shiftInputWindow(); gramBuilder.Length = 0; } else { builtGramSize = gramSize.PreviousValue; } if (inputWindow.Count >= gramSize.Value) { bool isAllFiller = true; InputWindowToken nextToken = null; IEnumerator <InputWindowToken> iter = inputWindow.GetEnumerator(); //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.Value; ++gramNum) { nextToken = iter.Current; if (builtGramSize < gramNum) { if (builtGramSize > 0) { gramBuilder.Append(tokenSeparator); } gramBuilder.Append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; } if (isAllFiller && nextToken.isFiller) { if (gramNum == gramSize.Value) { gramSize.advance(); } } else { isAllFiller = false; } } if (!isAllFiller && builtGramSize == gramSize.Value) { inputWindow.First.Value.attSource.copyTo(this); posIncrAtt.PositionIncrement = isOutputHere ? 0 : 1; termAtt.setEmpty().append(gramBuilder); if (gramSize.Value > 1) { typeAtt.Type = tokenType; noShingleOutput = false; } offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); posLenAtt.PositionLength = builtGramSize; isOutputHere = true; gramSize.advance(); tokenAvailable = true; } } return(tokenAvailable); }
/// <summary> /// <para>Get the next token from the input stream. /// </para> /// <para>If the next token has <code>positionIncrement > 1</code>, /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are /// inserted first. /// </para> /// </summary> /// <param name="target"> Where to put the new token; if null, a new instance is created. </param> /// <returns> On success, the populated token; null otherwise </returns> /// <exception cref="IOException"> if the input stream has a problem </exception> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private InputWindowToken getNextToken(InputWindowToken target) throws java.io.IOException private InputWindowToken getNextToken(InputWindowToken target) { InputWindowToken newTarget = target; if (numFillerTokensToInsert > 0) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes()); } else { nextInputStreamToken.copyTo(target.attSource); } // A filler token occupies no space newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes()); } else { nextInputStreamToken.copyTo(target.attSource); } isNextInputStreamToken = false; newTarget.isFiller = false; } else if (!exhausted) { if (input.incrementToken()) { if (null == target) { newTarget = new InputWindowToken(this, cloneAttributes()); } else { this.copyTo(target.attSource); } if (posIncrAtt.PositionIncrement > 1) { // Each output shingle must contain at least one input token, // so no more than (maxShingleSize - 1) filler tokens will be inserted. numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1); // Save the current token as the next input stream token if (null == nextInputStreamToken) { nextInputStreamToken = cloneAttributes(); } else { this.copyTo(nextInputStreamToken); } isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { newTarget.isFiller = false; } } else { exhausted = true; input.end(); endState = captureState(); numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(AttributeFactory); nextInputStreamToken.addAttribute(typeof(CharTermAttribute)); OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(typeof(OffsetAttribute)); newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset()); // Recurse/loop just once: return getNextToken(target); } else { newTarget = null; } } } else { newTarget = null; } return newTarget; }
/// <summary> /// <para>Get the next token from the input stream. /// </para> /// <para>If the next token has <code>positionIncrement > 1</code>, /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are /// inserted first. /// </para> /// </summary> /// <param name="target"> Where to put the new token; if null, a new instance is created. </param> /// <returns> On success, the populated token; null otherwise </returns> /// <exception cref="IOException"> if the input stream has a problem </exception> private InputWindowToken GetNextToken(InputWindowToken target) { InputWindowToken newTarget = target; if (numFillerTokensToInsert > 0) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } // A filler token occupies no space newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset(), newTarget.offsetAtt.StartOffset()); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes()); } else { nextInputStreamToken.CopyTo(target.attSource); } isNextInputStreamToken = false; newTarget.isFiller = false; } else if (!exhausted) { if (input.IncrementToken()) { if (null == target) { newTarget = new InputWindowToken(this, CloneAttributes()); } else { this.CopyTo(target.attSource); } if (posIncrAtt.PositionIncrement > 1) { // Each output shingle must contain at least one input token, // so no more than (maxShingleSize - 1) filler tokens will be inserted. numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1); // Save the current token as the next input stream token if (null == nextInputStreamToken) { nextInputStreamToken = CloneAttributes(); } else { this.CopyTo(nextInputStreamToken); } isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset(), offsetAtt.StartOffset()); newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { newTarget.isFiller = false; } } else { exhausted = true; input.End(); endState = CaptureState(); numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1); if (numFillerTokensToInsert > 0) { // LUCENENET TODO: Property attributeFactory should begin with uppre case character nextInputStreamToken = new AttributeSource(this.attributeFactory); nextInputStreamToken.AddAttribute<ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute<IOffsetAttribute>(); newOffsetAtt.SetOffset(offsetAtt.EndOffset(), offsetAtt.EndOffset()); // Recurse/loop just once: return GetNextToken(target); } else { newTarget = null; } } } else { newTarget = null; } return newTarget; }