Exemple #1
0
        /// <summary>
        /// <para>Fills <see cref="inputWindow"/> with input stream tokens, if available,
        /// shifting to the right if the window was previously full.
        /// </para>
        /// <para>
        /// Resets <see cref="gramSize"/> to its minimum value.
        /// </para>
        /// </summary>
        /// <exception cref="IOException"> if there's a problem getting the next token </exception>
        private void ShiftInputWindow()
        {
            InputWindowToken firstToken = null;

            if (inputWindow.Count > 0)
            {
                firstToken = inputWindow.First.Value;
                inputWindow.Remove(firstToken);
            }
            while (inputWindow.Count < maxShingleSize)
            {
                if (null != firstToken) // recycle the firstToken, if available
                {
                    if (null != GetNextToken(firstToken))
                    {
                        inputWindow.AddLast(firstToken); // the firstToken becomes the last
                        firstToken = null;
                    }
                    else
                    {
                        break; // end of input stream
                    }
                }
                else
                {
                    InputWindowToken nextToken = GetNextToken(null);
                    if (null != nextToken)
                    {
                        inputWindow.AddLast(nextToken);
                    }
                    else
                    {
                        break; // end of input stream
                    }
                }
            }
            if (outputUnigramsIfNoShingles && noShingleOutput && gramSize.MinValue > 1 && inputWindow.Count < minShingleSize)
            {
                gramSize.MinValue = 1;
            }
            gramSize.Reset();
            isOutputHere = false;
        }
Exemple #2
0
        /// <summary>
        /// <para>Get the next token from the input stream.
        /// </para>
        /// <para>If the next token has <c>positionIncrement > 1</c>,
        /// <c>positionIncrement - 1</c> <see cref="fillerToken"/>s are
        /// inserted first.
        /// </para>
        /// </summary>
        /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
        /// <returns> On success, the populated token; null otherwise </returns>
        /// <exception cref="IOException"> if the input stream has a problem </exception>
        private InputWindowToken GetNextToken(InputWindowToken target)
        {
            InputWindowToken newTarget = target;

            if (numFillerTokensToInsert > 0)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes());
                }
                else
                {
                    nextInputStreamToken.CopyTo(target.attSource);
                }
                // A filler token occupies no space
                newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset, newTarget.offsetAtt.StartOffset);
                newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                newTarget.isFiller = true;
                --numFillerTokensToInsert;
            }
            else if (isNextInputStreamToken)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes());
                }
                else
                {
                    nextInputStreamToken.CopyTo(target.attSource);
                }
                isNextInputStreamToken = false;
                newTarget.isFiller     = false;
            }
            else if (!exhausted)
            {
                if (m_input.IncrementToken())
                {
                    if (null == target)
                    {
                        newTarget = new InputWindowToken(this, CloneAttributes());
                    }
                    else
                    {
                        this.CopyTo(target.attSource);
                    }
                    if (posIncrAtt.PositionIncrement > 1)
                    {
                        // Each output shingle must contain at least one input token,
                        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                        numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
                        // Save the current token as the next input stream token
                        if (null == nextInputStreamToken)
                        {
                            nextInputStreamToken = CloneAttributes();
                        }
                        else
                        {
                            this.CopyTo(nextInputStreamToken);
                        }
                        isNextInputStreamToken = true;
                        // A filler token occupies no space
                        newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset);
                        newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                        newTarget.isFiller = true;
                        --numFillerTokensToInsert;
                    }
                    else
                    {
                        newTarget.isFiller = false;
                    }
                }
                else
                {
                    exhausted = true;
                    m_input.End();
                    endState = CaptureState();
                    numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
                    if (numFillerTokensToInsert > 0)
                    {
                        nextInputStreamToken = new AttributeSource(this.GetAttributeFactory());
                        nextInputStreamToken.AddAttribute <ICharTermAttribute>();
                        IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute <IOffsetAttribute>();
                        newOffsetAtt.SetOffset(offsetAtt.EndOffset, offsetAtt.EndOffset);
                        // Recurse/loop just once:
                        return(GetNextToken(target));
                    }
                    else
                    {
                        newTarget = null;
                    }
                }
            }
            else
            {
                newTarget = null;
            }
            return(newTarget);
        }
Exemple #3
0
        public override bool IncrementToken()
        {
            bool tokenAvailable = false;
            int  builtGramSize  = 0;

            if (gramSize.AtMinValue() || inputWindow.Count < gramSize.Value)
            {
                ShiftInputWindow();
                gramBuilder.Length = 0;
            }
            else
            {
                builtGramSize = gramSize.PreviousValue;
            }
            if (inputWindow.Count >= gramSize.Value)
            {
                bool             isAllFiller        = true;
                InputWindowToken nextToken          = null;
                IEnumerator <InputWindowToken> iter = inputWindow.GetEnumerator();
                for (int gramNum = 1; iter.MoveNext() && builtGramSize < gramSize.Value; ++gramNum)
                {
                    nextToken = iter.Current;
                    if (builtGramSize < gramNum)
                    {
                        if (builtGramSize > 0)
                        {
                            gramBuilder.Append(tokenSeparator);
                        }
                        gramBuilder.Append(nextToken.termAtt.Buffer, 0, nextToken.termAtt.Length);
                        ++builtGramSize;
                    }
                    if (isAllFiller && nextToken.isFiller)
                    {
                        if (gramNum == gramSize.Value)
                        {
                            gramSize.Advance();
                        }
                    }
                    else
                    {
                        isAllFiller = false;
                    }
                }
                if (!isAllFiller && builtGramSize == gramSize.Value)
                {
                    inputWindow.First.Value.attSource.CopyTo(this);
                    posIncrAtt.PositionIncrement = isOutputHere ? 0 : 1;
                    termAtt.SetEmpty().Append(gramBuilder);
                    if (gramSize.Value > 1)
                    {
                        typeAtt.Type    = tokenType;
                        noShingleOutput = false;
                    }
                    offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset);
                    posLenAtt.PositionLength = builtGramSize;
                    isOutputHere             = true;
                    gramSize.Advance();
                    tokenAvailable = true;
                }
            }
            return(tokenAvailable);
        }
Exemple #4
0
        public override bool IncrementToken()
        {
            bool tokenAvailable = false;
            int  builtGramSize  = 0;

            if (gramSize.atMinValue() || inputWindow.Count < gramSize.Value)
            {
                shiftInputWindow();
                gramBuilder.Length = 0;
            }
            else
            {
                builtGramSize = gramSize.PreviousValue;
            }
            if (inputWindow.Count >= gramSize.Value)
            {
                bool             isAllFiller        = true;
                InputWindowToken nextToken          = null;
                IEnumerator <InputWindowToken> iter = inputWindow.GetEnumerator();
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
                for (int gramNum = 1; iter.hasNext() && builtGramSize < gramSize.Value; ++gramNum)
                {
                    nextToken = iter.Current;
                    if (builtGramSize < gramNum)
                    {
                        if (builtGramSize > 0)
                        {
                            gramBuilder.Append(tokenSeparator);
                        }
                        gramBuilder.Append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length());
                        ++builtGramSize;
                    }
                    if (isAllFiller && nextToken.isFiller)
                    {
                        if (gramNum == gramSize.Value)
                        {
                            gramSize.advance();
                        }
                    }
                    else
                    {
                        isAllFiller = false;
                    }
                }
                if (!isAllFiller && builtGramSize == gramSize.Value)
                {
                    inputWindow.First.Value.attSource.copyTo(this);
                    posIncrAtt.PositionIncrement = isOutputHere ? 0 : 1;
                    termAtt.setEmpty().append(gramBuilder);
                    if (gramSize.Value > 1)
                    {
                        typeAtt.Type    = tokenType;
                        noShingleOutput = false;
                    }
                    offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
                    posLenAtt.PositionLength = builtGramSize;
                    isOutputHere             = true;
                    gramSize.advance();
                    tokenAvailable = true;
                }
            }
            return(tokenAvailable);
        }
Exemple #5
0
	  /// <summary>
	  /// <para>Get the next token from the input stream.
	  /// </para>
	  /// <para>If the next token has <code>positionIncrement > 1</code>,
	  /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are
	  /// inserted first.
	  /// </para>
	  /// </summary>
	  /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
	  /// <returns> On success, the populated token; null otherwise </returns>
	  /// <exception cref="IOException"> if the input stream has a problem </exception>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private InputWindowToken getNextToken(InputWindowToken target) throws java.io.IOException
	  private InputWindowToken getNextToken(InputWindowToken target)
	  {
		InputWindowToken newTarget = target;
		if (numFillerTokensToInsert > 0)
		{
		  if (null == target)
		  {
			newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
		  }
		  else
		  {
			nextInputStreamToken.copyTo(target.attSource);
		  }
		  // A filler token occupies no space
		  newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
		  newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
		  newTarget.isFiller = true;
		  --numFillerTokensToInsert;
		}
		else if (isNextInputStreamToken)
		{
		  if (null == target)
		  {
			newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
		  }
		  else
		  {
			nextInputStreamToken.copyTo(target.attSource);
		  }
		  isNextInputStreamToken = false;
		  newTarget.isFiller = false;
		}
		else if (!exhausted)
		{
		  if (input.incrementToken())
		  {
			if (null == target)
			{
			  newTarget = new InputWindowToken(this, cloneAttributes());
			}
			else
			{
			  this.copyTo(target.attSource);
			}
			if (posIncrAtt.PositionIncrement > 1)
			{
			  // Each output shingle must contain at least one input token, 
			  // so no more than (maxShingleSize - 1) filler tokens will be inserted.
			  numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
			  // Save the current token as the next input stream token
			  if (null == nextInputStreamToken)
			  {
				nextInputStreamToken = cloneAttributes();
			  }
			  else
			  {
				this.copyTo(nextInputStreamToken);
			  }
			  isNextInputStreamToken = true;
			  // A filler token occupies no space
			  newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
			  newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
			  newTarget.isFiller = true;
			  --numFillerTokensToInsert;
			}
			else
			{
			  newTarget.isFiller = false;
			}
		  }
		  else
		  {
			exhausted = true;
			input.end();
			endState = captureState();
			numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
			if (numFillerTokensToInsert > 0)
			{
			  nextInputStreamToken = new AttributeSource(AttributeFactory);
			  nextInputStreamToken.addAttribute(typeof(CharTermAttribute));
			  OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(typeof(OffsetAttribute));
			  newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
			  // Recurse/loop just once:
			  return getNextToken(target);
			}
			else
			{
			  newTarget = null;
			}
		  }
		}
		else
		{
		  newTarget = null;
		}
		return newTarget;
	  }
 /// <summary>
 /// <para>Get the next token from the input stream.
 /// </para>
 /// <para>If the next token has <code>positionIncrement > 1</code>,
 /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are
 /// inserted first.
 /// </para>
 /// </summary>
 /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
 /// <returns> On success, the populated token; null otherwise </returns>
 /// <exception cref="IOException"> if the input stream has a problem </exception>
 private InputWindowToken GetNextToken(InputWindowToken target)
 {
     InputWindowToken newTarget = target;
     if (numFillerTokensToInsert > 0)
     {
         if (null == target)
         {
             newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes());
         }
         else
         {
             nextInputStreamToken.CopyTo(target.attSource);
         }
         // A filler token occupies no space
         newTarget.offsetAtt.SetOffset(newTarget.offsetAtt.StartOffset(), newTarget.offsetAtt.StartOffset());
         newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
         newTarget.isFiller = true;
         --numFillerTokensToInsert;
     }
     else if (isNextInputStreamToken)
     {
         if (null == target)
         {
             newTarget = new InputWindowToken(this, nextInputStreamToken.CloneAttributes());
         }
         else
         {
             nextInputStreamToken.CopyTo(target.attSource);
         }
         isNextInputStreamToken = false;
         newTarget.isFiller = false;
     }
     else if (!exhausted)
     {
         if (input.IncrementToken())
         {
             if (null == target)
             {
                 newTarget = new InputWindowToken(this, CloneAttributes());
             }
             else
             {
                 this.CopyTo(target.attSource);
             }
             if (posIncrAtt.PositionIncrement > 1)
             {
                 // Each output shingle must contain at least one input token, 
                 // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                 numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
                 // Save the current token as the next input stream token
                 if (null == nextInputStreamToken)
                 {
                     nextInputStreamToken = CloneAttributes();
                 }
                 else
                 {
                     this.CopyTo(nextInputStreamToken);
                 }
                 isNextInputStreamToken = true;
                 // A filler token occupies no space
                 newTarget.offsetAtt.SetOffset(offsetAtt.StartOffset(), offsetAtt.StartOffset());
                 newTarget.termAtt.CopyBuffer(fillerToken, 0, fillerToken.Length);
                 newTarget.isFiller = true;
                 --numFillerTokensToInsert;
             }
             else
             {
                 newTarget.isFiller = false;
             }
         }
         else
         {
             exhausted = true;
             input.End();
             endState = CaptureState();
             numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
             if (numFillerTokensToInsert > 0)
             {
                 // LUCENENET TODO: Property attributeFactory should begin with uppre case character
                 nextInputStreamToken = new AttributeSource(this.attributeFactory);
                 nextInputStreamToken.AddAttribute<ICharTermAttribute>();
                 IOffsetAttribute newOffsetAtt = nextInputStreamToken.AddAttribute<IOffsetAttribute>();
                 newOffsetAtt.SetOffset(offsetAtt.EndOffset(), offsetAtt.EndOffset());
                 // Recurse/loop just once:
                 return GetNextToken(target);
             }
             else
             {
                 newTarget = null;
             }
         }
     }
     else
     {
         newTarget = null;
     }
     return newTarget;
 }