Example #1
0
        public override bool accept(AttributeSource source)
        {
            if (termAtt == null)
            {
                termAtt = source.addAttribute(typeof(CharTermAttribute));
            }
            try
            {
                DateTime date = dateFormat.parse(termAtt.ToString());   //We don't care about the date, just that we can parse it as a date
                if (date != null)
                {
                    return(true);
                }
            }
            catch (ParseException)
            {
            }

            return(false);
        }
	  public override bool accept(AttributeSource source)
	  {
		if (termAtt == null)
		{
		  termAtt = source.addAttribute(typeof(CharTermAttribute));
		}
		try
		{
		  DateTime date = dateFormat.parse(termAtt.ToString()); //We don't care about the date, just that we can parse it as a date
		  if (date != null)
		  {
			return true;
		  }
		}
		catch (ParseException)
		{

		}

		return false;
	  }
Example #3
0
	  /// <summary>
	  /// <para>Get the next token from the input stream.
	  /// </para>
	  /// <para>If the next token has <code>positionIncrement > 1</code>,
	  /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are
	  /// inserted first.
	  /// </para>
	  /// </summary>
	  /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
	  /// <returns> On success, the populated token; null otherwise </returns>
	  /// <exception cref="IOException"> if the input stream has a problem </exception>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private InputWindowToken getNextToken(InputWindowToken target) throws java.io.IOException
	  private InputWindowToken getNextToken(InputWindowToken target)
	  {
		InputWindowToken newTarget = target;
		if (numFillerTokensToInsert > 0)
		{
		  if (null == target)
		  {
			newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
		  }
		  else
		  {
			nextInputStreamToken.copyTo(target.attSource);
		  }
		  // A filler token occupies no space
		  newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
		  newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
		  newTarget.isFiller = true;
		  --numFillerTokensToInsert;
		}
		else if (isNextInputStreamToken)
		{
		  if (null == target)
		  {
			newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
		  }
		  else
		  {
			nextInputStreamToken.copyTo(target.attSource);
		  }
		  isNextInputStreamToken = false;
		  newTarget.isFiller = false;
		}
		else if (!exhausted)
		{
		  if (input.incrementToken())
		  {
			if (null == target)
			{
			  newTarget = new InputWindowToken(this, cloneAttributes());
			}
			else
			{
			  this.copyTo(target.attSource);
			}
			if (posIncrAtt.PositionIncrement > 1)
			{
			  // Each output shingle must contain at least one input token, 
			  // so no more than (maxShingleSize - 1) filler tokens will be inserted.
			  numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
			  // Save the current token as the next input stream token
			  if (null == nextInputStreamToken)
			  {
				nextInputStreamToken = cloneAttributes();
			  }
			  else
			  {
				this.copyTo(nextInputStreamToken);
			  }
			  isNextInputStreamToken = true;
			  // A filler token occupies no space
			  newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
			  newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
			  newTarget.isFiller = true;
			  --numFillerTokensToInsert;
			}
			else
			{
			  newTarget.isFiller = false;
			}
		  }
		  else
		  {
			exhausted = true;
			input.end();
			endState = captureState();
			numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
			if (numFillerTokensToInsert > 0)
			{
			  nextInputStreamToken = new AttributeSource(AttributeFactory);
			  nextInputStreamToken.addAttribute(typeof(CharTermAttribute));
			  OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(typeof(OffsetAttribute));
			  newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
			  // Recurse/loop just once:
			  return getNextToken(target);
			}
			else
			{
			  newTarget = null;
			}
		  }
		}
		else
		{
		  newTarget = null;
		}
		return newTarget;
	  }
Example #4
0
        /// <summary>
        /// <para>Get the next token from the input stream.
        /// </para>
        /// <para>If the next token has <code>positionIncrement > 1</code>,
        /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are
        /// inserted first.
        /// </para>
        /// </summary>
        /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
        /// <returns> On success, the populated token; null otherwise </returns>
        /// <exception cref="IOException"> if the input stream has a problem </exception>
        private InputWindowToken getNextToken(InputWindowToken target)
        {
            InputWindowToken newTarget = target;

            if (numFillerTokensToInsert > 0)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
                }
                else
                {
                    nextInputStreamToken.copyTo(target.attSource);
                }
                // A filler token occupies no space
                newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
                newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
                newTarget.isFiller = true;
                --numFillerTokensToInsert;
            }
            else if (isNextInputStreamToken)
            {
                if (null == target)
                {
                    newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
                }
                else
                {
                    nextInputStreamToken.copyTo(target.attSource);
                }
                isNextInputStreamToken = false;
                newTarget.isFiller     = false;
            }
            else if (!exhausted)
            {
                if (input.incrementToken())
                {
                    if (null == target)
                    {
                        newTarget = new InputWindowToken(this, cloneAttributes());
                    }
                    else
                    {
                        this.copyTo(target.attSource);
                    }
                    if (posIncrAtt.PositionIncrement > 1)
                    {
                        // Each output shingle must contain at least one input token,
                        // so no more than (maxShingleSize - 1) filler tokens will be inserted.
                        numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
                        // Save the current token as the next input stream token
                        if (null == nextInputStreamToken)
                        {
                            nextInputStreamToken = cloneAttributes();
                        }
                        else
                        {
                            this.copyTo(nextInputStreamToken);
                        }
                        isNextInputStreamToken = true;
                        // A filler token occupies no space
                        newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
                        newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
                        newTarget.isFiller = true;
                        --numFillerTokensToInsert;
                    }
                    else
                    {
                        newTarget.isFiller = false;
                    }
                }
                else
                {
                    exhausted = true;
                    input.end();
                    endState = captureState();
                    numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
                    if (numFillerTokensToInsert > 0)
                    {
                        nextInputStreamToken = new AttributeSource(AttributeFactory);
                        nextInputStreamToken.addAttribute(typeof(CharTermAttribute));
                        OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(typeof(OffsetAttribute));
                        newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
                        // Recurse/loop just once:
                        return(getNextToken(target));
                    }
                    else
                    {
                        newTarget = null;
                    }
                }
            }
            else
            {
                newTarget = null;
            }
            return(newTarget);
        }
Example #5
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = nextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null;

                if (result == null)
                {
                    copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <>();

                result = match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource            origTok        = includeOrig ? firstTok : null;
                PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute));
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                      repTok       = result.synonyms[i];
                    AttributeSource            newTok       = firstTok.cloneAttributes();
                    CharTermAttribute          newTermAtt   = newTok.addAttribute(typeof(CharTermAttribute));
                    OffsetAttribute            newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute));
                    PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute));

                    OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute));

                    newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                    newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                    repPos += repTok.PositionIncrement;
                    if (i == 0)     // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos    += origPosInc.PositionIncrement;
                        origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (origTok != null)
                        {
                            origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos    += origPosInc.PositionIncrement;
                    origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (origTok != null)
                    {
                        origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }