Ejemplo n.º 1
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            Token nextToken = input.Next(reusableToken);

            if (nextToken == null)
            {
                return(null);
            }

            char[] buffer       = nextToken.TermBuffer();
            int    bufferLength = nextToken.TermLength();

            System.String type = nextToken.Type();

            if (type == APOSTROPHE_TYPE &&
                bufferLength >= 2 &&
                buffer[bufferLength - 2] == '\'' &&
                (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                nextToken.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                nextToken.SetTermLength(upto);
            }

            return(nextToken);
        }
        /*
         * (non-Javadoc)
         *
         * @see Lucene.Net.Analysis.TokenStream#next()
         */
        public override Token Next(/* in */ Token reusableToken)
        {
            System.Diagnostics.Debug.Assert(reusableToken != null);
            int posIncr = 1;

            while (true)
            {
                int tokenType = scanner.GetNextToken();

                if (tokenType == StandardTokenizerImpl.YYEOF)
                {
                    return(null);
                }

                if (scanner.Yylength() <= maxTokenLength)
                {
                    reusableToken.Clear();
                    reusableToken.SetPositionIncrement(posIncr);
                    scanner.GetText(reusableToken);
                    int start = scanner.Yychar();
                    reusableToken.SetStartOffset(start);
                    reusableToken.SetEndOffset(start + reusableToken.TermLength());
                    // This 'if' should be removed in the next release. For now, it converts
                    // invalid acronyms to HOST. When removed, only the 'else' part should
                    // remain.
                    if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
                    {
                        if (replaceInvalidAcronym)
                        {
                            reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                            reusableToken.SetTermLength(reusableToken.TermLength() - 1);                             // remove extra '.'
                        }
                        else
                        {
                            reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                        }
                    }
                    else
                    {
                        reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
                    }
                    return(reusableToken);
                }
                // When we skip a too-long term, we still increment the
                // position increment
                else
                {
                    posIncr++;
                }
            }
        }
Ejemplo n.º 3
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p>Removes <tt>'s</tt> from the end of words.
        /// <p>Removes dots from acronyms.
        /// </summary>
        public override Token Next(Token result)
        {
            Token t = input.Next(result);

            if (t == null)
            {
                return(null);
            }

            char[] buffer       = t.TermBuffer();
            int    bufferLength = t.TermLength();

            System.String type = t.Type();

            if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                t.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                t.SetTermLength(upto);
            }

            return(t);
        }
Ejemplo n.º 4
0
		/*
		* (non-Javadoc)
		*
		* @see Lucene.Net.Analysis.TokenStream#next()
		*/
		public override Token Next(Token result)
		{
			int posIncr = 1;
			
			while (true)
			{
				int tokenType = scanner.GetNextToken();
				
				if (tokenType == StandardTokenizerImpl.YYEOF)
				{
					return null;
				}
				
				if (scanner.Yylength() <= maxTokenLength)
				{
					result.Clear();
					result.SetPositionIncrement(posIncr);
					scanner.GetText(result);
					int start = scanner.Yychar();
					result.SetStartOffset(start);
					result.SetEndOffset(start + result.TermLength());
					// This 'if' should be removed in the next release. For now, it converts
					// invalid acronyms to HOST. When removed, only the 'else' part should
					// remain.
					if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
					{
						if (replaceInvalidAcronym)
						{
							result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
							result.SetTermLength(result.TermLength() - 1); // remove extra '.'
						}
						else
						{
							result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
						}
					}
					else
					{
						result.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
					}
					return result;
				}
				// When we skip a too-long term, we still increment the
				// position increment
				else
					posIncr++;
			}
		}