Beispiel #1
0
        public override bool accept()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int max32 = termAtt.length();
            int max32 = termAtt.length();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int min32 = max32 >> 1;
            int min32 = max32 >> 1;

            if (min32 >= min && max32 <= max)
            {
                // definitely within range
                return(true);
            }
            else if (min32 > max || max32 < min)
            {
                // definitely not
                return(false);
            }
            else
            {
                // we must count to be sure
                int len = char.codePointCount(termAtt.buffer(), 0, termAtt.length());
                return(len >= min && len <= max);
            }
        }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            clearAttributes();
            skippedPositions = 0;

            while (true)
            {
                int tokenType = scanner.NextToken;

                if (tokenType == StandardTokenizerInterface_Fields.YYEOF)
                {
                    return(false);
                }

                if (scanner.yylength() <= maxTokenLength)
                {
                    posIncrAtt.PositionIncrement = skippedPositions + 1;
                    scanner.getText(termAtt);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = scanner.yychar();
                    int start = scanner.yychar();
                    offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
                    typeAtt.Type = TOKEN_TYPES[tokenType];
                    return(true);
                }
                else
                // When we skip a too-long term, we still increment the
                // position increment
                {
                    skippedPositions++;
                }
            }
        }
Beispiel #3
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            while (input.incrementToken())
            {
                char[] text       = termAtt.buffer();
                int    termLength = termAtt.length();

                // why not key off token type here assuming ChineseTokenizer comes first?
                if (!stopTable.contains(text, 0, termLength))
                {
                    switch (char.getType(text[0]))
                    {
                    case char.LOWERCASE_LETTER:
                    case char.UPPERCASE_LETTER:

                        // English word/token should larger than 1 character.
                        if (termLength > 1)
                        {
                            return(true);
                        }
                        break;

                    case char.OTHER_LETTER:

                        // One Chinese character as one Chinese word.
                        // Chinese word extraction to be added later here.

                        return(true);
                    }
                }
            }
            return(false);
        }
Beispiel #4
0
        /// <summary>
        /// Returns the next input Token, after being stemmed </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (input.incrementToken())
            {
                if (!keywordAttr.Keyword)
                {
                    char[] termBuffer = termAtt.buffer();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int length = termAtt.length();
                    int length = termAtt.length();
                    stemmer.setCurrent(termBuffer, length);
                    stemmer.stem();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char finalTerm[] = stemmer.getCurrentBuffer();
                    char[] finalTerm = stemmer.CurrentBuffer;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int newLength = stemmer.getCurrentBufferLength();
                    int newLength = stemmer.CurrentBufferLength;
                    if (finalTerm != termBuffer)
                    {
                        termAtt.copyBuffer(finalTerm, 0, newLength);
                    }
                    else
                    {
                        termAtt.Length = newLength;
                    }
                }
                return(true);
            }
            else
            {
                return(false);
            }
        }
Beispiel #5
0
        public override bool accept()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int len = termAtt.length();
            int len = termAtt.length();

            return(len >= min && len <= max);
        }
Beispiel #6
0
        private void capture()
        {
            captureCount++;
            //System.out.println("  capture slot=" + nextWrite);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite];
            PendingInput input = futureInputs[nextWrite];

            input.state    = captureState();
            input.consumed = false;
            input.term.copyChars(termAtt.buffer(), 0, termAtt.length());

            nextWrite = rollIncr(nextWrite);

            // Buffer head should never catch up to tail:
            Debug.Assert(nextWrite != nextRead);
        }
Beispiel #7
0
        /// <summary>
        /// {@inheritDoc}
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            while (!exhausted && input.incrementToken())
            {
                char[] term       = termAttribute.buffer();
                int    termLength = termAttribute.length();
                lastEndOffset = offsetAttribute.endOffset();

                if (termLength > 0 && term[termLength - 1] == '-')
                {
                    // a hyphenated word
                    // capture the state of the first token only
                    if (savedState == null)
                    {
                        savedState = captureState();
                    }
                    hyphenated.Append(term, 0, termLength - 1);
                }
                else if (savedState == null)
                {
                    // not part of a hyphenated word.
                    return(true);
                }
                else
                {
                    // the final portion of a hyphenated word
                    hyphenated.Append(term, 0, termLength);
                    unhyphenate();
                    return(true);
                }
            }

            exhausted = true;

            if (savedState != null)
            {
                // the final term ends with a hyphen
                // add back the hyphen, for backwards compatibility.
                hyphenated.Append('-');
                unhyphenate();
                return(true);
            }

            return(false);
        }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            bool iOrAfter = false;

            if (input.incrementToken())
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
                char[] buffer = termAtt.buffer();
                int    length = termAtt.length();
                for (int i = 0; i < length;)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int ch = Character.codePointAt(buffer, i, length);
                    int ch = char.codePointAt(buffer, i, length);

                    iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && char.getType(ch) == char.NON_SPACING_MARK));

                    if (iOrAfter)     // all the special I turkish handling happens here.
                    {
                        switch (ch)
                        {
                        // remove COMBINING_DOT_ABOVE to mimic composed lowercase
                        case COMBINING_DOT_ABOVE:
                            length = delete(buffer, i, length);
                            continue;

                        // i itself, it depends if it is followed by COMBINING_DOT_ABOVE
                        // if it is, we will make it small i and later remove the dot
                        case LATIN_CAPITAL_LETTER_I:
                            if (isBeforeDot(buffer, i + 1, length))
                            {
                                buffer[i] = (char)LATIN_SMALL_LETTER_I;
                            }
                            else
                            {
                                buffer[i] = (char)LATIN_SMALL_LETTER_DOTLESS_I;
                                // below is an optimization. no COMBINING_DOT_ABOVE follows,
                                // so don't waste time calculating Character.getType(), etc
                                iOrAfter = false;
                            }
                            i++;
                            continue;
                        }
                    }

                    i += char.toChars(char.ToLower(ch), buffer, i);
                }

                termAtt.Length = length;
                return(true);
            }
            else
            {
                return(false);
            }
        }
        private void setupToken()
        {
            scanner.getText(termAtt);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = scanner.yychar();
            int start = scanner.yychar();

            offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
        }
        /*
         * (non-Javadoc)
         *
         * @see org.apache.lucene.analysis.TokenStream#next()
         */
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            clearAttributes();
            skippedPositions = 0;

            while (true)
            {
                int tokenType = scanner.NextToken;

                if (tokenType == StandardTokenizerInterface_Fields.YYEOF)
                {
                    return(false);
                }

                if (scanner.yylength() <= maxTokenLength)
                {
                    posIncrAtt.PositionIncrement = skippedPositions + 1;
                    scanner.getText(termAtt);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = scanner.yychar();
                    int start = scanner.yychar();
                    offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
                    // This 'if' should be removed in the next release. For now, it converts
                    // invalid acronyms to HOST. When removed, only the 'else' part should
                    // remain.
                    if (tokenType == StandardTokenizer.ACRONYM_DEP)
                    {
                        typeAtt.Type   = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
                        termAtt.Length = termAtt.length() - 1;   // remove extra '.'
                    }
                    else
                    {
                        typeAtt.Type = StandardTokenizer.TOKEN_TYPES[tokenType];
                    }
                    return(true);
                }
                else
                // When we skip a too-long term, we still increment the
                // position increment
                {
                    skippedPositions++;
                }
            }
        }
Beispiel #11
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (input.incrementToken())
            {
                charUtils.ToUpper(termAtt.buffer(), 0, termAtt.length());
                return(true);
            }
            else
            {
                return(false);
            }
        }
Beispiel #12
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

            char[] termBuffer = termAtt.buffer();
            int    len        = termAtt.length();

            //TODO: Is this the right behavior or should we return false?  Currently, "  ", returns true, so I think this should
            //also return true
            if (len == 0)
            {
                return(true);
            }
            int start  = 0;
            int end    = 0;
            int endOff = 0;

            // eat the first characters
            for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++)
            {
            }
            // eat the end characters
            for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--)
            {
                endOff++;
            }
            if (start > 0 || end < len)
            {
                if (start < end)
                {
                    termAtt.copyBuffer(termBuffer, start, (end - start));
                }
                else
                {
                    termAtt.setEmpty();
                }
                if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset())
                {
                    int newStart = offsetAtt.startOffset() + start;
                    int newEnd   = offsetAtt.endOffset() - (start < end ? endOff:0);
                    offsetAtt.setOffset(newStart, newEnd);
                }
            }

            return(true);
        }
        private const char oe_se = '\u00F6'; //ö


//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

            char[] buffer = charTermAttribute.buffer();
            int    length = charTermAttribute.length();


            int i;

            for (i = 0; i < length; i++)
            {
                if (buffer[i] == aa || buffer[i] == ae_se || buffer[i] == ae)
                {
                    buffer[i] = 'a';
                }
                else if (buffer[i] == AA || buffer[i] == AE_se || buffer[i] == AE)
                {
                    buffer[i] = 'A';
                }
                else if (buffer[i] == oe || buffer[i] == oe_se)
                {
                    buffer[i] = 'o';
                }
                else if (buffer[i] == OE || buffer[i] == OE_se)
                {
                    buffer[i] = 'O';
                }
                else if (length - 1 > i)
                {
                    if ((buffer[i] == 'a' || buffer[i] == 'A') && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                    {
                        length = StemmerUtil.delete(buffer, i + 1, length);
                    }
                    else if ((buffer[i] == 'o' || buffer[i] == 'O') && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                    {
                        length = StemmerUtil.delete(buffer, i + 1, length);
                    }
                }
            }

            charTermAttribute.Length = length;


            return(true);
        }
Beispiel #14
0
            /// <summary>
            /// Sugar: analyzes the text with the analyzer and
            ///  separates by <seealso cref="SynonymMap#WORD_SEPARATOR"/>.
            ///  reuse and its chars must not be null.
            /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public org.apache.lucene.util.CharsRef analyze(String text, org.apache.lucene.util.CharsRef reuse) throws java.io.IOException
            public virtual CharsRef analyze(string text, CharsRef reuse)
            {
                IOException priorException = null;
                TokenStream ts             = analyzer.tokenStream("", text);

                try
                {
                    CharTermAttribute          termAtt   = ts.addAttribute(typeof(CharTermAttribute));
                    PositionIncrementAttribute posIncAtt = ts.addAttribute(typeof(PositionIncrementAttribute));
                    ts.reset();
                    reuse.length = 0;
                    while (ts.incrementToken())
                    {
                        int length = termAtt.length();
                        if (length == 0)
                        {
                            throw new System.ArgumentException("term: " + text + " analyzed to a zero-length token");
                        }
                        if (posIncAtt.PositionIncrement != 1)
                        {
                            throw new System.ArgumentException("term: " + text + " analyzed to a token with posinc != 1");
                        }
                        reuse.grow(reuse.length + length + 1);   // current + word + separator
                        int end = reuse.offset + reuse.length;
                        if (reuse.length > 0)
                        {
                            reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
                            reuse.length++;
                        }
                        Array.Copy(termAtt.buffer(), 0, reuse.chars, end, length);
                        reuse.length += length;
                    }
                    ts.end();
                }
                catch (IOException e)
                {
                    priorException = e;
                }
                finally
                {
                    IOUtils.closeWhileHandlingException(priorException, ts);
                }
                if (reuse.length == 0)
                {
                    throw new System.ArgumentException("term: " + text + " was completely eliminated by analyzer");
                }
                return(reuse);
            }
Beispiel #15
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private SlowSynonymMap match(SlowSynonymMap map) throws java.io.IOException
        private SlowSynonymMap match(SlowSynonymMap map)
        {
            SlowSynonymMap result = null;

            if (map.submap != null)
            {
                AttributeSource tok = nextTok();
                if (tok != null)
                {
                    // clone ourselves.
                    if (tok == this)
                    {
                        tok = cloneAttributes();
                    }
                    // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
                    CharTermAttribute termAtt = tok.getAttribute(typeof(CharTermAttribute));
                    SlowSynonymMap    subMap  = map.submap.get(termAtt.buffer(), 0, termAtt.length());

                    if (subMap != null)
                    {
                        // recurse
                        result = match(subMap);
                    }

                    if (result != null)
                    {
                        matched.AddFirst(tok);
                    }
                    else
                    {
                        // push back unmatched token
                        pushTok(tok);
                    }
                }
            }

            // if no longer sequence matched, so if this node has synonyms, it's the match.
            if (result == null && map.synonyms != null)
            {
                result = map;
            }

            return(result);
        }
Beispiel #16
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (input.incrementToken())
            {
                int len = termAtt.length();
                if (marker != NOMARKER)
                {
                    len++;
                    termAtt.resizeBuffer(len);
                    termAtt.buffer()[len - 1] = marker;
                }
                reverse(matchVersion, termAtt.buffer(), 0, len);
                termAtt.Length = len;
                return(true);
            }
            else
            {
                return(false);
            }
        }
Beispiel #17
0
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
            char[] buffer = termAtt.buffer();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int bufferLength = termAtt.length();
            int bufferLength = termAtt.length();

            if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || (matchVersion.onOrAfter(Version.LUCENE_36) && (buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                termAtt.Length = bufferLength - 2;   // Strip last 2 characters off
            }

            return(true);
        }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public final boolean incrementTokenClassic() throws java.io.IOException
	  public bool incrementTokenClassic()
	  {
		if (!input.incrementToken())
		{
		  return false;
		}

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
		char[] buffer = termAtt.buffer();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int bufferLength = termAtt.length();
		int bufferLength = termAtt.length();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final String type = typeAtt.type();
		string type = typeAtt.type();

		if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
		{
		  // Strip last 2 characters off
		  termAtt.Length = bufferLength - 2;
		} // remove dots
		else if (type == ACRONYM_TYPE)
		{
		  int upto = 0;
		  for (int i = 0;i < bufferLength;i++)
		  {
			char c = buffer[i];
			if (c != '.')
			{
			  buffer[upto++] = c;
			}
		  }
		  termAtt.Length = upto;
		}

		return true;
	  }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private static java.util.List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws java.io.IOException
        private static IList <string> splitByTokenizer(string source, TokenizerFactory tokFactory)
        {
            StringReader   reader  = new StringReader(source);
            TokenStream    ts      = loadTokenizer(tokFactory, reader);
            IList <string> tokList = new List <string>();

            try
            {
                CharTermAttribute termAtt = ts.addAttribute(typeof(CharTermAttribute));
                ts.reset();
                while (ts.incrementToken())
                {
                    if (termAtt.length() > 0)
                    {
                        tokList.Add(termAtt.ToString());
                    }
                }
            }
            finally
            {
                reader.close();
            }
            return(tokList);
        }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (currentMatcher != -1 && nextCapture())
            {
                Debug.Assert(state != null);
                clearAttributes();
                restoreState(state);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
                int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);
                int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);

                posAttr.PositionIncrement = 0;
                charTermAttr.copyBuffer(spare.chars, start, end - start);
                currentGroup[currentMatcher]++;
                return(true);
            }

            if (!input.incrementToken())
            {
                return(false);
            }

            char[] buffer = charTermAttr.buffer();
            int    length = charTermAttr.length();

            spare.copyChars(buffer, 0, length);
            state = captureState();

            for (int i = 0; i < matchers.Length; i++)
            {
                matchers[i].reset(spare);
                currentGroup[i] = -1;
            }

            if (preserveOriginal)
            {
                currentMatcher = 0;
            }
            else if (nextCapture())
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
                int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);
                int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);

                // if we start at 0 we can simply set the length and save the copy
                if (start == 0)
                {
                    charTermAttr.Length = end;
                }
                else
                {
                    charTermAttr.copyBuffer(spare.chars, start, end - start);
                }
                currentGroup[currentMatcher]++;
            }
            return(true);
        }
Beispiel #21
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (input.incrementToken())
            {
                int    state  = N;
                char[] buffer = termAtt.buffer();
                int    length = termAtt.length();
                for (int i = 0; i < length; i++)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char c = buffer[i];
                    char c = buffer[i];
                    switch (c)
                    {
                    case 'a':
                    case 'o':
                        state = U;
                        break;

                    case 'u':
                        state = (state == N) ? U : V;
                        break;

                    case 'e':
                        if (state == U)
                        {
                            length = StemmerUtil.delete(buffer, i--, length);
                        }
                        state = V;
                        break;

                    case 'i':
                    case 'q':
                    case 'y':
                        state = V;
                        break;

                    case 'ä':
                        buffer[i] = 'a';
                        state     = V;
                        break;

                    case 'ö':
                        buffer[i] = 'o';
                        state     = V;
                        break;

                    case 'ü':
                        buffer[i] = 'u';
                        state     = V;
                        break;

                    case 'ß':
                        buffer[i++] = 's';
                        buffer      = termAtt.resizeBuffer(1 + length);
                        if (i < length)
                        {
                            Array.Copy(buffer, i, buffer, i + 1, (length - i));
                        }
                        buffer[i] = 's';
                        length++;
                        state = N;
                        break;

                    default:
                        state = N;
                        break;
                    }
                }
                termAtt.Length = length;
                return(true);
            }
            else
            {
                return(false);
            }
        }
Beispiel #22
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
                if (replacement != null && replacement.hasNext())
                {
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
                    copy(this, replacement.next());
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = nextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                CharTermAttribute termAtt = firstTok.addAttribute(typeof(CharTermAttribute));
                SlowSynonymMap    result  = map.submap != null?map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;

                if (result == null)
                {
                    copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = cloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <>();

                result = match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.includeOrig();

                AttributeSource            origTok        = includeOrig ? firstTok : null;
                PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute));
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                      repTok       = result.synonyms[i];
                    AttributeSource            newTok       = firstTok.cloneAttributes();
                    CharTermAttribute          newTermAtt   = newTok.addAttribute(typeof(CharTermAttribute));
                    OffsetAttribute            newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute));
                    PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute));

                    OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute));

                    newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                    newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                    repPos += repTok.PositionIncrement;
                    if (i == 0)     // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos    += origPosInc.PositionIncrement;
                        origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (origTok != null)
                        {
                            origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos    += origPosInc.PositionIncrement;
                    origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (origTok != null)
                    {
                        origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Beispiel #23
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            while (true)
            {
                if (curTermBuffer == null)
                {
                    if (!input.incrementToken())
                    {
                        return(false);
                    }
                    else
                    {
                        curTermBuffer     = termAtt.buffer().clone();
                        curTermLength     = termAtt.length();
                        curCodePointCount = charUtils.codePointCount(termAtt);
                        curGramSize       = minGram;
                        curPos            = 0;
                        curPosInc         = posIncAtt.PositionIncrement;
                        curPosLen         = posLenAtt.PositionLength;
                        tokStart          = offsetAtt.startOffset();
                        tokEnd            = offsetAtt.endOffset();
                        // if length by start + end offsets doesn't match the term text then assume
                        // this is a synonym and don't adjust the offsets.
                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
                    }
                }
                if (version.onOrAfter(Version.LUCENE_44))
                {
                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
                    {
                        ++curPos;
                        curGramSize = minGram;
                    }
                    if ((curPos + curGramSize) <= curCodePointCount)
                    {
                        clearAttributes();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                        int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        termAtt.copyBuffer(curTermBuffer, start, end - start);
                        posIncAtt.PositionIncrement = curPosInc;
                        curPosInc = 0;
                        posLenAtt.PositionLength = curPosLen;
                        offsetAtt.setOffset(tokStart, tokEnd);
                        curGramSize++;
                        return(true);
                    }
                }
                else
                {
                    while (curGramSize <= maxGram)
                    {
                        while (curPos + curGramSize <= curTermLength)   // while there is input
                        {
                            clearAttributes();
                            termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
                            if (hasIllegalOffsets)
                            {
                                offsetAtt.setOffset(tokStart, tokEnd);
                            }
                            else
                            {
                                offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                            }
                            curPos++;
                            return(true);
                        }
                        curGramSize++;   // increase n-gram size
                        curPos = 0;
                    }
                }
                curTermBuffer = null;
            }
        }
Beispiel #24
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (hasMoreTokensInClone)
            {
                int start = breaker.current();
                int end   = breaker.next();
                if (end != BreakIterator.DONE)
                {
                    clonedToken.copyTo(this);
                    termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
                    if (hasIllegalOffsets)
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                    }
                    else
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
                    }
                    if (handlePosIncr)
                    {
                        posAtt.PositionIncrement = 1;
                    }
                    return(true);
                }
                hasMoreTokensInClone = false;
            }

            if (!input.incrementToken())
            {
                return(false);
            }

            if (termAtt.length() == 0 || char.UnicodeBlock.of(termAtt.charAt(0)) != char.UnicodeBlock.THAI)
            {
                return(true);
            }

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
            {
                clonedToken     = cloneAttributes();
                clonedTermAtt   = clonedToken.getAttribute(typeof(CharTermAttribute));
                clonedOffsetAtt = clonedToken.getAttribute(typeof(OffsetAttribute));
            }
            else
            {
                this.copyTo(clonedToken);
            }

            // reinit CharacterIterator
            charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
            breaker.Text = charIterator;
            int end = breaker.next();

            if (end != BreakIterator.DONE)
            {
                termAtt.Length = end;
                if (hasIllegalOffsets)
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                }
                else
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
                }
                // position increment keeps as it is for first token
                return(true);
            }
            return(false);
        }
 private void assertEquals(CharTermAttribute term, string expected)
 {
     assertEquals(expected.Length, term.length());
     //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
     //ORIGINAL LINE: final char[] buffer = term.buffer();
     char[] buffer = term.buffer();
     for (int chIDX = 0;chIDX < expected.Length;chIDX++)
     {
       assertEquals(expected[chIDX], buffer[chIDX]);
     }
 }
Beispiel #26
0
 /// <summary>
 /// Returns the next input Token whose term() is not a stop word.
 /// </summary>
 protected internal override bool Accept()
 {
     return(!stopWords.contains(termAtt.buffer(), 0, termAtt.length()));
 }