Beispiel #1
0
        /*
         * (non-Javadoc)
         *
         * @see Lucene.Net.Analysis.TokenStream#next()
         */
        public override bool IncrementToken()
        {
            ClearAttributes();
            int posIncr = 1;

            while (true)
            {
                int tokenType = scanner.GetNextToken();

                if (tokenType == StandardTokenizerImpl.YYEOF)
                {
                    return(false);
                }

                if (scanner.Yylength() <= maxTokenLength)
                {
                    posIncrAtt.SetPositionIncrement(posIncr);
                    scanner.GetText(termAtt);
                    int start = scanner.Yychar();
                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
                    // This 'if' should be removed in the next release. For now, it converts
                    // invalid acronyms to HOST. When removed, only the 'else' part should
                    // remain.
                    if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
                    {
                        if (replaceInvalidAcronym)
                        {
                            typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                            termAtt.SetTermLength(termAtt.TermLength() - 1);                             // remove extra '.'
                        }
                        else
                        {
                            typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                        }
                    }
                    else
                    {
                        typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
                    }
                    return(true);
                }
                // When we skip a too-long term, we still increment the
                // position increment
                else
                {
                    posIncr++;
                }
            }
        }
Beispiel #2
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p/>Removes <tt>'s</tt> from the end of words.
        /// <p/>Removes dots from acronyms.
        /// </summary>
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] buffer       = termAtt.TermBuffer();
            int    bufferLength = termAtt.TermLength();

            System.String type = typeAtt.Type();

            if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                termAtt.SetTermLength(bufferLength - 2);
            }
            else if ((System.Object)type == (System.Object)ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                termAtt.SetTermLength(upto);
            }

            return(true);
        }
Beispiel #3
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         char[] buffer = termAtt.TermBuffer();
         int    length = termAtt.TermLength();
         // If no characters actually require rewriting then we
         // just return token as-is:
         for (int i = 0; i < length; i++)
         {
             char c = buffer[i];
             if (c >= '\u00c0' && c <= '\uFB06')
             {
                 RemoveAccents(buffer, length);
                 termAtt.SetTermBuffer(output, 0, outputPos);
                 break;
             }
         }
         return(true);
     }
     else
     {
         return(false);
     }
 }
Beispiel #4
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength()))
            {
                termAtt.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
            }
            return(true);
        }
Beispiel #5
0
 /// <summary> Returns the next input Token whose term() is the right len</summary>
 public override bool IncrementToken()
 {
     // return the first non-stop word found
     while (input.IncrementToken())
     {
         int len = termAtt.TermLength();
         if (len >= min && len <= max)
         {
             return(true);
         }
         // note: else we ignore it but should we index each part of it?
     }
     // reached EOS -- return null
     return(false);
 }
Beispiel #6
0
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                for (int i = 0; i < length; i++)
                {
                    buffer[i] = System.Char.ToLower(buffer[i]);
                }

                return(true);
            }
            else
            {
                return(false);
            }
        }
Beispiel #7
0
        /// <summary> Returns the next input Token whose term() is not a stop word.</summary>
        public override bool IncrementToken()
        {
            // return the first non-stop word found
            int skippedPositions = 0;

            while (input.IncrementToken())
            {
                if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength()))
                {
                    if (enablePositionIncrements)
                    {
                        posIncrAtt.SetPositionIncrement(posIncrAtt.GetPositionIncrement() + skippedPositions);
                    }
                    return(true);
                }
                skippedPositions += posIncrAtt.GetPositionIncrement();
            }
            // reached EOS -- return null
            return(false);
        }
Beispiel #8
0
            public override bool IncrementToken()
            {
                if (input.IncrementToken())
                {
                    System.String token = new System.String(termAtt.TermBuffer(), 0, termAtt.TermLength());

                    if (!nopayload.Contains(token))
                    {
                        if (entities.Contains(token))
                        {
                            payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Entity:" + pos)));
                        }
                        else
                        {
                            payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Noise:" + pos)));
                        }
                    }
                    pos += posIncrAtt.GetPositionIncrement();
                    return(true);
                }
                return(false);
            }
        // Primary entry point (for first TermsHash)
        internal override void  Add()
        {
            System.Diagnostics.Debug.Assert(!postingsCompacted);

            // We are first in the chain so we must "intern" the
            // term text into textStart address

            // Get the text of this term.
            char[] tokenText = termAtt.TermBuffer();
            ;
            int tokenTextLen = termAtt.TermLength();

            // Compute hashcode & replace any invalid UTF16 sequences
            int downto = tokenTextLen;
            int code   = 0;

            while (downto > 0)
            {
                char ch = tokenText[--downto];

                if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END)
                {
                    if (0 == downto)
                    {
                        // Unpaired
                        ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR);
                    }
                    else
                    {
                        char ch2 = tokenText[downto - 1];
                        if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END)
                        {
                            // OK: high followed by low.  This is a valid
                            // surrogate pair.
                            code = ((code * 31) + ch) * 31 + ch2;
                            downto--;
                            continue;
                        }
                        else
                        {
                            // Unpaired
                            ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR);
                        }
                    }
                }
                else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || ch == 0xffff))
                {
                    // Unpaired or 0xffff
                    ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR);
                }

                code = (code * 31) + ch;
            }

            int hashPos = code & postingsHashMask;

            // Locate RawPostingList in hash
            p = postingsHash[hashPos];

            if (p != null && !PostingEquals(tokenText, tokenTextLen))
            {
                // Conflict: keep searching different locations in
                // the hash table.
                int inc = ((code >> 8) + code) | 1;
                do
                {
                    code   += inc;
                    hashPos = code & postingsHashMask;
                    p       = postingsHash[hashPos];
                }while (p != null && !PostingEquals(tokenText, tokenTextLen));
            }

            if (p == null)
            {
                // First time we are seeing this token since we last
                // flushed the hash.
                int textLen1 = 1 + tokenTextLen;
                if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE)
                {
                    if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE)
                    {
                        // Just skip this term, to remain as robust as
                        // possible during indexing.  A TokenFilter
                        // can be inserted into the analyzer chain if
                        // other behavior is wanted (pruning the term
                        // to a prefix, throwing an exception, etc).

                        if (docState.maxTermPrefix == null)
                        {
                            docState.maxTermPrefix = new System.String(tokenText, 0, 30);
                        }

                        consumer.SkippingLongTerm();
                        return;
                    }
                    charPool.NextBuffer();
                }

                // Refill?
                if (0 == perThread.freePostingsCount)
                {
                    perThread.MorePostings();
                }

                // Pull next free RawPostingList from free list
                p = perThread.freePostings[--perThread.freePostingsCount];
                System.Diagnostics.Debug.Assert(p != null);

                char[] text     = charPool.buffer;
                int    textUpto = charPool.charUpto;
                p.textStart        = textUpto + charPool.charOffset;
                charPool.charUpto += textLen1;
                Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
                text[textUpto + tokenTextLen] = (char)(0xffff);

                System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null);
                postingsHash[hashPos] = p;
                numPostings++;

                if (numPostings == postingsHashHalfSize)
                {
                    RehashPostings(2 * postingsHashSize);
                }

                // Init stream slices
                if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
                {
                    intPool.NextBuffer();
                }

                if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
                {
                    bytePool.NextBuffer();
                }

                intUptos         = intPool.buffer;
                intUptoStart     = intPool.intUpto;
                intPool.intUpto += streamCount;

                p.intStart = intUptoStart + intPool.intOffset;

                for (int i = 0; i < streamCount; i++)
                {
                    int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                    intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
                }
                p.byteStart = intUptos[intUptoStart];

                consumer.NewTerm(p);
            }
            else
            {
                intUptos     = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
                intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
                consumer.AddTerm(p);
            }

            if (doNextCall)
            {
                nextPerField.Add(p.textStart);
            }
        }