Lucene.Net.Analysis.Tokenattributes.TermAttribute.TermBuffer C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p/>Removes <tt>'s</tt> from the end of words.
        /// <p/>Removes dots from acronyms.
        /// </summary>
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] buffer       = termAtt.TermBuffer();
            int    bufferLength = termAtt.TermLength();

            System.String type = typeAtt.Type();

            if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                termAtt.SetTermLength(bufferLength - 2);
            }
            else if ((System.Object)type == (System.Object)ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                termAtt.SetTermLength(upto);
            }

            return(true);
        }

コード例 #2

0

ファイルを表示

 public override bool IncrementToken()
 {
     if (!done)
     {
         ClearAttributes();
         done = true;
         int    upto   = 0;
         char[] buffer = termAtt.TermBuffer();
         while (true)
         {
             int length = input.Read(buffer, upto, buffer.Length - upto);
             if (length == 0)
             {
                 break;
             }
             upto += length;
             if (upto == buffer.Length)
             {
                 buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
             }
         }
         termAtt.SetTermLength(upto);
         finalOffset = CorrectOffset(upto);
         offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
         return(true);
     }
     return(false);
 }

コード例 #3

0

ファイルを表示

 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         char[] buffer = termAtt.TermBuffer();
         int    length = termAtt.TermLength();
         // If no characters actually require rewriting then we
         // just return token as-is:
         for (int i = 0; i < length; i++)
         {
             char c = buffer[i];
             if (c >= '\u00c0' && c <= '\uFB06')
             {
                 RemoveAccents(buffer, length);
                 termAtt.SetTermBuffer(output, 0, outputPos);
                 break;
             }
         }
         return(true);
     }
     else
     {
         return(false);
     }
 }

コード例 #4

0

ファイルを表示

        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength()))
            {
                termAtt.SetTermBuffer(stemmer.GetResultBuffer(), 0, stemmer.GetResultLength());
            }
            return(true);
        }

コード例 #5

0

ファイルを表示

        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                for (int i = 0; i < length; i++)
                {
                    buffer[i] = System.Char.ToLower(buffer[i]);
                }

                return(true);
            }
            else
            {
                return(false);
            }
        }

コード例 #6

0

ファイルを表示

ファイル: StopFilter.cs プロジェクト: thinhtp/liteweb.info

        /// <summary> Returns the next input Token whose term() is not a stop word.</summary>
        public override bool IncrementToken()
        {
            // return the first non-stop word found
            int skippedPositions = 0;

            while (input.IncrementToken())
            {
                if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength()))
                {
                    if (enablePositionIncrements)
                    {
                        posIncrAtt.SetPositionIncrement(posIncrAtt.GetPositionIncrement() + skippedPositions);
                    }
                    return(true);
                }
                skippedPositions += posIncrAtt.GetPositionIncrement();
            }
            // reached EOS -- return null
            return(false);
        }

コード例 #7

0

ファイルを表示

ファイル: TestPayloadSpans.cs プロジェクト: stgwilli/ravendb

            public override bool IncrementToken()
            {
                if (input.IncrementToken())
                {
                    System.String token = new System.String(termAtt.TermBuffer(), 0, termAtt.TermLength());

                    if (!nopayload.Contains(token))
                    {
                        if (entities.Contains(token))
                        {
                            payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Entity:" + pos)));
                        }
                        else
                        {
                            payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Noise:" + pos)));
                        }
                    }
                    pos += posIncrAtt.GetPositionIncrement();
                    return(true);
                }
                return(false);
            }

コード例 #8

0

ファイルを表示

                public override bool IncrementToken()
                {
                    int count = input.Read((System.Char[])buffer, 0, buffer.Length);

                    if (done)
                    {
                        return(false);
                    }
                    else
                    {
                        ClearAttributes();
                        done = true;
                        if (count == 1)
                        {
                            termAtt.TermBuffer()[0] = buffer[0];
                            termAtt.SetTermLength(1);
                        }
                        else
                        {
                            termAtt.SetTermLength(0);
                        }
                        return(true);
                    }
                }

コード例 #9

0

ファイルを表示

ファイル: CharTokenizer.cs プロジェクト: stgwilli/ravendb

        public override bool IncrementToken()
        {
            ClearAttributes();
            int length = 0;
            int start  = bufferIndex;

            char[] buffer = termAtt.TermBuffer();
            while (true)
            {
                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    dataLen = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length);
                    if (dataLen <= 0)
                    {
                        dataLen = 0;                         // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }
                        else
                        {
                            return(false);
                        }
                    }
                    bufferIndex = 0;
                }

                char c = ioBuffer[bufferIndex++];

                if (IsTokenChar(c))
                {
                    // if it's a token char

                    if (length == 0)
                    {
                        // start of token
                        start = offset + bufferIndex - 1;
                    }
                    else if (length == buffer.Length)
                    {
                        buffer = termAtt.ResizeTermBuffer(1 + length);
                    }

                    buffer[length++] = Normalize(c);                     // buffer it, normalized

                    if (length == MAX_WORD_LEN)
                    {
                        // buffer overflow!
                        break;
                    }
                }
                else if (length > 0)
                {
                    // at non-Letter w/ chars
                    break;                     // return 'em
                }
            }

            termAtt.SetTermLength(length);
            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
            return(true);
        }

コード例 #10

0

ファイルを表示

ファイル: TermsHashPerField.cs プロジェクト: thinhtp/liteweb.info

        // Primary entry point (for first TermsHash)
        internal override void  Add()
        {
            System.Diagnostics.Debug.Assert(!postingsCompacted);

            // We are first in the chain so we must "intern" the
            // term text into textStart address

            // Get the text of this term.
            char[] tokenText = termAtt.TermBuffer();
            ;
            int tokenTextLen = termAtt.TermLength();

            // Compute hashcode & replace any invalid UTF16 sequences
            int downto = tokenTextLen;
            int code   = 0;

            while (downto > 0)
            {
                char ch = tokenText[--downto];

                if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END)
                {
                    if (0 == downto)
                    {
                        // Unpaired
                        ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR);
                    }
                    else
                    {
                        char ch2 = tokenText[downto - 1];
                        if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END)
                        {
                            // OK: high followed by low.  This is a valid
                            // surrogate pair.
                            code = ((code * 31) + ch) * 31 + ch2;
                            downto--;
                            continue;
                        }
                        else
                        {
                            // Unpaired
                            ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR);
                        }
                    }
                }
                else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || ch == 0xffff))
                {
                    // Unpaired or 0xffff
                    ch = tokenText[downto] = (char)(UnicodeUtil.UNI_REPLACEMENT_CHAR);
                }

                code = (code * 31) + ch;
            }

            int hashPos = code & postingsHashMask;

            // Locate RawPostingList in hash
            p = postingsHash[hashPos];

            if (p != null && !PostingEquals(tokenText, tokenTextLen))
            {
                // Conflict: keep searching different locations in
                // the hash table.
                int inc = ((code >> 8) + code) | 1;
                do
                {
                    code   += inc;
                    hashPos = code & postingsHashMask;
                    p       = postingsHash[hashPos];
                }while (p != null && !PostingEquals(tokenText, tokenTextLen));
            }

            if (p == null)
            {
                // First time we are seeing this token since we last
                // flushed the hash.
                int textLen1 = 1 + tokenTextLen;
                if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE)
                {
                    if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE)
                    {
                        // Just skip this term, to remain as robust as
                        // possible during indexing.  A TokenFilter
                        // can be inserted into the analyzer chain if
                        // other behavior is wanted (pruning the term
                        // to a prefix, throwing an exception, etc).

                        if (docState.maxTermPrefix == null)
                        {
                            docState.maxTermPrefix = new System.String(tokenText, 0, 30);
                        }

                        consumer.SkippingLongTerm();
                        return;
                    }
                    charPool.NextBuffer();
                }

                // Refill?
                if (0 == perThread.freePostingsCount)
                {
                    perThread.MorePostings();
                }

                // Pull next free RawPostingList from free list
                p = perThread.freePostings[--perThread.freePostingsCount];
                System.Diagnostics.Debug.Assert(p != null);

                char[] text     = charPool.buffer;
                int    textUpto = charPool.charUpto;
                p.textStart        = textUpto + charPool.charOffset;
                charPool.charUpto += textLen1;
                Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
                text[textUpto + tokenTextLen] = (char)(0xffff);

                System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null);
                postingsHash[hashPos] = p;
                numPostings++;

                if (numPostings == postingsHashHalfSize)
                {
                    RehashPostings(2 * postingsHashSize);
                }

                // Init stream slices
                if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
                {
                    intPool.NextBuffer();
                }

                if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE)
                {
                    bytePool.NextBuffer();
                }

                intUptos         = intPool.buffer;
                intUptoStart     = intPool.intUpto;
                intPool.intUpto += streamCount;

                p.intStart = intUptoStart + intPool.intOffset;

                for (int i = 0; i < streamCount; i++)
                {
                    int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
                    intUptos[intUptoStart + i] = upto + bytePool.byteOffset;
                }
                p.byteStart = intUptos[intUptoStart];

                consumer.NewTerm(p);
            }
            else
            {
                intUptos     = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
                intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
                consumer.AddTerm(p);
            }

            if (doNextCall)
            {
                nextPerField.Add(p.textStart);
            }
        }

C# (CSharp) Lucene.Net.Analysis.Tokenattributes.TermAttribute.TermBufferの例