Ejemplo n.º 1
0
 public KeywordTokenizer(TextReader input, int bufferSize)
     : base(input)
 {
     if (bufferSize <= 0)
     {
         throw new System.ArgumentException("bufferSize must be > 0");
     }
     termAtt.ResizeBuffer(bufferSize);
 }
Ejemplo n.º 2
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int len = termAtt.Length;
         if (marker != NOMARKER)
         {
             len++;
             termAtt.ResizeBuffer(len);
             termAtt.Buffer()[len - 1] = marker;
         }
         reverse(matchVersion, termAtt.Buffer(), 0, len);
         termAtt.Length = len;
         return(true);
     }
     else
     {
         return(false);
     }
 }
Ejemplo n.º 3
0
 private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
 {
     if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
     }
     charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
     {
         throw new System.ArgumentException("minGram must be greater than zero");
     }
     if (minGram > maxGram)
     {
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     }
     this.minGram   = minGram;
     this.maxGram   = maxGram;
     this.edgesOnly = edgesOnly;
     charBuffer     = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
     buffer         = new int[charBuffer.Buffer.Length];
     // Make the term att large enough
     termAtt.ResizeBuffer(2 * maxGram);
 }
Ejemplo n.º 4
0
        /// <summary>
        /// Constructs a compound token.
        /// </summary>
        private void GramToken()
        {
            buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length);
            int endOffset = offsetAttribute.EndOffset();

            ClearAttributes();

            int length = buffer.Length;

            char[] termText = termAttribute.Buffer();
            if (length > termText.Length)
            {
                termText = termAttribute.ResizeBuffer(length);
            }

            buffer.GetChars(0, length, termText, 0);
            termAttribute.Length = length;
            posIncAttribute.PositionIncrement = 0;
            posLenAttribute.PositionLength    = 2;  // bigram
            offsetAttribute.SetOffset(lastStartOffset, endOffset);
            typeAttribute.Type = GRAM_TYPE;
            buffer.Length      = 0;
        }
Ejemplo n.º 5
0
        public override bool IncrementToken()
        {
            ClearAttributes();
            int length      = 0;
            int start       = -1; // this variable is always initialized
            int end_Renamed = -1;

            char[] buffer = termAtt.Buffer();
            while (true)
            {
                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
                    if (ioBuffer.Length == 0)
                    {
                        dataLen = 0; // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }
                        else
                        {
                            finalOffset = CorrectOffset(offset);
                            return(false);
                        }
                    }
                    dataLen     = ioBuffer.Length;
                    bufferIndex = 0;
                }
                // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
                int c         = charUtils.codePointAt(ioBuffer.Buffer, bufferIndex, ioBuffer.Length);
                int charCount = Character.CharCount(c);
                bufferIndex += charCount;

                if (isTokenChar(c))  // if it's a token char
                {
                    if (length == 0) // start of token
                    {
                        Debug.Assert(start == -1);
                        start       = offset + bufferIndex - charCount;
                        end_Renamed = start;
                    } // check if a supplementary could run out of bounds
                    else if (length >= buffer.Length - 1)
                    {
                        buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer
                    }
                    end_Renamed += charCount;
                    length      += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized
                    if (length >= MAX_WORD_LEN)                                     // buffer overflow! make sure to check for >= surrogate pair could break == test
                    {
                        break;
                    }
                } // at non-Letter w/ chars
                else if (length > 0)
                {
                    break; // return 'em
                }
            }

            termAtt.Length = length;
            Debug.Assert(start != -1);
            offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end_Renamed));
            return(true);
        }