Beispiel #1
0
        /// <summary>
        /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
        /// <param name="version"> Lucene version to enable correct position increments.
        ///                See <see cref="NGramTokenFilter"/> for details. </param>
        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
        /// <param name="minGram"> the smallest n-gram to generate </param>
        /// <param name="maxGram"> the largest n-gram to generate </param>
        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
        {
            this.version   = version;
            this.charUtils = version.OnOrAfter(
#pragma warning disable 612, 618
                LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                             CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }
            this.minGram = minGram;
            this.maxGram = maxGram;
#pragma warning disable 612, 618
            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                posIncAtt = AddAttribute <IPositionIncrementAttribute>();
                posLenAtt = AddAttribute <IPositionLengthAttribute>();
            }
            else
            {
                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper();
                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper();
            }
            termAtt   = AddAttribute <ICharTermAttribute>();
            offsetAtt = AddAttribute <IOffsetAttribute>();
        }
Beispiel #2
0
        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
        {
#pragma warning disable 612, 618
            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
            {
                throw new ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
            }
#pragma warning disable 612, 618
            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
#pragma warning restore 612, 618
                        CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }
            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }
            termAtt        = AddAttribute <ICharTermAttribute>();
            posIncAtt      = AddAttribute <IPositionIncrementAttribute>();
            posLenAtt      = AddAttribute <IPositionLengthAttribute>();
            offsetAtt      = AddAttribute <IOffsetAttribute>();
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.edgesOnly = edgesOnly;
            charBuffer     = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
            buffer         = new int[charBuffer.Buffer.Length];

            // Make the term att large enough
            termAtt.ResizeBuffer(2 * maxGram);
        }
        private void ReadInputToBuffer()
        {
            while (true)
            {
                // CharacterUtils.fill is supplementary char aware
#pragma warning disable 612, 618
                bool hasRemainingChars = CharacterUtils.GetInstance(LuceneVersion.LUCENE_CURRENT).Fill(tmpBuffer, m_input);
#pragma warning restore 612, 618

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(tmpBuffer.Offset == 0);
                }
                inputBuffer.Append(tmpBuffer.Buffer, 0, tmpBuffer.Length);

                if (hasRemainingChars == false)
                {
                    inputFinished = true;
                    break;
                }

                int lastCodePoint = Character.CodePointBefore(tmpBuffer.Buffer, tmpBuffer.Length, 0);
                if (normalizer.IsInert(lastCodePoint))
                {
                    // we require an inert char so that we can normalize content before and
                    // after this character independently
                    break;
                }
            }
        }
Beispiel #4
0
 /// <summary>
 /// Create a new UpperCaseFilter, that normalizes token text to upper case.
 /// </summary>
 /// <param name="matchVersion"> See <a href="#version">above</a> </param>
 /// <param name="in"> TokenStream to filter </param>
 public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in)
     : base(@in)
 {
     termAtt   = AddAttribute <ICharTermAttribute>();
     termAtt   = AddAttribute <ICharTermAttribute>();
     charUtils = CharacterUtils.GetInstance(matchVersion);
 }
Beispiel #5
0
        public virtual void TestCodePointAtCharSequenceInt()
        {
            var java4            = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30);
            var cpAt3            = "Abc\ud801\udc1c";
            var highSurrogateAt3 = "Abc\ud801";

            assertEquals((int)'A', java4.CodePointAt(cpAt3, 0));
            assertEquals((int)'\ud801', java4.CodePointAt(cpAt3, 3));
            assertEquals((int)'\ud801', java4.CodePointAt(highSurrogateAt3, 3));
            try
            {
                java4.CodePointAt(highSurrogateAt3, 4);
                fail("string index out of bounds");
            }
            catch (IndexOutOfRangeException)
            {
            }

            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            assertEquals((int)'A', java5.CodePointAt(cpAt3, 0));
            assertEquals(Character.CodePointAt('\ud801', '\udc1c'), java5.CodePointAt(cpAt3, 3));
            assertEquals((int)'\ud801', java5.CodePointAt(highSurrogateAt3, 3));
            try
            {
                java5.CodePointAt(highSurrogateAt3, 4);
                fail("string index out of bounds");
            }
            catch (System.IndexOutOfRangeException)
            {
            }
        }
Beispiel #6
0
 /// <summary>
 /// Creates NGramTokenFilter with given min and max n-grams. </summary>
 /// <param name="version"> Lucene version to enable correct position increments.
 ///                See <a href="#version">above</a> for details. </param>
 /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
 /// <param name="minGram"> the smallest n-gram to generate </param>
 /// <param name="maxGram"> the largest n-gram to generate </param>
 public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram)
     : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
 {
     this.version   = version;
     this.charUtils = version.OnOrAfter(Version.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
     {
         throw new System.ArgumentException("minGram must be greater than zero");
     }
     if (minGram > maxGram)
     {
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     }
     this.minGram = minGram;
     this.maxGram = maxGram;
     if (version.OnOrAfter(Version.LUCENE_44))
     {
         posIncAtt = AddAttribute(typeof(PositionIncrementAttribute));
         posLenAtt = AddAttribute(typeof(PositionLengthAttribute));
     }
     else
     {
         posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
         posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
     }
 }
Beispiel #7
0
        public virtual void TestConversions()
        {
            var java4 = CharacterUtils.Java4Instance;
            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            TestConversions(java4);
            TestConversions(java5);
        }
Beispiel #8
0
        public virtual void TestCodePointCount()
        {
            var java4 = CharacterUtils.Java4Instance;
            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            var s = TestUtil.RandomUnicodeString(Random());

            assertEquals(s.Length, java4.CodePointCount(s));
            assertEquals(Character.CodePointCount(s, 0, s.Length), java5.CodePointCount(s));
        }
Beispiel #9
0
        public virtual void TestCodePointAtCharArrayIntInt()
        {
            var java4            = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30);
            var cpAt3            = "Abc\ud801\udc1c".ToCharArray();
            var highSurrogateAt3 = "Abc\ud801".ToCharArray();

            assertEquals((int)'A', java4.CodePointAt(cpAt3, 0, 2));
            assertEquals((int)'\ud801', java4.CodePointAt(cpAt3, 3, 5));
            assertEquals((int)'\ud801', java4.CodePointAt(highSurrogateAt3, 3, 4));

            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            assertEquals((int)'A', java5.CodePointAt(cpAt3, 0, 2));
            assertEquals(Character.ToCodePoint('\ud801', '\udc1c'), java5.CodePointAt(cpAt3, 3, 5));
            assertEquals((int)'\ud801', java5.CodePointAt(highSurrogateAt3, 3, 4));
        }
Beispiel #10
0
        public virtual void TestOffsetByCodePoint()
        {
            var java4 = CharacterUtils.Java4Instance;
            var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);

            for (int i = 0; i < 10; ++i)
            {
                var s      = TestUtil.RandomUnicodeString(Random()).toCharArray();
                var index  = TestUtil.NextInt(Random(), 0, s.Length);
                var offset = Random().Next(7) - 3;
                try
                {
                    var to = java4.OffsetByCodePoints(s, 0, s.Length, index, offset);
                    assertEquals(to, index + offset);
                }
                catch (System.IndexOutOfRangeException)
                {
                    assertTrue((index + offset) < 0 || (index + offset) > s.Length);
                }

                int o;
                try
                {
                    o = java5.OffsetByCodePoints(s, 0, s.Length, index, offset);
                }
                catch (System.IndexOutOfRangeException)
                {
                    try
                    {
                        Character.OffsetByCodePoints(s, 0, s.Length, index, offset);
                        fail();
                    }
                    catch (System.IndexOutOfRangeException)
                    {
                        // OK
                    }
                    o = -1;
                }
                if (o >= 0)
                {
                    assertEquals(Character.OffsetByCodePoints(s, 0, s.Length, index, offset), o);
                }
            }
        }
Beispiel #11
0
        public virtual void TestFillJava14()
        {
            var input    = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
            var instance = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30);
            var reader   = new StringReader(input);
            var buffer   = CharacterUtils.NewCharacterBuffer(5);

            assertTrue(instance.Fill(buffer, reader));
            assertEquals(5, buffer.Length);
            assertEquals("1234\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(5, buffer.Length);
            assertEquals("\udc1c7891", new string(buffer.Buffer));
            buffer = CharacterUtils.NewCharacterBuffer(6);
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(6, buffer.Length);
            assertEquals("23\ud801\ud801\udc1c\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertFalse(instance.Fill(buffer, reader));
        }
Beispiel #12
0
        public virtual void TestFillNoHighSurrogate()
        {
            var versions = new LuceneVersion[] { LuceneVersion.LUCENE_30, TEST_VERSION_CURRENT };

            foreach (var version in versions)
            {
                var instance = CharacterUtils.GetInstance(version);
                var reader   = new StringReader("helloworld");
                var buffer   = CharacterUtils.NewCharacterBuffer(6);
                assertTrue(instance.Fill(buffer, reader));
                assertEquals(0, buffer.Offset);
                assertEquals(6, buffer.Length);
                assertEquals("hellow", new string(buffer.Buffer));
                assertFalse(instance.Fill(buffer, reader));
                assertEquals(4, buffer.Length);
                assertEquals(0, buffer.Offset);

                assertEquals("orld", new string(buffer.Buffer, buffer.Offset, buffer.Length));
                assertFalse(instance.Fill(buffer, reader));
            }
        }
Beispiel #13
0
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            //if (version == null)
            //{
            //    throw new ArgumentException("version must not be null");
            //}

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!Enum.IsDefined(typeof(Side), side))
            {
                throw new ArgumentException("sideLabel must be either front or back");
            }

            if (minGram < 1)
            {
                throw new ArgumentException("minGram must be greater than zero");
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            this.version   = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;

            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            this.posLenAtt  = AddAttribute <IPositionLengthAttribute>();
        }
Beispiel #14
0
        public virtual void TestFillJava15()
        {
            const string input    = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
            var          instance = CharacterUtils.GetInstance(TEST_VERSION_CURRENT);
            var          reader   = new StringReader(input);
            var          buffer   = CharacterUtils.NewCharacterBuffer(5);

            assertTrue(instance.Fill(buffer, reader));
            assertEquals(4, buffer.Length);
            assertEquals("1234", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(5, buffer.Length);
            assertEquals("\ud801\udc1c789", new string(buffer.Buffer));
            assertTrue(instance.Fill(buffer, reader));
            assertEquals(4, buffer.Length);
            assertEquals("123\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertFalse(instance.Fill(buffer, reader));
            assertEquals(3, buffer.Length);
            assertEquals("\ud801\udc1c\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length));
            assertFalse(instance.Fill(buffer, reader));
            assertEquals(0, buffer.Length);
        }
Beispiel #15
0
 private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
 {
     if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
     {
         throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
     }
     charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
     if (minGram < 1)
     {
         throw new System.ArgumentException("minGram must be greater than zero");
     }
     if (minGram > maxGram)
     {
         throw new System.ArgumentException("minGram must not be greater than maxGram");
     }
     this.minGram   = minGram;
     this.maxGram   = maxGram;
     this.edgesOnly = edgesOnly;
     charBuffer     = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
     buffer         = new int[charBuffer.Buffer.Length];
     // Make the term att large enough
     termAtt.ResizeBuffer(2 * maxGram);
 }
Beispiel #16
0
        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
            : base(input)
        {
            // LUCENENET specific - version cannot be null because it is a value type.

            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
            {
                throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
            }

            if (!side.IsDefined())
            {
                throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram < 1)
            {
                throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }

            if (minGram > maxGram)
            {
                throw new ArgumentException("minGram must not be greater than maxGram");
            }

            this.version   = version;
            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version);
            this.minGram   = minGram;
            this.maxGram   = maxGram;
            this.side      = side;

            this.termAtt    = AddAttribute <ICharTermAttribute>();
            this.offsetAtt  = AddAttribute <IOffsetAttribute>();
            this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            this.posLenAtt  = AddAttribute <IPositionLengthAttribute>();
        }