/// <summary> /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <see cref="NGramTokenFilter"/> for details. </param> /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter( #pragma warning disable 612, 618 LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; #pragma warning disable 612, 618 if (version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { posIncAtt = AddAttribute <IPositionIncrementAttribute>(); posLenAtt = AddAttribute <IPositionLengthAttribute>(); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(); } termAtt = AddAttribute <ICharTermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) { #pragma warning disable 612, 618 if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { throw new ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } #pragma warning disable 612, 618 charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? #pragma warning restore 612, 618 CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } termAtt = AddAttribute <ICharTermAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); posLenAtt = AddAttribute <IPositionLengthAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.ResizeBuffer(2 * maxGram); }
private void ReadInputToBuffer() { while (true) { // CharacterUtils.fill is supplementary char aware #pragma warning disable 612, 618 bool hasRemainingChars = CharacterUtils.GetInstance(LuceneVersion.LUCENE_CURRENT).Fill(tmpBuffer, m_input); #pragma warning restore 612, 618 if (Debugging.AssertsEnabled) { Debugging.Assert(tmpBuffer.Offset == 0); } inputBuffer.Append(tmpBuffer.Buffer, 0, tmpBuffer.Length); if (hasRemainingChars == false) { inputFinished = true; break; } int lastCodePoint = Character.CodePointBefore(tmpBuffer.Buffer, tmpBuffer.Length, 0); if (normalizer.IsInert(lastCodePoint)) { // we require an inert char so that we can normalize content before and // after this character independently break; } } }
/// <summary> /// Create a new UpperCaseFilter, that normalizes token text to upper case. /// </summary> /// <param name="matchVersion"> See <a href="#version">above</a> </param> /// <param name="in"> TokenStream to filter </param> public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in) : base(@in) { termAtt = AddAttribute <ICharTermAttribute>(); termAtt = AddAttribute <ICharTermAttribute>(); charUtils = CharacterUtils.GetInstance(matchVersion); }
public virtual void TestCodePointAtCharSequenceInt() { var java4 = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30); var cpAt3 = "Abc\ud801\udc1c"; var highSurrogateAt3 = "Abc\ud801"; assertEquals((int)'A', java4.CodePointAt(cpAt3, 0)); assertEquals((int)'\ud801', java4.CodePointAt(cpAt3, 3)); assertEquals((int)'\ud801', java4.CodePointAt(highSurrogateAt3, 3)); try { java4.CodePointAt(highSurrogateAt3, 4); fail("string index out of bounds"); } catch (IndexOutOfRangeException) { } var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT); assertEquals((int)'A', java5.CodePointAt(cpAt3, 0)); assertEquals(Character.CodePointAt('\ud801', '\udc1c'), java5.CodePointAt(cpAt3, 3)); assertEquals((int)'\ud801', java5.CodePointAt(highSurrogateAt3, 3)); try { java5.CodePointAt(highSurrogateAt3, 4); fail("string index out of bounds"); } catch (System.IndexOutOfRangeException) { } }
/// <summary> /// Creates NGramTokenFilter with given min and max n-grams. </summary> /// <param name="version"> Lucene version to enable correct position increments. /// See <a href="#version">above</a> for details. </param> /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param> /// <param name="minGram"> the smallest n-gram to generate </param> /// <param name="maxGram"> the largest n-gram to generate </param> public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) { this.version = version; this.charUtils = version.OnOrAfter(Version.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; if (version.OnOrAfter(Version.LUCENE_44)) { posIncAtt = AddAttribute(typeof(PositionIncrementAttribute)); posLenAtt = AddAttribute(typeof(PositionLengthAttribute)); } else { posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); } }
public virtual void TestConversions() { var java4 = CharacterUtils.Java4Instance; var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT); TestConversions(java4); TestConversions(java5); }
public virtual void TestCodePointCount() { var java4 = CharacterUtils.Java4Instance; var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT); var s = TestUtil.RandomUnicodeString(Random()); assertEquals(s.Length, java4.CodePointCount(s)); assertEquals(Character.CodePointCount(s, 0, s.Length), java5.CodePointCount(s)); }
public virtual void TestCodePointAtCharArrayIntInt() { var java4 = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30); var cpAt3 = "Abc\ud801\udc1c".ToCharArray(); var highSurrogateAt3 = "Abc\ud801".ToCharArray(); assertEquals((int)'A', java4.CodePointAt(cpAt3, 0, 2)); assertEquals((int)'\ud801', java4.CodePointAt(cpAt3, 3, 5)); assertEquals((int)'\ud801', java4.CodePointAt(highSurrogateAt3, 3, 4)); var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT); assertEquals((int)'A', java5.CodePointAt(cpAt3, 0, 2)); assertEquals(Character.ToCodePoint('\ud801', '\udc1c'), java5.CodePointAt(cpAt3, 3, 5)); assertEquals((int)'\ud801', java5.CodePointAt(highSurrogateAt3, 3, 4)); }
public virtual void TestOffsetByCodePoint() { var java4 = CharacterUtils.Java4Instance; var java5 = CharacterUtils.GetInstance(TEST_VERSION_CURRENT); for (int i = 0; i < 10; ++i) { var s = TestUtil.RandomUnicodeString(Random()).toCharArray(); var index = TestUtil.NextInt(Random(), 0, s.Length); var offset = Random().Next(7) - 3; try { var to = java4.OffsetByCodePoints(s, 0, s.Length, index, offset); assertEquals(to, index + offset); } catch (System.IndexOutOfRangeException) { assertTrue((index + offset) < 0 || (index + offset) > s.Length); } int o; try { o = java5.OffsetByCodePoints(s, 0, s.Length, index, offset); } catch (System.IndexOutOfRangeException) { try { Character.OffsetByCodePoints(s, 0, s.Length, index, offset); fail(); } catch (System.IndexOutOfRangeException) { // OK } o = -1; } if (o >= 0) { assertEquals(Character.OffsetByCodePoints(s, 0, s.Length, index, offset), o); } } }
public virtual void TestFillJava14() { var input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801"; var instance = CharacterUtils.GetInstance(LuceneVersion.LUCENE_30); var reader = new StringReader(input); var buffer = CharacterUtils.NewCharacterBuffer(5); assertTrue(instance.Fill(buffer, reader)); assertEquals(5, buffer.Length); assertEquals("1234\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length)); assertTrue(instance.Fill(buffer, reader)); assertEquals(5, buffer.Length); assertEquals("\udc1c7891", new string(buffer.Buffer)); buffer = CharacterUtils.NewCharacterBuffer(6); assertTrue(instance.Fill(buffer, reader)); assertEquals(6, buffer.Length); assertEquals("23\ud801\ud801\udc1c\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length)); assertFalse(instance.Fill(buffer, reader)); }
public virtual void TestFillNoHighSurrogate() { var versions = new LuceneVersion[] { LuceneVersion.LUCENE_30, TEST_VERSION_CURRENT }; foreach (var version in versions) { var instance = CharacterUtils.GetInstance(version); var reader = new StringReader("helloworld"); var buffer = CharacterUtils.NewCharacterBuffer(6); assertTrue(instance.Fill(buffer, reader)); assertEquals(0, buffer.Offset); assertEquals(6, buffer.Length); assertEquals("hellow", new string(buffer.Buffer)); assertFalse(instance.Fill(buffer, reader)); assertEquals(4, buffer.Length); assertEquals(0, buffer.Offset); assertEquals("orld", new string(buffer.Buffer, buffer.Offset, buffer.Length)); assertFalse(instance.Fill(buffer, reader)); } }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { //if (version == null) //{ // throw new ArgumentException("version must not be null"); //} if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (!Enum.IsDefined(typeof(Side), side)) { throw new ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); this.posLenAtt = AddAttribute <IPositionLengthAttribute>(); }
public virtual void TestFillJava15() { const string input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801"; var instance = CharacterUtils.GetInstance(TEST_VERSION_CURRENT); var reader = new StringReader(input); var buffer = CharacterUtils.NewCharacterBuffer(5); assertTrue(instance.Fill(buffer, reader)); assertEquals(4, buffer.Length); assertEquals("1234", new string(buffer.Buffer, buffer.Offset, buffer.Length)); assertTrue(instance.Fill(buffer, reader)); assertEquals(5, buffer.Length); assertEquals("\ud801\udc1c789", new string(buffer.Buffer)); assertTrue(instance.Fill(buffer, reader)); assertEquals(4, buffer.Length); assertEquals("123\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length)); assertFalse(instance.Fill(buffer, reader)); assertEquals(3, buffer.Length); assertEquals("\ud801\udc1c\ud801", new string(buffer.Buffer, buffer.Offset, buffer.Length)); assertFalse(instance.Fill(buffer, reader)); assertEquals(0, buffer.Length); }
private void init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) { if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) { throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.Buffer.Length]; // Make the term att large enough termAtt.ResizeBuffer(2 * maxGram); }
public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) : base(input) { // LUCENENET specific - version cannot be null because it is a value type. if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) { throw new ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (!side.IsDefined()) { throw new ArgumentOutOfRangeException(nameof(side), "sideLabel must be either front or back"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (minGram < 1) { throw new ArgumentOutOfRangeException(nameof(minGram), "minGram must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (minGram > maxGram) { throw new ArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.GetJava4Instance(version); this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute <ICharTermAttribute>(); this.offsetAtt = AddAttribute <IOffsetAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); this.posLenAtt = AddAttribute <IPositionLengthAttribute>(); }