// ================================================= Helper Methods ================================================ /// <summary> /// Determines whether the transition from lastType to type indicates a break /// </summary> /// <param name="lastType"> Last subword type </param> /// <param name="type"> Current subword type </param> /// <returns> <c>true</c> if the transition indicates a break, <c>false</c> otherwise </returns> private bool IsBreak(int lastType, int type) { if ((type & lastType) != 0) { return(false); } if (!splitOnCaseChange && WordDelimiterFilter.IsAlpha(lastType) && WordDelimiterFilter.IsAlpha(type)) { // ALPHA->ALPHA: always ignore if case isn't considered. return(false); } else if (WordDelimiterFilter.IsUpper(lastType) && WordDelimiterFilter.IsAlpha(type)) { // UPPER->letter: Don't split return(false); } else if (!splitOnNumerics && ((WordDelimiterFilter.IsAlpha(lastType) && WordDelimiterFilter.IsDigit(type)) || (WordDelimiterFilter.IsDigit(lastType) && WordDelimiterFilter.IsAlpha(type)))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split return(false); } return(true); }
public virtual void doSplit(string input, params string[] output) { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, output); }
public virtual void TestOffsetChange2() { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.CATENATE_ALL | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "übelkeit" }, new int[] { 8 }, new int[] { 17 }); }
public virtual void TestOffsetChange4() { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.CATENATE_ALL | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "foo", "foobar", "bar" }, new int[] { 8, 8, 12 }, new int[] { 11, 15, 15 }); }
/// <summary> /// Determines if the text at the given position indicates an English possessive which should be removed /// </summary> /// <param name="pos"> Position in the text to check if it indicates an English possessive </param> /// <returns> <c>true</c> if the text at the position indicates an English posessive, <c>false</c> otherwise </returns> private bool EndsWithPossessive(int pos) { return(stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && (text[pos - 1] == 's' || text[pos - 1] == 'S') && WordDelimiterFilter.IsAlpha(CharType(text[pos - 3])) && (pos == endBounds || WordDelimiterFilter.IsSubwordDelim(CharType(text[pos])))); }
public virtual void doSplitPossessive(int stemPossessive, string input, params string[] output) { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS; flags |= (stemPossessive == 1) ? WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE : 0; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), flags, null); AssertTokenStreamContents(wdf, output); }
public virtual void TestOffsetChange2() { WordDelimiterFlags flags = WordDelimiterFlags.GENERATE_WORD_PARTS | WordDelimiterFlags.GENERATE_NUMBER_PARTS | WordDelimiterFlags.CATENATE_ALL | WordDelimiterFlags.SPLIT_ON_CASE_CHANGE | WordDelimiterFlags.SPLIT_ON_NUMERICS | WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "übelkeit" }, new int[] { 8 }, new int[] { 17 }); }
public virtual void TestOffsets() { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.CATENATE_ALL | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; // test that subwords and catenated subwords have // the correct offsets. WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "foo", "foobar", "bar" }, new int[] { 5, 5, 9 }, new int[] { 8, 12, 12 }); wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "foo", "bar", "foobar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 }); }
public virtual void TestOffsets() { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.CATENATE_ALL | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; // test that subwords and catenated subwords have // the correct offsets. WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "foo", "foobar", "bar" }, new int[] { 5, 5, 9 }, new int[] { 8, 12, 12 }); wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, new string[] { "foo", "bar", "foobar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 }); }
/// <summary> /// Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove /// it yet, simply note it. /// </summary> private void SetBounds() { while (startBounds < length && (WordDelimiterFilter.IsSubwordDelim(CharType(text[startBounds])))) { startBounds++; } while (endBounds > startBounds && (WordDelimiterFilter.IsSubwordDelim(CharType(text[endBounds - 1])))) { endBounds--; } if (EndsWithPossessive(endBounds)) { hasFinalPossessive = true; } current = startBounds; }
/// <summary> /// Advance to the next subword in the string. /// </summary> /// <returns> index of the next subword, or <see cref="DONE"/> if all subwords have been returned </returns> internal int Next() { current = end; if (current == DONE) { return(DONE); } if (skipPossessive) { current += 2; skipPossessive = false; } int lastType = 0; while (current < endBounds && (WordDelimiterFilter.IsSubwordDelim(lastType = CharType(text[current])))) { current++; } if (current >= endBounds) { return(end = DONE); } for (end = current + 1; end < endBounds; end++) { int type = CharType(text[end]); if (IsBreak(lastType, type)) { break; } lastType = type; } if (end < endBounds - 1 && EndsWithPossessive(end + 2)) { skipPossessive = true; } return(end); }
public virtual void doSplitPossessive(int stemPossessive, string input, params string[] output) { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS; flags |= (stemPossessive == 1) ? WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE : 0; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), flags, null); AssertTokenStreamContents(wdf, output); }
public virtual void doSplit(string input, params string[] output) { int flags = WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); AssertTokenStreamContents(wdf, output); }
public WordDelimiterConcatenation(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public OffsetSorter(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public OffsetSorter(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public WordDelimiterConcatenation(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new WikipediaTokenizer(reader); TokenStream stream = new SopTokenFilter(tokenizer); stream = new WordDelimiterFilter(TEST_VERSION_CURRENT, stream, table, -50, protWords); stream = new SopTokenFilter(stream); return new TokenStreamComponents(tokenizer, stream); }