// ================================================= Helper Methods ================================================ /// <summary> /// Determines whether the transition from lastType to type indicates a break /// </summary> /// <param name="lastType"> Last subword type </param> /// <param name="type"> Current subword type </param> /// <returns> {@code true} if the transition indicates a break, {@code false} otherwise </returns> private bool isBreak(int lastType, int type) { if ((type & lastType) != 0) { return(false); } if (!splitOnCaseChange && WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isAlpha(type)) { // ALPHA->ALPHA: always ignore if case isn't considered. return(false); } else if (WordDelimiterFilter.isUpper(lastType) && WordDelimiterFilter.isAlpha(type)) { // UPPER->letter: Don't split return(false); } else if (!splitOnNumerics && ((WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isDigit(type)) || (WordDelimiterFilter.isDigit(lastType) && WordDelimiterFilter.isAlpha(type)))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split return(false); } return(true); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void doSplit(final String input, String... output) throws Exception //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: public virtual void doSplit(string input, params string[] output) { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, output); }
//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes: //ORIGINAL LINE: @Test public void testOffsetChange2() throws Exception //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: public virtual void testOffsetChange2() { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, new string[] { "übelkeit" }, new int[] { 8 }, new int[] { 17 }); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void doSplit(final String input, String... output) throws Exception //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: public virtual void doSplit(string input, params string[] output) { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, output); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: public virtual void doSplitPossessive(int stemPossessive, string input, params string[] output) { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), flags, null); assertTokenStreamContents(wdf, output); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception //JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET: public virtual void doSplitPossessive(int stemPossessive, string input, params string[] output) { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader(input), MockTokenizer.KEYWORD, false), flags, null); assertTokenStreamContents(wdf, output); }
/// <summary> ///* /// public void testPerformance() throws IOException { /// String s = "now is the time-for all good men to come to-the aid of their country."; /// Token tok = new Token(); /// long start = System.currentTimeMillis(); /// int ret=0; /// for (int i=0; i<1000000; i++) { /// StringReader r = new StringReader(s); /// TokenStream ts = new WhitespaceTokenizer(r); /// ts = new WordDelimiterFilter(ts, 1,1,1,1,0); /// /// while (ts.next(tok) != null) ret++; /// } /// /// System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start)); /// } /// ** /// </summary> //JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes: //ORIGINAL LINE: @Test public void testOffsets() throws java.io.IOException //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: public virtual void testOffsets() { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; // test that subwords and catenated subwords have // the correct offsets. WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, new string[] { "foo", "foobar", "bar" }, new int[] { 5, 5, 9 }, new int[] { 8, 12, 12 }); wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, new string[] { "foo", "bar", "foobar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 }); }
/// <summary> /// Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove /// it yet, simply note it. /// </summary> private void setBounds() { while (startBounds < length && (WordDelimiterFilter.isSubwordDelim(charType(text[startBounds])))) { startBounds++; } while (endBounds > startBounds && (WordDelimiterFilter.isSubwordDelim(charType(text[endBounds - 1])))) { endBounds--; } if (endsWithPossessive(endBounds)) { hasFinalPossessive = true; } current = startBounds; }
/// <summary> /// Advance to the next subword in the string. /// </summary> /// <returns> index of the next subword, or <seealso cref="#DONE"/> if all subwords have been returned </returns> internal int next() { current = end; if (current == DONE) { return(DONE); } if (skipPossessive) { current += 2; skipPossessive = false; } int lastType = 0; while (current < endBounds && (WordDelimiterFilter.isSubwordDelim(lastType = charType(text[current])))) { current++; } if (current >= endBounds) { return(end = DONE); } for (end = current + 1; end < endBounds; end++) { int type_Renamed = charType(text[end]); if (isBreak(lastType, type_Renamed)) { break; } lastType = type_Renamed; } if (end < endBounds - 1 && endsWithPossessive(end + 2)) { skipPossessive = true; } return(end); }
/// <summary> ///* /// public void testPerformance() throws IOException { /// String s = "now is the time-for all good men to come to-the aid of their country."; /// Token tok = new Token(); /// long start = System.currentTimeMillis(); /// int ret=0; /// for (int i=0; i<1000000; i++) { /// StringReader r = new StringReader(s); /// TokenStream ts = new WhitespaceTokenizer(r); /// ts = new WordDelimiterFilter(ts, 1,1,1,1,0); /// /// while (ts.next(tok) != null) ret++; /// } /// /// System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start)); /// } /// ** /// </summary> //JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes: //ORIGINAL LINE: @Test public void testOffsets() throws java.io.IOException //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: public virtual void testOffsets() { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; // test that subwords and catenated subwords have // the correct offsets. WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, new string[] {"foo", "foobar", "bar"}, new int[] {5, 5, 9}, new int[] {8, 12, 12}); wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, new string[] {"foo", "bar", "foobar"}, new int[] {5, 5, 5}, new int[] {6, 6, 6}); }
//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes: //ORIGINAL LINE: @Test public void testOffsetChange4() throws Exception //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: public virtual void testOffsetChange4() { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; WordDelimiterFilter wdf = new WordDelimiterFilter(TEST_VERSION_CURRENT, new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, new string[] {"foo", "foobar", "bar"}, new int[] {8, 8, 12}, new int[] {11, 15, 15}); }
/// <summary> /// Determines if the text at the given position indicates an English possessive which should be removed /// </summary> /// <param name="pos"> Position in the text to check if it indicates an English possessive </param> /// <returns> {@code true} if the text at the position indicates an English posessive, {@code false} otherwise </returns> private bool endsWithPossessive(int pos) { return(stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && (text[pos - 1] == 's' || text[pos - 1] == 'S') && WordDelimiterFilter.isAlpha(charType(text[pos - 3])) && (pos == endBounds || WordDelimiterFilter.isSubwordDelim(charType(text[pos])))); }
public WordDelimiterConcatenation(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public OffsetSorter(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public WordDelimiterConcatenation(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }
public OffsetSorter(WordDelimiterFilter outerInstance) { this.outerInstance = outerInstance; }