public override bool accept(AttributeSource source) { if (termAtt == null) { termAtt = source.addAttribute(typeof(CharTermAttribute)); } try { DateTime date = dateFormat.parse(termAtt.ToString()); //We don't care about the date, just that we can parse it as a date if (date != null) { return true; } } catch (ParseException) { } return false; }
/// <summary> /// Fills CharTermAttribute with the current token text. /// </summary> public void getText(CharTermAttribute t) { t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testRandom() throws Exception public virtual void testRandom() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int alphabetSize = org.apache.lucene.util.TestUtil.nextInt(random(), 2, 7); int alphabetSize = TestUtil.Next(random(), 2, 7); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int docLen = atLeast(3000); int docLen = atLeast(3000); //final int docLen = 50; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String document = getRandomString('a', alphabetSize, docLen); string document = getRandomString('a', alphabetSize, docLen); if (VERBOSE) { Console.WriteLine("TEST: doc=" + document); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int numSyn = atLeast(5); int numSyn = atLeast(5); //final int numSyn = 2; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.Map<String,OneSyn> synMap = new java.util.HashMap<>(); IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.List<OneSyn> syns = new java.util.ArrayList<>(); IList<OneSyn> syns = new List<OneSyn>(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean dedup = random().nextBoolean(); bool dedup = random().nextBoolean(); if (VERBOSE) { Console.WriteLine(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for (int synIDX = 0;synIDX < numSyn;synIDX++) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String synIn = getRandomString('a', alphabetSize, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim(); string synIn = getRandomString('a', alphabetSize, TestUtil.Next(random(), 1, 5)).Trim(); OneSyn s = synMap[synIn]; if (s == null) { s = new OneSyn(); s.@in = synIn; syns.Add(s); s.@out = new List<>(); synMap[synIn] = s; s.keepOrig = random().nextBoolean(); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String synOut = getRandomString('0', 10, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim(); string synOut = getRandomString('0', 10, TestUtil.Next(random(), 1, 5)).Trim(); [email protected](synOut); add(synIn, synOut, s.keepOrig); if (VERBOSE) { Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); if (dedup) { pruneDups(syns); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String expected = slowSynMatcher(document, syns, 5); string expected = slowSynMatcher(document, syns, 5); if (VERBOSE) { Console.WriteLine("TEST: expected=" + expected); } verify(document, expected); }
private void assertEquals(CharTermAttribute term, string expected) { assertEquals(expected.Length, term.length()); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final char[] buffer = term.buffer(); char[] buffer = term.buffer(); for (int chIDX = 0;chIDX < expected.Length;chIDX++) { assertEquals(expected[chIDX], buffer[chIDX]); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBasic2() throws Exception public virtual void testBasic2() { b = new SynonymMap.Builder(true); const bool keepOrig = false; add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); add("bbb", "bbbb1 bbbb2", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); if (keepOrig) { verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); } else { verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testOutputHangsOffEnd() throws Exception public virtual void testOutputHangsOffEnd() { b = new SynonymMap.Builder(true); const bool keepOrig = false; // b hangs off the end (no input token under it): add("a", "a b", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); // Make sure endOffset inherits from previous input token: verify("a", "a b:1"); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBasic() throws Exception public virtual void testBasic() { b = new SynonymMap.Builder(true); add("a", "foo", true); add("a b", "bar fee", true); add("b c", "dog collar", true); add("c d", "dog harness holder extras", true); add("m c e", "dog barks loudly", false); add("i j k", "feep", true); add("e f", "foo bar", false); add("e f", "baz bee", false); add("z", "boo", false); add("y", "bee", true); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); verify("a b c", "a/bar b/fee c"); // syn output extends beyond input tokens verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); verify("a b a", "a/bar b/fee a/foo"); // outputs that add to one another: verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); // two outputs for same input verify("e f", "foo/baz bar/bee"); // verify multi-word / single-output offsets: verify("g i j k g", "g i/feep:7_3 j k g"); // mixed keepOrig true/false: verify("a m c e x", "a/foo dog barks loudly x"); verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); assertTrue(tokensOut.CaptureCount > 0); // no captureStates when no syns matched verify("p q r s t", "p q r s t"); assertEquals(0, tokensOut.CaptureCount); // no captureStates when only single-input syns, w/ no // lookahead needed, matched verify("p q z y t", "p q boo y/bee t"); assertEquals(0, tokensOut.CaptureCount); }
public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource) { this.outerInstance = outerInstance; this.attSource = attSource; this.termAtt = attSource.getAttribute(typeof(CharTermAttribute)); this.offsetAtt = attSource.getAttribute(typeof(OffsetAttribute)); }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: if (replacement != null && replacement.hasNext()) { //JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops: copy(this, replacement.next()); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) { return(false); } CharTermAttribute termAtt = firstTok.addAttribute(typeof(CharTermAttribute)); SlowSynonymMap result = map.submap != null?map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; if (result == null) { copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = cloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <>(); result = match(result); if (result == null) { // no match, simply return the first token read. copy(this, firstTok); return(true); } // reuse, or create new one each time? List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.includeOrig(); AttributeSource origTok = includeOrig ? firstTok : null; PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute)); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.Length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.cloneAttributes(); CharTermAttribute newTermAtt = newTok.addAttribute(typeof(CharTermAttribute)); OffsetAttribute newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute)); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute)); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length()); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
public override bool accept(AttributeSource a) { CharTermAttribute termAtt = a.getAttribute(typeof(CharTermAttribute)); return(termAtt.ToString().Equals("Dogs", StringComparison.CurrentCultureIgnoreCase)); }
/// <summary> /// Not an explicit test, just useful to print out some info on performance /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void performance() throws Exception public virtual void performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { StringBuilder buffer = new StringBuilder(); Console.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.consumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100); CharTermAttribute tfTok = stream.addAttribute(typeof(CharTermAttribute)); CharTermAttribute sinkTok = sink.addAttribute(typeof(CharTermAttribute)); for (int i = 0; stream.incrementToken(); i++) { assertTrue(sink.incrementToken()); assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = DateTimeHelperClass.CurrentUnixTimeMillis(); for (int i = 0; i < 20; i++) { stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))); PositionIncrementAttribute posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute)); while (stream.incrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.getAttribute(typeof(PositionIncrementAttribute)); while (stream.incrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } long finish = DateTimeHelperClass.CurrentUnixTimeMillis(); Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = DateTimeHelperClass.CurrentUnixTimeMillis(); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())))); sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(typeof(PositionIncrementAttribute)); while (teeStream.incrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.getAttribute(typeof(PositionIncrementAttribute)); while (sink.incrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } finish = DateTimeHelperClass.CurrentUnixTimeMillis(); Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos); } Console.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }