public void TestOffsetsWithTokenizer() { const string input = @"test1 <a href=""foo"">testlink</a> test2 test3"; Tokenizer t = new WhitespaceTokenizer(new HTMLStripCharFilter(CharReader.Get(new StringReader(input)))); string token = string.Empty; List <Token> results = new List <Token>(); OffsetAttribute att = ((OffsetAttribute)t.GetAttribute(typeof(OffsetAttribute))); t.IncrementToken(); Assert.AreEqual(0, att.StartOffset()); Assert.AreEqual(5, att.EndOffset() - att.StartOffset()); t.IncrementToken(); Assert.AreEqual(20, att.StartOffset()); Assert.AreEqual(8, att.EndOffset() - att.StartOffset()); t.IncrementToken(); Assert.AreEqual(33, att.StartOffset()); Assert.AreEqual(5, att.EndOffset() - att.StartOffset()); t.IncrementToken(); Assert.AreEqual(39, att.StartOffset()); Assert.AreEqual(5, att.EndOffset() - att.StartOffset()); }
public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource) { this.outerInstance = outerInstance; this.attSource = attSource; this.termAtt = attSource.GetAttribute(typeof(CharTermAttribute)); this.offsetAtt = attSource.GetAttribute(typeof(OffsetAttribute)); }
public SingleCharTokenizer(TokenStream input) : base(input) { _input = input; _termAttribute = (TermAttribute)AddAttribute(typeof(TermAttribute)); _offsetAttribute = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); _positionIncrementAttribute = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); }
public T ReadMetadata <T>() { using (var reader = CreateReader()) { var header = new MetadataHeader(reader); //MetadataHeader self registers to reader using (var vreader = (DependencyReader)reader.CreateVirtualReader(header.Header.HeaderSize)) { var mainBlock = header.StructureDefinitions.First(s => s.Type == StructureType.Main).TargetIndex; vreader.Seek(header.DataBlocks[mainBlock].Offset, SeekOrigin.Begin); var result = vreader.ReadObject <T>(); var blockProps = typeof(T).GetProperties() .Where(p => p.PropertyType.IsGenericType && p.PropertyType.GetGenericTypeDefinition() == typeof(BlockCollection <>)); foreach (var prop in blockProps) { var collection = prop.GetValue(result) as IBlockCollection; var offset = OffsetAttribute.ValueFor(prop); collection.LoadBlocks(mainBlock, offset, vreader); } return(result); } } }
// we only check a few core attributes here. // TODO: test other things //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void assertEquals(String s, org.apache.lucene.analysis.TokenStream left, org.apache.lucene.analysis.TokenStream right) throws Exception public virtual void assertEquals(string s, TokenStream left, TokenStream right) { left.reset(); right.reset(); CharTermAttribute leftTerm = left.addAttribute(typeof(CharTermAttribute)); CharTermAttribute rightTerm = right.addAttribute(typeof(CharTermAttribute)); OffsetAttribute leftOffset = left.addAttribute(typeof(OffsetAttribute)); OffsetAttribute rightOffset = right.addAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute leftPos = left.addAttribute(typeof(PositionIncrementAttribute)); PositionIncrementAttribute rightPos = right.addAttribute(typeof(PositionIncrementAttribute)); while (left.incrementToken()) { assertTrue("wrong number of tokens for input: " + s, right.incrementToken()); assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString()); assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement); assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset()); assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); } ; assertFalse("wrong number of tokens for input: " + s, right.incrementToken()); left.end(); right.end(); assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); left.close(); right.close(); }
public LuceneTokenizerAdapter(Tokenizer tokenizer) { this.tokenizer = tokenizer; this.termAttr = (CharTermAttribute)this.tokenizer.GetAttribute <ICharTermAttribute>(); this.offsetAttr = (OffsetAttribute)this.tokenizer.GetAttribute <IOffsetAttribute>(); }
// LUCENE-3642: normalize BMP->SMP and check that offsets are correct //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testCrossPlaneNormalization2() throws java.io.IOException public virtual void testCrossPlaneNormalization2() { Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this); int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { string s = TestUtil.randomUnicodeString(random()); TokenStream ts = analyzer.tokenStream("foo", s); try { ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(typeof(OffsetAttribute)); while (ts.incrementToken()) { string highlightedText = StringHelperClass.SubstringSpecial(s, offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.Length; j += char.charCount(cp)) { cp = char.ConvertToUtf32(highlightedText, j); assertTrue("non-letter:" + cp.ToString("x"), char.IsLetter(cp)); } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } // just for fun checkRandomData(random(), analyzer, num); }
public BoostingTokenFilter(TokenStream input, int[] startOffsets, float[] boosts) : base(input) { _startOffsets = startOffsets; _boosts = boosts; _offsetAttr = (OffsetAttribute)base.addAttribute(typeof(OffsetAttribute)); _payloadAttr = (PayloadAttribute)base.addAttribute(typeof(PayloadAttribute)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public void reset() throws java.io.IOException public override void reset() { base.reset(); hasMoreTokensInClone = false; clonedToken = null; clonedTermAtt = null; clonedOffsetAtt = null; }
public void TestOffsetValueExpression() { Assert.AreEqual(0, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property1)); Assert.AreEqual(10, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property2, 15)); Assert.AreEqual(15, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property2, 25)); Assert.AreEqual(20, OffsetAttribute.ValueFor((DataClass01 obj) => obj.Property3, 15)); }
private static IntPtr ParseField(FieldInfo field, PatternFinder pf) { OffsetAttribute offset = (OffsetAttribute)Attribute.GetCustomAttributes(field, typeof(OffsetAttribute)) .FirstOrDefault(); OffsetValueCN valcn = (OffsetValueCN)Attribute.GetCustomAttributes(field, typeof(OffsetValueCN)) .FirstOrDefault(); OffsetValueNA valna = (OffsetValueNA)Attribute.GetCustomAttributes(field, typeof(OffsetValueNA)) .FirstOrDefault(); IntPtr result = IntPtr.Zero; if (Constants.Lang == Language.Chn) { if (valcn != null) { return((IntPtr)valcn.Value); } if (offset == null) { return(IntPtr.Zero); } bool b1 = true; IntPtr[] results = pf.FindMany(offset.PatternCN, ref b1); if (results != null) { result = results[0]; } } else { if (valna != null) { return((IntPtr)valna.Value); } if (offset == null) { return(IntPtr.Zero); } bool b1 = true; IntPtr[] results = pf.FindMany(offset.Pattern, ref b1); if (results != null) { result = results[0]; } } Logger.Info("[OffsetManager][{0:,27}] {1}", field.Name, result.ToString("X")); return(result); }
public void SetTokenStream(TokenStream ts) { this.ts = ts; if (this.ts.HasAttribute <IOffsetAttribute>()) { this.offsetAttr = (OffsetAttribute)this.ts.GetAttribute <IOffsetAttribute>(); } if (this.ts.HasAttribute <ICharTermAttribute>()) { this.termAttr = (CharTermAttribute)this.ts.GetAttribute <ICharTermAttribute>(); } }
private void Init(int gramSize) { if (gramSize < 1) { throw new ArgumentException( "minGram must be greater than zero"); } _mGramSize = gramSize; _mTermAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); _mOffsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
public IntMetaDataTokenStream(string tokenText) { _tokenText = tokenText; // NOTE: Calling the AddAttribute<T> method failed, so // switched to using AddAttributeImpl. _termAttribute = new TermAttribute(); _offsetAttribute = new OffsetAttribute(); _payloadAtt = new PayloadAttribute(); base.AddAttributeImpl(_termAttribute); base.AddAttributeImpl(_offsetAttribute); base.AddAttributeImpl(_payloadAtt); }
public void TestOffsetValue() { var prop1 = typeof(DataClass01).GetProperty(nameof(DataClass01.Property1)); var prop2 = typeof(DataClass01).GetProperty(nameof(DataClass01.Property2)); var prop3 = typeof(DataClass01).GetProperty(nameof(DataClass01.Property3)); Assert.AreEqual(0, OffsetAttribute.ValueFor(prop1)); Assert.AreEqual(10, OffsetAttribute.ValueFor(prop2, 15)); Assert.AreEqual(15, OffsetAttribute.ValueFor(prop2, 25)); Assert.AreEqual(20, OffsetAttribute.ValueFor(prop3, 15)); }
/** * Creates NGramTokenFilter with given min and max n-grams. * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> * <param name="minGram">the smallest n-gram to generate</param> * <param name="maxGram">the largest n-gram to generate</param> */ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
/** * Creates NGramTokenFilter with given min and max n-grams. * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> * <param name="minGram">the smallest n-gram to generate</param> * <param name="maxGram">the largest n-gram to generate</param> */ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } _minGram = minGram; _maxGram = maxGram; _termAtt = (TermAttribute)AddAttribute <ITermAttribute>(); _offsetAtt = (OffsetAttribute)AddAttribute <IOffsetAttribute>(); }
/** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * @param input {@link TokenStream} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; termAtt = (TermAttribute)addAttribute(typeof(TermAttribute)); offsetAtt = (OffsetAttribute)addAttribute(typeof(OffsetAttribute)); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testSupplementaryCharacters() throws java.io.IOException public virtual void testSupplementaryCharacters() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String s = org.apache.lucene.util.TestUtil.randomUnicodeString(random(), 10); string s = TestUtil.randomUnicodeString(random(), 10); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int codePointCount = s.codePointCount(0, s.length()); int codePointCount = s.codePointCount(0, s.Length); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int minGram = org.apache.lucene.util.TestUtil.nextInt(random(), 1, 3); int minGram = TestUtil.Next(random(), 1, 3); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int maxGram = org.apache.lucene.util.TestUtil.nextInt(random(), minGram, 10); int maxGram = TestUtil.Next(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); CharTermAttribute termAtt = tk.addAttribute(typeof(CharTermAttribute)); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); OffsetAttribute offsetAtt = tk.addAttribute(typeof(OffsetAttribute)); tk.reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.Length, offsetAtt.endOffset()); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int startIndex = Character.offsetByCodePoints(s, 0, start); int startIndex = char.offsetByCodePoints(s, 0, start); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int endIndex = Character.offsetByCodePoints(s, 0, end); int endIndex = char.offsetByCodePoints(s, 0, end); assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString()); } } assertFalse(tk.incrementToken()); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testOtherLetterOffset() throws java.io.IOException public virtual void testOtherLetterOffset() { string s = "a天b"; ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s)); int correctStartOffset = 0; int correctEndOffset = 1; OffsetAttribute offsetAtt = tokenizer.getAttribute(typeof(OffsetAttribute)); tokenizer.reset(); while (tokenizer.incrementToken()) { assertEquals(correctStartOffset, offsetAtt.startOffset()); assertEquals(correctEndOffset, offsetAtt.endOffset()); correctStartOffset++; correctEndOffset++; } tokenizer.end(); tokenizer.close(); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testFilterTokens() throws Exception public virtual void testFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(this), "English"); CharTermAttribute termAtt = filter.getAttribute(typeof(CharTermAttribute)); OffsetAttribute offsetAtt = filter.getAttribute(typeof(OffsetAttribute)); TypeAttribute typeAtt = filter.getAttribute(typeof(TypeAttribute)); PayloadAttribute payloadAtt = filter.getAttribute(typeof(PayloadAttribute)); PositionIncrementAttribute posIncAtt = filter.getAttribute(typeof(PositionIncrementAttribute)); FlagsAttribute flagsAtt = filter.getAttribute(typeof(FlagsAttribute)); filter.incrementToken(); assertEquals("accent", termAtt.ToString()); assertEquals(2, offsetAtt.startOffset()); assertEquals(7, offsetAtt.endOffset()); assertEquals("wrd", typeAtt.type()); assertEquals(3, posIncAtt.PositionIncrement); assertEquals(77, flagsAtt.Flags); assertEquals(new BytesRef(new sbyte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
internal void LoadBlocks(int currentBlock, long collectionOffset, DependencyReader reader) { if (blockCount == 0) { return; } var structdef = metadata.StructureDefinitions.First(s => s.FieldBlock == currentBlock && s.FieldOffset == collectionOffset); if (structdef.TargetIndex < 0) { return; } var block = metadata.DataBlocks[structdef.TargetIndex]; var blockSize = FixedSizeAttribute.ValueFor(typeof(T)); reader.Seek(block.Offset, SeekOrigin.Begin); for (int i = 0; i < blockCount; i++) { Add(reader.ReadObject <T>()); } var blockProps = typeof(T).GetProperties() .Where(p => p.PropertyType.IsGenericType && p.PropertyType.GetGenericTypeDefinition() == typeof(BlockCollection <>)); int index = 0; foreach (var item in this) { var adjustedBase = blockSize * index++; foreach (var prop in blockProps) { var collection = prop.GetValue(item) as IBlockCollection; var offset = OffsetAttribute.ValueFor(prop); collection.LoadBlocks(structdef.TargetIndex, adjustedBase + offset, reader); } } }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null; if (result == null) { copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <>(); result = match(result); if (result == null) { // no match, simply return the first token read. copy(this, firstTok); return(true); } // reuse, or create new one each time? List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute)); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.Length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.cloneAttributes(); CharTermAttribute newTermAtt = newTok.addAttribute(typeof(CharTermAttribute)); OffsetAttribute newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute)); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute)); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length()); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute)); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException public override bool incrementToken() { if (hasMoreTokensInClone) { int start = breaker.current(); int end = breaker.next(); if (end != BreakIterator.DONE) { clonedToken.copyTo(this); termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start); if (hasIllegalOffsets) { offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset()); } else { offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end); } if (handlePosIncr) { posAtt.PositionIncrement = 1; } return(true); } hasMoreTokensInClone = false; } if (!input.incrementToken()) { return(false); } if (termAtt.length() == 0 || char.UnicodeBlock.of(termAtt.charAt(0)) != char.UnicodeBlock.THAI) { return(true); } hasMoreTokensInClone = true; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length(); // we lazy init the cloned token, as in ctor not all attributes may be added if (clonedToken == null) { clonedToken = cloneAttributes(); clonedTermAtt = clonedToken.getAttribute(typeof(CharTermAttribute)); clonedOffsetAtt = clonedToken.getAttribute(typeof(OffsetAttribute)); } else { this.copyTo(clonedToken); } // reinit CharacterIterator charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length()); breaker.Text = charIterator; int end = breaker.next(); if (end != BreakIterator.DONE) { termAtt.Length = end; if (hasIllegalOffsets) { offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset()); } else { offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end); } // position increment keeps as it is for first token return(true); } return(false); }
public override void CopyTo(IAttribute target) { OffsetAttribute t = (OffsetAttribute)target; t.SetOffset(start, end); }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.TokenStream("", key.ToString()); try { TermToBytesRefAttribute termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>(); OffsetAttribute offsetAtt = ts.AddAttribute <OffsetAttribute>(); PositionLengthAttribute posLenAtt = ts.AddAttribute <PositionLengthAttribute>(); PositionIncrementAttribute posIncAtt = ts.AddAttribute <PositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset()); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset()); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.BytesReader; // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; IList <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; prefixOutput = LookupPrefix(fst, bytesReader, token, arc); //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } nextCompletionBreak : backoff *= ALPHA; } results.Sort(new ComparatorAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.CloseWhileHandlingException(ts); } }
public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset) { Assert.IsNotNull(output); CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute)); Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute"); TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute)); OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute"); offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute)); } TypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute"); typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute)); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute"); posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute)); } ts.Reset(); for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetTermBuffer("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.SetType("bogusType"); } if (posIncrAtt != null) { posIncrAtt.SetPositionIncrement(45987657); } checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.Term(), "term " + i); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type(), "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i); } } Assert.IsFalse(ts.IncrementToken(), "end of stream"); ts.End(); if (finalOffset.HasValue) { Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset "); } ts.Close(); }
private void btnAnalyze_Click(object sender, System.EventArgs e) { ShowError(""); try { if (cmbAnalyzers.SelectedItem == null) { return; } Analyzer analyzer = null; try { // Trying to create type from executing assembly Type analyzerType = (Type)analyzers[cmbAnalyzers.SelectedItem]; if (null == analyzerType) { // Trying to create type from Lucene.Net assembly Assembly a = Assembly.GetAssembly(typeof(Lucene.Net.Analysis.Analyzer)); analyzerType = a.GetType((string)cmbAnalyzers.SelectedItem); } // Trying to create with default constructor analyzer = (Analyzer)Activator.CreateInstance(analyzerType, Util.Version.LUCENE_30); } catch (Exception) {} if (null == analyzer) { ShowError("Couldn't instantiate analyzer - public zero-argument constructor required"); return; } txtOutput.Text = txtText.Text; lstResults.BeginUpdate(); try { TokenStream ts = analyzer.TokenStream("Analyze", new StringReader(txtText.Text)); var token = ts.GetAttribute <ITermAttribute>(); var offset = ts.GetAttribute <IOffsetAttribute>(); lstResults.Items.Clear(); tokens.Clear(); while (ts.IncrementToken()) { lstResults.Items.Add(token.Term); var a = new OffsetAttribute(); a.SetOffset(offset.StartOffset, offset.EndOffset); tokens.Add(a); } } finally { lstResults.EndUpdate(); } } catch (Exception exc) { ShowError("Error analyzing: " + exc.Message); } }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBasic() throws Exception public virtual void testBasic() { b = new SynonymMap.Builder(true); add("a", "foo", true); add("a b", "bar fee", true); add("b c", "dog collar", true); add("c d", "dog harness holder extras", true); add("m c e", "dog barks loudly", false); add("i j k", "feep", true); add("e f", "foo bar", false); add("e f", "baz bee", false); add("z", "boo", false); add("y", "bee", true); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); verify("a b c", "a/bar b/fee c"); // syn output extends beyond input tokens verify("x a b c d", "x a/bar b/fee c/dog d/harness holder extras"); verify("a b a", "a/bar b/fee a/foo"); // outputs that add to one another: verify("c d c d", "c/dog d/harness c/holder/dog d/extras/harness holder extras"); // two outputs for same input verify("e f", "foo/baz bar/bee"); // verify multi-word / single-output offsets: verify("g i j k g", "g i/feep:7_3 j k g"); // mixed keepOrig true/false: verify("a m c e x", "a/foo dog barks loudly x"); verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x"); assertTrue(tokensOut.CaptureCount > 0); // no captureStates when no syns matched verify("p q r s t", "p q r s t"); assertEquals(0, tokensOut.CaptureCount); // no captureStates when only single-input syns, w/ no // lookahead needed, matched verify("p q z y t", "p q boo y/bee t"); assertEquals(0, tokensOut.CaptureCount); }
/// <summary> /// <para>Get the next token from the input stream. /// </para> /// <para>If the next token has <code>positionIncrement > 1</code>, /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are /// inserted first. /// </para> /// </summary> /// <param name="target"> Where to put the new token; if null, a new instance is created. </param> /// <returns> On success, the populated token; null otherwise </returns> /// <exception cref="IOException"> if the input stream has a problem </exception> private InputWindowToken getNextToken(InputWindowToken target) { InputWindowToken newTarget = target; if (numFillerTokensToInsert > 0) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes()); } else { nextInputStreamToken.copyTo(target.attSource); } // A filler token occupies no space newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else if (isNextInputStreamToken) { if (null == target) { newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes()); } else { nextInputStreamToken.copyTo(target.attSource); } isNextInputStreamToken = false; newTarget.isFiller = false; } else if (!exhausted) { if (input.incrementToken()) { if (null == target) { newTarget = new InputWindowToken(this, cloneAttributes()); } else { this.copyTo(target.attSource); } if (posIncrAtt.PositionIncrement > 1) { // Each output shingle must contain at least one input token, // so no more than (maxShingleSize - 1) filler tokens will be inserted. numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1); // Save the current token as the next input stream token if (null == nextInputStreamToken) { nextInputStreamToken = cloneAttributes(); } else { this.copyTo(nextInputStreamToken); } isNextInputStreamToken = true; // A filler token occupies no space newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length); newTarget.isFiller = true; --numFillerTokensToInsert; } else { newTarget.isFiller = false; } } else { exhausted = true; input.end(); endState = captureState(); numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(AttributeFactory); nextInputStreamToken.addAttribute(typeof(CharTermAttribute)); OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(typeof(OffsetAttribute)); newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset()); // Recurse/loop just once: return(getNextToken(target)); } else { newTarget = null; } } } else { newTarget = null; } return(newTarget); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testRandom() throws Exception public virtual void testRandom() { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int alphabetSize = org.apache.lucene.util.TestUtil.nextInt(random(), 2, 7); int alphabetSize = TestUtil.Next(random(), 2, 7); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int docLen = atLeast(3000); int docLen = atLeast(3000); //final int docLen = 50; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String document = getRandomString('a', alphabetSize, docLen); string document = getRandomString('a', alphabetSize, docLen); if (VERBOSE) { Console.WriteLine("TEST: doc=" + document); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int numSyn = atLeast(5); int numSyn = atLeast(5); //final int numSyn = 2; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.Map<String,OneSyn> synMap = new java.util.HashMap<>(); IDictionary<string, OneSyn> synMap = new Dictionary<string, OneSyn>(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final java.util.List<OneSyn> syns = new java.util.ArrayList<>(); IList<OneSyn> syns = new List<OneSyn>(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final boolean dedup = random().nextBoolean(); bool dedup = random().nextBoolean(); if (VERBOSE) { Console.WriteLine(" dedup=" + dedup); } b = new SynonymMap.Builder(dedup); for (int synIDX = 0;synIDX < numSyn;synIDX++) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String synIn = getRandomString('a', alphabetSize, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim(); string synIn = getRandomString('a', alphabetSize, TestUtil.Next(random(), 1, 5)).Trim(); OneSyn s = synMap[synIn]; if (s == null) { s = new OneSyn(); s.@in = synIn; syns.Add(s); s.@out = new List<>(); synMap[synIn] = s; s.keepOrig = random().nextBoolean(); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String synOut = getRandomString('0', 10, org.apache.lucene.util.TestUtil.nextInt(random(), 1, 5)).trim(); string synOut = getRandomString('0', 10, TestUtil.Next(random(), 1, 5)).Trim(); [email protected](synOut); add(synIn, synOut, s.keepOrig); if (VERBOSE) { Console.WriteLine(" syns[" + synIDX + "] = " + s.@in + " -> " + s.@out + " keepOrig=" + s.keepOrig); } } tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); if (dedup) { pruneDups(syns); } //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final String expected = slowSynMatcher(document, syns, 5); string expected = slowSynMatcher(document, syns, 5); if (VERBOSE) { Console.WriteLine("TEST: expected=" + expected); } verify(document, expected); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testOutputHangsOffEnd() throws Exception public virtual void testOutputHangsOffEnd() { b = new SynonymMap.Builder(true); const bool keepOrig = false; // b hangs off the end (no input token under it): add("a", "a b", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); // Make sure endOffset inherits from previous input token: verify("a", "a b:1"); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testBasic2() throws Exception public virtual void testBasic2() { b = new SynonymMap.Builder(true); const bool keepOrig = false; add("aaa", "aaaa1 aaaa2 aaaa3", keepOrig); add("bbb", "bbbb1 bbbb2", keepOrig); tokensIn = new MockTokenizer(new StringReader("a"), MockTokenizer.WHITESPACE, true); tokensIn.reset(); assertTrue(tokensIn.incrementToken()); assertFalse(tokensIn.incrementToken()); tokensIn.end(); tokensIn.close(); tokensOut = new SynonymFilter(tokensIn, b.build(), true); termAtt = tokensOut.addAttribute(typeof(CharTermAttribute)); posIncrAtt = tokensOut.addAttribute(typeof(PositionIncrementAttribute)); posLenAtt = tokensOut.addAttribute(typeof(PositionLengthAttribute)); offsetAtt = tokensOut.addAttribute(typeof(OffsetAttribute)); if (keepOrig) { verify("xyzzy bbb pot of gold", "xyzzy bbb/bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaa/aaaa1 pot/aaaa2 of/aaaa3 gold"); } else { verify("xyzzy bbb pot of gold", "xyzzy bbbb1 pot/bbbb2 of gold"); verify("xyzzy aaa pot of gold", "xyzzy aaaa1 pot/aaaa2 of/aaaa3 gold"); } }
public override void CopyTo(Attribute target) { OffsetAttribute t = (OffsetAttribute)target; t.SetOffset(Start, End); }
public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource) { this.outerInstance = outerInstance; this.attSource = attSource; this.termAtt = attSource.getAttribute(typeof(CharTermAttribute)); this.offsetAtt = attSource.getAttribute(typeof(OffsetAttribute)); }