/// <summary> Build a filter that removes words that are too long or too /// short from the text. /// </summary> public LengthFilter(TokenStream in_Renamed, int min, int max) : base(in_Renamed) { this.min = min; this.max = max; termAtt = AddAttribute <ITermAttribute>(); }
private void InitBlock(AnonymousClassAnalyzer1 enclosingInstance) { this.enclosingInstance = enclosingInstance; termAtt = AddAttribute<ITermAttribute>(); payloadAtt = AddAttribute<IPayloadAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
/// <summary> /// Builds a GermanStemFilter that uses an exclusiontable. /// </summary> /// <param name="_in"></param> /// <param name="exclusiontable"></param> /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' /// respectively, before the DIN1 stemmer is invoked.</param> public GermanStemFilter(TokenStream _in, ISet <string> exclusiontable, bool normalizeDin2) : base(_in) { exclusionSet = exclusiontable; stemmer = normalizeDin2 ? new GermanDIN2Stemmer() : new GermanStemmer(); termAtt = AddAttribute <ITermAttribute>(); }
/// <summary> /// 将word取出词干,支持停用词 /// </summary> /// <param name="word"></param> /// <param name="language"></param> /// <returns></returns> public static string SnowballWord(string word, string language) { string result = null; string stemmer = SnowballDict.GetStemmer(language); if (stemmer == null) { result = word; } else { using (SnowballAnalyzer snowball = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, stemmer, StopWord.StopWordList)) { using (TokenStream ts = snowball.ReusableTokenStream("", new StringReader(word)))//只显示分词信息,不需要使用FieldName { while (ts.IncrementToken()) { ITermAttribute attribute = ts.GetAttribute <ITermAttribute>(); result = attribute.Term; } } } } return(result); }
private void Init(int bufferSize) { this.done = false; termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); termAtt.ResizeTermBuffer(bufferSize); }
/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> protected void AddTermFrequencies(System.IO.TextReader r, IDictionary <string, Int> termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); int tokenCount = 0; // for every token ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>(); while (ts.IncrementToken()) { string word = termAtt.Term; tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
public AddSuffixFilter(TokenStream input, Dictionary <string, char[]> _suffixByTokenType) : base(input) { termAtt = AddAttribute <ITermAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); this.suffixByTokenType = _suffixByTokenType; }
public virtual void TestStopListPositions() { var stopWordsSet = Support.Compatibility.SetFactory.CreateHashSet <string>(); stopWordsSet.Add("good"); stopWordsSet.Add("test"); stopWordsSet.Add("analyzer"); var newStop = new StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet); var reader = new System.IO.StringReader("This is a good test of the english stop analyzer with positions"); int[] expectedIncr = { 1, 1, 1, 3, 1, 1, 1, 2, 1 }; TokenStream stream = newStop.TokenStream("test", reader); Assert.NotNull(stream); int i = 0; ITermAttribute termAtt = stream.GetAttribute <ITermAttribute>(); IPositionIncrementAttribute posIncrAtt = stream.AddAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { string text = termAtt.Term; Assert.IsFalse(stopWordsSet.Contains(text)); Assert.AreEqual(expectedIncr[i++], posIncrAtt.PositionIncrement); } }
public virtual void TestCloneAttributes() { AttributeSource src = new AttributeSource(); ITermAttribute termAtt = src.AddAttribute <ITermAttribute>(); ITypeAttribute typeAtt = src.AddAttribute <ITypeAttribute>(); termAtt.SetTermBuffer("TestTerm"); typeAtt.Type = "TestType"; AttributeSource clone = src.CloneAttributes(); System.Collections.Generic.IEnumerator <Type> it = clone.GetAttributeTypesIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext()); Assert.AreEqual(typeof(ITermAttribute), it.Current, "TermAttribute must be the first attribute"); Assert.IsTrue(it.MoveNext()); Assert.AreEqual(typeof(ITypeAttribute), it.Current, "TypeAttribute must be the second attribute"); Assert.IsFalse(it.MoveNext(), "No more attributes"); ITermAttribute termAtt2 = clone.GetAttribute <ITermAttribute>(); ITypeAttribute typeAtt2 = clone.GetAttribute <ITypeAttribute>(); Assert.IsFalse(ReferenceEquals(termAtt2, termAtt), "TermAttribute of original and clone must be different instances"); Assert.IsFalse(ReferenceEquals(typeAtt2, typeAtt), "TypeAttribute of original and clone must be different instances"); Assert.AreEqual(termAtt2, termAtt, "TermAttribute of original and clone must be equal"); Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal"); }
/// <summary> /// Creates a shingle filter with ad hoc parameter settings. /// </summary> /// <param name="input">stream from which to construct the matrix</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char?spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { _input = input; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _inTermAtt = input.AddAttribute <ITermAttribute>(); _inPosIncrAtt = input.AddAttribute <IPositionIncrementAttribute>(); _inPayloadAtt = input.AddAttribute <IPayloadAttribute>(); _inOffsetAtt = input.AddAttribute <IOffsetAttribute>(); _inTypeAtt = input.AddAttribute <ITypeAttribute>(); _inFlagsAtt = input.AddAttribute <IFlagsAttribute>(); }
/// <summary> /// Initialization constructor. /// </summary> /// <param name="tokenizer">CodeXCavator ITokenizer instance.</param> /// <param name="reader">Text reader, providing access to the text, from which tokens should be extracted.</param> /// <param name="caseMode">Defines, how the token stream should treat character case.</param> internal ITokenizerTokenStream(ITokenizer tokenizer, System.IO.TextReader reader, Case caseMode = Case.Sensitive) : base(reader) { mTermAttribute = AddAttribute <ITermAttribute>(); mOffsetAttribute = AddAttribute <IOffsetAttribute>(); mTokenizer = tokenizer; mCaseMode = caseMode; }
///<summary> ///Lucene Tokenizer适配器类构造函数 /// </summary> /// <param name="isMaxWordLength">当为true时,分词器进行最大词长切分;当为false是,采用最细粒度切分</param> public IKTokenizer(TextReader inreader, bool isMaxWordLength) : base(inreader) { offsetAtt = AddAttribute<IOffsetAttribute>(); termAtt = AddAttribute<ITermAttribute>(); _IKImplement = new IKSegmentation(inreader, isMaxWordLength); }
public void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8)) using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8)) { TokenStream _in = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer( sampleUnicode); ITermAttribute text = _in.GetAttribute <ITermAttribute>(); ITermAttribute sampleText = sample.GetAttribute <ITermAttribute>(); for (; ;) { if (_in.IncrementToken() == false) { break; } bool nextSampleToken = sample.IncrementToken(); Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode"); } } }
public override void CopyTo(Attribute target) { InitTermBuffer(); ITermAttribute t = (ITermAttribute)target; t.SetTermBuffer(termBuffer, 0, termLength); }
/// <summary> /// 显示分词完整信息 /// </summary> /// <param name="content"></param> /// <param name="analzyer"></param> /// <returns></returns> public static List <TermInfo> TestTermAll(string content, Analyzer analzyer) { List <TermInfo> list = new List <TermInfo>(); using (TokenStream tokenStream = analzyer.ReusableTokenStream("", new StringReader(content))) { //tokenStream.AddAttribute<ITermAttribute>(); while (tokenStream.IncrementToken()) { ITermAttribute termAttribute = tokenStream.GetAttribute <ITermAttribute>(); IPositionIncrementAttribute postionIncrementAttribute = tokenStream.GetAttribute <IPositionIncrementAttribute>(); ITypeAttribute typeAttribute = tokenStream.GetAttribute <ITypeAttribute>(); IOffsetAttribute offsetAttribute = tokenStream.GetAttribute <IOffsetAttribute>(); TermInfo obj = new TermInfo(); //obj.FlagsAttribute = tokenStream.GetAttribute<IFlagsAttribute>().Flags.ToString(); //obj.PayloadAttribute = tokenStream.GetAttribute<IPayloadAttribute>().Payload.Length.ToString(); obj.TermAttribute = termAttribute.Term; obj.OffsetAttribute = offsetAttribute.StartOffset.ToString() + "---" + offsetAttribute.EndOffset.ToString(); obj.PositionIncrementAttribute = postionIncrementAttribute.PositionIncrement.ToString(); obj.TypeAttribute = typeAttribute.Type; obj.TokenStream = tokenStream; list.Add(obj); } } return(list); }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } ITermAttribute termAttribute = this.GetAttribute <ITermAttribute>(); ISourceAttribute sourceAttribute = GetAttribute <ISourceAttribute>(); ISpellAttribute spellAttribute = GetAttribute <ISpellAttribute>(); IStemAttribute stemAttribute = GetAttribute <IStemAttribute>(); sourceAttribute.Term = termAttribute.Term; spellAttribute.Term = termAttribute.Term; stemAttribute.Term = termAttribute.Term; if (!SpellChecker.Exist(spellAttribute.Term)) { var res = SpellChecker.SuggestSimilar(spellAttribute.Term, 100); if (res.Length != 0) { spellAttribute.Term = res[0]; } } termAttribute.SetTermBuffer(spellAttribute.Term); return(true); }
public SingleCharTokenizer(TokenStream input) : base(input) { _input = input; _termAttribute = AddAttribute <ITermAttribute>(); _offsetAttribute = AddAttribute <IOffsetAttribute>(); _positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>(); }
public virtual void TestToStringAndMultiAttributeImplementations() { AttributeSource src = new AttributeSource(); ITermAttribute termAtt = src.AddAttribute <ITermAttribute>(); ITypeAttribute typeAtt = src.AddAttribute <ITypeAttribute>(); termAtt.SetTermBuffer("TestTerm"); typeAtt.Type = "TestType"; Assert.AreEqual("(" + typeAtt.ToString() + "," + termAtt.ToString() + ")", src.ToString(), "Attributes should appear in original order"); System.Collections.Generic.IEnumerator <Attribute> it = src.GetAttributeImplsIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext(), "Iterator should have 2 attributes left"); Assert.AreSame(typeAtt, it.Current, "First AttributeImpl from iterator should be typeAtt"); Assert.IsTrue(it.MoveNext(), "Iterator should have 1 attributes left"); Assert.AreSame(termAtt, it.Current, "Second AttributeImpl from iterator should be termAtt"); Assert.IsFalse(it.MoveNext(), "Iterator should have 0 attributes left"); src = new AttributeSource(); src.AddAttributeImpl(new Token()); // this should not add a new attribute as Token implements TermAttribute, too termAtt = src.AddAttribute <ITermAttribute>(); Assert.IsTrue(termAtt is Token, "TermAttribute should be implemented by Token"); // get the Token attribute and check, that it is the only one it = src.GetAttributeImplsIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext()); Token tok = (Token)it.Current; Assert.IsFalse(it.MoveNext(), "There should be only one attribute implementation instance"); termAtt.SetTermBuffer("TestTerm"); Assert.AreEqual("(" + tok.ToString() + ")", src.ToString(), "Token should only printed once"); }
public void Test() { String test = "The quick red fox jumped over the lazy brown dogs"; NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D"); bool seenDogs = false; ITermAttribute termAtt = nptf.GetAttribute <ITermAttribute>(); ITypeAttribute typeAtt = nptf.GetAttribute <ITypeAttribute>(); IPayloadAttribute payloadAtt = nptf.GetAttribute <IPayloadAttribute>(); while (nptf.IncrementToken()) { if (termAtt.Term.Equals("dogs")) { seenDogs = true; Assert.True(typeAtt.Type.Equals("D") == true, typeAtt.Type + " is not equal to " + "D"); Assert.True(payloadAtt.Payload != null, "payloadAtt.GetPayload() is null and it shouldn't be"); byte[] bytes = payloadAtt.Payload.GetData();//safe here to just use the bytes, otherwise we should use offset, length Assert.True(bytes.Length == payloadAtt.Payload.Length, bytes.Length + " does not equal: " + payloadAtt.Payload.Length); Assert.True(payloadAtt.Payload.Offset == 0, payloadAtt.Payload.Offset + " does not equal: " + 0); float pay = PayloadHelper.DecodeFloat(bytes); Assert.True(pay == 3, pay + " does not equal: " + 3); } else { Assert.True(typeAtt.Type.Equals("word"), typeAtt.Type + " is not null and it should be"); } } Assert.True(seenDogs == true, seenDogs + " does not equal: " + true); }
public override bool IncrementToken() { bool wasGoodWord = false; while (!wasGoodWord) { if (!input.IncrementToken()) { return(false); } ITermAttribute termAttribute = GetAttribute <ITermAttribute>(); var termArr = termAttribute.Term.ToCharArray(); if (termArr.Length < 3) { continue; } wasGoodWord = true; for (int i = 0; i < termArr.Length; ++i) { if (!_goodChars.Contains(termArr[i])) { wasGoodWord = false; } } } return(true); }
//public virtual string GetView(TokenStream tokenStream, out int numberOfTokens) //{ // StringBuilder sb = new StringBuilder(); // Token token = tokenStream.Next(); // numberOfTokens = 0; // while (token != null) // { // numberOfTokens++; // sb.Append(GetTokenView(token)); // token = tokenStream.Next(); // } // return sb.ToString(); //} public virtual string GetView(TokenStream tokenStream, out int numberOfAttributes) { StringBuilder sb = new StringBuilder(); //instead of CharTermAttribute, use //ITermAttribute termAttr = tokenStream.GetAttribute<ITermAttribute>(); //try addattribute //Attribute termAttr = tokenStream.GetAttribute<Attribute>(); //Error here. Resolved ITermAttribute is a must! ITermAttribute termAttr = tokenStream.GetAttribute <ITermAttribute>(); //try addattribute here //Error here //http://stackoverflow.com/questions/16274779/get-termattribute-in-tokenstream-lucene-net numberOfAttributes = 0; tokenStream.Reset(); //try out without while (tokenStream.IncrementToken()) { numberOfAttributes++; sb.Append(GetAttributeView(termAttr)); termAttr = tokenStream.GetAttribute <ITermAttribute>(); //string term = termAttr.Term; } tokenStream.End(); tokenStream.Dispose(); //http://stackoverflow.com/questions/2638200/how-to-get-a-token-from-a-lucene-tokenstream return(sb.ToString()); }
public QueryTermVector(string queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { IList <string> terms = new List <string>(); try { bool hasMoreTokens = false; stream.Reset(); ITermAttribute termAtt = stream.AddAttribute <ITermAttribute>(); hasMoreTokens = stream.IncrementToken(); while (hasMoreTokens) { terms.Add(termAtt.Term); hasMoreTokens = stream.IncrementToken(); } ProcessTerms(terms.ToArray()); } catch (System.IO.IOException) { } } } }
/// <summary> /// 对keyword进行分词,将分词的结果返回 /// </summary> public static IEnumerable <string> SplitWords(string keyword) { IList <string> list = new List <string>(); Analyzer analyzer = new PanGuAnalyzer(); TokenStream stream = analyzer.TokenStream(keyword, new StringReader(keyword)); ITermAttribute ita = null; bool hasNext = stream.IncrementToken(); while (hasNext) { ita = stream.GetAttribute <ITermAttribute>(); list.Add(ita.Term); hasNext = stream.IncrementToken(); } return(list); //IList<string> list = new List<string>(); //Analyzer analyzer = new PanGuAnalyzer(); //TokenStream tokenStream = analyzer.TokenStream("", new StringReader(keyword)); //Token token = null; //while ((token = tokenStream.IncrementToken()) != null) //{ // // token.TermText()为当前分的词 // string word = token.TermText(); // list.Add(word); //} //return list; }
public CamelCaseFilter(TokenStream stream) : base(stream) { _termAttribute = AddAttribute<ITermAttribute>(); _offsetAttribute = AddAttribute<IOffsetAttribute>(); _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>(); }
protected CompoundWordTokenFilterBase(TokenStream input, ISet <string> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input) { this.tokens = new LinkedList <Token>(); this.minWordSize = minWordSize; this.minSubwordSize = minSubwordSize; this.maxSubwordSize = maxSubwordSize; this.onlyLongestMatch = onlyLongestMatch; if (dictionary is CharArraySet) { this.dictionary = (CharArraySet)dictionary; } else { this.dictionary = new CharArraySet(dictionary.Count, false); AddAllLowerCase(this.dictionary, dictionary); } termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); flagsAtt = AddAttribute <IFlagsAttribute>(); posIncAtt = AddAttribute <IPositionIncrementAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); payloadAtt = AddAttribute <IPayloadAttribute>(); }
public CamelCaseFilter(TokenStream stream) : base(stream) { _termAttribute = AddAttribute <ITermAttribute>(); _offsetAttribute = AddAttribute <IOffsetAttribute>(); _positionIncrementAttribute = AddAttribute <IPositionIncrementAttribute>(); }
public AddSuffixFilter(TokenStream input, Dictionary<string, char[]> _suffixByTokenType) : base(input) { termAtt = AddAttribute <ITermAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); this.suffixByTokenType = _suffixByTokenType; }
public FrenchStemFilter(TokenStream _in) : base(_in) { stemmer = new FrenchStemmer(); termAtt = AddAttribute<ITermAttribute>(); }
public MyTokenStream(TestTermVectorsReader enclosingInstance) { InitBlock(enclosingInstance); termAtt = AddAttribute <ITermAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); }
/// <summary> Build a filter that removes words that are too long or too /// short from the text. /// </summary> public LengthFilter(TokenStream in_Renamed, int min, int max) : base(in_Renamed) { this.min = min; this.max = max; termAtt = AddAttribute<ITermAttribute>(); }
/// <summary> /// Creates a shingle filter based on a user defined matrix. /// /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor. /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at. /// /// </summary> /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param> /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param> /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param> /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param> /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param> /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param> public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { Matrix = matrix; MinimumShingleSize = minimumShingleSize; MaximumShingleSize = maximumShingleSize; SpacerCharacter = spacerCharacter; IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; _settingsCodec = settingsCodec; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute <ITermAttribute>(); _posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); _payloadAtt = AddAttribute <IPayloadAttribute>(); _offsetAtt = AddAttribute <IOffsetAttribute>(); _typeAtt = AddAttribute <ITypeAttribute>(); _flagsAtt = AddAttribute <IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor // set the input to be an empty token stream, we already have the data. _input = new EmptyTokenStream(); _inTermAtt = _input.AddAttribute <ITermAttribute>(); _inPosIncrAtt = _input.AddAttribute <IPositionIncrementAttribute>(); _inPayloadAtt = _input.AddAttribute <IPayloadAttribute>(); _inOffsetAtt = _input.AddAttribute <IOffsetAttribute>(); _inTypeAtt = _input.AddAttribute <ITypeAttribute>(); _inFlagsAtt = _input.AddAttribute <IFlagsAttribute>(); }
public virtual void TestCloneAttributes() { AttributeSource src = new AttributeSource(); ITermAttribute termAtt = src.AddAttribute <ITermAttribute>(); ITypeAttribute typeAtt = src.AddAttribute <ITypeAttribute>(); termAtt.SetTermBuffer("TestTerm"); typeAtt.Type = "TestType"; var types = new List <Type>(); AttributeSource clone = src.CloneAttributes(); IEnumerator <Type> it = clone.GetAttributeTypesIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext()); types.Add(it.Current); Assert.IsTrue(it.MoveNext()); types.Add(it.Current); Assert.IsFalse(it.MoveNext(), "No more attributes"); Assert.Contains(typeof(ITypeAttribute), types, "TypeAttribute must be present in attributes"); Assert.Contains(typeof(ITermAttribute), types, "TermAttribute must be present in attributes"); ITermAttribute termAtt2 = clone.GetAttribute <ITermAttribute>(); ITypeAttribute typeAtt2 = clone.GetAttribute <ITypeAttribute>(); Assert.IsFalse(ReferenceEquals(termAtt2, termAtt), "TermAttribute of original and clone must be different instances"); Assert.IsFalse(ReferenceEquals(typeAtt2, typeAtt), "TypeAttribute of original and clone must be different instances"); Assert.AreEqual(termAtt2, termAtt, "TermAttribute of original and clone must be equal"); Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal"); }
private void Init(int bufferSize) { this.done = false; termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); termAtt.ResizeTermBuffer(bufferSize); }
/* * 此处忽略调用base(input);因调用后input的position会被移动 * by zh */ public MMSegTokenizer(Seg seg, TextReader input) : base(input) { mmSeg = new MMSeg(input, seg); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param name="input"></param> /// <param name="synonymEngine"></param> /// <return></return> public SynonymFilter(TokenStream input, ISynonymEngine synonymEngine) : base(input) { synonymStack = new Stack <String>(); this.engine = synonymEngine; this.termAtt = AddAttribute <ITermAttribute>(); this.posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); }
public CutLeterDigitFilter(TokenStream input) : base(input) { reusableToken = new Token(); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder) : base(input) { termAtt = AddAttribute<ITermAttribute>(); payAtt = AddAttribute<IPayloadAttribute>(); this.delimiter = delimiter; this.encoder = encoder; }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> /// <param name="input"></param> /// <param name="synonymEngine"></param> /// <return></return> public SynonymFilter(TokenStream input, ISynonymEngine synonymEngine) : base(input) { synonymStack = new Stack<String>(); this.engine = synonymEngine; this.termAtt = AddAttribute<ITermAttribute>(); this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
void Init() { InitPanGuSegment(); termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
public SynonymsFilter(TokenStream input, List <Dictionary <string, string[]> > synonymsDictList) : base(input) { this._SynonymsQueue = new Queue <string>(); this._TermAttribute = base.AddAttribute <ITermAttribute>(); this._PostionIncrementAttribute = base.AddAttribute <IPositionIncrementAttribute>(); this._synonymsDictList = synonymsDictList; }
public SynonymsFilter(string language, TokenStream input) : base(input) { this._Language = language; this._SynonymsQueue = new Queue <string>(); this._TermAttribute = base.AddAttribute <ITermAttribute>(); this._PostionIncrementAttribute = base.AddAttribute <IPositionIncrementAttribute>(); }
/// <summary> /// Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using /// affix rules in the provided HunspellDictionary. /// </summary> /// <param name="input">TokenStream whose tokens will be stemmed.</param> /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param> /// <param name="dedup">true if only unique terms should be output.</param> public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true) : base(input) { _posIncAtt = AddAttribute<IPositionIncrementAttribute>(); _termAtt = AddAttribute<ITermAttribute>(); _dedup = dedup; _stemmer = new HunspellStemmer(dictionary); }
private void Init(System.IO.TextReader _input, HebMorph.DataStructures.DictRadix<int> _prefixesTree) { termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); //posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); typeAtt = AddAttribute <ITypeAttribute>(); input = _input; hebMorphTokenizer = new HebMorph.Tokenizer(_input); prefixesTree = _prefixesTree; }
public ExpandAcronymsFilter(TokenStream input, IAcronymExpansionProvider acronymExpansionProvider) : base(input) { _acronymExpansionProvider = acronymExpansionProvider; _termAttribute = AddAttribute<ITermAttribute>(); _positionIncrementAttribute = AddAttribute<IPositionIncrementAttribute>(); _tokenSet = new Queue<string>(); _recognizedTokens = new HashSet<string>(); }
public JiebaTokenizer(JiebaSegmenter seg, string input) { segmenter = seg; termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); var text = input; tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList(); }
public ExpanderFilter(TokenStream input, [NotNull] Func<String, IEnumerable<Expansion>> expander, Boolean emitSource = true) : base(input) { if (expander == null) throw new ArgumentNullException("expander"); _expander = expander; _emitSource = emitSource; _termAttr = AddAttribute<ITermAttribute>(); _posAttr = AddAttribute<IPositionIncrementAttribute>(); _typeAttr = AddAttribute<ITypeAttribute>(); }
/// <summary>Construct the named stemming filter. /// /// </summary> /// <param name="input">the input tokens to stem /// </param> /// <param name="name">the name of a stemmer /// </param> public SnowballFilter(TokenStream input, System.String name) : base(input) { try { System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer"); stemmer = (SnowballProgram) System.Activator.CreateInstance(stemClass); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } termAtt = AddAttribute<ITermAttribute>(); }
private void Init(System.IO.TextReader input, HebMorph.StreamLemmatizer _lemmatizer, HebMorph.LemmaFilters.LemmaFilterBase _lemmaFilter, bool AlwaysSaveMarkedOriginal) { termAtt = AddAttribute <ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); //payAtt = (PayloadAttribute)AddAttribute(typeof(PayloadAttribute)); this.input = input; this._streamLemmatizer = _lemmatizer; this._streamLemmatizer.SetStream(input); this.alwaysSaveMarkedOriginal = AlwaysSaveMarkedOriginal; this.lemmaFilter = _lemmaFilter; }
/// <summary> Construct a token stream filtering the given input. /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if /// <c>makeStopSet()</c> was used to construct the set) it will be directly used /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c> /// directly controls case sensitivity. /// <p/> /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />, /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be /// used to specify the case sensitivity of that set. /// </summary> /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param> /// <param name="input">Input TokenStream</param> /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param> /// <param name="ignoreCase">if true, all words are lower cased first</param> public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase) : base(input) { if (stopWords is CharArraySet) { this.stopWords = (CharArraySet) stopWords; } else { this.stopWords = new CharArraySet(stopWords.Count, ignoreCase); this.stopWords.AddAll(stopWords); } this.enablePositionIncrements = enablePositionIncrements; termAtt = AddAttribute<ITermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); }
/* * Creates NGramTokenFilter with given min and max n-grams. * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> * <param name="minGram">the smallest n-gram to generate</param> * <param name="maxGram">the largest n-gram to generate</param> */ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.termAtt = AddAttribute<ITermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public override bool Accept(AttributeSource source) { if (termAtt == null) { termAtt = source.AddAttribute<ITermAttribute>(); } try { DateTime date = DateTime.Parse(termAtt.Term, dateFormat);//We don't care about the date, just that we can parse it as a date if (date != null) { return true; } } catch (FormatException) { } return false; }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) { Suffix = suffix; Prefix = prefix; _prefixExhausted = false; // ReSharper disable DoNotCallOverridableMethodsInConstructor _termAtt = AddAttribute<ITermAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _payloadAtt = AddAttribute<IPayloadAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); _flagsAtt = AddAttribute<IFlagsAttribute>(); // ReSharper restore DoNotCallOverridableMethodsInConstructor _pTermAtt = prefix.AddAttribute<ITermAttribute>(); _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>(); _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>(); _pTypeAtt = prefix.AddAttribute<ITypeAttribute>(); _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>(); }
public PanGuTokenizer(TextReader input) : base(input) { termAttribute = AddAttribute<ITermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); inputText = base.input.ReadToEnd(); if (string.IsNullOrEmpty(inputText)) { char[] readBuf = new char[1024]; int relCount = base.input.Read(readBuf, 0, readBuf.Length); StringBuilder inputStr = new StringBuilder(readBuf.Length); while (relCount > 0) { inputStr.Append(readBuf, 0, relCount); relCount = input.Read(readBuf, 0, readBuf.Length); } if (inputStr.Length > 0) { inputText = inputStr.ToString(); } } if (string.IsNullOrEmpty(inputText)) { words = new WordInfo[0]; } else { global::PanGu.Segment segment = new Segment(); ICollection<WordInfo> wordInfos = segment.DoSegment(inputText); words = new WordInfo[wordInfos.Count]; wordInfos.CopyTo(words, 0); } }
//private System.Reflection.MethodInfo stemMethod; public SnowballFilter(TokenStream input, SnowballProgram stemmer) : base(input) { this.stemmer = stemmer; termAtt = AddAttribute<ITermAttribute>(); }
internal SingleTokenAttributeSource() { termAttribute = AddAttribute<ITermAttribute>(); offsetAttribute = AddAttribute<IOffsetAttribute>(); }
public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords) { this.str = str; this.isLetter = isLetter; this.toLowerCase = toLowerCase; this.stopWords = stopWords; this.termAtt = AddAttribute<ITermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public RegexTokenizer(String str, Regex regex, bool toLowerCase) { this.str = str; this.matcher = regex.Match(str); this.toLowerCase = toLowerCase; this.termAtt = AddAttribute<ITermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
public RepeatingTokenStream(System.String val) { this.value_Renamed = val; this.termAtt = AddAttribute<ITermAttribute>(); }
internal WordTokenFilter(TokenStream input) : base(input) { termAtt = AddAttribute<ITermAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
public PorterStemFilter(TokenStream in_Renamed) : base(in_Renamed) { stemmer = new PorterStemmer(); termAtt = AddAttribute<ITermAttribute>(); }