public SingleCharTokenizer(TokenStream input) : base(input) { _input = input; _termAttribute = (TermAttribute)AddAttribute(typeof(TermAttribute)); _offsetAttribute = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); _positionIncrementAttribute = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); }
/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> private void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); int tokenCount = 0; // for every token TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); while (ts.IncrementToken()) { string word = termAtt.Term(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = (Int)termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
public virtual void TestToStringAndMultiAttributeImplementations() { AttributeSource src = new AttributeSource(); TermAttribute termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute)); TypeAttribute typeAtt = (TypeAttribute)src.AddAttribute(typeof(TypeAttribute)); termAtt.SetTermBuffer("TestTerm"); typeAtt.SetType("TestType"); Assert.AreEqual("(" + termAtt.ToString() + "," + typeAtt.ToString() + ")", src.ToString(), "Attributes should appear in original order"); System.Collections.Generic.IEnumerator <AttributeImpl> it = src.GetAttributeImplsIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext(), "Iterator should have 2 attributes left"); Assert.AreSame(termAtt, it.Current, "First AttributeImpl from iterator should be termAtt"); Assert.IsTrue(it.MoveNext(), "Iterator should have 1 attributes left"); Assert.AreSame(typeAtt, it.Current, "Second AttributeImpl from iterator should be typeAtt"); Assert.IsFalse(it.MoveNext(), "Iterator should have 0 attributes left"); src = new AttributeSource(); src.AddAttributeImpl(new Token()); // this should not add a new attribute as Token implements TermAttribute, too termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute)); Assert.IsTrue(termAtt is Token, "TermAttribute should be implemented by Token"); // get the Token attribute and check, that it is the only one it = src.GetAttributeImplsIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext()); Token tok = (Token)it.Current; Assert.IsFalse(it.MoveNext(), "There should be only one attribute implementation instance"); termAtt.SetTermBuffer("TestTerm"); Assert.AreEqual("(" + tok.ToString() + ")", src.ToString(), "Token should only printed once"); }
protected new void AddTermFrequencies(System.IO.StreamReader r, System.Collections.IDictionary termFreqMap, System.String fieldName) { var analyzer = Analyzers[fieldName]; TokenStream ts = analyzer.TokenStream(fieldName, r); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int tokenCount = 0; while (ts.IncrementToken()) { // for every token System.String word = termAtt.Term(); tokenCount++; if (tokenCount > GetMaxNumTokensParsed()) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency var cnt = (Int)termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
public override void CopyTo(AttributeImpl target) { InitTermBuffer(); TermAttribute t = (TermAttribute)target; t.SetTermBuffer(termBuffer, 0, termLength); }
public virtual void TestCloneAttributes() { AttributeSource src = new AttributeSource(); TermAttribute termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute)); TypeAttribute typeAtt = (TypeAttribute)src.AddAttribute(typeof(TypeAttribute)); termAtt.SetTermBuffer("TestTerm"); typeAtt.SetType("TestType"); AttributeSource clone = src.CloneAttributes(); System.Collections.IEnumerator it = clone.GetAttributeClassesIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext()); Assert.AreEqual(typeof(TermAttribute), it.Current, "TermAttribute must be the first attribute"); Assert.IsTrue(it.MoveNext()); Assert.AreEqual(typeof(TypeAttribute), it.Current, "TypeAttribute must be the second attribute"); Assert.IsFalse(it.MoveNext(), "No more attributes"); TermAttribute termAtt2 = (TermAttribute)clone.GetAttribute(typeof(TermAttribute)); TypeAttribute typeAtt2 = (TypeAttribute)clone.GetAttribute(typeof(TypeAttribute)); Assert.IsFalse(ReferenceEquals(termAtt2, termAtt), "TermAttribute of original and clone must be different instances"); Assert.IsFalse(ReferenceEquals(typeAtt2, typeAtt), "TypeAttribute of original and clone must be different instances"); Assert.AreEqual(termAtt2, termAtt, "TermAttribute of original and clone must be equal"); Assert.AreEqual(typeAtt2, typeAtt, "TypeAttribute of original and clone must be equal"); }
public virtual void TestCaptureState() { // init a first instance AttributeSource src = new AttributeSource(); TermAttribute termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute)); TypeAttribute typeAtt = (TypeAttribute)src.AddAttribute(typeof(TypeAttribute)); termAtt.SetTermBuffer("TestTerm"); typeAtt.SetType("TestType"); int hashCode = src.GetHashCode(); AttributeSource.State state = src.CaptureState(); // modify the attributes termAtt.SetTermBuffer("AnotherTestTerm"); typeAtt.SetType("AnotherTestType"); Assert.IsTrue(hashCode != src.GetHashCode(), "Hash code should be different"); src.RestoreState(state); Assert.AreEqual("TestTerm", termAtt.Term()); Assert.AreEqual("TestType", typeAtt.Type()); Assert.AreEqual(hashCode, src.GetHashCode(), "Hash code should be equal after restore"); // restore into an exact configured copy AttributeSource copy = new AttributeSource(); copy.AddAttribute(typeof(TermAttribute)); copy.AddAttribute(typeof(TypeAttribute)); copy.RestoreState(state); Assert.AreEqual(src.GetHashCode(), copy.GetHashCode(), "Both AttributeSources should have same hashCode after restore"); Assert.AreEqual(src, copy, "Both AttributeSources should be equal after restore"); // init a second instance (with attributes in different order and one additional attribute) AttributeSource src2 = new AttributeSource(); typeAtt = (TypeAttribute)src2.AddAttribute(typeof(TypeAttribute)); Lucene.Net.Analysis.Tokenattributes.FlagsAttribute flagsAtt = (Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)src2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)); termAtt = (TermAttribute)src2.AddAttribute(typeof(TermAttribute)); flagsAtt.SetFlags(12345); src2.RestoreState(state); Assert.AreEqual("TestTerm", termAtt.Term()); Assert.AreEqual("TestType", typeAtt.Type()); Assert.AreEqual(12345, flagsAtt.GetFlags(), "FlagsAttribute should not be touched"); // init a third instance missing one Attribute AttributeSource src3 = new AttributeSource(); termAtt = (TermAttribute)src3.AddAttribute(typeof(TermAttribute)); try { src3.RestoreState(state); Assert.Fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException"); } catch (System.ArgumentException iae) { // pass } }
/// <summary> /// Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using /// affix rules in the provided HunspellDictionary. /// </summary> /// <param name="input">TokenStream whose tokens will be stemmed.</param> /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param> /// <param name="dedup">true if only unique terms should be output.</param> public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true) : base(input) { _posIncAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); _termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); _dedup = dedup; _stemmer = new HunspellStemmer(dictionary); }
/// <summary> /// Get the term qualified name when using the type of <typeparamref name="T"/> /// </summary> /// <typeparam name="T">The type of the term.</typeparam> /// <returns>The qualified name.</returns> public static string GetTermQualifiedName <T>() { object[] attributes = typeof(T).GetCustomAttributes(typeof(TermAttribute), false); if (attributes == null && attributes.Length == 0) { return(null); } TermAttribute term = (TermAttribute)attributes[0]; return(term.QualifiedName); }
private void Init(int gramSize) { if (gramSize < 1) { throw new ArgumentException( "minGram must be greater than zero"); } _mGramSize = gramSize; _mTermAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); _mOffsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
public string StemText(string text) { string result = ""; TokenStream stream = Stemmer.TokenStream(String.Empty, new StringReader(text)); while (stream.IncrementToken()) { TermAttribute termAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); result = result + termAttr.Term() + " "; } return(result.Trim()); }
public IntMetaDataTokenStream(string tokenText) { _tokenText = tokenText; // NOTE: Calling the AddAttribute<T> method failed, so // switched to using AddAttributeImpl. _termAttribute = new TermAttribute(); _offsetAttribute = new OffsetAttribute(); _payloadAtt = new PayloadAttribute(); base.AddAttributeImpl(_termAttribute); base.AddAttributeImpl(_offsetAttribute); base.AddAttributeImpl(_payloadAtt); }
public SynonymFilter(TokenStream input, SynonymEngine engine) : base(input) { if (engine == null) { throw new ArgumentNullException("synonymEngine"); } synonymStack = new Stack <string>(); this.engine = engine; this.termAtt = (TermAttribute)AddAttribute <ITermAttribute>(); this.posIncrAtt = (PositionIncrementAttribute)AddAttribute <IPositionIncrementAttribute>(); //this.termAtt = this.AddAttribute<string>(); //this.posIncrAtt = this.AddAttribute<string>(); }
public static IEnumerable <string> TokensFromAnalysis(Analyzer analyzer, String text) { TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); List <string> result = new List <string>(); TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); while (stream.IncrementToken()) { result.Add(tokenAttr.Term()); } stream.End(); stream.Close(); return(result); }
/** * Creates NGramTokenFilter with given min and max n-grams. * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> * <param name="minGram">the smallest n-gram to generate</param> * <param name="maxGram">the largest n-gram to generate</param> */ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); }
private string[] GetAnalyzedText(string field, string text) { var reader = new StringReader(text); var tokenStream = _masterAnalyzer.TokenStream(field, reader); _termAtt = (TermAttribute)tokenStream.AddAttribute(typeof(TermAttribute)); var tokens = new List <string>(); var words = new List <string>(); while (tokenStream.IncrementToken()) { tokens.Add(_termAtt.ToString()); words.Add(_termAtt.Term()); } return(words.ToArray()); }
/** * Creates NGramTokenFilter with given min and max n-grams. * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> * <param name="minGram">the smallest n-gram to generate</param> * <param name="maxGram">the largest n-gram to generate</param> */ public NGramTokenFilter(TokenStream input, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } _minGram = minGram; _maxGram = maxGram; _termAtt = (TermAttribute)AddAttribute <ITermAttribute>(); _offsetAtt = (OffsetAttribute)AddAttribute <IOffsetAttribute>(); }
public void TestUnaccentedWordAnalyzer() { TopDocs td = null; string text = "[email protected] 123.456 ğüşıöç%ĞÜŞİÖÇ$ΑΒΓΔΕΖ#АБВГДЕ SSß"; string[] expectedTokens = new string[] { "name", "surname", "gmail", "com", "123", "456", "gusioc", "gusioc", "αβγδεζ", "абвгде", "ssss" }; UnaccentedWordAnalyzer analyzer = new UnaccentedWordAnalyzer(); TokenStream ts = analyzer.TokenStream("", new System.IO.StringReader(text)); int i = 0; TermAttribute termAttribute = (TermAttribute)ts.GetAttribute(typeof(TermAttribute)); while (ts.IncrementToken()) { Assert.AreEqual(expectedTokens[i++], termAttribute.Term()); System.Diagnostics.Debug.WriteLine(termAttribute.Term()); } QueryParser p = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "field", analyzer); IndexSearcher src = CreateIndex(text, analyzer); td = src.Search(p.Parse("ĞÜŞıöç"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("name"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("surname"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("NAME.surname"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("surname@gmail"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("name@gmail"), 10); Assert.AreEqual(0, td.totalHits); td = src.Search(p.Parse("456"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("123.456"), 10); Assert.AreEqual(1, td.totalHits); }
/** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * @param input {@link TokenStream} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; termAtt = (TermAttribute)addAttribute(typeof(TermAttribute)); offsetAtt = (OffsetAttribute)addAttribute(typeof(OffsetAttribute)); }
public static IEnumerable <string> TokensFromAnalysis(Analyzer analyzer, String text) { TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); List <string> result = new List <string>(); TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); while (stream.IncrementToken()) { Console.WriteLine("Buffer:={0}, Length:={1}, Term:={2}".FormatWith(tokenAttr.TermBuffer(), tokenAttr.TermLength(), tokenAttr.Term())); result.Add(tokenAttr.Term()); } //tokenAttr. stream.End(); stream.Close(); return(result); }
static List <string> TokenizeStandard(string content, TokenizeConfig config) { StringReader reader = new StringReader(content); TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader); var stophash = StopFilter.MakeStopSet(config.StopWords); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stophash, true); /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount /// result.Reset(); TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute)); List <string> words = new List <string>(); while (result.IncrementToken()) { words.Add(termattr.Term()); } return(words); }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p> /// /// So, if you have a code fragment like this: /// <br> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p> /// /// </summary> /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>. /// /// <p> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P> /// This method is fail-safe in that if a long 'body' is passed in and /// {@link BooleanQuery#add BooleanQuery.add()} (used internally) /// throws /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the /// query as it is will be returned. /// /// /// /// /// /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while (ts.IncrementToken()) { String word = termAtt.Term(); // ignore opt stop words if (stop != null && stop.Contains(word)) { continue; } // ignore dups if (already.Contains(word) == true) { continue; } already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return(tmp); }
private static void ConsumeStreamNewAPI(TokenStream stream) { stream.Reset(); PayloadAttribute payloadAtt = (PayloadAttribute)stream.AddAttribute(typeof(PayloadAttribute)); TermAttribute termAtt = (TermAttribute)stream.AddAttribute(typeof(TermAttribute)); int i = 0; while (stream.IncrementToken()) { System.String term = termAtt.Term(); Payload p = payloadAtt.GetPayload(); if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun"); } else { Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)"); } Assert.AreEqual(results[i], term); i++; } }
public void Init() { instance = new TermAttribute(); }
private void Init() { termAtt = AddAttribute <TermAttribute>(); }
public static void Append(this TermAttribute termAtt, string str) { termAtt.SetTermBuffer(termAtt.Term() + str); // TODO: Not optimal, but works }
public CollationKeyFilter(TokenStream input, CultureInfo cultureInfo) : base(input) { _cultureInfo = cultureInfo; _termAtt = (TermAttribute)AddAttribute <ITermAttribute>(); }
public SynonymFilter(TokenStream input) : base(input) { _termAtt = (TermAttribute)AddAttribute <ITermAttribute>(); _posIncrAtt = (PositionIncrementAttribute)AddAttribute <IPositionIncrementAttribute>(); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int corpusNumDocs = reader.NumDocs(); Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects Hashtable processedTerms = new Hashtable(); while (ts.IncrementToken()) { String term = termAtt.Term(); if (!processedTerms.Contains(term)) { processedTerms.Add(term, term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = internSavingTemplateTerm.CreateTerm(term); FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); TermEnum origEnum = reader.Terms(startTerm); int df = 0; if (startTerm.Equals(origEnum.Term())) { df = origEnum.DocFreq(); //store the df so all variants use same idf } int numVariants = 0; int totalVariantDocFreqs = 0; do { Term possibleMatch = fe.Term(); if (possibleMatch != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = fe.Difference(); if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); variantsQ.Insert(st); minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore } } }while (fe.Next()); if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.Insert(st); } } } } }
public static void Append(this TermAttribute termAtt, char ch) { termAtt.SetTermBuffer(termAtt.Term() + new string(new[] { ch })); // TODO: Not optimal, but works }