public virtual void TestCaptureState() { // init a first instance AttributeSource src = new AttributeSource(); TermAttribute termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute)); TypeAttribute typeAtt = (TypeAttribute)src.AddAttribute(typeof(TypeAttribute)); termAtt.SetTermBuffer("TestTerm"); typeAtt.SetType("TestType"); int hashCode = src.GetHashCode(); AttributeSource.State state = src.CaptureState(); // modify the attributes termAtt.SetTermBuffer("AnotherTestTerm"); typeAtt.SetType("AnotherTestType"); Assert.IsTrue(hashCode != src.GetHashCode(), "Hash code should be different"); src.RestoreState(state); Assert.AreEqual("TestTerm", termAtt.Term()); Assert.AreEqual("TestType", typeAtt.Type()); Assert.AreEqual(hashCode, src.GetHashCode(), "Hash code should be equal after restore"); // restore into an exact configured copy AttributeSource copy = new AttributeSource(); copy.AddAttribute(typeof(TermAttribute)); copy.AddAttribute(typeof(TypeAttribute)); copy.RestoreState(state); Assert.AreEqual(src.GetHashCode(), copy.GetHashCode(), "Both AttributeSources should have same hashCode after restore"); Assert.AreEqual(src, copy, "Both AttributeSources should be equal after restore"); // init a second instance (with attributes in different order and one additional attribute) AttributeSource src2 = new AttributeSource(); typeAtt = (TypeAttribute)src2.AddAttribute(typeof(TypeAttribute)); Lucene.Net.Analysis.Tokenattributes.FlagsAttribute flagsAtt = (Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)src2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)); termAtt = (TermAttribute)src2.AddAttribute(typeof(TermAttribute)); flagsAtt.SetFlags(12345); src2.RestoreState(state); Assert.AreEqual("TestTerm", termAtt.Term()); Assert.AreEqual("TestType", typeAtt.Type()); Assert.AreEqual(12345, flagsAtt.GetFlags(), "FlagsAttribute should not be touched"); // init a third instance missing one Attribute AttributeSource src3 = new AttributeSource(); termAtt = (TermAttribute)src3.AddAttribute(typeof(TermAttribute)); try { src3.RestoreState(state); Assert.Fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException"); } catch (System.ArgumentException iae) { // pass } }
/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> private void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); int tokenCount = 0; // for every token TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); while (ts.IncrementToken()) { string word = termAtt.Term(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = (Int)termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
protected new void AddTermFrequencies(System.IO.StreamReader r, System.Collections.IDictionary termFreqMap, System.String fieldName) { var analyzer = Analyzers[fieldName]; TokenStream ts = analyzer.TokenStream(fieldName, r); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int tokenCount = 0; while (ts.IncrementToken()) { // for every token System.String word = termAtt.Term(); tokenCount++; if (tokenCount > GetMaxNumTokensParsed()) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency var cnt = (Int)termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
public void TestUnaccentedWordAnalyzer() { TopDocs td = null; string text = "[email protected] 123.456 ğüşıöç%ĞÜŞİÖÇ$ΑΒΓΔΕΖ#АБВГДЕ SSß"; string[] expectedTokens = new string[] { "name", "surname", "gmail", "com", "123", "456", "gusioc", "gusioc", "αβγδεζ", "абвгде", "ssss" }; UnaccentedWordAnalyzer analyzer = new UnaccentedWordAnalyzer(); TokenStream ts = analyzer.TokenStream("", new System.IO.StringReader(text)); int i = 0; TermAttribute termAttribute = (TermAttribute)ts.GetAttribute(typeof(TermAttribute)); while (ts.IncrementToken()) { Assert.AreEqual(expectedTokens[i++], termAttribute.Term()); System.Diagnostics.Debug.WriteLine(termAttribute.Term()); } QueryParser p = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "field", analyzer); IndexSearcher src = CreateIndex(text, analyzer); td = src.Search(p.Parse("ĞÜŞıöç"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("name"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("surname"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("NAME.surname"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("surname@gmail"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("name@gmail"), 10); Assert.AreEqual(0, td.totalHits); td = src.Search(p.Parse("456"), 10); Assert.AreEqual(1, td.totalHits); td = src.Search(p.Parse("123.456"), 10); Assert.AreEqual(1, td.totalHits); }
public override Boolean IncrementToken() { if (_buffer.Any()) { var nextStem = _buffer.Dequeue(); RestoreState(_savedState); _posIncAtt.SetPositionIncrement(0); _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength); return(true); } if (!input.IncrementToken()) { return(false); } var newTerms = _dedup ? _stemmer.UniqueStems(_termAtt.Term()) : _stemmer.Stem(_termAtt.Term()); foreach (var newTerm in newTerms) { _buffer.Enqueue(newTerm); } if (_buffer.Count == 0) { // we do not know this word, return it unchanged return(true); } var stem = _buffer.Dequeue(); _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength); if (_buffer.Count > 0) { _savedState = CaptureState(); } return(true); }
public string StemText(string text) { string result = ""; TokenStream stream = Stemmer.TokenStream(String.Empty, new StringReader(text)); while (stream.IncrementToken()) { TermAttribute termAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); result = result + termAttr.Term() + " "; } return(result.Trim()); }
public static IEnumerable <string> TokensFromAnalysis(Analyzer analyzer, String text) { TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); List <string> result = new List <string>(); TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); while (stream.IncrementToken()) { result.Add(tokenAttr.Term()); } stream.End(); stream.Close(); return(result); }
private string[] GetAnalyzedText(string field, string text) { var reader = new StringReader(text); var tokenStream = _masterAnalyzer.TokenStream(field, reader); _termAtt = (TermAttribute)tokenStream.AddAttribute(typeof(TermAttribute)); var tokens = new List <string>(); var words = new List <string>(); while (tokenStream.IncrementToken()) { tokens.Add(_termAtt.ToString()); words.Add(_termAtt.Term()); } return(words.ToArray()); }
public static IEnumerable <string> TokensFromAnalysis(Analyzer analyzer, String text) { TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); List <string> result = new List <string>(); TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); while (stream.IncrementToken()) { Console.WriteLine("Buffer:={0}, Length:={1}, Term:={2}".FormatWith(tokenAttr.TermBuffer(), tokenAttr.TermLength(), tokenAttr.Term())); result.Add(tokenAttr.Term()); } //tokenAttr. stream.End(); stream.Close(); return(result); }
static List <string> TokenizeStandard(string content, TokenizeConfig config) { StringReader reader = new StringReader(content); TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader); var stophash = StopFilter.MakeStopSet(config.StopWords); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, stophash, true); /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount /// result.Reset(); TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute)); List <string> words = new List <string>(); while (result.IncrementToken()) { words.Add(termattr.Term()); } return(words); }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p> /// /// So, if you have a code fragment like this: /// <br> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p> /// /// </summary> /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>. /// /// <p> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P> /// This method is fail-safe in that if a long 'body' is passed in and /// {@link BooleanQuery#add BooleanQuery.add()} (used internally) /// throws /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the /// query as it is will be returned. /// /// /// /// /// /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while (ts.IncrementToken()) { String word = termAtt.Term(); // ignore opt stop words if (stop != null && stop.Contains(word)) { continue; } // ignore dups if (already.Contains(word) == true) { continue; } already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return(tmp); }
private static void ConsumeStreamNewAPI(TokenStream stream) { stream.Reset(); PayloadAttribute payloadAtt = (PayloadAttribute)stream.AddAttribute(typeof(PayloadAttribute)); TermAttribute termAtt = (TermAttribute)stream.AddAttribute(typeof(TermAttribute)); int i = 0; while (stream.IncrementToken()) { System.String term = termAtt.Term(); Payload p = payloadAtt.GetPayload(); if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun"); } else { Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)"); } Assert.AreEqual(results[i], term); i++; } }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int corpusNumDocs = reader.NumDocs(); Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects Hashtable processedTerms = new Hashtable(); while (ts.IncrementToken()) { String term = termAtt.Term(); if (!processedTerms.Contains(term)) { processedTerms.Add(term, term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = internSavingTemplateTerm.CreateTerm(term); FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); TermEnum origEnum = reader.Terms(startTerm); int df = 0; if (startTerm.Equals(origEnum.Term())) { df = origEnum.DocFreq(); //store the df so all variants use same idf } int numVariants = 0; int totalVariantDocFreqs = 0; do { Term possibleMatch = fe.Term(); if (possibleMatch != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = fe.Difference(); if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); variantsQ.Insert(st); minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore } } }while (fe.Next()); if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.Insert(st); } } } } }
public static void Append(this TermAttribute termAtt, char ch) { termAtt.SetTermBuffer(termAtt.Term() + new string(new[] { ch })); // TODO: Not optimal, but works }
public static void Append(this TermAttribute termAtt, string str) { termAtt.SetTermBuffer(termAtt.Term() + str); // TODO: Not optimal, but works }
public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset) { Assert.IsNotNull(output); CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute)); Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute"); TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute)); OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute"); offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute)); } TypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute"); typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute)); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute"); posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute)); } ts.Reset(); for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetTermBuffer("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.SetType("bogusType"); } if (posIncrAtt != null) { posIncrAtt.SetPositionIncrement(45987657); } checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.Term(), "term " + i); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type(), "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i); } } Assert.IsFalse(ts.IncrementToken(), "end of stream"); ts.End(); if (finalOffset.HasValue) { Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset "); } ts.Close(); }