/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> private void AddTermFrequencies(System.IO.TextReader r, IDictionary <string, Int> termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); int tokenCount = 0; // for every token ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>(); while (ts.IncrementToken()) { string word = termAtt.Term; tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> protected void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int tokenCount = 0; while (ts.IncrementToken()) { // for every token System.String word = termAtt.Term(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = (Int)termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { bool hasMoreTokens = false; stream.Reset(); TermAttribute termAtt = (TermAttribute) stream.AddAttribute(typeof(TermAttribute)); hasMoreTokens = stream.IncrementToken(); while (hasMoreTokens) { terms.Add(termAtt.Term()); hasMoreTokens = stream.IncrementToken(); } ProcessTerms((System.String[]) terms.ToArray(typeof(System.String))); } catch (System.IO.IOException e) { } } } }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p> /// /// So, if you have a code fragment like this: /// <br> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p> /// /// </summary> /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>. /// /// <p> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P> /// This method is fail-safe in that if a long 'body' is passed in and /// {@link BooleanQuery#add BooleanQuery.add()} (used internally) /// throws /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the /// query as it is will be returned. /// /// /// /// /// /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while (ts.IncrementToken()) { String word = termAtt.Term(); // ignore opt stop words if (stop != null && stop.Contains(word)) { continue; } // ignore dups if (already.Contains(word) == true) { continue; } already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return(tmp); }
public PayloadFilter(TokenStream input, System.String fieldName):base(input) { this.fieldName = fieldName; pos = 0; i = 0; posIncrAttr = (PositionIncrementAttribute) input.AddAttribute(typeof(PositionIncrementAttribute)); payloadAttr = (PayloadAttribute) input.AddAttribute(typeof(PayloadAttribute)); termAttr = (TermAttribute) input.AddAttribute(typeof(TermAttribute)); }