Пример #1
0
        public QueryTermVector(System.String queryString, Analyzer analyzer)
        {
            if (analyzer != null)
            {
                TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
                if (stream != null)
                {
                    List <string> terms = new List <string>();
                    try
                    {
                        bool hasMoreTokens = false;

                        stream.Reset();
                        TermAttribute termAtt = (TermAttribute)stream.AddAttribute(typeof(TermAttribute));

                        hasMoreTokens = stream.IncrementToken();
                        while (hasMoreTokens)
                        {
                            terms.Add(termAtt.Term());
                            hasMoreTokens = stream.IncrementToken();
                        }
                        ProcessTerms(terms.ToArray());
                    }
                    catch (System.IO.IOException e)
                    {
                    }
                }
            }
        }
Пример #2
0
        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        protected void AddTermFrequencies(System.IO.TextReader r, IDictionary <string, Int> termFreqMap, System.String fieldName)
        {
            TokenStream ts         = analyzer.TokenStream(fieldName, r);
            int         tokenCount = 0;
            // for every token
            ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>();

            while (ts.IncrementToken())
            {
                string word = termAtt.Term;
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }
Пример #3
0
        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        ///
        /// <p>
        ///
        /// So, if you have a code fragment like this:
        /// <br>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        ///
        /// <p>
        ///
        /// </summary>
        /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
        ///
        /// <p>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        ///
        /// <P>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// {@link BooleanQuery#add BooleanQuery.add()} (used internally)
        /// throws
        /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
        /// query as it is will be returned.
        ///
        ///
        ///
        ///
        ///
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
        {
            TokenStream   ts      = a.TokenStream(field, new System.IO.StringReader(body));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            BooleanQuery tmp = new BooleanQuery();

            System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
            while (ts.IncrementToken())
            {
                String word = termAtt.Term();
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                {
                    continue;
                }
                // ignore dups
                if (already.Contains(word) == true)
                {
                    continue;
                }
                already.Add(word, word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, BooleanClause.Occur.SHOULD);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return(tmp);
        }
Пример #4
0
        public BooleanQuery GetCommQuery(string changecommBody, string changecommGroup, string changecommApps, string changecommCI)
        {
            Lucene.Net.Analysis.Analyzer commsAnalyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);

            Lucene.Net.Search.BooleanQuery.MaxClauseCount = 25000;


            TextReader textReadCommBody  = new StringReader(changecommBody);
            TextReader textReadCommGroup = new StringReader(changecommGroup);
            TextReader textReadCommApps  = new StringReader(changecommApps);
            TextReader textReadCommCI    = new StringReader(changecommCI);


            Lucene.Net.Analysis.TokenStream tokenizedCommBody  = commsAnalyzer.TokenStream(changecommBody, textReadCommBody);
            Lucene.Net.Analysis.TokenStream tokenizedCommGroup = commsAnalyzer.TokenStream(changecommGroup, textReadCommGroup);
            Lucene.Net.Analysis.TokenStream tokenizedCommApps  = commsAnalyzer.TokenStream(changecommApps, textReadCommApps);
            Lucene.Net.Analysis.TokenStream tokenizedCommCI    = commsAnalyzer.TokenStream(changecommCI, textReadCommCI);


            Lucene.Net.Search.BooleanQuery query1 = new Lucene.Net.Search.BooleanQuery();
            try
            {
                int tokenCount = 0;
                tokenizedCommBody.Reset();

                var termAttrText = tokenizedCommBody.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();

                while (tokenizedCommBody.IncrementToken())
                {
                    tokenCount++;

                    string Term = termAttrText.Term;

                    query1.Add(new Lucene.Net.Search.TermQuery(new Term("change_description", Term)), Lucene.Net.Search.Occur.SHOULD);
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }

            Lucene.Net.Search.BooleanQuery query2 = new Lucene.Net.Search.BooleanQuery();

            try
            {
                tokenizedCommGroup.Reset();

                var termAttrTicker = tokenizedCommGroup.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();

                int tokenCount = 0;

                while (tokenizedCommGroup.IncrementToken())
                {
                    tokenCount++;

                    string Term = termAttrTicker.Term;

                    query2.Add(new Lucene.Net.Search.TermQuery(new Term("change_group", Term)), Lucene.Net.Search.Occur.SHOULD);
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }

            Lucene.Net.Search.BooleanQuery query3 = new Lucene.Net.Search.BooleanQuery();

            try
            {
                tokenizedCommApps.Reset();

                var termAttrTicker = tokenizedCommApps.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();

                int tokenCount = 0;

                while (tokenizedCommApps.IncrementToken())
                {
                    tokenCount++;

                    string Term = termAttrTicker.Term;

                    query3.Add(new Lucene.Net.Search.TermQuery(new Term("application", Term)), Lucene.Net.Search.Occur.SHOULD);
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }


            Lucene.Net.Search.BooleanQuery query4 = new Lucene.Net.Search.BooleanQuery();

            try
            {
                tokenizedCommCI.Reset();

                var termAttrTicker = tokenizedCommCI.GetAttribute <Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();

                int tokenCount = 0;

                while (tokenizedCommCI.IncrementToken())
                {
                    tokenCount++;

                    string Term = termAttrTicker.Term;

                    query4.Add(new Lucene.Net.Search.TermQuery(new Term("change_CI", Term)), Lucene.Net.Search.Occur.SHOULD);
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex);
            }

            Lucene.Net.Search.BooleanQuery comQuery = new Lucene.Net.Search.BooleanQuery();


            query4.Boost = 5;
            query3.MinimumNumberShouldMatch = 1;

            comQuery.Add(query1, Lucene.Net.Search.Occur.SHOULD);
            comQuery.Add(query2, Lucene.Net.Search.Occur.SHOULD);
            comQuery.Add(query3, Lucene.Net.Search.Occur.SHOULD);
            comQuery.Add(query4, Lucene.Net.Search.Occur.SHOULD);

            return(comQuery);
        }