public QueryTermVector(System.String queryString, Analyzer analyzer)
		{
			if (analyzer != null)
			{
				TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
				if (stream != null)
				{
					System.Collections.ArrayList terms = new System.Collections.ArrayList();
					try
					{
						bool hasMoreTokens = false;
						
						stream.Reset();
						TermAttribute termAtt = (TermAttribute) stream.AddAttribute(typeof(TermAttribute));
						
						hasMoreTokens = stream.IncrementToken();
						while (hasMoreTokens)
						{
							terms.Add(termAtt.Term());
							hasMoreTokens = stream.IncrementToken();
						}
						ProcessTerms((System.String[]) terms.ToArray(typeof(System.String)));
					}
					catch (System.IO.IOException e)
					{
					}
				}
			}
		}
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             IList<string> terms = new List<string>();
             try
             {
                 bool hasMoreTokens = false;
                 
                 stream.Reset();
                 ITermAttribute termAtt = stream.AddAttribute<ITermAttribute>();
                 
                 hasMoreTokens = stream.IncrementToken();
                 while (hasMoreTokens)
                 {
                     terms.Add(termAtt.Term);
                     hasMoreTokens = stream.IncrementToken();
                 }
                 ProcessTerms(terms.ToArray());
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
Exemple #3
0
        public QueryTermVector(System.String queryString, Analyzer analyzer)
        {
            if (analyzer != null)
            {
                TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
                if (stream != null)
                {
                    List <string> terms = new List <string>();
                    try
                    {
                        bool hasMoreTokens = false;

                        stream.Reset();
                        TermAttribute termAtt = (TermAttribute)stream.AddAttribute(typeof(TermAttribute));

                        hasMoreTokens = stream.IncrementToken();
                        while (hasMoreTokens)
                        {
                            terms.Add(termAtt.Term());
                            hasMoreTokens = stream.IncrementToken();
                        }
                        ProcessTerms(terms.ToArray());
                    }
                    catch (System.IO.IOException e)
                    {
                    }
                }
            }
        }
Exemple #4
0
        //convenience method
        public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer)
        {
            Document doc = reader.Document(docId);

            System.String contents = doc.Get(field);
            if (contents == null)
            {
                throw new System.ArgumentException("Field " + field + " in document #" + docId + " is not stored and cannot be analyzed");
            }
            return(analyzer.TokenStream(field, new System.IO.StringReader(contents)));
        }
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             Token next = null;
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 while ((next = stream.Next()) != null)
                 {
                     terms.Add(next.TermText());
                 }
                 ProcessTerms((System.String[])terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 Token reusableToken = new Token();
                 for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                 {
                     terms.Add(nextToken.Term());
                 }
                 ProcessTerms((System.String[]) terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
Exemple #7
0
		public QueryTermVector(System.String queryString, Analyzer analyzer)
		{
			if (analyzer != null)
			{
				TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
				if (stream != null)
				{
					Token next = null;
					System.Collections.ArrayList terms = new System.Collections.ArrayList();
					try
					{
						while ((next = stream.Next()) != null)
						{
							terms.Add(next.TermText());
						}
						ProcessTerms((System.String[]) terms.ToArray(typeof(System.String)));
					}
					catch (System.IO.IOException)
					{
					}
				}
			}
		}
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 Token reusableToken = new Token();
                 for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                 {
                     terms.Add(nextToken.Term());
                 }
                 ProcessTerms((System.String[])terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
Exemple #9
0
        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        ///
        /// <p/>
        ///
        /// So, if you have a code fragment like this:
        /// <br/>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        ///
        /// <p/>
        /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
        ///
        /// <p/>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        ///
        /// <P/>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// <see cref="BooleanQuery.Add(BooleanClause)"/> (used internally)
        /// throws
        /// <see cref="BooleanQuery.TooManyClauses"/>, the
        /// query as it is will be returned.
        ///
        ///
        ///
        ///
        ///
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
        {
            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));

            Lucene.Net.Analysis.Token t;
            BooleanQuery tmp = new BooleanQuery();

            System.Collections.Hashtable already = new System.Collections.Hashtable();             // ignore dups
            while ((t = ts.Next()) != null)
            {
                System.String word = t.TermText();
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                {
                    continue;
                }
                // ignore dups
                if (already.Contains(word) == true)
                {
                    continue;
                }
                already.Add(word, word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, BooleanClause.Occur.SHOULD);                     //false, false);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return(tmp);
        }
		//convenience method
		public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer)
		{
			Document doc = reader.Document(docId);
			System.String contents = doc.Get(field);
			if (contents == null)
			{
				throw new System.ArgumentException("Field " + field + " in document #" + docId + " is not stored and cannot be analyzed");
			}
			return analyzer.TokenStream(field, new System.IO.StringReader(contents));
		}
		/// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
		/// This is a convenience method that calls
		/// {@link #getBestFragments(TokenStream, String, int)}
		/// 
		/// </summary>
		/// <param name="analyzer">  the analyzer that will be used to split <code>text</code>
		/// into chunks  
		/// </param>
		/// <param name="fieldName">    the name of the field being highlighted (used by analyzer)
		/// </param>
		/// <param name="text">       	text to highlight terms in
		/// </param>
		/// <param name="maxNumFragments"> the maximum number of fragments.
		/// 
		/// </param>
		/// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
		/// </returns>
		public System.String[] GetBestFragments(Analyzer analyzer, System.String fieldName, System.String text, int maxNumFragments)
		{
			TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));
			return GetBestFragments(tokenStream, text, maxNumFragments);
		}
Exemple #12
0
        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        /// 
        /// <p>
        /// 
        /// So, if you have a code fragment like this:
        /// <br>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        /// 
        /// <p>
        /// 
        /// </summary>
        /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
        /// 
        /// <p>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        /// 
        /// <P>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// {@link BooleanQuery#add BooleanQuery.add()} (used internally)
        /// throws
        /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
        /// query as it is will be returned.
        /// 
        /// 
        /// 
        /// 
        /// 
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
        {
            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            BooleanQuery tmp = new BooleanQuery();
            System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
            while (ts.IncrementToken())
            {
                String word = termAtt.Term();
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                    continue;
                // ignore dups
                if (already.Contains(word) == true)
                    continue;
                already.Add(word, word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, BooleanClause.Occur.SHOULD);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return tmp;
        }
Exemple #13
0
        public TokenStream GetTokenStream(Analyzer analyzer)
        {
            if (!((FieldType)FieldType()).Indexed)
            {
                return null;
            }
            Number n = new Number();
            NumericType? numericType = ((FieldType)FieldType()).NumericTypeValue;
            if (numericType != null)
            {
                if (!(InternalTokenStream is NumericTokenStream))
                {
                    // lazy init the TokenStream as it is heavy to instantiate
                    // (attributes,...) if not needed (stored field loading)
                    InternalTokenStream = new NumericTokenStream(Type.NumericPrecisionStep);
                }
                NumericTokenStream nts = (NumericTokenStream)InternalTokenStream;
                // initialize value in TokenStream
                object val = FieldsData;
                switch (numericType)
                {
                    case NumericType.INT:
                        nts.SetIntValue(Convert.ToInt32(val));
                        break;

                    case NumericType.LONG:
                        nts.SetLongValue(Convert.ToInt64(val));
                        break;

                    case NumericType.FLOAT:
                        nts.SetFloatValue(Convert.ToSingle(val));
                        break;

                    case NumericType.DOUBLE:
                        nts.SetDoubleValue(Convert.ToDouble(val));
                        break;

                    default:
                        throw new Exception("Should never get here");
                }
                return InternalTokenStream;
            }

            if (!((FieldType)FieldType()).Tokenized)
            {
                if (StringValue == null)
                {
                    throw new System.ArgumentException("Non-Tokenized Fields must have a String value");
                }
                if (!(InternalTokenStream is StringTokenStream))
                {
                    // lazy init the TokenStream as it is heavy to instantiate
                    // (attributes,...) if not needed (stored field loading)
                    InternalTokenStream = new StringTokenStream();
                }
                ((StringTokenStream)InternalTokenStream).Value = StringValue;
                return InternalTokenStream;
            }

            if (TokenStream_Renamed != null)
            {
                return TokenStream_Renamed;
            }
            else if (ReaderValue != null)
            {
                return analyzer.TokenStream(Name(), ReaderValue);
            }
            else if (StringValue != null)
            {
                TextReader sr = new StringReader(StringValue);
                return analyzer.TokenStream(Name(), sr);
            }

            throw new System.ArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this);
        }
Exemple #14
0
 /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
 /// This is a convenience method that calls
 /// {@link #getBestFragments(TokenStream, String, int)}
 /// 
 /// </summary>
 /// <param name="analyzer">  the analyzer that will be used to split <code>text</code> into chunks </param>
 /// <param name="text">text to highlight terms in</param>
 /// <param name="maxNumFragments"> the maximum number of fragments.
 /// 
 /// </param>
 /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
 /// </returns>
 public String[] GetBestFragments(Analyzer analyzer, string text, int maxNumFragments)
 {
     TokenStream tokenStream = analyzer.TokenStream("field", new StringReader(text));
     return GetBestFragments(tokenStream, text, maxNumFragments);
 }
 /// <summary> Simple similarity query generators.
 /// Takes every unique word and forms a boolean query where all words are optional.
 /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
 /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
 /// need to then ignore that.
 /// 
 /// <p>
 /// 
 /// So, if you have a code fragment like this:
 /// <br>
 /// <code>
 /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
 /// </code>
 /// 
 /// <p>
 /// 
 /// </summary>
 /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
 /// 
 /// <p>
 /// The philosophy behind this method is "two documents are similar if they share lots of words".
 /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
 /// 
 /// <P>
 /// This method is fail-safe in that if a long 'body' is passed in and
 /// {@link BooleanQuery#add BooleanQuery.add()} (used internally)
 /// throws
 /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
 /// query as it is will be returned.
 /// 
 /// 
 /// 
 /// 
 /// 
 /// </summary>
 /// <param name="body">the body of the document you want to find similar documents to
 /// </param>
 /// <param name="a">the analyzer to use to parse the body
 /// </param>
 /// <param name="field">the field you want to search on, probably something like "contents" or "body"
 /// </param>
 /// <param name="stop">optional set of stop words to ignore
 /// </param>
 /// <returns> a query with all unique words in 'body'
 /// </returns>
 /// <throws>  IOException this can't happen... </throws>
 public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
 {
     TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
     Lucene.Net.Analysis.Token t;
     BooleanQuery tmp = new BooleanQuery();
     System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
     while ((t = ts.Next()) != null)
     {
         System.String word = t.TermText();
         // ignore opt stop words
         if (stop != null && stop.Contains(word))
             continue;
         // ignore dups
         if (already.Contains(word) == true)
             continue;
         already.Add(word, word);
         // add to query
         TermQuery tq = new TermQuery(new Term(field, word));
         try
         {
             tmp.Add(tq, BooleanClause.Occur.SHOULD); //false, false);
         }
         catch (BooleanQuery.TooManyClauses too)
         {
             // fail-safe, just return what we have, not the end of the world
             break;
         }
     }
     return tmp;
 }
Exemple #16
0
        /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
        /// This is a convenience method that calls
        /// {@link #getBestFragments(TokenStream, String, int)}
        ///
        /// </summary>
        /// <param name="analyzer">  the analyzer that will be used to split <code>text</code>
        /// into chunks
        /// </param>
        /// <param name="fieldName">    the name of the field being highlighted (used by analyzer)
        /// </param>
        /// <param name="text">         text to highlight terms in
        /// </param>
        /// <param name="maxNumFragments"> the maximum number of fragments.
        ///
        /// </param>
        /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
        /// </returns>
        public System.String[] GetBestFragments(Analyzer analyzer, System.String fieldName, System.String text, int maxNumFragments)
        {
            TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));

            return(GetBestFragments(tokenStream, text, maxNumFragments));
        }
Exemple #17
0
        /// <summary>
        /// Creates a query from the analysis chain.
        /// <p>
        /// Expert: this is more useful for subclasses such as queryparsers.
        /// If using this class directly, just use <seealso cref="#createBooleanQuery(String, String)"/>
        /// and <seealso cref="#createPhraseQuery(String, String)"/> </summary>
        /// <param name="analyzer"> analyzer used for this query </param>
        /// <param name="operator"> default boolean operator used for this query </param>
        /// <param name="field"> field to create queries against </param>
        /// <param name="queryText"> text to be passed to the analysis chain </param>
        /// <param name="quoted"> true if phrases should be generated when terms occur at more than one position </param>
        /// <param name="phraseSlop"> slop factor for phrase/multiphrase queries </param>
        protected internal Query CreateFieldQuery(Analyzer analyzer, BooleanClause.Occur @operator, string field, string queryText, bool quoted, int phraseSlop)
        {
            Debug.Assert(@operator == BooleanClause.Occur.SHOULD || @operator == BooleanClause.Occur.MUST);
            // Use the analyzer to get all the tokens, and then build a TermQuery,
            // PhraseQuery, or nothing based on the term count
            CachingTokenFilter buffer = null;
            ITermToBytesRefAttribute termAtt = null;
            IPositionIncrementAttribute posIncrAtt = null;
            int numTokens = 0;
            int positionCount = 0;
            bool severalTokensAtSamePosition = false;
            bool hasMoreTokens = false;

            TokenStream source = null;
            try
            {
                source = analyzer.TokenStream(field, new StringReader(queryText));
                source.Reset();
                buffer = new CachingTokenFilter(source);
                buffer.Reset();

                if (buffer.HasAttribute<ITermToBytesRefAttribute>())
                {
                    termAtt = buffer.GetAttribute<ITermToBytesRefAttribute>();
                }
                if (buffer.HasAttribute<IPositionIncrementAttribute>())
                {
                    posIncrAtt = buffer.GetAttribute<IPositionIncrementAttribute>();
                }

                if (termAtt != null)
                {
                    try
                    {
                        hasMoreTokens = buffer.IncrementToken();
                        while (hasMoreTokens)
                        {
                            numTokens++;
                            int positionIncrement = (posIncrAtt != null) ? posIncrAtt.PositionIncrement : 1;
                            if (positionIncrement != 0)
                            {
                                positionCount += positionIncrement;
                            }
                            else
                            {
                                severalTokensAtSamePosition = true;
                            }
                            hasMoreTokens = buffer.IncrementToken();
                        }
                    }
                    catch (System.IO.IOException)
                    {
                        // ignore
                    }
                }
            }
            catch (System.IO.IOException e)
            {
                throw new Exception("Error analyzing query text", e);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(source);
            }

            // rewind the buffer stream
            buffer.Reset();

            BytesRef bytes = termAtt == null ? null : termAtt.BytesRef;

            if (numTokens == 0)
            {
                return null;
            }
            else if (numTokens == 1)
            {
                try
                {
                    bool hasNext = buffer.IncrementToken();
                    Debug.Assert(hasNext == true);
                    termAtt.FillBytesRef();
                }
                catch (System.IO.IOException)
                {
                    // safe to ignore, because we know the number of tokens
                }
                return NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)));
            }
            else
            {
                if (severalTokensAtSamePosition || (!quoted))
                {
                    if (positionCount == 1 || (!quoted))
                    {
                        // no phrase query:

                        if (positionCount == 1)
                        {
                            // simple case: only one position, with synonyms
                            BooleanQuery q = NewBooleanQuery(true);
                            for (int i = 0; i < numTokens; i++)
                            {
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    Debug.Assert(hasNext == true);
                                    termAtt.FillBytesRef();
                                }
                                catch (System.IO.IOException)
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                Query currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)));
                                q.Add(currentQuery, BooleanClause.Occur.SHOULD);
                            }
                            return q;
                        }
                        else
                        {
                            // multiple positions
                            BooleanQuery q = NewBooleanQuery(false);
                            Query currentQuery = null;
                            for (int i = 0; i < numTokens; i++)
                            {
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    Debug.Assert(hasNext == true);
                                    termAtt.FillBytesRef();
                                }
                                catch (System.IO.IOException)
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0)
                                {
                                    if (!(currentQuery is BooleanQuery))
                                    {
                                        Query t = currentQuery;
                                        currentQuery = NewBooleanQuery(true);
                                        ((BooleanQuery)currentQuery).Add(t, BooleanClause.Occur.SHOULD);
                                    }
                                    ((BooleanQuery)currentQuery).Add(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
                                }
                                else
                                {
                                    if (currentQuery != null)
                                    {
                                        q.Add(currentQuery, @operator);
                                    }
                                    currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)));
                                }
                            }
                            q.Add(currentQuery, @operator);
                            return q;
                        }
                    }
                    else
                    {
                        // phrase query:
                        MultiPhraseQuery mpq = NewMultiPhraseQuery();
                        mpq.Slop = phraseSlop;
                        IList<Term> multiTerms = new List<Term>();
                        int position = -1;
                        for (int i = 0; i < numTokens; i++)
                        {
                            int positionIncrement = 1;
                            try
                            {
                                bool hasNext = buffer.IncrementToken();
                                Debug.Assert(hasNext == true);
                                termAtt.FillBytesRef();
                                if (posIncrAtt != null)
                                {
                                    positionIncrement = posIncrAtt.PositionIncrement;
                                }
                            }
                            catch (System.IO.IOException)
                            {
                                // safe to ignore, because we know the number of tokens
                            }

                            if (positionIncrement > 0 && multiTerms.Count > 0)
                            {
                                if (EnablePositionIncrements_Renamed)
                                {
                                    mpq.Add(multiTerms.ToArray(), position);
                                }
                                else
                                {
                                    mpq.Add(multiTerms.ToArray());
                                }
                                multiTerms.Clear();
                            }
                            position += positionIncrement;
                            multiTerms.Add(new Term(field, BytesRef.DeepCopyOf(bytes)));
                        }
                        if (EnablePositionIncrements_Renamed)
                        {
                            mpq.Add(multiTerms.ToArray(), position);
                        }
                        else
                        {
                            mpq.Add(multiTerms.ToArray());
                        }
                        return mpq;
                    }
                }
                else
                {
                    PhraseQuery pq = NewPhraseQuery();
                    pq.Slop = phraseSlop;
                    int position = -1;

                    for (int i = 0; i < numTokens; i++)
                    {
                        int positionIncrement = 1;

                        try
                        {
                            bool hasNext = buffer.IncrementToken();
                            Debug.Assert(hasNext == true);
                            termAtt.FillBytesRef();
                            if (posIncrAtt != null)
                            {
                                positionIncrement = posIncrAtt.PositionIncrement;
                            }
                        }
                        catch (System.IO.IOException)
                        {
                            // safe to ignore, because we know the number of tokens
                        }

                        if (EnablePositionIncrements_Renamed)
                        {
                            position += positionIncrement;
                            pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)), position);
                        }
                        else
                        {
                            pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)));
                        }
                    }
                    return pq;
                }
            }
        }
Exemple #18
0
        /// <summary> Highlights chosen terms in a text, extracting the most relevant section.
        /// This is a convenience method that calls
        /// {@link #GetBestFragment(TokenStream, String)}
        ///
        /// </summary>
        /// <param name="analyzer">  the analyzer that will be used to split <code>text</code>
        /// into chunks
        /// </param>
        /// <param name="text">text to highlight terms in
        /// </param>
        /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy
        ///
        /// </param>
        /// <returns> highlighted text fragment or null if no terms found
        /// </returns>
        public System.String GetBestFragment(Analyzer analyzer, System.String fieldName, System.String text)
        {
            TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));

            return(GetBestFragment(tokenStream, text));
        }
		/// <summary> Highlights chosen terms in a text, extracting the most relevant section.
		/// This is a convenience method that calls
		/// {@link #GetBestFragment(TokenStream, String)}
		/// 
		/// </summary>
		/// <param name="analyzer">  the analyzer that will be used to split <code>text</code>
		/// into chunks  
		/// </param>
		/// <param name="text">text to highlight terms in
		/// </param>
		/// <param name="fieldName">Name of field used to influence analyzer's tokenization policy 
		/// 
		/// </param>
		/// <returns> highlighted text fragment or null if no terms found
		/// </returns>
		public System.String GetBestFragment(Analyzer analyzer, System.String fieldName, System.String text)
		{
			TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text));
			return GetBestFragment(tokenStream, text);
		}
        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        /// 
        /// <p/>
        /// 
        /// So, if you have a code fragment like this:
        /// <br/>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        /// 
        /// <p/>
        /// 
        ///  The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
        /// 
        /// <p/>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        /// 
        /// <P/>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// <see cref="BooleanQuery.Add"/> (used internally)
        /// throws
        /// <see cref="BooleanQuery.TooManyClauses"/>, the
        /// query as it is will be returned.
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop)
        {
            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
            ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();

            BooleanQuery tmp = new BooleanQuery();
            ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.GetSet<string>(); // ignore dups
            while (ts.IncrementToken())
            {
                String word = termAtt.Term;
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                    continue;
                // ignore dups
                if (already.Contains(word))
                    continue;
                already.Add(word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, Occur.SHOULD);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return tmp;
        }
Exemple #21
0
        // Tokenizes the fields of a document into Postings.
        private void  InvertDocument(Document doc)
        {
            foreach (Field field in doc.Fields())
            {
                System.String fieldName   = field.Name();
                int           fieldNumber = fieldInfos.FieldNumber(fieldName);

                int length   = fieldLengths[fieldNumber];               // length of field
                int position = fieldPositions[fieldNumber];             // position in field
                if (length > 0)
                {
                    position += analyzer.GetPositionIncrementGap(fieldName);
                }
                int offset = fieldOffsets[fieldNumber];                 // offset field

                if (field.IsIndexed())
                {
                    if (!field.IsTokenized())
                    {
                        // un-tokenized field
                        System.String stringValue = field.StringValue();
                        if (field.IsStoreOffsetWithTermVector())
                        {
                            AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
                        }
                        else
                        {
                            AddPosition(fieldName, stringValue, position++, null);
                        }
                        offset += stringValue.Length;
                        length++;
                    }
                    else
                    {
                        System.IO.TextReader reader;                         // find or make Reader
                        if (field.ReaderValue() != null)
                        {
                            reader = field.ReaderValue();
                        }
                        else if (field.StringValue() != null)
                        {
                            reader = new System.IO.StringReader(field.StringValue());
                        }
                        else
                        {
                            throw new System.ArgumentException("field must have either String or Reader value");
                        }

                        // Tokenize field and add to postingTable
                        TokenStream stream = analyzer.TokenStream(fieldName, reader);
                        try
                        {
                            Token lastToken = null;
                            for (Token t = stream.Next(); t != null; t = stream.Next())
                            {
                                position += (t.GetPositionIncrement() - 1);

                                if (field.IsStoreOffsetWithTermVector())
                                {
                                    AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
                                }
                                else
                                {
                                    AddPosition(fieldName, t.TermText(), position++, null);
                                }

                                lastToken = t;
                                if (++length > maxFieldLength)
                                {
                                    if (infoStream != null)
                                    {
                                        infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
                                    }
                                    break;
                                }
                            }

                            if (lastToken != null)
                            {
                                offset += lastToken.EndOffset() + 1;
                            }
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    fieldLengths[fieldNumber]   = length;                   // save field length
                    fieldPositions[fieldNumber] = position;                 // save field position
                    fieldBoosts[fieldNumber]   *= field.GetBoost();
                    fieldOffsets[fieldNumber]   = offset;
                }
            }
        }
Exemple #22
0
        public SearchModel Search(string searchText)
        {
            var result = new SearchModel();

            if (string.IsNullOrEmpty(searchText))
            {
                result.Message = "Įveskite paieškos užklausą.";
                return(result);
            }

            var stemmedSearchText = new LithuanianStemmer().Stem(searchText.Trim());

            if (string.IsNullOrEmpty(stemmedSearchText))
            {
                result.Message = "Įveskite paieškos užklausą.";
                return(result);
            }

            Lucene.Net.Search.Hits hits = null;
            try
            {
                if (char.IsLetter(stemmedSearchText[stemmedSearchText.Length - 1]))
                {
                    stemmedSearchText += "*";
                }

                query = parser.Parse(stemmedSearchText);

                if (searcher == null)
                {
                    searcher = new Lucene.Net.Search.IndexSearcher(CustomAppSettings.SearchIndexFolder);
                }

                hits = searcher.Search(query);
            }
            catch (Exception e)
            {
                result.Message = "Paieška nepavyko. Pataisykite užklausą. Klaidos pranešimas: " + e.Message;
                return(result);
            }

            Lucene.Net.Highlight.Formatter formatter = new Lucene.Net.Highlight.SimpleHTMLFormatter(
                "<span class=\"highlightResult\">",
                "</span>");

            var fragmenter  = new Lucene.Net.Highlight.SimpleFragmenter(100);
            var scorer      = new Lucene.Net.Highlight.QueryScorer(searcher.Rewrite(query));
            var highlighter = new Lucene.Net.Highlight.Highlighter(formatter, scorer);

            highlighter.SetTextFragmenter(fragmenter);

            Dictionary <string, int> dict_already_seen_ids = new Dictionary <string, int>();

            var list = new List <SearchIndexModel>();

            // insert the search results into a temp table which we will join with what's in the database
            for (int i = 0; i < hits.Length(); i++)
            {
                if (dict_already_seen_ids.Count < 100)
                {
                    Lucene.Net.Documents.Document doc = hits.Doc(i);
                    string id = doc.Get("id");
                    if (!dict_already_seen_ids.ContainsKey(id))
                    {
                        dict_already_seen_ids[id] = 1;
                        var model = new SearchIndexModel();
                        model.Id      = id;
                        model.Score   = hits.Score(i);
                        model.Subject = doc.Get("subject");
                        model.Type    = (EntryTypes)Enum.Parse(typeof(EntryTypes), doc.Get("type"));

                        string raw_text = HttpUtility.HtmlEncode(doc.Get("raw_text"));
                        //string raw_text = doc.Get("raw_text");

                        Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("text",
                                                                                      new System.IO.StringReader(
                                                                                          raw_text));
                        string highlighted_text = highlighter.GetBestFragments(stream, raw_text, 3, "...").Replace("'",
                                                                                                                   "''");


                        if (highlighted_text == "") // someties the highlighter fails to emit text...
                        {
                            highlighted_text = raw_text.Replace("'", "''");
                        }
                        if (highlighted_text.Length > 3000)
                        {
                            highlighted_text = highlighted_text.Substring(0, 3000);
                        }

                        model.HighlightedText = highlighted_text;

                        list.Add(model);
                    }
                }
                else
                {
                    break;
                }
            }

            result.List         = list;
            result.SearchPhrase = searchText;
            if (list.Count == 0)
            {
                result.Message = string.Format("Įrašų pagal užklausą '{0}' nerasta. Patikslinkite paieškos duomenis.", searchText);
            }

            return(result);
        }