public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { bool hasMoreTokens = false; stream.Reset(); TermAttribute termAtt = (TermAttribute) stream.AddAttribute(typeof(TermAttribute)); hasMoreTokens = stream.IncrementToken(); while (hasMoreTokens) { terms.Add(termAtt.Term()); hasMoreTokens = stream.IncrementToken(); } ProcessTerms((System.String[]) terms.ToArray(typeof(System.String))); } catch (System.IO.IOException e) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { IList<string> terms = new List<string>(); try { bool hasMoreTokens = false; stream.Reset(); ITermAttribute termAtt = stream.AddAttribute<ITermAttribute>(); hasMoreTokens = stream.IncrementToken(); while (hasMoreTokens) { terms.Add(termAtt.Term); hasMoreTokens = stream.IncrementToken(); } ProcessTerms(terms.ToArray()); } catch (System.IO.IOException) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { List <string> terms = new List <string>(); try { bool hasMoreTokens = false; stream.Reset(); TermAttribute termAtt = (TermAttribute)stream.AddAttribute(typeof(TermAttribute)); hasMoreTokens = stream.IncrementToken(); while (hasMoreTokens) { terms.Add(termAtt.Term()); hasMoreTokens = stream.IncrementToken(); } ProcessTerms(terms.ToArray()); } catch (System.IO.IOException e) { } } } }
//convenience method public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer) { Document doc = reader.Document(docId); System.String contents = doc.Get(field); if (contents == null) { throw new System.ArgumentException("Field " + field + " in document #" + docId + " is not stored and cannot be analyzed"); } return(analyzer.TokenStream(field, new System.IO.StringReader(contents))); }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { Token next = null; System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { while ((next = stream.Next()) != null) { terms.Add(next.TermText()); } ProcessTerms((System.String[])terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { Token reusableToken = new Token(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { terms.Add(nextToken.Term()); } ProcessTerms((System.String[]) terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { Token next = null; System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { while ((next = stream.Next()) != null) { terms.Add(next.TermText()); } ProcessTerms((System.String[]) terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { Token reusableToken = new Token(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { terms.Add(nextToken.Term()); } ProcessTerms((System.String[])terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p/> /// /// So, if you have a code fragment like this: /// <br/> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p/> /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>. /// /// <p/> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P/> /// This method is fail-safe in that if a long 'body' is passed in and /// <see cref="BooleanQuery.Add(BooleanClause)"/> (used internally) /// throws /// <see cref="BooleanQuery.TooManyClauses"/>, the /// query as it is will be returned. /// /// /// /// /// /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); Lucene.Net.Analysis.Token t; BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while ((t = ts.Next()) != null) { System.String word = t.TermText(); // ignore opt stop words if (stop != null && stop.Contains(word)) { continue; } // ignore dups if (already.Contains(word) == true) { continue; } already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); //false, false); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return(tmp); }
//convenience method public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer) { Document doc = reader.Document(docId); System.String contents = doc.Get(field); if (contents == null) { throw new System.ArgumentException("Field " + field + " in document #" + docId + " is not stored and cannot be analyzed"); } return analyzer.TokenStream(field, new System.IO.StringReader(contents)); }
/// <summary> Highlights chosen terms in a text, extracting the most relevant sections. /// This is a convenience method that calls /// {@link #getBestFragments(TokenStream, String, int)} /// /// </summary> /// <param name="analyzer"> the analyzer that will be used to split <code>text</code> /// into chunks /// </param> /// <param name="fieldName"> the name of the field being highlighted (used by analyzer) /// </param> /// <param name="text"> text to highlight terms in /// </param> /// <param name="maxNumFragments"> the maximum number of fragments. /// /// </param> /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments) /// </returns> public System.String[] GetBestFragments(Analyzer analyzer, System.String fieldName, System.String text, int maxNumFragments) { TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text)); return GetBestFragments(tokenStream, text, maxNumFragments); }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p> /// /// So, if you have a code fragment like this: /// <br> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p> /// /// </summary> /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>. /// /// <p> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P> /// This method is fail-safe in that if a long 'body' is passed in and /// {@link BooleanQuery#add BooleanQuery.add()} (used internally) /// throws /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the /// query as it is will be returned. /// /// /// /// /// /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while (ts.IncrementToken()) { String word = termAtt.Term(); // ignore opt stop words if (stop != null && stop.Contains(word)) continue; // ignore dups if (already.Contains(word) == true) continue; already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return tmp; }
public TokenStream GetTokenStream(Analyzer analyzer) { if (!((FieldType)FieldType()).Indexed) { return null; } Number n = new Number(); NumericType? numericType = ((FieldType)FieldType()).NumericTypeValue; if (numericType != null) { if (!(InternalTokenStream is NumericTokenStream)) { // lazy init the TokenStream as it is heavy to instantiate // (attributes,...) if not needed (stored field loading) InternalTokenStream = new NumericTokenStream(Type.NumericPrecisionStep); } NumericTokenStream nts = (NumericTokenStream)InternalTokenStream; // initialize value in TokenStream object val = FieldsData; switch (numericType) { case NumericType.INT: nts.SetIntValue(Convert.ToInt32(val)); break; case NumericType.LONG: nts.SetLongValue(Convert.ToInt64(val)); break; case NumericType.FLOAT: nts.SetFloatValue(Convert.ToSingle(val)); break; case NumericType.DOUBLE: nts.SetDoubleValue(Convert.ToDouble(val)); break; default: throw new Exception("Should never get here"); } return InternalTokenStream; } if (!((FieldType)FieldType()).Tokenized) { if (StringValue == null) { throw new System.ArgumentException("Non-Tokenized Fields must have a String value"); } if (!(InternalTokenStream is StringTokenStream)) { // lazy init the TokenStream as it is heavy to instantiate // (attributes,...) if not needed (stored field loading) InternalTokenStream = new StringTokenStream(); } ((StringTokenStream)InternalTokenStream).Value = StringValue; return InternalTokenStream; } if (TokenStream_Renamed != null) { return TokenStream_Renamed; } else if (ReaderValue != null) { return analyzer.TokenStream(Name(), ReaderValue); } else if (StringValue != null) { TextReader sr = new StringReader(StringValue); return analyzer.TokenStream(Name(), sr); } throw new System.ArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this); }
/// <summary> Highlights chosen terms in a text, extracting the most relevant sections. /// This is a convenience method that calls /// {@link #getBestFragments(TokenStream, String, int)} /// /// </summary> /// <param name="analyzer"> the analyzer that will be used to split <code>text</code> into chunks </param> /// <param name="text">text to highlight terms in</param> /// <param name="maxNumFragments"> the maximum number of fragments. /// /// </param> /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments) /// </returns> public String[] GetBestFragments(Analyzer analyzer, string text, int maxNumFragments) { TokenStream tokenStream = analyzer.TokenStream("field", new StringReader(text)); return GetBestFragments(tokenStream, text, maxNumFragments); }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p> /// /// So, if you have a code fragment like this: /// <br> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p> /// /// </summary> /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>. /// /// <p> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P> /// This method is fail-safe in that if a long 'body' is passed in and /// {@link BooleanQuery#add BooleanQuery.add()} (used internally) /// throws /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the /// query as it is will be returned. /// /// /// /// /// /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); Lucene.Net.Analysis.Token t; BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while ((t = ts.Next()) != null) { System.String word = t.TermText(); // ignore opt stop words if (stop != null && stop.Contains(word)) continue; // ignore dups if (already.Contains(word) == true) continue; already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); //false, false); } catch (BooleanQuery.TooManyClauses too) { // fail-safe, just return what we have, not the end of the world break; } } return tmp; }
/// <summary> Highlights chosen terms in a text, extracting the most relevant sections. /// This is a convenience method that calls /// {@link #getBestFragments(TokenStream, String, int)} /// /// </summary> /// <param name="analyzer"> the analyzer that will be used to split <code>text</code> /// into chunks /// </param> /// <param name="fieldName"> the name of the field being highlighted (used by analyzer) /// </param> /// <param name="text"> text to highlight terms in /// </param> /// <param name="maxNumFragments"> the maximum number of fragments. /// /// </param> /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments) /// </returns> public System.String[] GetBestFragments(Analyzer analyzer, System.String fieldName, System.String text, int maxNumFragments) { TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text)); return(GetBestFragments(tokenStream, text, maxNumFragments)); }
/// <summary> /// Creates a query from the analysis chain. /// <p> /// Expert: this is more useful for subclasses such as queryparsers. /// If using this class directly, just use <seealso cref="#createBooleanQuery(String, String)"/> /// and <seealso cref="#createPhraseQuery(String, String)"/> </summary> /// <param name="analyzer"> analyzer used for this query </param> /// <param name="operator"> default boolean operator used for this query </param> /// <param name="field"> field to create queries against </param> /// <param name="queryText"> text to be passed to the analysis chain </param> /// <param name="quoted"> true if phrases should be generated when terms occur at more than one position </param> /// <param name="phraseSlop"> slop factor for phrase/multiphrase queries </param> protected internal Query CreateFieldQuery(Analyzer analyzer, BooleanClause.Occur @operator, string field, string queryText, bool quoted, int phraseSlop) { Debug.Assert(@operator == BooleanClause.Occur.SHOULD || @operator == BooleanClause.Occur.MUST); // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count CachingTokenFilter buffer = null; ITermToBytesRefAttribute termAtt = null; IPositionIncrementAttribute posIncrAtt = null; int numTokens = 0; int positionCount = 0; bool severalTokensAtSamePosition = false; bool hasMoreTokens = false; TokenStream source = null; try { source = analyzer.TokenStream(field, new StringReader(queryText)); source.Reset(); buffer = new CachingTokenFilter(source); buffer.Reset(); if (buffer.HasAttribute<ITermToBytesRefAttribute>()) { termAtt = buffer.GetAttribute<ITermToBytesRefAttribute>(); } if (buffer.HasAttribute<IPositionIncrementAttribute>()) { posIncrAtt = buffer.GetAttribute<IPositionIncrementAttribute>(); } if (termAtt != null) { try { hasMoreTokens = buffer.IncrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.PositionIncrement : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.IncrementToken(); } } catch (System.IO.IOException) { // ignore } } } catch (System.IO.IOException e) { throw new Exception("Error analyzing query text", e); } finally { IOUtils.CloseWhileHandlingException(source); } // rewind the buffer stream buffer.Reset(); BytesRef bytes = termAtt == null ? null : termAtt.BytesRef; if (numTokens == 0) { return null; } else if (numTokens == 1) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } return NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))); } else { if (severalTokensAtSamePosition || (!quoted)) { if (positionCount == 1 || (!quoted)) { // no phrase query: if (positionCount == 1) { // simple case: only one position, with synonyms BooleanQuery q = NewBooleanQuery(true); for (int i = 0; i < numTokens; i++) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } Query currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))); q.Add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // multiple positions BooleanQuery q = NewBooleanQuery(false); Query currentQuery = null; for (int i = 0; i < numTokens; i++) { try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0) { if (!(currentQuery is BooleanQuery)) { Query t = currentQuery; currentQuery = NewBooleanQuery(true); ((BooleanQuery)currentQuery).Add(t, BooleanClause.Occur.SHOULD); } ((BooleanQuery)currentQuery).Add(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))), BooleanClause.Occur.SHOULD); } else { if (currentQuery != null) { q.Add(currentQuery, @operator); } currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))); } } q.Add(currentQuery, @operator); return q; } } else { // phrase query: MultiPhraseQuery mpq = NewMultiPhraseQuery(); mpq.Slop = phraseSlop; IList<Term> multiTerms = new List<Term>(); int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.Count > 0) { if (EnablePositionIncrements_Renamed) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } multiTerms.Clear(); } position += positionIncrement; multiTerms.Add(new Term(field, BytesRef.DeepCopyOf(bytes))); } if (EnablePositionIncrements_Renamed) { mpq.Add(multiTerms.ToArray(), position); } else { mpq.Add(multiTerms.ToArray()); } return mpq; } } else { PhraseQuery pq = NewPhraseQuery(); pq.Slop = phraseSlop; int position = -1; for (int i = 0; i < numTokens; i++) { int positionIncrement = 1; try { bool hasNext = buffer.IncrementToken(); Debug.Assert(hasNext == true); termAtt.FillBytesRef(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.PositionIncrement; } } catch (System.IO.IOException) { // safe to ignore, because we know the number of tokens } if (EnablePositionIncrements_Renamed) { position += positionIncrement; pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)), position); } else { pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes))); } } return pq; } } }
/// <summary> Highlights chosen terms in a text, extracting the most relevant section. /// This is a convenience method that calls /// {@link #GetBestFragment(TokenStream, String)} /// /// </summary> /// <param name="analyzer"> the analyzer that will be used to split <code>text</code> /// into chunks /// </param> /// <param name="text">text to highlight terms in /// </param> /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy /// /// </param> /// <returns> highlighted text fragment or null if no terms found /// </returns> public System.String GetBestFragment(Analyzer analyzer, System.String fieldName, System.String text) { TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text)); return(GetBestFragment(tokenStream, text)); }
/// <summary> Highlights chosen terms in a text, extracting the most relevant section. /// This is a convenience method that calls /// {@link #GetBestFragment(TokenStream, String)} /// /// </summary> /// <param name="analyzer"> the analyzer that will be used to split <code>text</code> /// into chunks /// </param> /// <param name="text">text to highlight terms in /// </param> /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy /// /// </param> /// <returns> highlighted text fragment or null if no terms found /// </returns> public System.String GetBestFragment(Analyzer analyzer, System.String fieldName, System.String text) { TokenStream tokenStream = analyzer.TokenStream(fieldName, new System.IO.StringReader(text)); return GetBestFragment(tokenStream, text); }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p/> /// /// So, if you have a code fragment like this: /// <br/> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p/> /// /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>. /// /// <p/> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P/> /// This method is fail-safe in that if a long 'body' is passed in and /// <see cref="BooleanQuery.Add"/> (used internally) /// throws /// <see cref="BooleanQuery.TooManyClauses"/>, the /// query as it is will be returned. /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>(); BooleanQuery tmp = new BooleanQuery(); ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.GetSet<string>(); // ignore dups while (ts.IncrementToken()) { String word = termAtt.Term; // ignore opt stop words if (stop != null && stop.Contains(word)) continue; // ignore dups if (already.Contains(word)) continue; already.Add(word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, Occur.SHOULD); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return tmp; }
// Tokenizes the fields of a document into Postings. private void InvertDocument(Document doc) { foreach (Field field in doc.Fields()) { System.String fieldName = field.Name(); int fieldNumber = fieldInfos.FieldNumber(fieldName); int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field if (length > 0) { position += analyzer.GetPositionIncrementGap(fieldName); } int offset = fieldOffsets[fieldNumber]; // offset field if (field.IsIndexed()) { if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length)); } else { AddPosition(fieldName, stringValue, position++, null); } offset += stringValue.Length; length++; } else { System.IO.TextReader reader; // find or make Reader if (field.ReaderValue() != null) { reader = field.ReaderValue(); } else if (field.StringValue() != null) { reader = new System.IO.StringReader(field.StringValue()); } else { throw new System.ArgumentException("field must have either String or Reader value"); } // Tokenize field and add to postingTable TokenStream stream = analyzer.TokenStream(fieldName, reader); try { Token lastToken = null; for (Token t = stream.Next(); t != null; t = stream.Next()) { position += (t.GetPositionIncrement() - 1); if (field.IsStoreOffsetWithTermVector()) { AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset())); } else { AddPosition(fieldName, t.TermText(), position++, null); } lastToken = t; if (++length > maxFieldLength) { if (infoStream != null) { infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens"); } break; } } if (lastToken != null) { offset += lastToken.EndOffset() + 1; } } finally { stream.Close(); } } fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.GetBoost(); fieldOffsets[fieldNumber] = offset; } } }
public SearchModel Search(string searchText) { var result = new SearchModel(); if (string.IsNullOrEmpty(searchText)) { result.Message = "Įveskite paieškos užklausą."; return(result); } var stemmedSearchText = new LithuanianStemmer().Stem(searchText.Trim()); if (string.IsNullOrEmpty(stemmedSearchText)) { result.Message = "Įveskite paieškos užklausą."; return(result); } Lucene.Net.Search.Hits hits = null; try { if (char.IsLetter(stemmedSearchText[stemmedSearchText.Length - 1])) { stemmedSearchText += "*"; } query = parser.Parse(stemmedSearchText); if (searcher == null) { searcher = new Lucene.Net.Search.IndexSearcher(CustomAppSettings.SearchIndexFolder); } hits = searcher.Search(query); } catch (Exception e) { result.Message = "Paieška nepavyko. Pataisykite užklausą. Klaidos pranešimas: " + e.Message; return(result); } Lucene.Net.Highlight.Formatter formatter = new Lucene.Net.Highlight.SimpleHTMLFormatter( "<span class=\"highlightResult\">", "</span>"); var fragmenter = new Lucene.Net.Highlight.SimpleFragmenter(100); var scorer = new Lucene.Net.Highlight.QueryScorer(searcher.Rewrite(query)); var highlighter = new Lucene.Net.Highlight.Highlighter(formatter, scorer); highlighter.SetTextFragmenter(fragmenter); Dictionary <string, int> dict_already_seen_ids = new Dictionary <string, int>(); var list = new List <SearchIndexModel>(); // insert the search results into a temp table which we will join with what's in the database for (int i = 0; i < hits.Length(); i++) { if (dict_already_seen_ids.Count < 100) { Lucene.Net.Documents.Document doc = hits.Doc(i); string id = doc.Get("id"); if (!dict_already_seen_ids.ContainsKey(id)) { dict_already_seen_ids[id] = 1; var model = new SearchIndexModel(); model.Id = id; model.Score = hits.Score(i); model.Subject = doc.Get("subject"); model.Type = (EntryTypes)Enum.Parse(typeof(EntryTypes), doc.Get("type")); string raw_text = HttpUtility.HtmlEncode(doc.Get("raw_text")); //string raw_text = doc.Get("raw_text"); Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("text", new System.IO.StringReader( raw_text)); string highlighted_text = highlighter.GetBestFragments(stream, raw_text, 3, "...").Replace("'", "''"); if (highlighted_text == "") // someties the highlighter fails to emit text... { highlighted_text = raw_text.Replace("'", "''"); } if (highlighted_text.Length > 3000) { highlighted_text = highlighted_text.Substring(0, 3000); } model.HighlightedText = highlighted_text; list.Add(model); } } else { break; } } result.List = list; result.SearchPhrase = searchText; if (list.Count == 0) { result.Message = string.Format("Įrašų pagal užklausą '{0}' nerasta. Patikslinkite paieškos duomenis.", searchText); } return(result); }