public TokenStream ( string fieldName, |
||
fieldName | string | the name of the field the created TokenStream is used for |
reader | ||
Результат |
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p/> /// /// So, if you have a code fragment like this: /// <br/> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p/> /// /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>. /// /// <p/> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P/> /// This method is fail-safe in that if a long 'body' is passed in and /// <see cref="BooleanQuery.Add"/> (used internally) /// throws /// <see cref="BooleanQuery.TooManyClauses"/>, the /// query as it is will be returned. /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>(); BooleanQuery tmp = new BooleanQuery(); ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); // ignore dups while (ts.IncrementToken()) { String word = termAtt.Term; // ignore opt stop words if (stop != null && stop.Contains(word)) continue; // ignore dups if (already.Contains(word)) continue; already.Add(word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, Occur.SHOULD); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return tmp; }
public List <String> cutWord(string word, Lucene.Net.Analysis.Analyzer analysis) { List <string> result = new List <string>(); //TokenStream tokenStream = analysis.ReusableTokenStream("", new StringReader(word)); TokenStream tokenStream = analysis.TokenStream("field1", new StringReader(word)); //IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer); bool boolHas = tokenStream.HasAttributes; ITermAttribute attrbutes;// = tokenStream.GetAttribute<ITermAttribute>(); //IEnumerable<Lucene.Net.Util.Attribute> aaa = tokenStream.GetAttributeImplsIterator(); //IEnumerable<Type> bbb = tokenStream.GetAttributeTypesIterator(); while (tokenStream.IncrementToken()) { attrbutes = tokenStream.GetAttribute <ITermAttribute>(); result.Add(attrbutes.Term.ToString()); } tokenStream.Reset(); //attrbutes. //Token token = tokenStream.; //PanGu.Segment segment = new PanGu.Segment(); tokenStream.End(); return(result); }
public static void Highlight(Document d, string query, Analyzer analyzer) { string contents = d.Get("contents"); SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"highlight\"><b>", "</b></span>"); //SpanGradientFormatter formatter = new SpanGradientFormatter(10.0f, null, null, "#F1FD9F", "#EFF413"); //SimpleHTMLEncoder encoder = new SimpleHTMLEncoder(); SimpleFragmenter fragmenter = new SimpleFragmenter(250); Highlighter hiliter = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer))); hiliter.SetTextFragmenter(fragmenter); int numfragments = contents.Length / fragmenter.GetFragmentSize() + 1;// +1 ensures its never zero. More than the required number of fragments dont harm. StringBuilder result = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><head><title>Search Results - "); result.Append(d.Get("filename")); result.Append("</title></head><body><font face=Arial size=5>"); TokenStream tokenstream = analyzer.TokenStream("contents", new System.IO.StringReader(contents)); TextFragment[] frags = hiliter.GetBestTextFragments(tokenstream, contents, false, numfragments); foreach (TextFragment frag in frags) { if (frag.GetScore() > 0) { result.Append(frag.ToString() + "<br/><hr/><br/>"); } } string contentspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "contents.html"); result.Append("</font><a target=_self href=\"file:///"); result.Append(contentspath); result.Append("\">View Original Document...</a>"); result.Append("</body></html>"); result.Replace("\n", "<br/>"); string resultspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "results.html"); System.IO.File.WriteAllText(resultspath, result.ToString()); //webBrowser1.Url = new Uri("file:///" + resultspath); Highlighter hiliter2 = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer))); hiliter2.SetTextFragmenter(fragmenter); TokenStream tokstr = analyzer.TokenStream(new System.IO.StringReader(contents)); StringBuilder htmlcontents = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><body><font face=Arial size=5>"); htmlcontents.Append(hiliter2.GetBestFragments(tokstr, contents, numfragments, "...")); htmlcontents.Append("</font></body></html>"); htmlcontents.Replace("\n", "<br/>"); System.IO.File.WriteAllText(contentspath, htmlcontents.ToString()); }
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { Analyzer analyzer = (Analyzer)analyzerMap[fieldName]; if (analyzer == null) { analyzer = defaultAnalyzer; } return(analyzer.TokenStream(fieldName, reader)); }
public virtual void AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output) { TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input)); for (int i = 0; i < output.Length; i++) { Token t = ts.Next(); Assert.IsNotNull(t); Assert.AreEqual(t.TermText(), output[i]); } Assert.IsNull(ts.Next()); ts.Close(); }
protected virtual void AssertAnalyzesTo(Analyzer analyzer, String input, String[] output) { var tokenStream = analyzer.TokenStream("dummyFieldName", new StringReader(input)); for( var i = 0; i < output.Length; i++ ) { var t = tokenStream.Next(); Assert.IsNotNull(t); Assert.AreEqual(output[i], t.TermText()); } Assert.IsNull(tokenStream.Next()); tokenStream.Close(); }
public List <string> TokenizeString(string untokenized) { System.IO.StringReader stringReader = new System.IO.StringReader(untokenized); TokenStream tokenStream = analyzer.TokenStream("text", stringReader); List <string> tokenized = new List <string>(); ITermAttribute termAtt = tokenStream.GetAttribute <ITermAttribute>(); while (tokenStream.IncrementToken()) { tokenized.Add(termAtt.Term); } return(tokenized); }
public virtual void AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output) { TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input)); for (int i = 0; i < output.Length; i++) { Token t = ts.Next(); Assert.IsNotNull(t); Assert.AreEqual(t.TermText(), output[i]); } Assert.IsNull(ts.Next()); ts.Close(); }
private static List<string> GetTokens(string keywords, Analyzer analyser) { var tokenStream = analyser.TokenStream(null, new StringReader(keywords)); var termAttribute = tokenStream.GetAttribute<ITermAttribute>(); tokenStream.Reset(); var list = new List<string>(); while (tokenStream.IncrementToken()) { var term = termAttribute.Term; list.Add(term); } return list; }
public virtual void AssertThreadSafe(Analyzer analyzer) { int numTestPoints = 100; int numThreads = TestUtil.NextInt(Random(), 3, 5); Dictionary <string, BytesRef> map = new Dictionary <string, BytesRef>(); // create a map<String,SortKey> up front. // then with multiple threads, generate sort keys for all the keys in the map // and ensure they are the same as the ones we produced in serial fashion. for (int i = 0; i < numTestPoints; i++) { string term = TestUtil.RandomSimpleString(Random()); IOException priorException = null; TokenStream ts = analyzer.TokenStream("fake", new StreamReader(term)); try { ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); BytesRef bytes = termAtt.BytesRef; ts.Reset(); Assert.IsTrue(ts.IncrementToken()); termAtt.FillBytesRef(); // ensure we make a copy of the actual bytes too map[term] = BytesRef.DeepCopyOf(bytes); Assert.IsFalse(ts.IncrementToken()); ts.End(); } catch (IOException e) { priorException = e; } finally { IOUtils.CloseWhileHandlingException(priorException, ts); } } ThreadClass[] threads = new ThreadClass[numThreads]; for (int i = 0; i < numThreads; i++) { threads[i] = new ThreadAnonymousInnerClassHelper(this, analyzer, map); } for (int i = 0; i < numThreads; i++) { threads[i].Start(); } for (int i = 0; i < numThreads; i++) { threads[i].Join(); } }
// This is a simplified query builder which works for single Terms and single Phrases // Returns null, TermQuery, or PhraseQuery public static Lucene.Net.Search.Query GetFieldQuery(Analyzer analyzer, string field, string queryText) { TokenStream stream = analyzer.TokenStream(field, new StringReader(queryText)); TokenFilter filter = new CachingTokenFilter(stream); filter.Reset(); // This attribute way of getting token properties isn't very good, but it's the non-obsolete one. var attr1 = filter.GetAttribute<ITermAttribute>(); Func<string> getText = () => attr1 != null ? attr1.Term : null; Func<int> getPositionIncrement; if (filter.HasAttribute<IPositionIncrementAttribute>()) { var attr = filter.GetAttribute<IPositionIncrementAttribute>(); getPositionIncrement = () => attr.PositionIncrement; } else { getPositionIncrement = () => 1; } // 0 tokens if (!filter.IncrementToken()) { return new BooleanQuery(); } // 1 token? string token1 = getText(); int position = 0; if (!filter.IncrementToken()) { return new TermQuery(new Term(field, token1)); } // many tokens - handle first token PhraseQuery ret = new PhraseQuery(); ret.Add(new Term(field, token1)); do { // handle rest of tokens string tokenNext = getText(); position += getPositionIncrement(); ret.Add(new Term(field, tokenNext), position); } while (filter.IncrementToken()); return ret; }
public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text) { TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)); List<string> result = new List<string>(); TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); while (stream.IncrementToken()) { result.Add(tokenAttr.Term()); } stream.End(); stream.Close(); return result; }
public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text) { using (TokenStream stream = analyzer.TokenStream("contents", new StringReader(text))) { var result = new List<string>(); var tokenAttr = (TermAttribute) stream.GetAttribute<ITermAttribute>(); while (stream.IncrementToken()) { result.Add(tokenAttr.Term); } stream.End(); return result; } }
public void PositonOfWOrds() { Lucene.Net.Analysis.Analyzer analyzer = AnalyzerList[0].LuceneAnalyzer as Analyzer; int termCounter = 0; if (analyzer != null) { AnalyzerView view = AnalyzerViews[0] as AnalyzerView; StringReader stringReader = new StringReader(sb.ToString()); TokenStream tokenStream = analyzer.TokenStream("defaultFieldName", stringReader); String strValue = view.GetView(tokenStream, out termCounter).Trim(); Console.WriteLine("PositonOfWOrds Details : " + strValue); } }
public void WordCountFrequency() { Lucene.Net.Analysis.Analyzer analyzer = AnalyzerList[0].LuceneAnalyzer as Analyzer; int termCounter = 0; if (analyzer != null) { AnalyzerView view = AnalyzerViews[1] as AnalyzerView; StringReader stringReader = new StringReader(sb.ToString()); TokenStream tokenStream = analyzer.TokenStream("defaultFieldName", stringReader); String strValue = view.GetView(tokenStream, out termCounter).Trim(); Console.WriteLine("WordCountFrequency Details : " + strValue); } Console.WriteLine(string.Format("Total of {0} Term(s) Found.", termCounter)); }
public static void DisplayTokenWithPositions(Analyzer analyzer, string text) { var stream = analyzer.TokenStream("contents", new StringReader(text)); var termAttribute = stream.AddAttribute(typeof (TermAttribute)) as TermAttribute; var positionIncrement = stream.AddAttribute(typeof (PositionIncrementAttribute)) as PositionIncrementAttribute; int position = 0; while (stream.IncrementToken()) { int increment = positionIncrement.GetPositionIncrement(); if(increment>0) { position = position + increment; Console.WriteLine(); Console.WriteLine("{0}: ", position); } Console.WriteLine("[{0}]", termAttribute.Term()); } Console.WriteLine(); }
public virtual void AssertAnalyzesTo(Analyzer a, System.String input, System.String[] expectedImages, System.String[] expectedTypes, int[] expectedPosIncrs) { TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input)); for (int i = 0; i < expectedImages.Length; i++) { Token t = ts.Next(); Assert.IsNotNull(t); Assert.AreEqual(expectedImages[i], t.TermText()); if (expectedTypes != null) { Assert.AreEqual(expectedTypes[i], t.Type()); } if (expectedPosIncrs != null) { Assert.AreEqual(expectedPosIncrs[i], t.GetPositionIncrement()); } } Assert.IsNull(ts.Next()); ts.Close(); }
/// <summary> /// Perform synonym expansion on a query. /// </summary> /// <param name="query">query</param> /// <param name="syns">syns</param> /// <param name="a">a</param> /// <param name="field">field</param> /// <param name="boost">boost</param> public static Query Expand(String query, Searcher syns, Analyzer a, String field, float boost) { already = new List<String>(); // avoid dups var top = new List<String>(); // needs to be separately listed.. var ts = a.TokenStream(field, new StringReader(query)); var termAtt = ts.AddAttribute<TermAttribute>(); while (ts.IncrementToken()) { var word = termAtt.Term; if (!already.Contains(word)) { already.Add(word); top.Add(word); } } tmp = new BooleanQuery(); // [2] form query System.Collections.IEnumerator it = top.GetEnumerator(); while (it.MoveNext()) { // [2a] add to level words in var word = (String)it.Current; var tq = new TermQuery(new Term(field, word)); tmp.Add(tq, Occur.SHOULD); var c = new CollectorImpl(field, boost); syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c); } return tmp; }
public override void Run() { try { foreach (KeyValuePair <string, BytesRef> mapping in Map) { string term = mapping.Key; BytesRef expected = mapping.Value; IOException priorException = null; TokenStream ts = Analyzer.TokenStream("fake", new StreamReader(term)); try { ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); BytesRef bytes = termAtt.BytesRef; ts.Reset(); Assert.IsTrue(ts.IncrementToken()); termAtt.FillBytesRef(); Assert.AreEqual(expected, bytes); Assert.IsFalse(ts.IncrementToken()); ts.End(); } catch (IOException e) { priorException = e; } finally { IOUtils.CloseWhileHandlingException(priorException, ts); } } } catch (IOException e) { throw (Exception)e; } }
public static List<string> SplitKeyWords(string keywords, Analyzer analyzer) { System.IO.StreamReader reader = new System.IO.StreamReader(PanGu.Framework.Stream.WriteStringToStream(keywords, Encoding.UTF8), Encoding.UTF8); TokenStream tokenStream = analyzer.TokenStream("", reader); global::Lucene.Net.Analysis.Token token = tokenStream.Next(); List<string> result = new List<string>(); while (token != null) { result.Add(keywords.Substring(token.StartOffset(), token.EndOffset() - token.StartOffset())); token = tokenStream.Next(); } return result; }
public virtual void AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output) { TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input)); for (int i = 0; i < output.Length; i++) { Token t = ts.Next(); System.Diagnostics.Trace.Assert(t != null); //// assertNotNull(t); System.Diagnostics.Trace.Assert(output[i] == t.TermText()); //// assertEquals(output[i], t.TermText()); } System.Diagnostics.Trace.Assert(ts.Next() == null); //// assertNull(ts.Next()); ts.Close(); }
public void DoStandardHighlights(Analyzer analyzer, IndexSearcher searcher, TopDocs hits, Query query, IFormatter formatter, bool expandMT) { IFragmenter frag = new SimpleFragmenter(20); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; IScorer scorer = null; TokenStream tokenStream = analyzer.TokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); if (Mode == QUERY) { scorer = new QueryScorer(query); } else if (Mode == QUERY_TERM) { scorer = new QueryTermScorer(query); } var highlighter = new Highlighter(formatter, scorer) {TextFragmenter = frag}; String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); Console.WriteLine("\t" + result); } }
public static void AssertAnalyzesTo(Analyzer a, String input, String[] output, int[] startOffsets, int[] endOffsets, String[] types, int[] posIncrements) { AssertTokenStreamContents(a.TokenStream("dummy", new System.IO.StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.Length); }
private static Query ExecuteAnalyzer(Analyzer analyzer, string field, string text) { TokenStream tokenStream = analyzer.TokenStream(field, new StringReader(text)); ITermAttribute termAttribute = tokenStream.AddAttribute<ITermAttribute>(); IPositionIncrementAttribute positionIncrementAttribute = tokenStream.AddAttribute<IPositionIncrementAttribute>(); List<List<Term>> terms = new List<List<Term>>(); List<Term> current = null; while (tokenStream.IncrementToken()) { if (positionIncrementAttribute.PositionIncrement > 0) { current = new List<Term>(); terms.Add(current); } if (current != null) { current.Add(new Term(field, termAttribute.Term)); } } if (terms.Count == 1 && terms[0].Count == 1) { return new TermQuery(terms[0][0]); } else if (terms.Select(l => l.Count).Sum() == terms.Count) { PhraseQuery phraseQuery = new PhraseQuery(); foreach (var positionList in terms) { phraseQuery.Add(positionList[0]); } return phraseQuery; } else { MultiPhraseQuery multiPhraseQuery = new MultiPhraseQuery(); foreach (var positionList in terms) { multiPhraseQuery.Add(positionList.ToArray()); } return multiPhraseQuery; } }
public static string ToTokenStreamString(this string text, Analyzer analyzer) { var tokens = new StringBuilder(); var reader = new StringReader(text); var tokenStream = analyzer.TokenStream("x", reader); while(true) { var term = tokenStream.Next(); if(term == null) { break; } tokens.Append(term.Term()); tokens.Append(" "); } return tokens.ToString(); }
//convenience method public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer) { return analyzer.TokenStream(field, new StringReader(contents)); }
private static List<string> AnalyzeText(Analyzer analyzer, string field, string text) { if (String.IsNullOrEmpty(text)) { return new List<string>(); } var result = new List<string>(); using (var sr = new System.IO.StringReader(text)) { using (TokenStream stream = analyzer.TokenStream(field, sr)) { while (stream.IncrementToken()) { var termAttribute = stream.GetAttribute<ITermAttribute>(); if(termAttribute != null) { result.Add(termAttribute.Term); } } } } return result; }
/// <summary> /// Searches the index for the querytext and displays a ranked list of results to the screen /// </summary> /// <param name="querytext">The text to search the index</param> private string SearchAndDisplayResults(string querytext, long qid, List <long> relevantList) { System.Console.WriteLine("Searching for " + querytext); querytext = querytext.ToLower(); Query query = parser.Parse(querytext); System.Console.WriteLine($"Searching for { query.ToString()}"); TopDocs results = searcher.Search(query, MAX_QUERY); // create highlighter - using strong tag to highlight in this case (change as needed) //IFormatter formatter = new SimpleHTMLFormatter("<strong>", "</strong>"); IFormatter formatter = new SimpleHTMLFormatter("<span style=\"font-weight:bold;background-color:yellow;\">", "</span>"); // excerpt set to 200 characters in length var fragmenter = new SimpleFragmenter(3000); var scorer = new QueryScorer(query); var highlighter = new Highlighter(formatter, scorer) { TextFragmenter = fragmenter }; long rank = 0; float topscore = 0f; long foundrelevants = 0; List <TrecItem> logItems = new List <TrecItem>(); SearchedListViewModel.DeleteAll(); foreach (ScoreDoc scoreDoc in results.ScoreDocs) { if (rank == 0) { topscore = scoreDoc.Score; } rank++; Lucene.Net.Documents.Document doc = searcher.Doc(scoreDoc.Doc); long id = Convert.ToInt64(doc.Get(PID_FN).ToString()); CollectionPassage ps = collectionProvider.Passages[id]; // Logging Trec logItems.Add(new TrecItem(0, id, rank, scoreDoc.Score)); // get highlighted fragment TokenStream stream = analyzer.TokenStream("", new StringReader(ps.passage_text)); string highlighted = highlighter.GetBestFragment(stream, ps.passage_text); //string url2 = doc.Get(TEXT_FN).ToString(); //Console.WriteLine("Rank " + rank + " text " + myFieldValue); if (highlighted == null) { highlighted = ps.passage_text; } if (relevantList.Contains(id)) { foundrelevants++; } SearchedListViewModel.Add(scoreDoc.Score / topscore, id, ps.GetTitle(), ps.url, highlighted, relevantList.Contains(id)); //Console.WriteLine("==>" + highlighted); } StatusBarViewModel.Instance.NumRelevants = "Num Relevants : " + foundrelevants.ToString() + "/" + relevantList.Count.ToString(); StatusBarViewModel.Instance.NumSearch = "Num Searched :" + results.ScoreDocs.Length.ToString(); // Logging Trec trecLogger.Logging(qid, logItems); //Console.WriteLine(string.Join(",", relevantList)); return(query.ToString()); }
private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text); } ICharTermAttribute termAtt; IOffsetAttribute offsetAtt; IPositionIncrementAttribute posIncAtt; IPositionLengthAttribute posLengthAtt; ITypeAttribute typeAtt; IList<string> tokens = new List<string>(); IList<string> types = new List<string>(); IList<int> positions = new List<int>(); IList<int> positionLengths = new List<int>(); IList<int> startOffsets = new List<int>(); IList<int> endOffsets = new List<int>(); int remainder = random.Next(10); StringReader reader = new StringReader(text); TokenStream ts; using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader)) { termAtt = ts.HasAttribute<ICharTermAttribute>() ? ts.GetAttribute<ICharTermAttribute>() : null; offsetAtt = ts.HasAttribute<IOffsetAttribute>() ? ts.GetAttribute<IOffsetAttribute>() : null; posIncAtt = ts.HasAttribute<IPositionIncrementAttribute>() ? ts.GetAttribute<IPositionIncrementAttribute>() : null; posLengthAtt = ts.HasAttribute<IPositionLengthAttribute>() ? ts.GetAttribute<IPositionLengthAttribute>() : null; typeAtt = ts.HasAttribute<ITypeAttribute>() ? ts.GetAttribute<ITypeAttribute>() : null; ts.Reset(); // First pass: save away "correct" tokens while (ts.IncrementToken()) { Assert.IsNotNull(termAtt, "has no CharTermAttribute"); tokens.Add(termAtt.ToString()); if (typeAtt != null) { types.Add(typeAtt.Type); } if (posIncAtt != null) { positions.Add(posIncAtt.PositionIncrement); } if (posLengthAtt != null) { positionLengths.Add(posLengthAtt.PositionLength); } if (offsetAtt != null) { startOffsets.Add(offsetAtt.StartOffset()); endOffsets.Add(offsetAtt.EndOffset()); } } ts.End(); } // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (tokens.Count > 0) { // KWTokenizer (for example) can produce a token // even when input is length 0: if (text.Length != 0) { // (Optional) second pass: do something evil: int evilness = random.Next(50); if (evilness == 17) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception"); } // Throw an errant exception from the Reader: MockReaderWrapper evilReader = new MockReaderWrapper(random, text); evilReader.ThrowExcAfterChar(random.Next(text.Length)); reader = evilReader; try { // NOTE: some Tokenizers go and read characters // when you call .setReader(Reader), eg // PatternTokenizer. this is a bit // iffy... (really, they should only // pull from the Reader when you call // .incremenToken(), I think?), but we // currently allow it, so, we must call // a.TokenStream inside the try since we may // hit the exc on init: ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(evilReader, remainder) : evilReader); ts.Reset(); while (ts.IncrementToken()) ; Assert.Fail("did not hit exception"); } catch (Exception re) { Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re)); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } else if (evilness == 7) { // Only consume a subset of the tokens: int numTokensToRead = random.Next(tokens.Count); if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens"); } reader = new StringReader(text); ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader); ts.Reset(); for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++) { Assert.IsTrue(ts.IncrementToken()); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } } } // Final pass: verify clean tokenization matches // results from first pass: if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens"); } reader = new StringReader(text); long seed = random.Next(); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader); if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (posIncAtt != null && offsetAtt != null) { // offset + pos AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (offsetAtt != null) { // offset AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect); } else { // terms only AssertTokenStreamContents(ts, tokens.ToArray()); } if (field != null) { reader = new StringReader(text); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } field.ReaderValue = useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader; } }
public TokenStream GetTokenStream(Analyzer analyzer) { if (!((FieldType)FieldType).Indexed) { return null; } FieldType.NumericType? numericType = ((FieldType)FieldType).NumericTypeValue; if (numericType != null) { if (!(InternalTokenStream is NumericTokenStream)) { // lazy init the TokenStream as it is heavy to instantiate // (attributes,...) if not needed (stored field loading) InternalTokenStream = new NumericTokenStream(Type.NumericPrecisionStep); } var nts = (NumericTokenStream)InternalTokenStream; // initialize value in TokenStream object val = FieldsData; switch (numericType) { case Documents.FieldType.NumericType.INT: nts.SetIntValue(Convert.ToInt32(val)); break; case Documents.FieldType.NumericType.LONG: nts.SetLongValue(Convert.ToInt64(val)); break; case Documents.FieldType.NumericType.FLOAT: nts.SetFloatValue(Convert.ToSingle(val)); break; case Documents.FieldType.NumericType.DOUBLE: nts.SetDoubleValue(Convert.ToDouble(val)); break; default: throw new Exception("Should never get here"); } return InternalTokenStream; } if (!((FieldType)FieldType).Tokenized) { if (StringValue == null) { throw new System.ArgumentException("Non-Tokenized Fields must have a String value"); } if (!(InternalTokenStream is StringTokenStream)) { // lazy init the TokenStream as it is heavy to instantiate // (attributes,...) if not needed (stored field loading) InternalTokenStream = new StringTokenStream(); } ((StringTokenStream)InternalTokenStream).Value = StringValue; return InternalTokenStream; } if (TokenStream_Renamed != null) { return TokenStream_Renamed; } else if (ReaderValue != null) { return analyzer.TokenStream(Name, ReaderValue); } else if (StringValue != null) { TextReader sr = new StringReader(StringValue); return analyzer.TokenStream(Name, sr); } throw new System.ArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this); }
internal static void CheckResetException(Analyzer a, string input) { TokenStream ts = a.TokenStream("bogus", new StringReader(input)); try { if (ts.IncrementToken()) { ts.ReflectAsString(false); Assert.Fail("didn't get expected exception when reset() not called"); } } catch (InvalidOperationException expected) { //ok } catch (AssertionException expected) { // ok: MockTokenizer Assert.IsTrue(expected.Message != null && expected.Message.Contains("wrong state"), expected.Message); } catch (Exception unexpected) { //unexpected.printStackTrace(System.err); Console.Error.WriteLine(unexpected.StackTrace); Assert.Fail("got wrong exception when reset() not called: " + unexpected); } finally { // consume correctly ts.Reset(); while (ts.IncrementToken()) { } ts.End(); ts.Dispose(); } // check for a missing Close() ts = a.TokenStream("bogus", new StringReader(input)); ts.Reset(); while (ts.IncrementToken()) { } ts.End(); try { ts = a.TokenStream("bogus", new StringReader(input)); Assert.Fail("didn't get expected exception when Close() not called"); } catch (Exception) { // ok } finally { ts.Dispose(); } }
public static void AssertAnalyzesTo(Analyzer a, string input, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, bool offsetsAreCorrect) { CheckResetException(a, input); AssertTokenStreamContents(a.TokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.Length, offsetsAreCorrect); }
// Test using various international locales with accented characters (which // sort differently depending on locale) // // Copied (and slightly modified) from // Lucene.Net.Search.TestSort.testInternationalSort() // // TODO: this test is really fragile. there are already 3 different cases, // depending upon unicode version. public virtual void TestCollationKeySort(Analyzer usAnalyzer, Analyzer franceAnalyzer, Analyzer swedenAnalyzer, Analyzer denmarkAnalyzer, string usResult, string frResult, string svResult, string dkResult) { Directory indexStore = NewDirectory(); IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false))); // document data: // the tracer field is used to determine which document was hit string[][] sortData = new string[][] { new string[] { "A", "x", "p\u00EAche", "p\u00EAche", "p\u00EAche", "p\u00EAche" }, new string[] { "B", "y", "HAT", "HAT", "HAT", "HAT" }, new string[] { "C", "x", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" }, new string[] { "D", "y", "HUT", "HUT", "HUT", "HUT" }, new string[] { "E", "x", "peach", "peach", "peach", "peach" }, new string[] { "F", "y", "H\u00C5T", "H\u00C5T", "H\u00C5T", "H\u00C5T" }, new string[] { "G", "x", "sin", "sin", "sin", "sin" }, new string[] { "H", "y", "H\u00D8T", "H\u00D8T", "H\u00D8T", "H\u00D8T" }, new string[] { "I", "x", "s\u00EDn", "s\u00EDn", "s\u00EDn", "s\u00EDn" }, new string[] { "J", "y", "HOT", "HOT", "HOT", "HOT" } }; FieldType customType = new FieldType(); customType.Stored = true; for (int i = 0; i < sortData.Length; ++i) { Document doc = new Document(); doc.Add(new Field("tracer", sortData[i][0], customType)); doc.Add(new TextField("contents", sortData[i][1], Field.Store.NO)); if (sortData[i][2] != null) { doc.Add(new TextField("US", usAnalyzer.TokenStream("US", new StringReader(sortData[i][2])))); } if (sortData[i][3] != null) { doc.Add(new TextField("France", franceAnalyzer.TokenStream("France", new StringReader(sortData[i][3])))); } if (sortData[i][4] != null) { doc.Add(new TextField("Sweden", swedenAnalyzer.TokenStream("Sweden", new StringReader(sortData[i][4])))); } if (sortData[i][5] != null) { doc.Add(new TextField("Denmark", denmarkAnalyzer.TokenStream("Denmark", new StringReader(sortData[i][5])))); } writer.AddDocument(doc); } writer.ForceMerge(1); writer.Dispose(); IndexReader reader = DirectoryReader.Open(indexStore); IndexSearcher searcher = new IndexSearcher(reader); Sort sort = new Sort(); Query queryX = new TermQuery(new Term("contents", "x")); Query queryY = new TermQuery(new Term("contents", "y")); sort.SetSort(new SortField("US", SortField.Type_e.STRING)); AssertMatches(searcher, queryY, sort, usResult); sort.SetSort(new SortField("France", SortField.Type_e.STRING)); AssertMatches(searcher, queryX, sort, frResult); sort.SetSort(new SortField("Sweden", SortField.Type_e.STRING)); AssertMatches(searcher, queryY, sort, svResult); sort.SetSort(new SortField("Denmark", SortField.Type_e.STRING)); AssertMatches(searcher, queryY, sort, dkResult); reader.Dispose(); indexStore.Dispose(); }
protected internal virtual BytesRef AnalyzeMultitermTerm(string field, string part, Analyzer analyzerIn) { if (analyzerIn == null) analyzerIn = Analyzer; TokenStream source = null; try { source = analyzerIn.TokenStream(field, part); source.Reset(); ITermToBytesRefAttribute termAtt = source.GetAttribute<ITermToBytesRefAttribute>(); BytesRef bytes = termAtt.BytesRef; if (!source.IncrementToken()) throw new ArgumentException("analyzer returned no terms for multiTerm term: " + part); termAtt.FillBytesRef(); if (source.IncrementToken()) throw new ArgumentException("analyzer returned too many terms for multiTerm term: " + part); source.End(); return BytesRef.DeepCopyOf(bytes); } catch (IOException e) { throw new Exception("Error analyzing multiTerm term: " + part, e); } finally { IOUtils.CloseWhileHandlingException(source); } }
public static void DisplayTokens(Analyzer analyzer, string text) { DisplayTokens(analyzer.TokenStream("contents", new StringReader(text))); }
public static void AssertAnalyzesTo(Analyzer a, string input, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, bool offsetsAreCorrect) { CheckResetException(a, input); AssertTokenStreamContents(a.TokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.Length, offsetsAreCorrect); }
public static void AssertAnalyzesTo(Analyzer a, String input, String[] output, int[] startOffsets, int[] endOffsets, String[] types, int[] posIncrements) { AssertTokenStreamContents(a.TokenStream("dummy", new System.IO.StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.Length); }
public virtual void AssertThreadSafe(Analyzer analyzer) { int numTestPoints = 100; int numThreads = TestUtil.NextInt(Random(), 3, 5); Dictionary<string, BytesRef> map = new Dictionary<string, BytesRef>(); // create a map<String,SortKey> up front. // then with multiple threads, generate sort keys for all the keys in the map // and ensure they are the same as the ones we produced in serial fashion. for (int i = 0; i < numTestPoints; i++) { string term = TestUtil.RandomSimpleString(Random()); IOException priorException = null; TokenStream ts = analyzer.TokenStream("fake", new StreamReader(term)); try { ITermToBytesRefAttribute termAtt = ts.AddAttribute<ITermToBytesRefAttribute>(); BytesRef bytes = termAtt.BytesRef; ts.Reset(); Assert.IsTrue(ts.IncrementToken()); termAtt.FillBytesRef(); // ensure we make a copy of the actual bytes too map[term] = BytesRef.DeepCopyOf(bytes); Assert.IsFalse(ts.IncrementToken()); ts.End(); } catch (IOException e) { priorException = e; } finally { IOUtils.CloseWhileHandlingException(priorException, ts); } } ThreadClass[] threads = new ThreadClass[numThreads]; for (int i = 0; i < numThreads; i++) { threads[i] = new ThreadAnonymousInnerClassHelper(this, analyzer, map); } for (int i = 0; i < numThreads; i++) { threads[i].Start(); } for (int i = 0; i < numThreads; i++) { threads[i].Join(); } }
private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text); } ICharTermAttribute termAtt; IOffsetAttribute offsetAtt; IPositionIncrementAttribute posIncAtt; IPositionLengthAttribute posLengthAtt; ITypeAttribute typeAtt; IList <string> tokens = new List <string>(); IList <string> types = new List <string>(); IList <int> positions = new List <int>(); IList <int> positionLengths = new List <int>(); IList <int> startOffsets = new List <int>(); IList <int> endOffsets = new List <int>(); int remainder = random.Next(10); StringReader reader = new StringReader(text); TokenStream ts; using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader)) { termAtt = ts.HasAttribute <ICharTermAttribute>() ? ts.GetAttribute <ICharTermAttribute>() : null; offsetAtt = ts.HasAttribute <IOffsetAttribute>() ? ts.GetAttribute <IOffsetAttribute>() : null; posIncAtt = ts.HasAttribute <IPositionIncrementAttribute>() ? ts.GetAttribute <IPositionIncrementAttribute>() : null; posLengthAtt = ts.HasAttribute <IPositionLengthAttribute>() ? ts.GetAttribute <IPositionLengthAttribute>() : null; typeAtt = ts.HasAttribute <ITypeAttribute>() ? ts.GetAttribute <ITypeAttribute>() : null; ts.Reset(); // First pass: save away "correct" tokens while (ts.IncrementToken()) { Assert.IsNotNull(termAtt, "has no CharTermAttribute"); tokens.Add(termAtt.ToString()); if (typeAtt != null) { types.Add(typeAtt.Type); } if (posIncAtt != null) { positions.Add(posIncAtt.PositionIncrement); } if (posLengthAtt != null) { positionLengths.Add(posLengthAtt.PositionLength); } if (offsetAtt != null) { startOffsets.Add(offsetAtt.StartOffset()); endOffsets.Add(offsetAtt.EndOffset()); } } ts.End(); } // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (tokens.Count > 0) { // KWTokenizer (for example) can produce a token // even when input is length 0: if (text.Length != 0) { // (Optional) second pass: do something evil: int evilness = random.Next(50); if (evilness == 17) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception"); } // Throw an errant exception from the Reader: MockReaderWrapper evilReader = new MockReaderWrapper(random, text); evilReader.ThrowExcAfterChar(random.Next(text.Length)); reader = evilReader; try { // NOTE: some Tokenizers go and read characters // when you call .setReader(Reader), eg // PatternTokenizer. this is a bit // iffy... (really, they should only // pull from the Reader when you call // .incremenToken(), I think?), but we // currently allow it, so, we must call // a.TokenStream inside the try since we may // hit the exc on init: ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(evilReader, remainder) : evilReader); ts.Reset(); while (ts.IncrementToken()) { ; } Assert.Fail("did not hit exception"); } catch (Exception re) { Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re)); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } else if (evilness == 7) { // Only consume a subset of the tokens: int numTokensToRead = random.Next(tokens.Count); if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens"); } reader = new StringReader(text); ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader); ts.Reset(); for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++) { Assert.IsTrue(ts.IncrementToken()); } try { ts.End(); } catch (InvalidOperationException ae) { // Catch & ignore MockTokenizer's // anger... if ("End() called before IncrementToken() returned false!".Equals(ae.Message)) { // OK } else { throw ae; } } finally { ts.Dispose(); } } } } // Final pass: verify clean tokenization matches // results from first pass: if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens"); } reader = new StringReader(text); long seed = random.Next(); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader); if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) { // offset + pos + type AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) { // offset + pos + posLength AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect); } else if (posIncAtt != null && offsetAtt != null) { // offset + pos AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect); } else if (offsetAtt != null) { // offset AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect); } else { // terms only AssertTokenStreamContents(ts, tokens.ToArray()); } if (field != null) { reader = new StringReader(text); random = new Random((int)seed); if (random.Next(30) == 7) { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader"); } reader = new MockReaderWrapper(random, text); } field.ReaderValue = useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader; } }
/// <summary> Perform synonym expansion on a query. /// /// </summary> /// <param name="">query /// </param> /// <param name="">syns /// </param> /// <param name="">a /// </param> /// <param name="">field /// </param> /// <param name="">boost /// </param> public static Query Expand(System.String query, Searcher syns, Analyzer a, System.String field, float boost) { System.Collections.Hashtable already = new System.Collections.Hashtable(); // avoid dups System.Collections.IList top = new System.Collections.ArrayList(); // needs to be separately listed.. // [1] Parse query into separate words so that when we expand we can avoid dups TokenStream ts = a.TokenStream(field, new System.IO.StringReader(query)); Lucene.Net.Analysis.Token t; while ((t = ts.Next()) != null) { System.String word = t.TermText(); if (already.Contains(word) == false) { already.Add(word, word); top.Add(word); } } BooleanQuery tmp = new BooleanQuery(); // [2] form query System.Collections.IEnumerator it = top.GetEnumerator(); while (it.MoveNext()) { // [2a] add to level words in System.String word = (System.String) it.Current; TermQuery tq = new TermQuery(new Term(field, word)); tmp.Add(tq, BooleanClause.Occur.SHOULD); // [2b] add in unique synonums Hits hits = syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word))); for (int i = 0; i < hits.Length(); i++) { Document doc = hits.Doc(i); System.String[] values = doc.GetValues(Syns2Index.F_SYN); for (int j = 0; j < values.Length; j++) { System.String syn = values[j]; if (already.Contains(syn) == false) { already.Add(syn, syn); tq = new TermQuery(new Term(field, syn)); if (boost > 0) // else keep normal 1.0 tq.SetBoost(boost); tmp.Add(tq, BooleanClause.Occur.SHOULD); } } } } return tmp; }
protected internal virtual string ToDot(Analyzer a, string inputText) { StringWriter sw = new StringWriter(); TokenStream ts = a.TokenStream("field", new StringReader(inputText)); ts.Reset(); (new TokenStreamToDot(inputText, ts, /*new StreamWriter(*/(TextWriter)sw/*)*/)).ToDot(); return sw.ToString(); }
/// <summary> /// Searches the index for the querytext /// </summary> /// <param name="querytext">The text to search the index</param> //public string SearchIndext(string querytext) public List <Dictionary <string, string> > SearchIndext(string querytext) { List <Dictionary <string, string> > resultListDict = new List <Dictionary <string, string> >(); // Initiate a result list Query query = DisplayQueries(querytext); Console.WriteLine("query is " + query); TopDocs results = searcher.Search(query, 100); System.Console.WriteLine("Number of results is " + results.TotalHits); // Setup the configuration of Highlighter IFormatter formatter = new SimpleHTMLFormatter("<span style=\"font-weight:bold; background-color:yellow;\">", "</span>"); SimpleFragmenter fragmenter = new SimpleFragmenter(2000); QueryScorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.TextFragmenter = fragmenter; int rank = 0; // ScoreDocs : a array stores pointers of a query // scoreDoc : a pointer of a query points to doc_ID and score (of the doc for the query) //string output = ""; if (results.TotalHits != 0) // Check if there are results { foreach (ScoreDoc scoreDoc in results.ScoreDocs) { rank++; Lucene.Net.Documents.Document doc = searcher.Doc(scoreDoc.Doc); string myFieldValue = doc.Get(TEXT_FN_PASS_TEXT); string myURL = doc.Get(TEXT_FN_URL); string passId = doc.Get(TEXT_FN_PASS_ID); string score = scoreDoc.Score.ToString(); string queryId = doc.Get(TEXT_FN_QUERY_ID); int jsonId = Int32.Parse(doc.Get(TEXT_FN_JSON_ARRAY_ID)); // passage_text field store as Field.Store.NO foreach (var itemP in jArr[jsonId][PASSAGES]) { if (itemP[TEXT_FN_PASS_ID].ToString() == passId) { myFieldValue = itemP[TEXT_FN_PASS_TEXT].ToString(); } } //Add the Highlighter tag into passage_text of query //TokenStream HLstream = analyzer.TokenStream("", new StringReader(doc.Get(TEXT_FN_PASS_TEXT))); //string HLmyFieldValue = highlighter.GetBestFragment(HLstream, doc.Get(TEXT_FN_PASS_TEXT)); TokenStream HLstream = analyzer.TokenStream("", new StringReader(myFieldValue)); string HLmyFieldValue = highlighter.GetBestFragment(HLstream, myFieldValue); Explanation e = searcher.Explain(query, scoreDoc.Doc); //Extract title from URL char delimiters = '/'; string[] urlSeg = myURL.Split(delimiters); string title; if (urlSeg[urlSeg.Length - 1].Length == 0) { title = urlSeg[urlSeg.Length - 2]; } else { title = urlSeg[urlSeg.Length - 1]; } resultListDict.Add(new Dictionary <string, string> { { "rank", rank.ToString() }, { "passId", passId }, { "score", score }, { "title", title }, { "url", myURL }, { "text", myFieldValue }, { "queryId", queryId }, { "highlighter", HLmyFieldValue } }); //Console.WriteLine("Rank " + rank + " text " + myFieldValue + " URL " + myURL); //Console.WriteLine(e); } } return(resultListDict); }
protected internal virtual void ToDotFile(Analyzer a, string inputText, string localFileName) { StreamWriter w = new StreamWriter(new FileStream(localFileName, FileMode.Open), IOUtils.CHARSET_UTF_8); TokenStream ts = a.TokenStream("field", new StreamReader(inputText)); ts.Reset(); (new TokenStreamToDot(inputText, ts,/* new PrintWriter(*/w/*)*/)).ToDot(); w.Dispose(); }