TokenStream() public method

Returns a TokenStream suitable for fieldName, tokenizing the contents of text.

this method uses #createComponents(String, Reader) to obtain an instance of TokenStreamComponents. It returns the sink of the components and stores the components internally. Subsequent calls to this method will reuse the previously stored components after resetting them through TokenStreamComponents#setReader(Reader).

NOTE: After calling this method, the consumer must follow the workflow described in TokenStream to properly consume its contents. See the Lucene.Net.Analysis Analysis package documentation for some examples demonstrating this.

if the Analyzer is closed. if an i/o error occurs (may rarely happen for strings).
public TokenStream ( string fieldName, TextReader reader ) : TokenStream
fieldName string the name of the field the created TokenStream is used for
reader System.IO.TextReader
return TokenStream
Ejemplo n.º 1
0
        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        /// 
        /// <p/>
        /// 
        /// So, if you have a code fragment like this:
        /// <br/>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        /// 
        /// <p/>
        /// 
        ///  The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
        /// 
        /// <p/>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        /// 
        /// <P/>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// <see cref="BooleanQuery.Add"/> (used internally)
        /// throws
        /// <see cref="BooleanQuery.TooManyClauses"/>, the
        /// query as it is will be returned.
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop)
        {
            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
            ITermAttribute termAtt = ts.AddAttribute<ITermAttribute>();

            BooleanQuery tmp = new BooleanQuery();
            ISet<string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet<string>(); // ignore dups
            while (ts.IncrementToken())
            {
                String word = termAtt.Term;
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                    continue;
                // ignore dups
                if (already.Contains(word))
                    continue;
                already.Add(word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, Occur.SHOULD);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return tmp;
        }
Ejemplo n.º 2
0
        public List <String> cutWord(string word, Lucene.Net.Analysis.Analyzer analysis)
        {
            List <string> result = new List <string>();
            //TokenStream tokenStream = analysis.ReusableTokenStream("", new StringReader(word));
            TokenStream tokenStream = analysis.TokenStream("field1", new StringReader(word));
            //IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer);
            bool           boolHas = tokenStream.HasAttributes;
            ITermAttribute attrbutes;// = tokenStream.GetAttribute<ITermAttribute>();

            //IEnumerable<Lucene.Net.Util.Attribute> aaa = tokenStream.GetAttributeImplsIterator();
            //IEnumerable<Type> bbb = tokenStream.GetAttributeTypesIterator();

            while (tokenStream.IncrementToken())
            {
                attrbutes = tokenStream.GetAttribute <ITermAttribute>();
                result.Add(attrbutes.Term.ToString());
            }

            tokenStream.Reset();
            //attrbutes.
            //Token token = tokenStream.;
            //PanGu.Segment segment = new PanGu.Segment();

            tokenStream.End();
            return(result);
        }
Ejemplo n.º 3
0
        public static void Highlight(Document d, string query, Analyzer analyzer)
        {
            string contents = d.Get("contents");
            SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"highlight\"><b>", "</b></span>");
            //SpanGradientFormatter formatter = new SpanGradientFormatter(10.0f, null, null, "#F1FD9F", "#EFF413");
            //SimpleHTMLEncoder encoder = new SimpleHTMLEncoder();
            SimpleFragmenter fragmenter = new SimpleFragmenter(250);
            Highlighter hiliter = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer)));

            hiliter.SetTextFragmenter(fragmenter);
            int numfragments = contents.Length / fragmenter.GetFragmentSize() + 1;// +1 ensures its never zero. More than the required number of fragments dont harm.
            StringBuilder result = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><head><title>Search Results - ");
            result.Append(d.Get("filename"));
            result.Append("</title></head><body><font face=Arial size=5>");
            TokenStream tokenstream = analyzer.TokenStream("contents", new System.IO.StringReader(contents));
            TextFragment[] frags = hiliter.GetBestTextFragments(tokenstream, contents, false, numfragments);
            foreach (TextFragment frag in frags)
            {
                if (frag.GetScore() > 0)
                {
                    result.Append(frag.ToString() + "<br/><hr/><br/>");

                }

            }

            string contentspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "contents.html");
            result.Append("</font><a target=_self href=\"file:///");
            result.Append(contentspath);
            result.Append("\">View Original Document...</a>");
            result.Append("</body></html>");
            result.Replace("\n", "<br/>");

            string resultspath = System.IO.Path.Combine(System.Windows.Forms.Application.StartupPath, "results.html");
            System.IO.File.WriteAllText(resultspath, result.ToString());
            //webBrowser1.Url = new Uri("file:///" + resultspath);

            Highlighter hiliter2 = new Highlighter(formatter, new QueryScorer(QueryParser.Parse(query, "contents", analyzer)));
            hiliter2.SetTextFragmenter(fragmenter);
            TokenStream tokstr = analyzer.TokenStream(new System.IO.StringReader(contents));
            StringBuilder htmlcontents = new StringBuilder("<html><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"><style>.highlight{background:yellow;}</style><body><font face=Arial size=5>");
            htmlcontents.Append(hiliter2.GetBestFragments(tokstr, contents, numfragments, "..."));
            htmlcontents.Append("</font></body></html>");
            htmlcontents.Replace("\n", "<br/>");
            System.IO.File.WriteAllText(contentspath, htmlcontents.ToString());
        }
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            Analyzer analyzer = (Analyzer)analyzerMap[fieldName];

            if (analyzer == null)
            {
                analyzer = defaultAnalyzer;
            }

            return(analyzer.TokenStream(fieldName, reader));
        }
Ejemplo n.º 5
0
		public virtual void  AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output)
		{
			TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));
			for (int i = 0; i < output.Length; i++)
			{
				Token t = ts.Next();
				Assert.IsNotNull(t);
				Assert.AreEqual(t.TermText(), output[i]);
			}
			Assert.IsNull(ts.Next());
			ts.Close();
		}
 protected virtual void AssertAnalyzesTo(Analyzer analyzer, String input, String[] output)
 {
     var tokenStream = analyzer.TokenStream("dummyFieldName", new StringReader(input));
     for( var i = 0; i < output.Length; i++ )
     {
         var t = tokenStream.Next();
         Assert.IsNotNull(t);
         Assert.AreEqual(output[i], t.TermText());
     }
     Assert.IsNull(tokenStream.Next());
     tokenStream.Close();
 }
Ejemplo n.º 7
0
    public List <string> TokenizeString(string untokenized)
    {
        System.IO.StringReader stringReader = new System.IO.StringReader(untokenized);
        TokenStream            tokenStream  = analyzer.TokenStream("text", stringReader);
        List <string>          tokenized    = new List <string>();
        ITermAttribute         termAtt      = tokenStream.GetAttribute <ITermAttribute>();

        while (tokenStream.IncrementToken())
        {
            tokenized.Add(termAtt.Term);
        }
        return(tokenized);
    }
Ejemplo n.º 8
0
        public virtual void  AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output)
        {
            TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));

            for (int i = 0; i < output.Length; i++)
            {
                Token t = ts.Next();
                Assert.IsNotNull(t);
                Assert.AreEqual(t.TermText(), output[i]);
            }
            Assert.IsNull(ts.Next());
            ts.Close();
        }
Ejemplo n.º 9
0
 private static List<string> GetTokens(string keywords, Analyzer analyser)
 {
     var tokenStream = analyser.TokenStream(null, new StringReader(keywords));
     var termAttribute = tokenStream.GetAttribute<ITermAttribute>();
     tokenStream.Reset();
     var list = new List<string>();
     while (tokenStream.IncrementToken())
     {
         var term = termAttribute.Term;
         list.Add(term);
     }
     return list;
 }
Ejemplo n.º 10
0
        public virtual void AssertThreadSafe(Analyzer analyzer)
        {
            int numTestPoints = 100;
            int numThreads    = TestUtil.NextInt(Random(), 3, 5);
            Dictionary <string, BytesRef> map = new Dictionary <string, BytesRef>();

            // create a map<String,SortKey> up front.
            // then with multiple threads, generate sort keys for all the keys in the map
            // and ensure they are the same as the ones we produced in serial fashion.

            for (int i = 0; i < numTestPoints; i++)
            {
                string      term           = TestUtil.RandomSimpleString(Random());
                IOException priorException = null;
                TokenStream ts             = analyzer.TokenStream("fake", new StreamReader(term));
                try
                {
                    ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                    BytesRef bytes = termAtt.BytesRef;
                    ts.Reset();
                    Assert.IsTrue(ts.IncrementToken());
                    termAtt.FillBytesRef();
                    // ensure we make a copy of the actual bytes too
                    map[term] = BytesRef.DeepCopyOf(bytes);
                    Assert.IsFalse(ts.IncrementToken());
                    ts.End();
                }
                catch (IOException e)
                {
                    priorException = e;
                }
                finally
                {
                    IOUtils.CloseWhileHandlingException(priorException, ts);
                }
            }

            ThreadClass[] threads = new ThreadClass[numThreads];
            for (int i = 0; i < numThreads; i++)
            {
                threads[i] = new ThreadAnonymousInnerClassHelper(this, analyzer, map);
            }
            for (int i = 0; i < numThreads; i++)
            {
                threads[i].Start();
            }
            for (int i = 0; i < numThreads; i++)
            {
                threads[i].Join();
            }
        }
Ejemplo n.º 11
0
        // This is a simplified query builder which works for single Terms and single Phrases
        // Returns null, TermQuery, or PhraseQuery
        public static Lucene.Net.Search.Query GetFieldQuery(Analyzer analyzer, string field, string queryText)
        {
            TokenStream stream = analyzer.TokenStream(field, new StringReader(queryText));
            TokenFilter filter = new CachingTokenFilter(stream);
            filter.Reset();

            // This attribute way of getting token properties isn't very good, but it's the non-obsolete one.
            var attr1 = filter.GetAttribute<ITermAttribute>();
            Func<string> getText = () => attr1 != null ? attr1.Term : null;

            Func<int> getPositionIncrement;
            if (filter.HasAttribute<IPositionIncrementAttribute>())
            {
                var attr = filter.GetAttribute<IPositionIncrementAttribute>();
                getPositionIncrement = () => attr.PositionIncrement;
            }
            else
            {
                getPositionIncrement = () => 1;
            }

            // 0 tokens
            if (!filter.IncrementToken())
            {
                return new BooleanQuery();
            }

            // 1 token?
            string token1 = getText();
            int position = 0;
            if (!filter.IncrementToken())
            {
                return new TermQuery(new Term(field, token1));
            }

            // many tokens - handle first token
            PhraseQuery ret = new PhraseQuery();
            ret.Add(new Term(field, token1));

            do
            {
                // handle rest of tokens
                string tokenNext = getText();
                position += getPositionIncrement();
                ret.Add(new Term(field, tokenNext), position);
            }
            while (filter.IncrementToken());

            return ret;
        }
Ejemplo n.º 12
0
        public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text)
        {
            TokenStream stream = analyzer.TokenStream("contents", new StringReader(text));
            List<string> result = new List<string>();
            TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));

            while (stream.IncrementToken())
            {
                result.Add(tokenAttr.Term());
            }

            stream.End();
            stream.Close();

            return result;
        }
Ejemplo n.º 13
0
		public static IEnumerable<string> TokensFromAnalysis(Analyzer analyzer, String text)
		{
			using (TokenStream stream = analyzer.TokenStream("contents", new StringReader(text)))
			{
				var result = new List<string>();
				var tokenAttr = (TermAttribute) stream.GetAttribute<ITermAttribute>();

				while (stream.IncrementToken())
				{
					result.Add(tokenAttr.Term);
				}

				stream.End();

				return result;
			}
		}
Ejemplo n.º 14
0
        public void PositonOfWOrds()
        {
            Lucene.Net.Analysis.Analyzer analyzer = AnalyzerList[0].LuceneAnalyzer as Analyzer;

            int termCounter = 0;

            if (analyzer != null)
            {
                AnalyzerView view         = AnalyzerViews[0] as AnalyzerView;
                StringReader stringReader = new StringReader(sb.ToString());

                TokenStream tokenStream = analyzer.TokenStream("defaultFieldName", stringReader);

                String strValue = view.GetView(tokenStream, out termCounter).Trim();
                Console.WriteLine("PositonOfWOrds Details : " + strValue);
            }
        }
Ejemplo n.º 15
0
        public void WordCountFrequency()
        {
            Lucene.Net.Analysis.Analyzer analyzer = AnalyzerList[0].LuceneAnalyzer as Analyzer;

            int termCounter = 0;

            if (analyzer != null)
            {
                AnalyzerView view         = AnalyzerViews[1] as AnalyzerView;
                StringReader stringReader = new StringReader(sb.ToString());

                TokenStream tokenStream = analyzer.TokenStream("defaultFieldName", stringReader);

                String strValue = view.GetView(tokenStream, out termCounter).Trim();
                Console.WriteLine("WordCountFrequency Details : " + strValue);
            }
            Console.WriteLine(string.Format("Total of {0} Term(s) Found.", termCounter));
        }
Ejemplo n.º 16
0
 public static void DisplayTokenWithPositions(Analyzer analyzer, string text)
 {
     var stream = analyzer.TokenStream("contents", new StringReader(text));
     var termAttribute = stream.AddAttribute(typeof (TermAttribute)) as TermAttribute;
     var positionIncrement =
         stream.AddAttribute(typeof (PositionIncrementAttribute)) as PositionIncrementAttribute;
     int position = 0;
     while (stream.IncrementToken())
     {
         int increment = positionIncrement.GetPositionIncrement();
         if(increment>0)
         {
             position = position + increment;
             Console.WriteLine();
             Console.WriteLine("{0}: ", position);
         }
         Console.WriteLine("[{0}]", termAttribute.Term());
     }
     Console.WriteLine();
 }
		public virtual void  AssertAnalyzesTo(Analyzer a, System.String input, System.String[] expectedImages, System.String[] expectedTypes, int[] expectedPosIncrs)
		{
			TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));
			for (int i = 0; i < expectedImages.Length; i++)
			{
				Token t = ts.Next();
				Assert.IsNotNull(t);
				Assert.AreEqual(expectedImages[i], t.TermText());
				if (expectedTypes != null)
				{
					Assert.AreEqual(expectedTypes[i], t.Type());
				}
				if (expectedPosIncrs != null)
				{
					Assert.AreEqual(expectedPosIncrs[i], t.GetPositionIncrement());
				}
			}
			Assert.IsNull(ts.Next());
			ts.Close();
		}
Ejemplo n.º 18
0
        /// <summary> 
        /// Perform synonym expansion on a query.
        /// </summary>
        /// <param name="query">query</param>
        /// <param name="syns">syns</param>
        /// <param name="a">a</param>
        /// <param name="field">field</param>
        /// <param name="boost">boost</param>
        public static Query Expand(String query, 
            Searcher syns, 
            Analyzer a, 
            String field, 
            float boost)
        {
            already = new List<String>(); // avoid dups
            var top = new List<String>(); // needs to be separately listed..

            var ts = a.TokenStream(field, new StringReader(query));
            var termAtt = ts.AddAttribute<TermAttribute>();

            while (ts.IncrementToken())
            {
                var word = termAtt.Term;

                if (!already.Contains(word))
                {
                    already.Add(word);
                    top.Add(word);
                }
            }

            tmp = new BooleanQuery();

            // [2] form query
            System.Collections.IEnumerator it = top.GetEnumerator();
            while (it.MoveNext())
            {
                // [2a] add to level words in
                var word = (String)it.Current;
                var tq = new TermQuery(new Term(field, word));
                tmp.Add(tq, Occur.SHOULD);

                var c = new CollectorImpl(field, boost);
                syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)), c);
            }

            return tmp;
        }
Ejemplo n.º 19
0
 public override void Run()
 {
     try
     {
         foreach (KeyValuePair <string, BytesRef> mapping in Map)
         {
             string      term           = mapping.Key;
             BytesRef    expected       = mapping.Value;
             IOException priorException = null;
             TokenStream ts             = Analyzer.TokenStream("fake", new StreamReader(term));
             try
             {
                 ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                 BytesRef bytes = termAtt.BytesRef;
                 ts.Reset();
                 Assert.IsTrue(ts.IncrementToken());
                 termAtt.FillBytesRef();
                 Assert.AreEqual(expected, bytes);
                 Assert.IsFalse(ts.IncrementToken());
                 ts.End();
             }
             catch (IOException e)
             {
                 priorException = e;
             }
             finally
             {
                 IOUtils.CloseWhileHandlingException(priorException, ts);
             }
         }
     }
     catch (IOException e)
     {
         throw (Exception)e;
     }
 }
Ejemplo n.º 20
0
        public static List<string> SplitKeyWords(string keywords, Analyzer analyzer)
        {
            System.IO.StreamReader reader = new System.IO.StreamReader(PanGu.Framework.Stream.WriteStringToStream(keywords,
                Encoding.UTF8), Encoding.UTF8);

            TokenStream tokenStream = analyzer.TokenStream("", reader);

            global::Lucene.Net.Analysis.Token token = tokenStream.Next();

            List<string> result = new List<string>();

            while (token != null)
            {
                result.Add(keywords.Substring(token.StartOffset(), token.EndOffset() - token.StartOffset()));
                token = tokenStream.Next();
            }

            return result;
        }
Ejemplo n.º 21
0
		public virtual void  AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output)
		{
			TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));
			for (int i = 0; i < output.Length; i++)
			{
				Token t = ts.Next();
				System.Diagnostics.Trace.Assert(t != null); ////   assertNotNull(t);
				System.Diagnostics.Trace.Assert(output[i] == t.TermText()); //// assertEquals(output[i], t.TermText());
			}
			System.Diagnostics.Trace.Assert(ts.Next() == null); //// assertNull(ts.Next());
			ts.Close();
		}
Ejemplo n.º 22
0
        public void DoStandardHighlights(Analyzer analyzer, IndexSearcher searcher,
                                         TopDocs hits, Query query, IFormatter formatter, bool expandMT)
        {
            IFragmenter frag = new SimpleFragmenter(20);

            for (int i = 0; i < hits.TotalHits; i++)
            {
                String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(HighlighterTest.FIELD_NAME);
                int maxNumFragmentsRequired = 2;
                String fragmentSeparator = "...";
                IScorer scorer = null;
                TokenStream tokenStream = analyzer.TokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
                if (Mode == QUERY)
                {
                    scorer = new QueryScorer(query);
                }
                else if (Mode == QUERY_TERM)
                {
                    scorer = new QueryTermScorer(query);
                }
                var highlighter = new Highlighter(formatter, scorer) {TextFragmenter = frag};

                String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired,
                                                             fragmentSeparator);
                Console.WriteLine("\t" + result);
            }
        }
Ejemplo n.º 23
0
 public static void AssertAnalyzesTo(Analyzer a, String input, String[] output, int[] startOffsets, int[] endOffsets, String[] types, int[] posIncrements)
 {
     AssertTokenStreamContents(a.TokenStream("dummy", new System.IO.StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.Length);
 }
Ejemplo n.º 24
0
        private static Query ExecuteAnalyzer(Analyzer analyzer, string field, string text)
        {
            TokenStream tokenStream = analyzer.TokenStream(field, new StringReader(text));

            ITermAttribute termAttribute = tokenStream.AddAttribute<ITermAttribute>();
            IPositionIncrementAttribute positionIncrementAttribute = tokenStream.AddAttribute<IPositionIncrementAttribute>();

            List<List<Term>> terms = new List<List<Term>>();
            List<Term> current = null;
            while (tokenStream.IncrementToken())
            {
                if (positionIncrementAttribute.PositionIncrement > 0)
                {
                    current = new List<Term>();
                    terms.Add(current);
                }
                if (current != null)
                {
                    current.Add(new Term(field, termAttribute.Term));
                }
            }

            if (terms.Count == 1 && terms[0].Count == 1)
            {
                return new TermQuery(terms[0][0]);
            }
            else if (terms.Select(l => l.Count).Sum() == terms.Count)
            {
                PhraseQuery phraseQuery = new PhraseQuery();
                foreach (var positionList in terms)
                {
                    phraseQuery.Add(positionList[0]);
                }
                return phraseQuery;
            }
            else
            {
                MultiPhraseQuery multiPhraseQuery = new MultiPhraseQuery();
                foreach (var positionList in terms)
                {
                    multiPhraseQuery.Add(positionList.ToArray());
                }
                return multiPhraseQuery;
            }
        }
Ejemplo n.º 25
0
 public static string ToTokenStreamString(this string text, Analyzer analyzer) {
     var tokens = new StringBuilder();
     var reader = new StringReader(text);
     var tokenStream = analyzer.TokenStream("x", reader);
     while(true) {
         var term = tokenStream.Next();
         if(term == null) {
             break;
         }
         tokens.Append(term.Term());
         tokens.Append(" ");
     }
     return tokens.ToString();
 }
Ejemplo n.º 26
0
 //convenience method
 public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer)
 {
     return analyzer.TokenStream(field, new StringReader(contents));
 }
Ejemplo n.º 27
0
        private static List<string> AnalyzeText(Analyzer analyzer, string field, string text) {

            if (String.IsNullOrEmpty(text)) {
                return new List<string>();
            }

            var result = new List<string>();
            using (var sr = new System.IO.StringReader(text)) {
                using (TokenStream stream = analyzer.TokenStream(field, sr)) {
                    while (stream.IncrementToken()) {
                        var termAttribute = stream.GetAttribute<ITermAttribute>();
                        if(termAttribute != null) {
                            result.Add(termAttribute.Term);
                        }
                    }
                }
            }

            return result;
        }
Ejemplo n.º 28
0
        /// <summary>
        /// Searches the index for the querytext and displays a ranked list of results to the screen
        /// </summary>
        /// <param name="querytext">The text to search the index</param>
        private string SearchAndDisplayResults(string querytext, long qid, List <long> relevantList)
        {
            System.Console.WriteLine("Searching for " + querytext);
            querytext = querytext.ToLower();
            Query query = parser.Parse(querytext);

            System.Console.WriteLine($"Searching for { query.ToString()}");

            TopDocs results = searcher.Search(query, MAX_QUERY);

            // create highlighter - using strong tag to highlight in this case (change as needed)
            //IFormatter formatter = new SimpleHTMLFormatter("<strong>", "</strong>");
            IFormatter formatter = new SimpleHTMLFormatter("<span style=\"font-weight:bold;background-color:yellow;\">", "</span>");

            // excerpt set to 200 characters in length
            var fragmenter  = new SimpleFragmenter(3000);
            var scorer      = new QueryScorer(query);
            var highlighter = new Highlighter(formatter, scorer)
            {
                TextFragmenter = fragmenter
            };

            long            rank           = 0;
            float           topscore       = 0f;
            long            foundrelevants = 0;
            List <TrecItem> logItems       = new List <TrecItem>();

            SearchedListViewModel.DeleteAll();
            foreach (ScoreDoc scoreDoc in results.ScoreDocs)
            {
                if (rank == 0)
                {
                    topscore = scoreDoc.Score;
                }
                rank++;
                Lucene.Net.Documents.Document doc = searcher.Doc(scoreDoc.Doc);
                long id = Convert.ToInt64(doc.Get(PID_FN).ToString());
                CollectionPassage ps = collectionProvider.Passages[id];

                // Logging Trec
                logItems.Add(new TrecItem(0, id, rank, scoreDoc.Score));

                // get highlighted fragment
                TokenStream stream      = analyzer.TokenStream("", new StringReader(ps.passage_text));
                string      highlighted = highlighter.GetBestFragment(stream, ps.passage_text);

                //string url2 = doc.Get(TEXT_FN).ToString();
                //Console.WriteLine("Rank " + rank + " text " + myFieldValue);
                if (highlighted == null)
                {
                    highlighted = ps.passage_text;
                }
                if (relevantList.Contains(id))
                {
                    foundrelevants++;
                }
                SearchedListViewModel.Add(scoreDoc.Score / topscore, id, ps.GetTitle(), ps.url, highlighted, relevantList.Contains(id));

                //Console.WriteLine("==>" + highlighted);
            }

            StatusBarViewModel.Instance.NumRelevants = "Num Relevants : " + foundrelevants.ToString() + "/" + relevantList.Count.ToString();
            StatusBarViewModel.Instance.NumSearch    = "Num Searched :" + results.ScoreDocs.Length.ToString();
            // Logging Trec
            trecLogger.Logging(qid, logItems);

            //Console.WriteLine(string.Join(",", relevantList));
            return(query.ToString());
        }
        private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field)
        {
            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text);
            }

            ICharTermAttribute termAtt;
            IOffsetAttribute offsetAtt;
            IPositionIncrementAttribute posIncAtt;
            IPositionLengthAttribute posLengthAtt;
            ITypeAttribute typeAtt;

            IList<string> tokens = new List<string>();
            IList<string> types = new List<string>();
            IList<int> positions = new List<int>();
            IList<int> positionLengths = new List<int>();
            IList<int> startOffsets = new List<int>();
            IList<int> endOffsets = new List<int>();

            int remainder = random.Next(10);
            StringReader reader = new StringReader(text);

            TokenStream ts;
            using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader))
            {
                 termAtt = ts.HasAttribute<ICharTermAttribute>()
                    ? ts.GetAttribute<ICharTermAttribute>()
                    : null;
                offsetAtt = ts.HasAttribute<IOffsetAttribute>()
                    ? ts.GetAttribute<IOffsetAttribute>()
                    : null;
                posIncAtt = ts.HasAttribute<IPositionIncrementAttribute>()
                    ? ts.GetAttribute<IPositionIncrementAttribute>()
                    : null;
                posLengthAtt = ts.HasAttribute<IPositionLengthAttribute>()
                    ? ts.GetAttribute<IPositionLengthAttribute>()
                    : null;
                typeAtt = ts.HasAttribute<ITypeAttribute>() ? ts.GetAttribute<ITypeAttribute>() : null;

                ts.Reset();

                // First pass: save away "correct" tokens
                while (ts.IncrementToken())
                {
                    Assert.IsNotNull(termAtt, "has no CharTermAttribute");
                    tokens.Add(termAtt.ToString());
                    if (typeAtt != null)
                    {
                        types.Add(typeAtt.Type);
                    }
                    if (posIncAtt != null)
                    {
                        positions.Add(posIncAtt.PositionIncrement);
                    }
                    if (posLengthAtt != null)
                    {
                        positionLengths.Add(posLengthAtt.PositionLength);
                    }
                    if (offsetAtt != null)
                    {
                        startOffsets.Add(offsetAtt.StartOffset());
                        endOffsets.Add(offsetAtt.EndOffset());
                    }
                }
                ts.End();
            }

            // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
            if (tokens.Count > 0)
            {
                // KWTokenizer (for example) can produce a token
                // even when input is length 0:
                if (text.Length != 0)
                {
                    // (Optional) second pass: do something evil:
                    int evilness = random.Next(50);
                    if (evilness == 17)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception");
                        }
                        // Throw an errant exception from the Reader:

                        MockReaderWrapper evilReader = new MockReaderWrapper(random, text);
                        evilReader.ThrowExcAfterChar(random.Next(text.Length));
                        reader = evilReader;

                        try
                        {
                            // NOTE: some Tokenizers go and read characters
                            // when you call .setReader(Reader), eg
                            // PatternTokenizer.  this is a bit
                            // iffy... (really, they should only
                            // pull from the Reader when you call
                            // .incremenToken(), I think?), but we
                            // currently allow it, so, we must call
                            // a.TokenStream inside the try since we may
                            // hit the exc on init:
                            ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(evilReader, remainder) : evilReader);
                            ts.Reset();
                            while (ts.IncrementToken()) ;
                            Assert.Fail("did not hit exception");
                        }
                        catch (Exception re)
                        {
                            Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re));
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                    else if (evilness == 7)
                    {
                        // Only consume a subset of the tokens:
                        int numTokensToRead = random.Next(tokens.Count);
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens");
                        }

                        reader = new StringReader(text);
                        ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader);
                        ts.Reset();
                        for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++)
                        {
                            Assert.IsTrue(ts.IncrementToken());
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                }
            }

            // Final pass: verify clean tokenization matches
            // results from first pass:

            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens");
            }
            reader = new StringReader(text);

            long seed = random.Next();
            random = new Random((int)seed);
            if (random.Next(30) == 7)
            {
                if (VERBOSE)
                {
                    Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader");
                }

                reader = new MockReaderWrapper(random, text);
            }

            ts = a.TokenStream("dummy", useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader);
            if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (typeAtt != null && posIncAtt != null && offsetAtt != null)
            {
                // offset + pos + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && offsetAtt != null)
            {
                // offset + pos
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (offsetAtt != null)
            {
                // offset
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect);
            }
            else
            {
                // terms only
                AssertTokenStreamContents(ts, tokens.ToArray());
            }

            if (field != null)
            {
                reader = new StringReader(text);
                random = new Random((int)seed);
                if (random.Next(30) == 7)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader");
                    }

                    reader = new MockReaderWrapper(random, text);
                }

                field.ReaderValue = useCharFilter ? (TextReader)new MockCharFilter(reader, remainder) : reader;
            }
        }
Ejemplo n.º 30
0
        public TokenStream GetTokenStream(Analyzer analyzer)
        {
            if (!((FieldType)FieldType).Indexed)
            {
                return null;
            }
            FieldType.NumericType? numericType = ((FieldType)FieldType).NumericTypeValue;
            if (numericType != null)
            {
                if (!(InternalTokenStream is NumericTokenStream))
                {
                    // lazy init the TokenStream as it is heavy to instantiate
                    // (attributes,...) if not needed (stored field loading)
                    InternalTokenStream = new NumericTokenStream(Type.NumericPrecisionStep);
                }
                var nts = (NumericTokenStream)InternalTokenStream;
                // initialize value in TokenStream
                object val = FieldsData;
                switch (numericType)
                {
                    case Documents.FieldType.NumericType.INT:
                        nts.SetIntValue(Convert.ToInt32(val));
                        break;

                    case Documents.FieldType.NumericType.LONG:
                        nts.SetLongValue(Convert.ToInt64(val));
                        break;

                    case Documents.FieldType.NumericType.FLOAT:
                        nts.SetFloatValue(Convert.ToSingle(val));
                        break;

                    case Documents.FieldType.NumericType.DOUBLE:
                        nts.SetDoubleValue(Convert.ToDouble(val));
                        break;

                    default:
                        throw new Exception("Should never get here");
                }
                return InternalTokenStream;
            }

            if (!((FieldType)FieldType).Tokenized)
            {
                if (StringValue == null)
                {
                    throw new System.ArgumentException("Non-Tokenized Fields must have a String value");
                }
                if (!(InternalTokenStream is StringTokenStream))
                {
                    // lazy init the TokenStream as it is heavy to instantiate
                    // (attributes,...) if not needed (stored field loading)
                    InternalTokenStream = new StringTokenStream();
                }
                ((StringTokenStream)InternalTokenStream).Value = StringValue;
                return InternalTokenStream;
            }

            if (TokenStream_Renamed != null)
            {
                return TokenStream_Renamed;
            }
            else if (ReaderValue != null)
            {
                return analyzer.TokenStream(Name, ReaderValue);
            }
            else if (StringValue != null)
            {
                TextReader sr = new StringReader(StringValue);
                return analyzer.TokenStream(Name, sr);
            }

            throw new System.ArgumentException("Field must have either TokenStream, String, Reader or Number value; got " + this);
        }
        internal static void CheckResetException(Analyzer a, string input)
        {
            TokenStream ts = a.TokenStream("bogus", new StringReader(input));
            try
            {
                if (ts.IncrementToken())
                {
                    ts.ReflectAsString(false);
                    Assert.Fail("didn't get expected exception when reset() not called");
                }
            }
            catch (InvalidOperationException expected)
            {
                //ok
            }
            catch (AssertionException expected)
            {
                // ok: MockTokenizer
                Assert.IsTrue(expected.Message != null && expected.Message.Contains("wrong state"), expected.Message);
            }
            catch (Exception unexpected)
            {
                //unexpected.printStackTrace(System.err);
                Console.Error.WriteLine(unexpected.StackTrace);
                Assert.Fail("got wrong exception when reset() not called: " + unexpected);
            }
            finally
            {
                // consume correctly
                ts.Reset();
                while (ts.IncrementToken())
                {
                }
                ts.End();
                ts.Dispose();
            }

            // check for a missing Close()
            ts = a.TokenStream("bogus", new StringReader(input));
            ts.Reset();
            while (ts.IncrementToken())
            {
            }
            ts.End();
            try
            {
                ts = a.TokenStream("bogus", new StringReader(input));
                Assert.Fail("didn't get expected exception when Close() not called");
            }
            catch (Exception)
            {
                // ok
            }
            finally
            {
                ts.Dispose();
            }
        }
 public static void AssertAnalyzesTo(Analyzer a, string input, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, bool offsetsAreCorrect)
 {
     CheckResetException(a, input);
     AssertTokenStreamContents(a.TokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.Length, offsetsAreCorrect);
 }
Ejemplo n.º 33
0
        // Test using various international locales with accented characters (which
        // sort differently depending on locale)
        //
        // Copied (and slightly modified) from
        // Lucene.Net.Search.TestSort.testInternationalSort()
        //
        // TODO: this test is really fragile. there are already 3 different cases,
        // depending upon unicode version.
        public virtual void TestCollationKeySort(Analyzer usAnalyzer, Analyzer franceAnalyzer, Analyzer swedenAnalyzer, Analyzer denmarkAnalyzer, string usResult, string frResult, string svResult, string dkResult)
        {
            Directory indexStore = NewDirectory();
            IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false)));

            // document data:
            // the tracer field is used to determine which document was hit
            string[][] sortData = new string[][] { new string[] { "A", "x", "p\u00EAche", "p\u00EAche", "p\u00EAche", "p\u00EAche" }, new string[] { "B", "y", "HAT", "HAT", "HAT", "HAT" }, new string[] { "C", "x", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9", "p\u00E9ch\u00E9" }, new string[] { "D", "y", "HUT", "HUT", "HUT", "HUT" }, new string[] { "E", "x", "peach", "peach", "peach", "peach" }, new string[] { "F", "y", "H\u00C5T", "H\u00C5T", "H\u00C5T", "H\u00C5T" }, new string[] { "G", "x", "sin", "sin", "sin", "sin" }, new string[] { "H", "y", "H\u00D8T", "H\u00D8T", "H\u00D8T", "H\u00D8T" }, new string[] { "I", "x", "s\u00EDn", "s\u00EDn", "s\u00EDn", "s\u00EDn" }, new string[] { "J", "y", "HOT", "HOT", "HOT", "HOT" } };

            FieldType customType = new FieldType();
            customType.Stored = true;

            for (int i = 0; i < sortData.Length; ++i)
            {
                Document doc = new Document();
                doc.Add(new Field("tracer", sortData[i][0], customType));
                doc.Add(new TextField("contents", sortData[i][1], Field.Store.NO));
                if (sortData[i][2] != null)
                {
                    doc.Add(new TextField("US", usAnalyzer.TokenStream("US", new StringReader(sortData[i][2]))));
                }
                if (sortData[i][3] != null)
                {
                    doc.Add(new TextField("France", franceAnalyzer.TokenStream("France", new StringReader(sortData[i][3]))));
                }
                if (sortData[i][4] != null)
                {
                    doc.Add(new TextField("Sweden", swedenAnalyzer.TokenStream("Sweden", new StringReader(sortData[i][4]))));
                }
                if (sortData[i][5] != null)
                {
                    doc.Add(new TextField("Denmark", denmarkAnalyzer.TokenStream("Denmark", new StringReader(sortData[i][5]))));
                }
                writer.AddDocument(doc);
            }
            writer.ForceMerge(1);
            writer.Dispose();
            IndexReader reader = DirectoryReader.Open(indexStore);
            IndexSearcher searcher = new IndexSearcher(reader);

            Sort sort = new Sort();
            Query queryX = new TermQuery(new Term("contents", "x"));
            Query queryY = new TermQuery(new Term("contents", "y"));

            sort.SetSort(new SortField("US", SortField.Type_e.STRING));
            AssertMatches(searcher, queryY, sort, usResult);

            sort.SetSort(new SortField("France", SortField.Type_e.STRING));
            AssertMatches(searcher, queryX, sort, frResult);

            sort.SetSort(new SortField("Sweden", SortField.Type_e.STRING));
            AssertMatches(searcher, queryY, sort, svResult);

            sort.SetSort(new SortField("Denmark", SortField.Type_e.STRING));
            AssertMatches(searcher, queryY, sort, dkResult);
            reader.Dispose();
            indexStore.Dispose();
        }
Ejemplo n.º 34
0
        protected internal virtual BytesRef AnalyzeMultitermTerm(string field, string part, Analyzer analyzerIn)
        {
            if (analyzerIn == null) analyzerIn = Analyzer;

            TokenStream source = null;
            try
            {
                source = analyzerIn.TokenStream(field, part);
                source.Reset();

                ITermToBytesRefAttribute termAtt = source.GetAttribute<ITermToBytesRefAttribute>();
                BytesRef bytes = termAtt.BytesRef;

                if (!source.IncrementToken())
                    throw new ArgumentException("analyzer returned no terms for multiTerm term: " + part);
                termAtt.FillBytesRef();
                if (source.IncrementToken())
                    throw new ArgumentException("analyzer returned too many terms for multiTerm term: " + part);
                source.End();
                return BytesRef.DeepCopyOf(bytes);
            }
            catch (IOException e)
            {
                throw new Exception("Error analyzing multiTerm term: " + part, e);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(source);
            }
        }
Ejemplo n.º 35
0
 public static void DisplayTokens(Analyzer analyzer, string text)
 {
     DisplayTokens(analyzer.TokenStream("contents", new StringReader(text)));
 }
Ejemplo n.º 36
0
 public static void AssertAnalyzesTo(Analyzer a, string input, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, bool offsetsAreCorrect)
 {
     CheckResetException(a, input);
     AssertTokenStreamContents(a.TokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.Length, offsetsAreCorrect);
 }
 public static void AssertAnalyzesTo(Analyzer a, String input, String[] output, int[] startOffsets, int[] endOffsets, String[] types, int[] posIncrements)
 {
     AssertTokenStreamContents(a.TokenStream("dummy", new System.IO.StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.Length);
 }
Ejemplo n.º 38
0
        public virtual void AssertThreadSafe(Analyzer analyzer)
        {
            int numTestPoints = 100;
            int numThreads = TestUtil.NextInt(Random(), 3, 5);
            Dictionary<string, BytesRef> map = new Dictionary<string, BytesRef>();

            // create a map<String,SortKey> up front.
            // then with multiple threads, generate sort keys for all the keys in the map
            // and ensure they are the same as the ones we produced in serial fashion.

            for (int i = 0; i < numTestPoints; i++)
            {
                string term = TestUtil.RandomSimpleString(Random());
                IOException priorException = null;
                TokenStream ts = analyzer.TokenStream("fake", new StreamReader(term));
                try
                {
                    ITermToBytesRefAttribute termAtt = ts.AddAttribute<ITermToBytesRefAttribute>();
                    BytesRef bytes = termAtt.BytesRef;
                    ts.Reset();
                    Assert.IsTrue(ts.IncrementToken());
                    termAtt.FillBytesRef();
                    // ensure we make a copy of the actual bytes too
                    map[term] = BytesRef.DeepCopyOf(bytes);
                    Assert.IsFalse(ts.IncrementToken());
                    ts.End();
                }
                catch (IOException e)
                {
                    priorException = e;
                }
                finally
                {
                    IOUtils.CloseWhileHandlingException(priorException, ts);
                }
            }

            ThreadClass[] threads = new ThreadClass[numThreads];
            for (int i = 0; i < numThreads; i++)
            {
                threads[i] = new ThreadAnonymousInnerClassHelper(this, analyzer, map);
            }
            for (int i = 0; i < numThreads; i++)
            {
                threads[i].Start();
            }
            for (int i = 0; i < numThreads; i++)
            {
                threads[i].Join();
            }
        }
Ejemplo n.º 39
0
        private static void CheckAnalysisConsistency(Random random, Analyzer a, bool useCharFilter, string text, bool offsetsAreCorrect, Field field)
        {
            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: get first token stream now text=" + text);
            }

            ICharTermAttribute          termAtt;
            IOffsetAttribute            offsetAtt;
            IPositionIncrementAttribute posIncAtt;
            IPositionLengthAttribute    posLengthAtt;
            ITypeAttribute typeAtt;

            IList <string> tokens          = new List <string>();
            IList <string> types           = new List <string>();
            IList <int>    positions       = new List <int>();
            IList <int>    positionLengths = new List <int>();
            IList <int>    startOffsets    = new List <int>();
            IList <int>    endOffsets      = new List <int>();

            int          remainder = random.Next(10);
            StringReader reader    = new StringReader(text);

            TokenStream ts;

            using (ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader))
            {
                termAtt = ts.HasAttribute <ICharTermAttribute>()
                    ? ts.GetAttribute <ICharTermAttribute>()
                    : null;
                offsetAtt = ts.HasAttribute <IOffsetAttribute>()
                    ? ts.GetAttribute <IOffsetAttribute>()
                    : null;
                posIncAtt = ts.HasAttribute <IPositionIncrementAttribute>()
                    ? ts.GetAttribute <IPositionIncrementAttribute>()
                    : null;
                posLengthAtt = ts.HasAttribute <IPositionLengthAttribute>()
                    ? ts.GetAttribute <IPositionLengthAttribute>()
                    : null;
                typeAtt = ts.HasAttribute <ITypeAttribute>() ? ts.GetAttribute <ITypeAttribute>() : null;

                ts.Reset();

                // First pass: save away "correct" tokens
                while (ts.IncrementToken())
                {
                    Assert.IsNotNull(termAtt, "has no CharTermAttribute");
                    tokens.Add(termAtt.ToString());
                    if (typeAtt != null)
                    {
                        types.Add(typeAtt.Type);
                    }
                    if (posIncAtt != null)
                    {
                        positions.Add(posIncAtt.PositionIncrement);
                    }
                    if (posLengthAtt != null)
                    {
                        positionLengths.Add(posLengthAtt.PositionLength);
                    }
                    if (offsetAtt != null)
                    {
                        startOffsets.Add(offsetAtt.StartOffset());
                        endOffsets.Add(offsetAtt.EndOffset());
                    }
                }
                ts.End();
            }

            // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
            if (tokens.Count > 0)
            {
                // KWTokenizer (for example) can produce a token
                // even when input is length 0:
                if (text.Length != 0)
                {
                    // (Optional) second pass: do something evil:
                    int evilness = random.Next(50);
                    if (evilness == 17)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis w/ exception");
                        }
                        // Throw an errant exception from the Reader:

                        MockReaderWrapper evilReader = new MockReaderWrapper(random, text);
                        evilReader.ThrowExcAfterChar(random.Next(text.Length));
                        reader = evilReader;

                        try
                        {
                            // NOTE: some Tokenizers go and read characters
                            // when you call .setReader(Reader), eg
                            // PatternTokenizer.  this is a bit
                            // iffy... (really, they should only
                            // pull from the Reader when you call
                            // .incremenToken(), I think?), but we
                            // currently allow it, so, we must call
                            // a.TokenStream inside the try since we may
                            // hit the exc on init:
                            ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(evilReader, remainder) : evilReader);
                            ts.Reset();
                            while (ts.IncrementToken())
                            {
                                ;
                            }
                            Assert.Fail("did not hit exception");
                        }
                        catch (Exception re)
                        {
                            Assert.IsTrue(MockReaderWrapper.IsMyEvilException(re));
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                    else if (evilness == 7)
                    {
                        // Only consume a subset of the tokens:
                        int numTokensToRead = random.Next(tokens.Count);
                        if (VERBOSE)
                        {
                            Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis, only consuming " + numTokensToRead + " of " + tokens.Count + " tokens");
                        }

                        reader = new StringReader(text);
                        ts     = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader);
                        ts.Reset();
                        for (int tokenCount = 0; tokenCount < numTokensToRead; tokenCount++)
                        {
                            Assert.IsTrue(ts.IncrementToken());
                        }

                        try
                        {
                            ts.End();
                        }
                        catch (InvalidOperationException ae)
                        {
                            // Catch & ignore MockTokenizer's
                            // anger...
                            if ("End() called before IncrementToken() returned false!".Equals(ae.Message))
                            {
                                // OK
                            }
                            else
                            {
                                throw ae;
                            }
                        }
                        finally
                        {
                            ts.Dispose();
                        }
                    }
                }
            }

            // Final pass: verify clean tokenization matches
            // results from first pass:

            if (VERBOSE)
            {
                Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: re-run analysis; " + tokens.Count + " tokens");
            }
            reader = new StringReader(text);

            long seed = random.Next();

            random = new Random((int)seed);
            if (random.Next(30) == 7)
            {
                if (VERBOSE)
                {
                    Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: using spoon-feed reader");
                }

                reader = new MockReaderWrapper(random, text);
            }

            ts = a.TokenStream("dummy", useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader);
            if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (typeAtt != null && posIncAtt != null && offsetAtt != null)
            {
                // offset + pos + type
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), types.ToArray(), ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null)
            {
                // offset + pos + posLength
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), ToIntArray(positionLengths), text.Length, offsetsAreCorrect);
            }
            else if (posIncAtt != null && offsetAtt != null)
            {
                // offset + pos
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, ToIntArray(positions), null, text.Length, offsetsAreCorrect);
            }
            else if (offsetAtt != null)
            {
                // offset
                AssertTokenStreamContents(ts, tokens.ToArray(), ToIntArray(startOffsets), ToIntArray(endOffsets), null, null, null, text.Length, offsetsAreCorrect);
            }
            else
            {
                // terms only
                AssertTokenStreamContents(ts, tokens.ToArray());
            }

            if (field != null)
            {
                reader = new StringReader(text);
                random = new Random((int)seed);
                if (random.Next(30) == 7)
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine(Thread.CurrentThread.Name + ": NOTE: baseTokenStreamTestCase: indexing using spoon-feed reader");
                    }

                    reader = new MockReaderWrapper(random, text);
                }

                field.ReaderValue = useCharFilter ? (TextReader) new MockCharFilter(reader, remainder) : reader;
            }
        }
Ejemplo n.º 40
0
		/// <summary> Perform synonym expansion on a query.
		/// 
		/// </summary>
		/// <param name="">query
		/// </param>
		/// <param name="">syns
		/// </param>
		/// <param name="">a
		/// </param>
		/// <param name="">field
		/// </param>
		/// <param name="">boost
		/// </param>
		public static Query Expand(System.String query, Searcher syns, Analyzer a, System.String field, float boost)
		{
			System.Collections.Hashtable already = new System.Collections.Hashtable(); // avoid dups		
			System.Collections.IList top = new System.Collections.ArrayList(); // needs to be separately listed..
			
			// [1] Parse query into separate words so that when we expand we can avoid dups
			TokenStream ts = a.TokenStream(field, new System.IO.StringReader(query));
			Lucene.Net.Analysis.Token t;
			while ((t = ts.Next()) != null)
			{
				System.String word = t.TermText();
				if (already.Contains(word) == false)
				{
					already.Add(word, word);
					top.Add(word);
				}
			}
			BooleanQuery tmp = new BooleanQuery();
			
			// [2] form query
			System.Collections.IEnumerator it = top.GetEnumerator();
			while (it.MoveNext())
			{
				// [2a] add to level words in
				System.String word = (System.String) it.Current;
				TermQuery tq = new TermQuery(new Term(field, word));
				tmp.Add(tq, BooleanClause.Occur.SHOULD);
				
				// [2b] add in unique synonums
				Hits hits = syns.Search(new TermQuery(new Term(Syns2Index.F_WORD, word)));
				for (int i = 0; i < hits.Length(); i++)
				{
					Document doc = hits.Doc(i);
					System.String[] values = doc.GetValues(Syns2Index.F_SYN);
					for (int j = 0; j < values.Length; j++)
					{
						System.String syn = values[j];
						if (already.Contains(syn) == false)
						{
							already.Add(syn, syn);
							tq = new TermQuery(new Term(field, syn));
							if (boost > 0)
							// else keep normal 1.0
								tq.SetBoost(boost);
							tmp.Add(tq, BooleanClause.Occur.SHOULD);
						}
					}
				}
			}
			
			
			return tmp;
		}
 protected internal virtual string ToDot(Analyzer a, string inputText)
 {
     StringWriter sw = new StringWriter();
     TokenStream ts = a.TokenStream("field", new StringReader(inputText));
     ts.Reset();
     (new TokenStreamToDot(inputText, ts, /*new StreamWriter(*/(TextWriter)sw/*)*/)).ToDot();
     return sw.ToString();
 }
Ejemplo n.º 42
0
        /// <summary>
        /// Searches the index for the querytext
        /// </summary>
        /// <param name="querytext">The text to search the index</param>
        //public string SearchIndext(string querytext)
        public List <Dictionary <string, string> > SearchIndext(string querytext)
        {
            List <Dictionary <string, string> > resultListDict = new List <Dictionary <string, string> >();      // Initiate a result list

            Query query = DisplayQueries(querytext);

            Console.WriteLine("query is " + query);
            TopDocs results = searcher.Search(query, 100);

            System.Console.WriteLine("Number of results is " + results.TotalHits);

            // Setup the configuration of Highlighter
            IFormatter       formatter   = new SimpleHTMLFormatter("<span style=\"font-weight:bold; background-color:yellow;\">", "</span>");
            SimpleFragmenter fragmenter  = new SimpleFragmenter(2000);
            QueryScorer      scorer      = new QueryScorer(query);
            Highlighter      highlighter = new Highlighter(formatter, scorer);

            highlighter.TextFragmenter = fragmenter;

            int rank = 0;

            // ScoreDocs : a array stores pointers of a query
            // scoreDoc : a pointer of a query points to doc_ID and score (of the doc for the query)
            //string output = "";
            if (results.TotalHits != 0)   // Check if there are results
            {
                foreach (ScoreDoc scoreDoc in results.ScoreDocs)
                {
                    rank++;
                    Lucene.Net.Documents.Document doc = searcher.Doc(scoreDoc.Doc);
                    string myFieldValue = doc.Get(TEXT_FN_PASS_TEXT);
                    string myURL        = doc.Get(TEXT_FN_URL);
                    string passId       = doc.Get(TEXT_FN_PASS_ID);
                    string score        = scoreDoc.Score.ToString();
                    string queryId      = doc.Get(TEXT_FN_QUERY_ID);

                    int jsonId = Int32.Parse(doc.Get(TEXT_FN_JSON_ARRAY_ID));

                    // passage_text field store as Field.Store.NO
                    foreach (var itemP in jArr[jsonId][PASSAGES])
                    {
                        if (itemP[TEXT_FN_PASS_ID].ToString() == passId)
                        {
                            myFieldValue = itemP[TEXT_FN_PASS_TEXT].ToString();
                        }
                    }

                    //Add the Highlighter tag into passage_text of query
                    //TokenStream HLstream = analyzer.TokenStream("", new StringReader(doc.Get(TEXT_FN_PASS_TEXT)));
                    //string HLmyFieldValue = highlighter.GetBestFragment(HLstream, doc.Get(TEXT_FN_PASS_TEXT));
                    TokenStream HLstream       = analyzer.TokenStream("", new StringReader(myFieldValue));
                    string      HLmyFieldValue = highlighter.GetBestFragment(HLstream, myFieldValue);

                    Explanation e = searcher.Explain(query, scoreDoc.Doc);

                    //Extract title from URL
                    char     delimiters = '/';
                    string[] urlSeg     = myURL.Split(delimiters);
                    string   title;
                    if (urlSeg[urlSeg.Length - 1].Length == 0)
                    {
                        title = urlSeg[urlSeg.Length - 2];
                    }
                    else
                    {
                        title = urlSeg[urlSeg.Length - 1];
                    }

                    resultListDict.Add(new Dictionary <string, string> {
                        { "rank", rank.ToString() }, { "passId", passId },
                        { "score", score }, { "title", title }, { "url", myURL }, { "text", myFieldValue }, { "queryId", queryId }, { "highlighter", HLmyFieldValue }
                    });

                    //Console.WriteLine("Rank " + rank + " text " + myFieldValue + " URL " + myURL);
                    //Console.WriteLine(e);
                }
            }

            return(resultListDict);
        }
 protected internal virtual void ToDotFile(Analyzer a, string inputText, string localFileName)
 {
     StreamWriter w = new StreamWriter(new FileStream(localFileName, FileMode.Open), IOUtils.CHARSET_UTF_8);
     TokenStream ts = a.TokenStream("field", new StreamReader(inputText));
     ts.Reset();
     (new TokenStreamToDot(inputText, ts,/* new PrintWriter(*/w/*)*/)).ToDot();
     w.Dispose();
 }