Exemple #1
0
 public PayloadFilter(TokenStream in_Renamed, byte[] data, int offset, int length) : base(in_Renamed)
 {
     this.data   = data;
     this.length = length;
     this.offset = offset;
     payloadAtt  = AddAttribute <IPayloadAttribute>();
 }
Exemple #2
0
        public QueryTermVector(System.String queryString, Analyzer analyzer)
        {
            if (analyzer != null)
            {
                TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
                if (stream != null)
                {
                    List <string> terms = new List <string>();
                    try
                    {
                        bool hasMoreTokens = false;

                        stream.Reset();
                        TermAttribute termAtt = (TermAttribute)stream.AddAttribute(typeof(TermAttribute));

                        hasMoreTokens = stream.IncrementToken();
                        while (hasMoreTokens)
                        {
                            terms.Add(termAtt.Term());
                            hasMoreTokens = stream.IncrementToken();
                        }
                        ProcessTerms(terms.ToArray());
                    }
                    catch (System.IO.IOException e)
                    {
                    }
                }
            }
        }
Exemple #3
0
        internal static void  Test(System.IO.TextReader reader, bool verbose, long bytes)
        {
            Analyzer    analyzer = new SimpleAnalyzer();
            TokenStream stream   = analyzer.TokenStream(null, reader);

            System.DateTime start = System.DateTime.Now;

            int count = 0;

            for (Token t = stream.Next(); t != null; t = stream.Next())
            {
                if (verbose)
                {
                    System.Console.Out.WriteLine("Text=" + t.TermText() + " start=" + t.StartOffset() + " end=" + t.EndOffset());
                }
                count++;
            }

            System.DateTime end = System.DateTime.Now;

            long time = end.Ticks - start.Ticks;

            System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens");
            System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token");
            System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour");
        }
Exemple #4
0
        public string Search(string strQuery)
        {
            string result = string.Empty;

            Lucene.Net.Index.IndexReader        reader           = Lucene.Net.Index.IndexReader.Open(Server.MapPath(System.Configuration.ConfigurationManager.AppSettings["IndexingArticle"]));
            Lucene.Net.QueryParsers.QueryParser parser           = new Lucene.Net.QueryParsers.QueryParser("ArticleDetail", new Lucene.Net.Analysis.Standard.StandardAnalyzer());
            Lucene.Net.Search.Query             query            = parser.Parse(strQuery);
            Lucene.Net.Search.IndexSearcher     searcher         = new Lucene.Net.Search.IndexSearcher(reader);
            Lucene.Net.Search.Hits                   hits        = searcher.Search(query);
            Lucene.Net.Highlight.QueryScorer         score       = new Lucene.Net.Highlight.QueryScorer(query);
            Lucene.Net.Highlight.SimpleHTMLFormatter formater    = new Lucene.Net.Highlight.SimpleHTMLFormatter("<span class='Highlight'>", "</span>");
            Lucene.Net.Highlight.Highlighter         highlighter = new Lucene.Net.Highlight.Highlighter(formater, score);
            result += "<div align='right' style='background-color:#F0F7F9; padding-right:15px' height='30px'><font style='FONT-WEIGHT: bold; FONT-SIZE: 10pt; COLOR: #005482; FONT-FAMILY: arial'>Kết quả tìm thấy : " + hits.Length() + "  </font></div>";
            result += "<div style='padding: 10px 10px 10px 10px;'>";
            for (int i = 0; i < hits.Length(); i++)
            {
                string id     = hits.Doc(i).Get("ArticleId");
                string title  = hits.Doc(i).Get("ArticleTitle");
                string detail = hits.Doc(i).Get("ArticleDetail");
                Lucene.Net.Analysis.TokenStream ts = (new Lucene.Net.Analysis.Standard.StandardAnalyzer()).TokenStream("ArticleDetail", new System.IO.StringReader(detail));
                result += string.Format("<div align='left'><font style='FONT-WEIGHT: bold; FONT-SIZE: 10pt; COLOR: #5b5b5b; FONT-FAMILY: arial'><a href='/?ArticleId={0}'>{1}</a></font>", id, title);
                result += string.Format("<div align='left'><font style='FONT-SIZE: 9pt' face='Arial' color='#005482'>...{0}...</font></div></div></br>", highlighter.GetBestFragment(ts, detail));
            }
            result += "</div>";
            reader.Close();
            return(result);
        }
        public virtual void  TearDown()
        {
            try
            {
                // this isn't as useful as calling directly from the scope where the
                // index readers are used, because they could be gc'ed just before
                // tearDown is called.
                // But it's better then nothing.
                AssertSaneFieldCaches(GetTestLabel());

                if (ConcurrentMergeScheduler.AnyUnhandledExceptions())
                {
                    // Clear the failure so that we don't just keep
                    // failing subsequent test cases
                    ConcurrentMergeScheduler.ClearUnhandledExceptions();
                    Assert.Fail("ConcurrentMergeScheduler hit unhandled exceptions");
                }
            }
            finally
            {
                PurgeFieldCache(Lucene.Net.Search.FieldCache_Fields.DEFAULT);
            }

            TokenStream.SetOnlyUseNewAPI(savedAPISetting);
            //base.TearDown();  // {{Aroush-2.9}}
            this.seed_init = false;

            //{{Lucene.Net-2.9.1}}
            Lucene.Net.Search.BooleanQuery.SetAllowDocsOutOfOrder(allowDocsOutOfOrder);
        }
Exemple #6
0
        /// <summary> Create a tokenized and indexed field that is not stored, optionally with
        /// storing term vectors.  This is useful for pre-analyzed fields.
        /// The TokenStream is read only when the Document is added to the index,
        /// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)}
        /// has been called.
        ///
        /// </summary>
        /// <param name="name">The name of the field
        /// </param>
        /// <param name="tokenStream">The TokenStream with the content
        /// </param>
        /// <param name="termVector">Whether term vector should be stored
        /// </param>
        /// <throws>  NullPointerException if name or tokenStream is <code>null</code> </throws>
        public Field(System.String name, TokenStream tokenStream, TermVector termVector)
        {
            if (name == null)
            {
                throw new System.NullReferenceException("name cannot be null");
            }
            if (tokenStream == null)
            {
                throw new System.NullReferenceException("tokenStream cannot be null");
            }

            this.name        = StringHelper.Intern(name);      // field names are interned
            this.fieldsData  = null;
            this.tokenStream = tokenStream;

            this.isStored     = false;
            this.isCompressed = false;

            this.isIndexed   = true;
            this.isTokenized = true;

            this.isBinary = false;

            SetStoreTermVector(termVector);
        }
Exemple #7
0
        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        private void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
        {
            TokenStream ts = analyzer.TokenStream(fieldName, r);

            Lucene.Net.Analysis.Token token;
            int tokenCount = 0;

            while ((token = ts.Next()) != null)
            {
                // for every token
                System.String word = token.TermText();
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = (Int)termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }
Exemple #8
0
        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        protected void AddTermFrequencies(System.IO.TextReader r, IDictionary <string, Int> termFreqMap, System.String fieldName)
        {
            TokenStream ts         = analyzer.TokenStream(fieldName, r);
            int         tokenCount = 0;
            // for every token
            ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>();

            while (ts.IncrementToken())
            {
                string word = termAtt.Term;
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }
 public TestFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed) : base(in_Renamed)
 {
     InitBlock(enclosingInstance);
     termAtt    = AddAttribute <ITermAttribute>();
     posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
     offsetAtt  = AddAttribute <IOffsetAttribute>();
     typeAtt    = AddAttribute <ITypeAttribute>();
 }
Exemple #10
0
 public PayloadFilter(TokenStream input, System.String fieldName) : base(input)
 {
     this.fieldName = fieldName;
     pos            = 0;
     i           = 0;
     posIncrAttr = (PositionIncrementAttribute)input.AddAttribute(typeof(PositionIncrementAttribute));
     payloadAttr = (PayloadAttribute)input.AddAttribute(typeof(PayloadAttribute));
     termAttr    = (TermAttribute)input.AddAttribute(typeof(TermAttribute));
 }
Exemple #11
0
        public string SearchAndPaging(string strQuery, string index)
        {
            string result = string.Empty;

            try
            {
                List <SearchArticle>            searchArticleList = new List <SearchArticle>();
                PSCPortal.CMS.ArticleCollection ArticleList       = ArticleCollection.GetArticleCollectionPublish();
                string         nameSub       = Libs.Ultility.GetSubDomain() == string.Empty ? "HomePage" : Libs.Ultility.GetSubDomain();
                SubDomain      subDomain     = PSCPortal.Engine.SubDomain.GetSubByName(nameSub);
                PageCollection pagesBelongTo = subDomain.GetPagesBelongTo();
                string         strId         = string.Empty;
                foreach (var page in pagesBelongTo)
                {
                    foreach (var ar in ArticleList.Where(ar => ar.PageId == page.Id))
                    {
                        strId += ar.Id + " OR ";
                    }
                    if (strId.Length > 0)
                    {
                        strId = strId.Remove(strId.Length - 3, 3);
                    }
                }
                int    pageIndex = Int32.Parse(index);
                string strSearch = " ArticleDetail:(" + strQuery + ") AND ArticleId:" + "( " + strId + " )";
                Lucene.Net.Index.IndexReader        reader           = Lucene.Net.Index.IndexReader.Open(Server.MapPath(System.Configuration.ConfigurationManager.AppSettings["IndexingArticle"]));
                Lucene.Net.QueryParsers.QueryParser parser           = new Lucene.Net.QueryParsers.QueryParser("ArticleDetail", new Lucene.Net.Analysis.Standard.StandardAnalyzer());
                Lucene.Net.Search.Query             query            = parser.Parse(strSearch);
                Lucene.Net.Search.IndexSearcher     searcher         = new Lucene.Net.Search.IndexSearcher(reader);
                Lucene.Net.Search.Hits                   hits        = searcher.Search(query);
                Lucene.Net.Highlight.QueryScorer         score       = new Lucene.Net.Highlight.QueryScorer(query);
                Lucene.Net.Highlight.SimpleHTMLFormatter formater    = new Lucene.Net.Highlight.SimpleHTMLFormatter("<span class='Highlight'>", "</span>");
                Lucene.Net.Highlight.Highlighter         highlighter = new Lucene.Net.Highlight.Highlighter(formater, score);
                result += hits.Length() + "_" + "<div class='blog_news'><div class='topic_news_title1'><div class='topic_news_title'><a href='#'>Kết quả tìm thấy: " + hits.Length() + "</a></div></div>";
                result += "<div class='ct_topic_l'><div class='ct_topic_r1'>";
                for (int i = pageIndex * 20 - 20; i < pageIndex * 20 && i < hits.Length(); i++)
                {
                    string detail = hits.Doc(i).Get("ArticleDetail");
                    Lucene.Net.Analysis.TokenStream ts = (new Lucene.Net.Analysis.Standard.StandardAnalyzer()).TokenStream("ArticleDetail", new System.IO.StringReader(detail));
                    SearchArticle searchArticle        = new SearchArticle();
                    searchArticle.Id        = hits.Doc(i).Get("ArticleId");;
                    searchArticle.Title     = hits.Doc(i).Get("ArticleTitle");
                    searchArticle.Highligth = highlighter.GetBestFragment(ts, detail);
                    searchArticleList.Add(searchArticle);
                }
                reader.Close();
                JavaScriptSerializer        serializer = new JavaScriptSerializer();
                Dictionary <string, object> resultDic  = new Dictionary <string, object>();
                resultDic["Count"] = hits.Length();
                resultDic["Data"]  = searchArticleList;
                result             = serializer.Serialize(resultDic);
            }
            catch (Exception e)
            {
            }
            return(result);
        }
Exemple #12
0
 /// <summary> Highlights chosen terms in a text, extracting the most relevant section.
 /// The document text is analysed in chunks to record hit statistics
 /// across the document. After accumulating stats, the fragment with the highest score
 /// is returned
 ///
 /// </summary>
 /// <param name="tokenStream">  a stream of tokens identified in the text parameter, including offset information.
 /// This is typically produced by an analyzer re-parsing a document's
 /// text. Some work may be done on retrieving TokenStreams more efficently
 /// by adding support for storing original text position data in the Lucene
 /// index but this support is not currently available (as of Lucene 1.4 rc2).
 /// </param>
 /// <param name="text">text to highlight terms in
 ///
 /// </param>
 /// <returns> highlighted text fragment or null if no terms found
 /// </returns>
 public System.String GetBestFragment(TokenStream tokenStream, System.String text)
 {
     System.String[] results = GetBestFragments(tokenStream, text, 1);
     if (results.Length > 0)
     {
         return(results[0]);
     }
     return(null);
 }
        public virtual void  SetUp()
        {
            //{{Lucene.Net-2.9.1}}
            allowDocsOutOfOrder = Lucene.Net.Search.BooleanQuery.GetAllowDocsOutOfOrder();

            ConcurrentMergeScheduler.SetTestMode();

            savedAPISetting = TokenStream.GetOnlyUseNewAPI();
            TokenStream.SetOnlyUseNewAPI(false);
        }
 public PayloadFilter(TestPayloadSpans enclosingInstance, TokenStream input, System.String fieldName) : base(input)
 {
     InitBlock(enclosingInstance);
     this.fieldName = fieldName;
     pos            = 0;
     CollectionsHelper.AddIfNotContains(entities, "xx");
     CollectionsHelper.AddIfNotContains(entities, "one");
     CollectionsHelper.AddIfNotContains(nopayload, "nopayload");
     CollectionsHelper.AddIfNotContains(nopayload, "np");
     termAtt    = AddAttribute <ITermAttribute>();
     posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
     payloadAtt = AddAttribute <IPayloadAttribute>();
 }
		/// <summary>Construct the named stemming filter.
		/// 
		/// </summary>
        /// <param name="input">the input tokens to stem
		/// </param>
		/// <param name="name">the name of a stemmer
		/// </param>
		public SnowballFilter(TokenStream input, System.String name) : base(input)
		{
			try
			{
				System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer");
				stemmer = (SnowballProgram) System.Activator.CreateInstance(stemClass);
			}
			catch (System.Exception e)
			{
				throw new System.SystemException(e.ToString());
			}
		    termAtt = AddAttribute<ITermAttribute>();
		}
Exemple #16
0
 /// <summary>Construct the named stemming filter.
 ///
 /// </summary>
 /// <param name="input">the input tokens to stem
 /// </param>
 /// <param name="name">the name of a stemmer
 /// </param>
 public SnowballFilter(TokenStream input, System.String name) : base(input)
 {
     try
     {
         System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer");
         stemmer = (SnowballProgram)System.Activator.CreateInstance(stemClass);
     }
     catch (System.Exception e)
     {
         throw new System.SystemException(e.ToString());
     }
     termAtt = AddAttribute <ITermAttribute>();
 }
Exemple #17
0
 public void  SetValue(TokenStream value_Renamed)
 {
     if (isBinary)
     {
         throw new System.ArgumentException("cannot set a TokenStream value on a binary field");
     }
     if (isStored)
     {
         throw new System.ArgumentException("cannot set a TokenStream value on a stored field");
     }
     fieldsData  = null;
     tokenStream = value_Renamed;
 }
		/// <summary>Construct the named stemming filter.
		/// 
		/// </summary>
		/// <param name="in">the input tokens to stem
		/// </param>
		/// <param name="name">the name of a stemmer
		/// </param>
		public SnowballFilter(TokenStream in_Renamed, System.String name) : base(in_Renamed)
		{
			try
			{
				System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer");
				stemmer = (SnowballProgram) System.Activator.CreateInstance(stemClass);
				// why doesn't the SnowballProgram class have an (abstract?) stem method?
				stemMethod = stemClass.GetMethod("Stem", (new System.Type[0] == null) ? new System.Type[0] : (System.Type[]) new System.Type[0]);
			}
			catch (System.Exception e)
			{
				throw new System.SystemException(e.ToString());
			}
		}
Exemple #19
0
 /// <summary> Highlights terms in the  text , extracting the most relevant sections
 /// and concatenating the chosen fragments with a separator (typically "...").
 /// The document text is analysed in chunks to record hit statistics
 /// across the document. After accumulating stats, the fragments with the highest scores
 /// are returned in order as "separator" delimited strings.
 ///
 /// </summary>
 /// <param name="text">       text to highlight terms in
 /// </param>
 /// <param name="maxNumFragments"> the maximum number of fragments.
 /// </param>
 /// <param name="separator"> the separator used to intersperse the document fragments (typically "...")
 ///
 /// </param>
 /// <returns> highlighted text
 /// </returns>
 public System.String GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments, System.String separator)
 {
     System.String[]           sections = GetBestFragments(tokenStream, text, maxNumFragments);
     System.Text.StringBuilder result   = new System.Text.StringBuilder();
     for (int i = 0; i < sections.Length; i++)
     {
         if (i > 0)
         {
             result.Append(separator);
         }
         result.Append(sections[i]);
     }
     return(result.ToString());
 }
 /// <summary>Construct the named stemming filter.
 ///
 /// </summary>
 /// <param name="in">the input tokens to stem
 /// </param>
 /// <param name="name">the name of a stemmer
 /// </param>
 public SnowballFilter(TokenStream in_Renamed, System.String name) : base(in_Renamed)
 {
     try
     {
         System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer");
         stemmer = (SnowballProgram)System.Activator.CreateInstance(stemClass);
         // why doesn't the SnowballProgram class have an (abstract?) stem method?
         stemMethod = stemClass.GetMethod("Stem", (new System.Type[0] == null) ? new System.Type[0] : (System.Type[]) new System.Type[0]);
     }
     catch (System.Exception e)
     {
         throw new System.SystemException(e.ToString());
     }
 }
        public virtual void  TestIncrementingPositions()
        {
            Analyzer    analyzer = new WhitespaceAnalyzer();
            TokenStream ts       = analyzer.TokenStream("Field", new System.IO.StringReader("one two three four five"));

            while (true)
            {
                Token token = ts.Next();
                if (token == null)
                {
                    break;
                }
                Assert.AreEqual(1, token.GetPositionIncrement(), token.TermText());
            }
        }
Exemple #22
0
        /// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
        /// The document text is analysed in chunks to record hit statistics
        /// across the document. After accumulating stats, the fragments with the highest scores
        /// are returned as an array of strings in order of score (contiguous fragments are merged into
        /// one in their original order to improve readability)
        ///
        /// </summary>
        /// <param name="text">         text to highlight terms in
        /// </param>
        /// <param name="maxNumFragments"> the maximum number of fragments.
        ///
        /// </param>
        /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
        /// </returns>
        public System.String[] GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments)
        {
            maxNumFragments = System.Math.Max(1, maxNumFragments);             //sanity check

            TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);

            //Get text
            System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
            for (int i = 0; i < frag.Length; i++)
            {
                if ((frag[i] != null) && (frag[i].GetScore() > 0))
                {
                    fragTexts.Add(frag[i].ToString());
                }
            }
            return((System.String[])fragTexts.ToArray(typeof(System.String)));
        }
Exemple #23
0
        public IEnumerable <SampleHit> Search(string query_str)
        {
            List <SampleHit> result_hits = new List <SampleHit>();

            using (Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(index_folder))
            {
                Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);
                //Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT);

                using (IndexSearcher searcher = new IndexSearcher(luceneIndexDirectory))
                {
                    QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_CURRENT, IndexModel.LineText, analyzer);
                    Query       query  = parser.Parse(query_str);

                    TopDocs hits = searcher.Search(query, max_search_hits);

                    // code highlighting
                    var formatter  = new Lucene.Net.Search.Highlight.SimpleHTMLFormatter("<span style=\"background:yellow;\">", "</span>");
                    var fragmenter = new Lucene.Net.Search.Highlight.SimpleFragmenter(200);
                    Lucene.Net.Search.Highlight.QueryScorer scorer      = new Lucene.Net.Search.Highlight.QueryScorer(query);
                    Lucene.Net.Search.Highlight.Highlighter highlighter = new Lucene.Net.Search.Highlight.Highlighter(formatter, scorer);
                    highlighter.TextFragmenter = fragmenter;

                    foreach (ScoreDoc hit in hits.ScoreDocs)
                    {
                        Document doc   = searcher.Doc(hit.Doc);
                        float    score = hit.Score;

                        Field line_number = doc.GetField(IndexModel.LineNumber);
                        Field line_text   = doc.GetField(IndexModel.LineText);

                        Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(line_text.StringValue));
                        string highlightedText = highlighter.GetBestFragments(stream, doc.Get(IndexModel.LineText), 1, "...");

                        result_hits.Add(new SampleHit {
                            line_number = line_number.StringValue, sample_text = line_text.StringValue, html_highlighting = highlightedText
                        });
                    }
                }
            }


            return(result_hits);
        }
Exemple #24
0
        /// <summary> A convenience method that tries a number of approaches to getting a token stream.
        /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still
        /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
        /// </summary>
        /// <param name="">reader
        /// </param>
        /// <param name="">docId
        /// </param>
        /// <param name="">field
        /// </param>
        /// <param name="">analyzer
        /// </param>
        /// <returns> null if field not stored correctly
        /// </returns>
        /// <throws>  IOException </throws>
        public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer)
        {
            TokenStream ts = null;

            TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field);

            if (tfv != null)
            {
                if (tfv is TermPositionVector)
                {
                    ts = GetTokenStream((TermPositionVector)tfv);
                }
            }
            //No token info stored so fall back to analyzing raw content
            if (ts == null)
            {
                ts = GetTokenStream(reader, docId, field, analyzer);
            }
            return(ts);
        }
Exemple #25
0
        public virtual void  TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);

            inWords = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            sampleUnicode = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("Unicode"));

            TokenStream in_Renamed = ra.TokenStream("all", inWords);

            RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode");
            }

            inWords.Close();
            sampleUnicode.Close();
        }
Exemple #26
0
        public virtual void  TestKOI8()
        {
            //System.out.println(new java.util.Date());
            RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);

            // KOI8
            inWordsKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sampleKOI8 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            TokenStream            in_Renamed = ra.TokenStream("all", inWordsKOI8);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8");
            }

            inWordsKOI8.Close();
            sampleKOI8.Close();
        }
Exemple #27
0
        public virtual void  Test1251()
        {
            // 1251
            inWords1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\test1251.txt").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            sample1251 = new System.IO.StreamReader(
                new System.IO.FileStream(
                    new System.IO.FileInfo(
                        dataDir.FullName + @"Analysis\RU\res1251.htm").FullName,
                    System.IO.FileMode.Open,
                    System.IO.FileAccess.Read),
                System.Text.Encoding.GetEncoding("iso-8859-1"));

            RussianAnalyzer        ra         = new RussianAnalyzer(RussianCharsets.CP1251);
            TokenStream            in_Renamed = ra.TokenStream("", inWords1251);
            RussianLetterTokenizer sample     = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251);

            for (; ;)
            {
                Token token = in_Renamed.Next();

                if (token == null)
                {
                    break;
                }

                Token sampleToken = sample.Next();
                Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251");
            }

            inWords1251.Close();
            sample1251.Close();
        }
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 Token reusableToken = new Token();
                 for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                 {
                     terms.Add(nextToken.Term());
                 }
                 ProcessTerms((System.String[])terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
 public QueryTermVector(System.String queryString, Analyzer analyzer)
 {
     if (analyzer != null)
     {
         TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString));
         if (stream != null)
         {
             Token next = null;
             System.Collections.ArrayList terms = new System.Collections.ArrayList();
             try
             {
                 while ((next = stream.Next()) != null)
                 {
                     terms.Add(next.TermText());
                 }
                 ProcessTerms((System.String[])terms.ToArray(typeof(System.String)));
             }
             catch (System.IO.IOException)
             {
             }
         }
     }
 }
Exemple #30
0
        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        ///
        /// <p/>
        ///
        /// So, if you have a code fragment like this:
        /// <br/>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        ///
        /// <p/>
        /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
        ///
        /// <p/>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        ///
        /// <P/>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// <see cref="BooleanQuery.Add(BooleanClause)"/> (used internally)
        /// throws
        /// <see cref="BooleanQuery.TooManyClauses"/>, the
        /// query as it is will be returned.
        ///
        ///
        ///
        ///
        ///
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
        {
            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));

            Lucene.Net.Analysis.Token t;
            BooleanQuery tmp = new BooleanQuery();

            System.Collections.Hashtable already = new System.Collections.Hashtable();             // ignore dups
            while ((t = ts.Next()) != null)
            {
                System.String word = t.TermText();
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                {
                    continue;
                }
                // ignore dups
                if (already.Contains(word) == true)
                {
                    continue;
                }
                already.Add(word, word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, BooleanClause.Occur.SHOULD);                     //false, false);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return(tmp);
        }
				public AnonymousClassTokenFilter(AnonymousClassAnalyzer enclosingInstance, TokenStream ts) : base(ts)
				{
					InitBlock(enclosingInstance);
				}
		/// <summary> Low level api to get the most relevant (formatted) sections of the document.
		/// This method has been made public to allow visibility of score information held in TextFragment objects.
		/// Thanks to Jason Calabrese for help in redefining the interface.  
		/// </summary>
		/// <param name="">tokenStream
		/// </param>
		/// <param name="">text
		/// </param>
		/// <param name="">maxNumFragments
		/// </param>
		/// <param name="">mergeContiguousFragments
		/// </param>
		/// <throws>  IOException </throws>
		public TextFragment[] GetBestTextFragments(TokenStream tokenStream, System.String text, bool mergeContiguousFragments, int maxNumFragments)
		{
			System.Collections.ArrayList docFrags = new System.Collections.ArrayList();
			System.Text.StringBuilder newText = new System.Text.StringBuilder();
			
			TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
			fragmentScorer.StartFragment(currentFrag);
			docFrags.Add(currentFrag);
			
			FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
			
			try
			{
				Lucene.Net.Analysis.Token token;
				System.String tokenText;
				int startOffset;
				int endOffset;
				int lastEndOffset = 0;
				textFragmenter.Start(text);
				
				TokenGroup tokenGroup = new TokenGroup();
				token = tokenStream.Next();
				while ((token != null) && (token.StartOffset() < maxDocBytesToAnalyze))
				{
					if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token)))
					{
						//the current token is distinct from previous tokens - 
						// markup the cached token group info
						startOffset = tokenGroup.matchStartOffset;
						endOffset = tokenGroup.matchEndOffset;
						tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
						System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
						//store any whitespace etc from between this and last group
						if (startOffset > lastEndOffset)
							newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
						newText.Append(markedUpText);
						lastEndOffset = System.Math.Max(endOffset, lastEndOffset);
						tokenGroup.Clear();
						
						//check if current token marks the start of a new fragment						
						if (textFragmenter.IsNewFragment(token))
						{
							currentFrag.SetScore(fragmentScorer.GetFragmentScore());
							//record stats for a new fragment
							currentFrag.textEndPos = newText.Length;
							currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
							fragmentScorer.StartFragment(currentFrag);
							docFrags.Add(currentFrag);
						}
					}
					
					tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token));
					
					//				if(lastEndOffset>maxDocBytesToAnalyze)
					//				{
					//					break;
					//				}
					token = tokenStream.Next();
				}
				currentFrag.SetScore(fragmentScorer.GetFragmentScore());
				
				if (tokenGroup.numTokens > 0)
				{
					//flush the accumulated text (same code as in above loop)
					startOffset = tokenGroup.matchStartOffset;
					endOffset = tokenGroup.matchEndOffset;
					tokenText = text.Substring(startOffset, (endOffset) - (startOffset));
					System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup);
					//store any whitespace etc from between this and last group
					if (startOffset > lastEndOffset)
						newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset))));
					newText.Append(markedUpText);
					lastEndOffset = System.Math.Max(lastEndOffset, endOffset);
				}
				
				//Test what remains of the original text beyond the point where we stopped analyzing 
				if ((lastEndOffset < text.Length) && (text.Length < maxDocBytesToAnalyze))
				{
					//append it to the last fragment
					newText.Append(encoder.EncodeText(text.Substring(lastEndOffset)));
				}
				
				currentFrag.textEndPos = newText.Length;
				
				//sort the most relevant sections of the text
				for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); )
				{
					currentFrag = (TextFragment) i.Current;
					
					//If you are running with a version of Lucene before 11th Sept 03
					// you do not have PriorityQueue.insert() - so uncomment the code below					
					/*
					if (currentFrag.getScore() >= minScore)
					{
					fragQueue.put(currentFrag);
					if (fragQueue.size() > maxNumFragments)
					{ // if hit queue overfull
					fragQueue.pop(); // remove lowest in hit queue
					minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
					}
					
					
					}
					*/
					//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
					//fix to PriorityQueue. The correct method to use here is the new "insert" method
					// USE ABOVE CODE IF THIS DOES NOT COMPILE!
					fragQueue.Insert(currentFrag);
				}
				
				//return the most relevant fragments
				TextFragment[] frag = new TextFragment[fragQueue.Size()];
				for (int i = frag.Length - 1; i >= 0; i--)
				{
					frag[i] = (TextFragment) fragQueue.Pop();
				}
				
				//merge any contiguous fragments to improve readability
				if (mergeContiguousFragments)
				{
					MergeContiguousFragments(frag);
					System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
					for (int i = 0; i < frag.Length; i++)
					{
						if ((frag[i] != null) && (frag[i].GetScore() > 0))
						{
							fragTexts.Add(frag[i]);
						}
					}
					frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment));
				}
				
				return frag;
			}
			finally
			{
				if (tokenStream != null)
				{
					try
					{
						tokenStream.Close();
					}
					catch (System.Exception e)
					{
					}
				}
			}
		}
		/// <summary> Highlights chosen terms in a text, extracting the most relevant sections.
		/// The document text is analysed in chunks to record hit statistics
		/// across the document. After accumulating stats, the fragments with the highest scores
		/// are returned as an array of strings in order of score (contiguous fragments are merged into 
		/// one in their original order to improve readability)
		/// 
		/// </summary>
		/// <param name="text">       	text to highlight terms in
		/// </param>
		/// <param name="maxNumFragments"> the maximum number of fragments.
		/// 
		/// </param>
		/// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments)
		/// </returns>
		public System.String[] GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments)
		{
			maxNumFragments = System.Math.Max(1, maxNumFragments); //sanity check
			
			TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments);
			
			//Get text
			System.Collections.ArrayList fragTexts = new System.Collections.ArrayList();
			for (int i = 0; i < frag.Length; i++)
			{
				if ((frag[i] != null) && (frag[i].GetScore() > 0))
				{
					fragTexts.Add(frag[i].ToString());
				}
			}
			return (System.String[]) fragTexts.ToArray(typeof(System.String));
		}
Exemple #34
0
 /// <summary>Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. </summary>
 public void  SetValue(TokenStream value_Renamed)
 {
     fieldsData = value_Renamed;
 }
Exemple #35
0
 //private System.Reflection.MethodInfo stemMethod;
 public SnowballFilter(TokenStream input, SnowballProgram stemmer)
     : base(input)
 {
     this.stemmer = stemmer;
     termAtt = AddAttribute<ITermAttribute>();
 }
		/// <summary> Highlights chosen terms in a text, extracting the most relevant section.
		/// The document text is analysed in chunks to record hit statistics
		/// across the document. After accumulating stats, the fragment with the highest score
		/// is returned
		/// 
		/// </summary>
		/// <param name="tokenStream">  a stream of tokens identified in the text parameter, including offset information. 
		/// This is typically produced by an analyzer re-parsing a document's 
		/// text. Some work may be done on retrieving TokenStreams more efficently 
		/// by adding support for storing original text position data in the Lucene
		/// index but this support is not currently available (as of Lucene 1.4 rc2).  
		/// </param>
		/// <param name="text">text to highlight terms in
		/// 
		/// </param>
		/// <returns> highlighted text fragment or null if no terms found
		/// </returns>
		public System.String GetBestFragment(TokenStream tokenStream, System.String text)
		{
			System.String[] results = GetBestFragments(tokenStream, text, 1);
			if (results.Length > 0)
			{
				return results[0];
			}
			return null;
		}
		/// <summary> Highlights terms in the  text , extracting the most relevant sections
		/// and concatenating the chosen fragments with a separator (typically "...").
		/// The document text is analysed in chunks to record hit statistics
		/// across the document. After accumulating stats, the fragments with the highest scores
		/// are returned in order as "separator" delimited strings.
		/// 
		/// </summary>
		/// <param name="text">       text to highlight terms in
		/// </param>
		/// <param name="maxNumFragments"> the maximum number of fragments.
		/// </param>
		/// <param name="separator"> the separator used to intersperse the document fragments (typically "...")
		/// 
		/// </param>
		/// <returns> highlighted text
		/// </returns>
		public System.String GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments, System.String separator)
		{
			System.String[] sections = GetBestFragments(tokenStream, text, maxNumFragments);
			System.Text.StringBuilder result = new System.Text.StringBuilder();
			for (int i = 0; i < sections.Length; i++)
			{
				if (i > 0)
				{
					result.Append(separator);
				}
				result.Append(sections[i]);
			}
			return result.ToString();
		}
 public PayloadFilter(TestPayloadNearQuery enclosingInstance, TokenStream input, System.String fieldName):base(input)
 {
     InitBlock(enclosingInstance);
     this.fieldName = fieldName;
     payAtt = AddAttribute<IPayloadAttribute>();
 }
			/// <summary> Filter which discards the token 'stop' and which expands the
			/// token 'phrase' into 'phrase1 phrase2'
			/// </summary>
			public QPTestFilter(TokenStream in_Renamed):base(in_Renamed)
			{
				termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
				offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute));
			}
			public PayloadFilter(TestPayloadSpans enclosingInstance, TokenStream input, System.String fieldName):base(input)
			{
				InitBlock(enclosingInstance);
				this.fieldName = fieldName;
				pos = 0;
				Support.CollectionsHelper.AddIfNotContains(entities, "xx");
				Support.CollectionsHelper.AddIfNotContains(entities, "one");
				Support.CollectionsHelper.AddIfNotContains(nopayload, "nopayload");
				Support.CollectionsHelper.AddIfNotContains(nopayload, "np");
				termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
				posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
				payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute));
			}
			protected internal PayloadFilter(TokenStream input):base(input)
			{
				payloadAtt =  AddAttribute<IPayloadAttribute>();
			}
 public PayloadFilter(TestPayloadSpans outerInstance, TokenStream input)
     : base(input)
 {
     this.OuterInstance = outerInstance;
     Pos = 0;
     Entities.Add("xx");
     Entities.Add("one");
     Nopayload.Add("nopayload");
     Nopayload.Add("np");
     TermAtt = AddAttribute<ICharTermAttribute>();
     PosIncrAtt = AddAttribute<IPositionIncrementAttribute>();
     PayloadAtt = AddAttribute<IPayloadAttribute>();
 }
			public CrashingFilter(TestIndexWriter enclosingInstance, System.String fieldName, TokenStream input):base(input)
			{
				InitBlock(enclosingInstance);
				this.fieldName = fieldName;
			}
 public TestFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed):base(in_Renamed)
 {
     InitBlock(enclosingInstance);
     termAtt =  AddAttribute<ITermAttribute>();
     posIncrAtt =  AddAttribute<IPositionIncrementAttribute>();
     offsetAtt =  AddAttribute<IOffsetAttribute>();
     typeAtt =  AddAttribute<ITypeAttribute>();
 }
 public TokenStream Init(TokenStream tokenStream)
 {
     return null;
 }
Exemple #46
0
		public void  SetValue(TokenStream value_Renamed)
		{
			if (isBinary)
			{
				throw new System.ArgumentException("cannot set a TokenStream value on a binary field");
			}
			if (isStored)
			{
				throw new System.ArgumentException("cannot set a TokenStream value on a stored field");
			}
			fieldsData = null;
			tokenStream = value_Renamed;
		}
		public SynonymTokenizer(TokenStream realStream, System.Collections.IDictionary synonyms)
		{
			this.realStream = realStream;
			this.synonyms = synonyms;
		}
Exemple #48
0
		/// <summary>Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true.
		/// May be combined with stored values from stringValue() or binaryValue() 
		/// </summary>
		public void  SetTokenStream(TokenStream tokenStream)
		{
			this.isIndexed = true;
			this.isTokenized = true;
			this.tokenStream = tokenStream;
		}
			protected internal PayloadFilter(TokenStream input):base(input)
			{
				payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute));
			}
Exemple #50
0
		/// <summary> Create a tokenized and indexed field that is not stored. Term vectors will
		/// not be stored. This is useful for pre-analyzed fields.
		/// The TokenStream is read only when the Document is added to the index,
		/// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)}
		/// has been called.
		/// 
		/// </summary>
		/// <param name="name">The name of the field
		/// </param>
		/// <param name="tokenStream">The TokenStream with the content
		/// </param>
		/// <throws>  NullPointerException if name or tokenStream is <code>null</code> </throws>
		public Field(System.String name, TokenStream tokenStream):this(name, tokenStream, TermVector.NO)
		{
		}
			public TestPosIncrementFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed):base(in_Renamed)
			{
				InitBlock(enclosingInstance);
				termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
				posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
			}
Exemple #52
0
		/// <summary> Create a tokenized and indexed field that is not stored, optionally with 
		/// storing term vectors.  This is useful for pre-analyzed fields.
		/// The TokenStream is read only when the Document is added to the index,
		/// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)}
		/// has been called.
		/// 
		/// </summary>
		/// <param name="name">The name of the field
		/// </param>
		/// <param name="tokenStream">The TokenStream with the content
		/// </param>
		/// <param name="termVector">Whether term vector should be stored
		/// </param>
		/// <throws>  NullPointerException if name or tokenStream is <code>null</code> </throws>
		public Field(System.String name, TokenStream tokenStream, TermVector termVector)
		{
			if (name == null)
				throw new System.NullReferenceException("name cannot be null");
			if (tokenStream == null)
				throw new System.NullReferenceException("tokenStream cannot be null");
			
			this.name = StringHelper.Intern(name); // field names are interned
			this.fieldsData = null;
			this.tokenStream = tokenStream;
			
			this.isStored = false;
			this.isCompressed = false;
			
			this.isIndexed = true;
			this.isTokenized = true;
			
			this.isBinary = false;
			
			SetStoreTermVector(termVector);
		}
Exemple #53
0
 /// <summary> Create a tokenized and indexed field that is not stored. Term vectors will
 /// not be stored. This is useful for pre-analyzed fields.
 /// The TokenStream is read only when the Document is added to the index,
 /// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)}
 /// has been called.
 ///
 /// </summary>
 /// <param name="name">The name of the field
 /// </param>
 /// <param name="tokenStream">The TokenStream with the content
 /// </param>
 /// <throws>  NullPointerException if name or tokenStream is <code>null</code> </throws>
 public Field(System.String name, TokenStream tokenStream) : this(name, tokenStream, TermVector.NO)
 {
 }
Exemple #54
0
			public PayloadFilter(TokenStream in_Renamed, byte[] data, int offset, int length):base(in_Renamed)
			{
				this.data = data;
				this.length = length;
				this.offset = offset;
				payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute));
			}
			public PayloadFilter(TestBoostingTermQuery enclosingInstance, TokenStream input, System.String fieldName):base(input)
			{
				InitBlock(enclosingInstance);
				this.fieldName = fieldName;
				payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute));
			}
		public PayloadFilter(TokenStream input, System.String fieldName):base(input)
		{
			this.fieldName = fieldName;
			pos = 0;
			i = 0;
			posIncrAttr = (PositionIncrementAttribute) input.AddAttribute(typeof(PositionIncrementAttribute));
			payloadAttr = (PayloadAttribute) input.AddAttribute(typeof(PayloadAttribute));
			termAttr = (TermAttribute) input.AddAttribute(typeof(TermAttribute));
		}
 /// <summary>Construct filtering <i>in</i>. </summary>
 public StandardFilter(TokenStream in_Renamed)
     : base(in_Renamed)
 {
     termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
     typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute));
 }
		public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed)
		{
			this.stemmer = stemmer;
			this.stemMethod = stemMethod;
		}
			protected internal PayloadFilter(TokenStream input):base(input)
			{
			}
		/// <summary>Construct filtering <i>in</i>. </summary>
		public StandardFilter(TokenStream in_Renamed):base(in_Renamed)
		{
		}