public PayloadFilter(TokenStream in_Renamed, byte[] data, int offset, int length) : base(in_Renamed) { this.data = data; this.length = length; this.offset = offset; payloadAtt = AddAttribute <IPayloadAttribute>(); }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { List <string> terms = new List <string>(); try { bool hasMoreTokens = false; stream.Reset(); TermAttribute termAtt = (TermAttribute)stream.AddAttribute(typeof(TermAttribute)); hasMoreTokens = stream.IncrementToken(); while (hasMoreTokens) { terms.Add(termAtt.Term()); hasMoreTokens = stream.IncrementToken(); } ProcessTerms(terms.ToArray()); } catch (System.IO.IOException e) { } } } }
internal static void Test(System.IO.TextReader reader, bool verbose, long bytes) { Analyzer analyzer = new SimpleAnalyzer(); TokenStream stream = analyzer.TokenStream(null, reader); System.DateTime start = System.DateTime.Now; int count = 0; for (Token t = stream.Next(); t != null; t = stream.Next()) { if (verbose) { System.Console.Out.WriteLine("Text=" + t.TermText() + " start=" + t.StartOffset() + " end=" + t.EndOffset()); } count++; } System.DateTime end = System.DateTime.Now; long time = end.Ticks - start.Ticks; System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens"); System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token"); System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour"); }
public string Search(string strQuery) { string result = string.Empty; Lucene.Net.Index.IndexReader reader = Lucene.Net.Index.IndexReader.Open(Server.MapPath(System.Configuration.ConfigurationManager.AppSettings["IndexingArticle"])); Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser("ArticleDetail", new Lucene.Net.Analysis.Standard.StandardAnalyzer()); Lucene.Net.Search.Query query = parser.Parse(strQuery); Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher(reader); Lucene.Net.Search.Hits hits = searcher.Search(query); Lucene.Net.Highlight.QueryScorer score = new Lucene.Net.Highlight.QueryScorer(query); Lucene.Net.Highlight.SimpleHTMLFormatter formater = new Lucene.Net.Highlight.SimpleHTMLFormatter("<span class='Highlight'>", "</span>"); Lucene.Net.Highlight.Highlighter highlighter = new Lucene.Net.Highlight.Highlighter(formater, score); result += "<div align='right' style='background-color:#F0F7F9; padding-right:15px' height='30px'><font style='FONT-WEIGHT: bold; FONT-SIZE: 10pt; COLOR: #005482; FONT-FAMILY: arial'>Kết quả tìm thấy : " + hits.Length() + " </font></div>"; result += "<div style='padding: 10px 10px 10px 10px;'>"; for (int i = 0; i < hits.Length(); i++) { string id = hits.Doc(i).Get("ArticleId"); string title = hits.Doc(i).Get("ArticleTitle"); string detail = hits.Doc(i).Get("ArticleDetail"); Lucene.Net.Analysis.TokenStream ts = (new Lucene.Net.Analysis.Standard.StandardAnalyzer()).TokenStream("ArticleDetail", new System.IO.StringReader(detail)); result += string.Format("<div align='left'><font style='FONT-WEIGHT: bold; FONT-SIZE: 10pt; COLOR: #5b5b5b; FONT-FAMILY: arial'><a href='/?ArticleId={0}'>{1}</a></font>", id, title); result += string.Format("<div align='left'><font style='FONT-SIZE: 9pt' face='Arial' color='#005482'>...{0}...</font></div></div></br>", highlighter.GetBestFragment(ts, detail)); } result += "</div>"; reader.Close(); return(result); }
public virtual void TearDown() { try { // this isn't as useful as calling directly from the scope where the // index readers are used, because they could be gc'ed just before // tearDown is called. // But it's better then nothing. AssertSaneFieldCaches(GetTestLabel()); if (ConcurrentMergeScheduler.AnyUnhandledExceptions()) { // Clear the failure so that we don't just keep // failing subsequent test cases ConcurrentMergeScheduler.ClearUnhandledExceptions(); Assert.Fail("ConcurrentMergeScheduler hit unhandled exceptions"); } } finally { PurgeFieldCache(Lucene.Net.Search.FieldCache_Fields.DEFAULT); } TokenStream.SetOnlyUseNewAPI(savedAPISetting); //base.TearDown(); // {{Aroush-2.9}} this.seed_init = false; //{{Lucene.Net-2.9.1}} Lucene.Net.Search.BooleanQuery.SetAllowDocsOutOfOrder(allowDocsOutOfOrder); }
/// <summary> Create a tokenized and indexed field that is not stored, optionally with /// storing term vectors. This is useful for pre-analyzed fields. /// The TokenStream is read only when the Document is added to the index, /// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)} /// has been called. /// /// </summary> /// <param name="name">The name of the field /// </param> /// <param name="tokenStream">The TokenStream with the content /// </param> /// <param name="termVector">Whether term vector should be stored /// </param> /// <throws> NullPointerException if name or tokenStream is <code>null</code> </throws> public Field(System.String name, TokenStream tokenStream, TermVector termVector) { if (name == null) { throw new System.NullReferenceException("name cannot be null"); } if (tokenStream == null) { throw new System.NullReferenceException("tokenStream cannot be null"); } this.name = StringHelper.Intern(name); // field names are interned this.fieldsData = null; this.tokenStream = tokenStream; this.isStored = false; this.isCompressed = false; this.isIndexed = true; this.isTokenized = true; this.isBinary = false; SetStoreTermVector(termVector); }
/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> private void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); Lucene.Net.Analysis.Token token; int tokenCount = 0; while ((token = ts.Next()) != null) { // for every token System.String word = token.TermText(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = (Int)termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> protected void AddTermFrequencies(System.IO.TextReader r, IDictionary <string, Int> termFreqMap, System.String fieldName) { TokenStream ts = analyzer.TokenStream(fieldName, r); int tokenCount = 0; // for every token ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>(); while (ts.IncrementToken()) { string word = termAtt.Term; tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
public TestFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed) : base(in_Renamed) { InitBlock(enclosingInstance); termAtt = AddAttribute <ITermAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); offsetAtt = AddAttribute <IOffsetAttribute>(); typeAtt = AddAttribute <ITypeAttribute>(); }
public PayloadFilter(TokenStream input, System.String fieldName) : base(input) { this.fieldName = fieldName; pos = 0; i = 0; posIncrAttr = (PositionIncrementAttribute)input.AddAttribute(typeof(PositionIncrementAttribute)); payloadAttr = (PayloadAttribute)input.AddAttribute(typeof(PayloadAttribute)); termAttr = (TermAttribute)input.AddAttribute(typeof(TermAttribute)); }
public string SearchAndPaging(string strQuery, string index) { string result = string.Empty; try { List <SearchArticle> searchArticleList = new List <SearchArticle>(); PSCPortal.CMS.ArticleCollection ArticleList = ArticleCollection.GetArticleCollectionPublish(); string nameSub = Libs.Ultility.GetSubDomain() == string.Empty ? "HomePage" : Libs.Ultility.GetSubDomain(); SubDomain subDomain = PSCPortal.Engine.SubDomain.GetSubByName(nameSub); PageCollection pagesBelongTo = subDomain.GetPagesBelongTo(); string strId = string.Empty; foreach (var page in pagesBelongTo) { foreach (var ar in ArticleList.Where(ar => ar.PageId == page.Id)) { strId += ar.Id + " OR "; } if (strId.Length > 0) { strId = strId.Remove(strId.Length - 3, 3); } } int pageIndex = Int32.Parse(index); string strSearch = " ArticleDetail:(" + strQuery + ") AND ArticleId:" + "( " + strId + " )"; Lucene.Net.Index.IndexReader reader = Lucene.Net.Index.IndexReader.Open(Server.MapPath(System.Configuration.ConfigurationManager.AppSettings["IndexingArticle"])); Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser("ArticleDetail", new Lucene.Net.Analysis.Standard.StandardAnalyzer()); Lucene.Net.Search.Query query = parser.Parse(strSearch); Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher(reader); Lucene.Net.Search.Hits hits = searcher.Search(query); Lucene.Net.Highlight.QueryScorer score = new Lucene.Net.Highlight.QueryScorer(query); Lucene.Net.Highlight.SimpleHTMLFormatter formater = new Lucene.Net.Highlight.SimpleHTMLFormatter("<span class='Highlight'>", "</span>"); Lucene.Net.Highlight.Highlighter highlighter = new Lucene.Net.Highlight.Highlighter(formater, score); result += hits.Length() + "_" + "<div class='blog_news'><div class='topic_news_title1'><div class='topic_news_title'><a href='#'>Kết quả tìm thấy: " + hits.Length() + "</a></div></div>"; result += "<div class='ct_topic_l'><div class='ct_topic_r1'>"; for (int i = pageIndex * 20 - 20; i < pageIndex * 20 && i < hits.Length(); i++) { string detail = hits.Doc(i).Get("ArticleDetail"); Lucene.Net.Analysis.TokenStream ts = (new Lucene.Net.Analysis.Standard.StandardAnalyzer()).TokenStream("ArticleDetail", new System.IO.StringReader(detail)); SearchArticle searchArticle = new SearchArticle(); searchArticle.Id = hits.Doc(i).Get("ArticleId");; searchArticle.Title = hits.Doc(i).Get("ArticleTitle"); searchArticle.Highligth = highlighter.GetBestFragment(ts, detail); searchArticleList.Add(searchArticle); } reader.Close(); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> resultDic = new Dictionary <string, object>(); resultDic["Count"] = hits.Length(); resultDic["Data"] = searchArticleList; result = serializer.Serialize(resultDic); } catch (Exception e) { } return(result); }
/// <summary> Highlights chosen terms in a text, extracting the most relevant section. /// The document text is analysed in chunks to record hit statistics /// across the document. After accumulating stats, the fragment with the highest score /// is returned /// /// </summary> /// <param name="tokenStream"> a stream of tokens identified in the text parameter, including offset information. /// This is typically produced by an analyzer re-parsing a document's /// text. Some work may be done on retrieving TokenStreams more efficently /// by adding support for storing original text position data in the Lucene /// index but this support is not currently available (as of Lucene 1.4 rc2). /// </param> /// <param name="text">text to highlight terms in /// /// </param> /// <returns> highlighted text fragment or null if no terms found /// </returns> public System.String GetBestFragment(TokenStream tokenStream, System.String text) { System.String[] results = GetBestFragments(tokenStream, text, 1); if (results.Length > 0) { return(results[0]); } return(null); }
public virtual void SetUp() { //{{Lucene.Net-2.9.1}} allowDocsOutOfOrder = Lucene.Net.Search.BooleanQuery.GetAllowDocsOutOfOrder(); ConcurrentMergeScheduler.SetTestMode(); savedAPISetting = TokenStream.GetOnlyUseNewAPI(); TokenStream.SetOnlyUseNewAPI(false); }
public PayloadFilter(TestPayloadSpans enclosingInstance, TokenStream input, System.String fieldName) : base(input) { InitBlock(enclosingInstance); this.fieldName = fieldName; pos = 0; CollectionsHelper.AddIfNotContains(entities, "xx"); CollectionsHelper.AddIfNotContains(entities, "one"); CollectionsHelper.AddIfNotContains(nopayload, "nopayload"); CollectionsHelper.AddIfNotContains(nopayload, "np"); termAtt = AddAttribute <ITermAttribute>(); posIncrAtt = AddAttribute <IPositionIncrementAttribute>(); payloadAtt = AddAttribute <IPayloadAttribute>(); }
/// <summary>Construct the named stemming filter. /// /// </summary> /// <param name="input">the input tokens to stem /// </param> /// <param name="name">the name of a stemmer /// </param> public SnowballFilter(TokenStream input, System.String name) : base(input) { try { System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer"); stemmer = (SnowballProgram) System.Activator.CreateInstance(stemClass); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } termAtt = AddAttribute<ITermAttribute>(); }
/// <summary>Construct the named stemming filter. /// /// </summary> /// <param name="input">the input tokens to stem /// </param> /// <param name="name">the name of a stemmer /// </param> public SnowballFilter(TokenStream input, System.String name) : base(input) { try { System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer"); stemmer = (SnowballProgram)System.Activator.CreateInstance(stemClass); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } termAtt = AddAttribute <ITermAttribute>(); }
public void SetValue(TokenStream value_Renamed) { if (isBinary) { throw new System.ArgumentException("cannot set a TokenStream value on a binary field"); } if (isStored) { throw new System.ArgumentException("cannot set a TokenStream value on a stored field"); } fieldsData = null; tokenStream = value_Renamed; }
/// <summary>Construct the named stemming filter. /// /// </summary> /// <param name="in">the input tokens to stem /// </param> /// <param name="name">the name of a stemmer /// </param> public SnowballFilter(TokenStream in_Renamed, System.String name) : base(in_Renamed) { try { System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer"); stemmer = (SnowballProgram) System.Activator.CreateInstance(stemClass); // why doesn't the SnowballProgram class have an (abstract?) stem method? stemMethod = stemClass.GetMethod("Stem", (new System.Type[0] == null) ? new System.Type[0] : (System.Type[]) new System.Type[0]); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } }
/// <summary> Highlights terms in the text , extracting the most relevant sections /// and concatenating the chosen fragments with a separator (typically "..."). /// The document text is analysed in chunks to record hit statistics /// across the document. After accumulating stats, the fragments with the highest scores /// are returned in order as "separator" delimited strings. /// /// </summary> /// <param name="text"> text to highlight terms in /// </param> /// <param name="maxNumFragments"> the maximum number of fragments. /// </param> /// <param name="separator"> the separator used to intersperse the document fragments (typically "...") /// /// </param> /// <returns> highlighted text /// </returns> public System.String GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments, System.String separator) { System.String[] sections = GetBestFragments(tokenStream, text, maxNumFragments); System.Text.StringBuilder result = new System.Text.StringBuilder(); for (int i = 0; i < sections.Length; i++) { if (i > 0) { result.Append(separator); } result.Append(sections[i]); } return(result.ToString()); }
/// <summary>Construct the named stemming filter. /// /// </summary> /// <param name="in">the input tokens to stem /// </param> /// <param name="name">the name of a stemmer /// </param> public SnowballFilter(TokenStream in_Renamed, System.String name) : base(in_Renamed) { try { System.Type stemClass = System.Type.GetType("SF.Snowball.Ext." + name + "Stemmer"); stemmer = (SnowballProgram)System.Activator.CreateInstance(stemClass); // why doesn't the SnowballProgram class have an (abstract?) stem method? stemMethod = stemClass.GetMethod("Stem", (new System.Type[0] == null) ? new System.Type[0] : (System.Type[]) new System.Type[0]); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } }
public virtual void TestIncrementingPositions() { Analyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.TokenStream("Field", new System.IO.StringReader("one two three four five")); while (true) { Token token = ts.Next(); if (token == null) { break; } Assert.AreEqual(1, token.GetPositionIncrement(), token.TermText()); } }
/// <summary> Highlights chosen terms in a text, extracting the most relevant sections. /// The document text is analysed in chunks to record hit statistics /// across the document. After accumulating stats, the fragments with the highest scores /// are returned as an array of strings in order of score (contiguous fragments are merged into /// one in their original order to improve readability) /// /// </summary> /// <param name="text"> text to highlight terms in /// </param> /// <param name="maxNumFragments"> the maximum number of fragments. /// /// </param> /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments) /// </returns> public System.String[] GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments) { maxNumFragments = System.Math.Max(1, maxNumFragments); //sanity check TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments); //Get text System.Collections.ArrayList fragTexts = new System.Collections.ArrayList(); for (int i = 0; i < frag.Length; i++) { if ((frag[i] != null) && (frag[i].GetScore() > 0)) { fragTexts.Add(frag[i].ToString()); } } return((System.String[])fragTexts.ToArray(typeof(System.String))); }
public IEnumerable <SampleHit> Search(string query_str) { List <SampleHit> result_hits = new List <SampleHit>(); using (Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(index_folder)) { Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Ru.RussianAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); //Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_CURRENT); using (IndexSearcher searcher = new IndexSearcher(luceneIndexDirectory)) { QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_CURRENT, IndexModel.LineText, analyzer); Query query = parser.Parse(query_str); TopDocs hits = searcher.Search(query, max_search_hits); // code highlighting var formatter = new Lucene.Net.Search.Highlight.SimpleHTMLFormatter("<span style=\"background:yellow;\">", "</span>"); var fragmenter = new Lucene.Net.Search.Highlight.SimpleFragmenter(200); Lucene.Net.Search.Highlight.QueryScorer scorer = new Lucene.Net.Search.Highlight.QueryScorer(query); Lucene.Net.Search.Highlight.Highlighter highlighter = new Lucene.Net.Search.Highlight.Highlighter(formatter, scorer); highlighter.TextFragmenter = fragmenter; foreach (ScoreDoc hit in hits.ScoreDocs) { Document doc = searcher.Doc(hit.Doc); float score = hit.Score; Field line_number = doc.GetField(IndexModel.LineNumber); Field line_text = doc.GetField(IndexModel.LineText); Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(line_text.StringValue)); string highlightedText = highlighter.GetBestFragments(stream, doc.Get(IndexModel.LineText), 1, "..."); result_hits.Add(new SampleHit { line_number = line_number.StringValue, sample_text = line_text.StringValue, html_highlighting = highlightedText }); } } } return(result_hits); }
/// <summary> A convenience method that tries a number of approaches to getting a token stream. /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable /// </summary> /// <param name="">reader /// </param> /// <param name="">docId /// </param> /// <param name="">field /// </param> /// <param name="">analyzer /// </param> /// <returns> null if field not stored correctly /// </returns> /// <throws> IOException </throws> public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer) { TokenStream ts = null; TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field); if (tfv != null) { if (tfv is TermPositionVector) { ts = GetTokenStream((TermPositionVector)tfv); } } //No token info stored so fall back to analyzing raw content if (ts == null) { ts = GetTokenStream(reader, docId, field, analyzer); } return(ts); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { Token reusableToken = new Token(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { terms.Add(nextToken.Term()); } ProcessTerms((System.String[])terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { Token next = null; System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { while ((next = stream.Next()) != null) { terms.Add(next.TermText()); } ProcessTerms((System.String[])terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
/// <summary> Simple similarity query generators. /// Takes every unique word and forms a boolean query where all words are optional. /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs. /// The only caveat is the first hit returned <b>should be</b> your source document - you'll /// need to then ignore that. /// /// <p/> /// /// So, if you have a code fragment like this: /// <br/> /// <code> /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); /// </code> /// /// <p/> /// The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>. /// /// <p/> /// The philosophy behind this method is "two documents are similar if they share lots of words". /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. /// /// <P/> /// This method is fail-safe in that if a long 'body' is passed in and /// <see cref="BooleanQuery.Add(BooleanClause)"/> (used internally) /// throws /// <see cref="BooleanQuery.TooManyClauses"/>, the /// query as it is will be returned. /// /// /// /// /// /// </summary> /// <param name="body">the body of the document you want to find similar documents to /// </param> /// <param name="a">the analyzer to use to parse the body /// </param> /// <param name="field">the field you want to search on, probably something like "contents" or "body" /// </param> /// <param name="stop">optional set of stop words to ignore /// </param> /// <returns> a query with all unique words in 'body' /// </returns> /// <throws> IOException this can't happen... </throws> public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop) { TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body)); Lucene.Net.Analysis.Token t; BooleanQuery tmp = new BooleanQuery(); System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups while ((t = ts.Next()) != null) { System.String word = t.TermText(); // ignore opt stop words if (stop != null && stop.Contains(word)) { continue; } // ignore dups if (already.Contains(word) == true) { continue; } already.Add(word, word); // add to query TermQuery tq = new TermQuery(new Term(field, word)); try { tmp.Add(tq, BooleanClause.Occur.SHOULD); //false, false); } catch (BooleanQuery.TooManyClauses) { // fail-safe, just return what we have, not the end of the world break; } } return(tmp); }
public AnonymousClassTokenFilter(AnonymousClassAnalyzer enclosingInstance, TokenStream ts) : base(ts) { InitBlock(enclosingInstance); }
/// <summary> Low level api to get the most relevant (formatted) sections of the document. /// This method has been made public to allow visibility of score information held in TextFragment objects. /// Thanks to Jason Calabrese for help in redefining the interface. /// </summary> /// <param name="">tokenStream /// </param> /// <param name="">text /// </param> /// <param name="">maxNumFragments /// </param> /// <param name="">mergeContiguousFragments /// </param> /// <throws> IOException </throws> public TextFragment[] GetBestTextFragments(TokenStream tokenStream, System.String text, bool mergeContiguousFragments, int maxNumFragments) { System.Collections.ArrayList docFrags = new System.Collections.ArrayList(); System.Text.StringBuilder newText = new System.Text.StringBuilder(); TextFragment currentFrag = new TextFragment(newText, newText.Length, docFrags.Count); fragmentScorer.StartFragment(currentFrag); docFrags.Add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { Lucene.Net.Analysis.Token token; System.String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.Start(text); TokenGroup tokenGroup = new TokenGroup(); token = tokenStream.Next(); while ((token != null) && (token.StartOffset() < maxDocBytesToAnalyze)) { if ((tokenGroup.numTokens > 0) && (tokenGroup.IsDistinct(token))) { //the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.Substring(startOffset, (endOffset) - (startOffset)); System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset)))); newText.Append(markedUpText); lastEndOffset = System.Math.Max(endOffset, lastEndOffset); tokenGroup.Clear(); //check if current token marks the start of a new fragment if (textFragmenter.IsNewFragment(token)) { currentFrag.SetScore(fragmentScorer.GetFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.Length; currentFrag = new TextFragment(newText, newText.Length, docFrags.Count); fragmentScorer.StartFragment(currentFrag); docFrags.Add(currentFrag); } } tokenGroup.AddToken(token, fragmentScorer.GetTokenScore(token)); // if(lastEndOffset>maxDocBytesToAnalyze) // { // break; // } token = tokenStream.Next(); } currentFrag.SetScore(fragmentScorer.GetFragmentScore()); if (tokenGroup.numTokens > 0) { //flush the accumulated text (same code as in above loop) startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.Substring(startOffset, (endOffset) - (startOffset)); System.String markedUpText = formatter.HighlightTerm(encoder.EncodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.Append(encoder.EncodeText(text.Substring(lastEndOffset, (startOffset) - (lastEndOffset)))); newText.Append(markedUpText); lastEndOffset = System.Math.Max(lastEndOffset, endOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ((lastEndOffset < text.Length) && (text.Length < maxDocBytesToAnalyze)) { //append it to the last fragment newText.Append(encoder.EncodeText(text.Substring(lastEndOffset))); } currentFrag.textEndPos = newText.Length; //sort the most relevant sections of the text for (System.Collections.IEnumerator i = docFrags.GetEnumerator(); i.MoveNext(); ) { currentFrag = (TextFragment) i.Current; //If you are running with a version of Lucene before 11th Sept 03 // you do not have PriorityQueue.insert() - so uncomment the code below /* if (currentFrag.getScore() >= minScore) { fragQueue.put(currentFrag); if (fragQueue.size() > maxNumFragments) { // if hit queue overfull fragQueue.pop(); // remove lowest in hit queue minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore } } */ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! fragQueue.Insert(currentFrag); } //return the most relevant fragments TextFragment[] frag = new TextFragment[fragQueue.Size()]; for (int i = frag.Length - 1; i >= 0; i--) { frag[i] = (TextFragment) fragQueue.Pop(); } //merge any contiguous fragments to improve readability if (mergeContiguousFragments) { MergeContiguousFragments(frag); System.Collections.ArrayList fragTexts = new System.Collections.ArrayList(); for (int i = 0; i < frag.Length; i++) { if ((frag[i] != null) && (frag[i].GetScore() > 0)) { fragTexts.Add(frag[i]); } } frag = (TextFragment[]) fragTexts.ToArray(typeof(TextFragment)); } return frag; } finally { if (tokenStream != null) { try { tokenStream.Close(); } catch (System.Exception e) { } } } }
/// <summary> Highlights chosen terms in a text, extracting the most relevant sections. /// The document text is analysed in chunks to record hit statistics /// across the document. After accumulating stats, the fragments with the highest scores /// are returned as an array of strings in order of score (contiguous fragments are merged into /// one in their original order to improve readability) /// /// </summary> /// <param name="text"> text to highlight terms in /// </param> /// <param name="maxNumFragments"> the maximum number of fragments. /// /// </param> /// <returns> highlighted text fragments (between 0 and maxNumFragments number of fragments) /// </returns> public System.String[] GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments) { maxNumFragments = System.Math.Max(1, maxNumFragments); //sanity check TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments); //Get text System.Collections.ArrayList fragTexts = new System.Collections.ArrayList(); for (int i = 0; i < frag.Length; i++) { if ((frag[i] != null) && (frag[i].GetScore() > 0)) { fragTexts.Add(frag[i].ToString()); } } return (System.String[]) fragTexts.ToArray(typeof(System.String)); }
/// <summary>Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. </summary> public void SetValue(TokenStream value_Renamed) { fieldsData = value_Renamed; }
//private System.Reflection.MethodInfo stemMethod; public SnowballFilter(TokenStream input, SnowballProgram stemmer) : base(input) { this.stemmer = stemmer; termAtt = AddAttribute<ITermAttribute>(); }
/// <summary> Highlights chosen terms in a text, extracting the most relevant section. /// The document text is analysed in chunks to record hit statistics /// across the document. After accumulating stats, the fragment with the highest score /// is returned /// /// </summary> /// <param name="tokenStream"> a stream of tokens identified in the text parameter, including offset information. /// This is typically produced by an analyzer re-parsing a document's /// text. Some work may be done on retrieving TokenStreams more efficently /// by adding support for storing original text position data in the Lucene /// index but this support is not currently available (as of Lucene 1.4 rc2). /// </param> /// <param name="text">text to highlight terms in /// /// </param> /// <returns> highlighted text fragment or null if no terms found /// </returns> public System.String GetBestFragment(TokenStream tokenStream, System.String text) { System.String[] results = GetBestFragments(tokenStream, text, 1); if (results.Length > 0) { return results[0]; } return null; }
/// <summary> Highlights terms in the text , extracting the most relevant sections /// and concatenating the chosen fragments with a separator (typically "..."). /// The document text is analysed in chunks to record hit statistics /// across the document. After accumulating stats, the fragments with the highest scores /// are returned in order as "separator" delimited strings. /// /// </summary> /// <param name="text"> text to highlight terms in /// </param> /// <param name="maxNumFragments"> the maximum number of fragments. /// </param> /// <param name="separator"> the separator used to intersperse the document fragments (typically "...") /// /// </param> /// <returns> highlighted text /// </returns> public System.String GetBestFragments(TokenStream tokenStream, System.String text, int maxNumFragments, System.String separator) { System.String[] sections = GetBestFragments(tokenStream, text, maxNumFragments); System.Text.StringBuilder result = new System.Text.StringBuilder(); for (int i = 0; i < sections.Length; i++) { if (i > 0) { result.Append(separator); } result.Append(sections[i]); } return result.ToString(); }
public PayloadFilter(TestPayloadNearQuery enclosingInstance, TokenStream input, System.String fieldName):base(input) { InitBlock(enclosingInstance); this.fieldName = fieldName; payAtt = AddAttribute<IPayloadAttribute>(); }
/// <summary> Filter which discards the token 'stop' and which expands the /// token 'phrase' into 'phrase1 phrase2' /// </summary> public QPTestFilter(TokenStream in_Renamed):base(in_Renamed) { termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute)); offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute)); }
public PayloadFilter(TestPayloadSpans enclosingInstance, TokenStream input, System.String fieldName):base(input) { InitBlock(enclosingInstance); this.fieldName = fieldName; pos = 0; Support.CollectionsHelper.AddIfNotContains(entities, "xx"); Support.CollectionsHelper.AddIfNotContains(entities, "one"); Support.CollectionsHelper.AddIfNotContains(nopayload, "nopayload"); Support.CollectionsHelper.AddIfNotContains(nopayload, "np"); termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute)); posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute)); payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute)); }
protected internal PayloadFilter(TokenStream input):base(input) { payloadAtt = AddAttribute<IPayloadAttribute>(); }
public PayloadFilter(TestPayloadSpans outerInstance, TokenStream input) : base(input) { this.OuterInstance = outerInstance; Pos = 0; Entities.Add("xx"); Entities.Add("one"); Nopayload.Add("nopayload"); Nopayload.Add("np"); TermAtt = AddAttribute<ICharTermAttribute>(); PosIncrAtt = AddAttribute<IPositionIncrementAttribute>(); PayloadAtt = AddAttribute<IPayloadAttribute>(); }
public CrashingFilter(TestIndexWriter enclosingInstance, System.String fieldName, TokenStream input):base(input) { InitBlock(enclosingInstance); this.fieldName = fieldName; }
public TestFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed):base(in_Renamed) { InitBlock(enclosingInstance); termAtt = AddAttribute<ITermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); }
public TokenStream Init(TokenStream tokenStream) { return null; }
public SynonymTokenizer(TokenStream realStream, System.Collections.IDictionary synonyms) { this.realStream = realStream; this.synonyms = synonyms; }
/// <summary>Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true. /// May be combined with stored values from stringValue() or binaryValue() /// </summary> public void SetTokenStream(TokenStream tokenStream) { this.isIndexed = true; this.isTokenized = true; this.tokenStream = tokenStream; }
protected internal PayloadFilter(TokenStream input):base(input) { payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute)); }
/// <summary> Create a tokenized and indexed field that is not stored. Term vectors will /// not be stored. This is useful for pre-analyzed fields. /// The TokenStream is read only when the Document is added to the index, /// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)} /// has been called. /// /// </summary> /// <param name="name">The name of the field /// </param> /// <param name="tokenStream">The TokenStream with the content /// </param> /// <throws> NullPointerException if name or tokenStream is <code>null</code> </throws> public Field(System.String name, TokenStream tokenStream):this(name, tokenStream, TermVector.NO) { }
public TestPosIncrementFilter(TestMultiAnalyzer enclosingInstance, TokenStream in_Renamed):base(in_Renamed) { InitBlock(enclosingInstance); termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute)); posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute)); }
/// <summary> Create a tokenized and indexed field that is not stored, optionally with /// storing term vectors. This is useful for pre-analyzed fields. /// The TokenStream is read only when the Document is added to the index, /// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)} /// has been called. /// /// </summary> /// <param name="name">The name of the field /// </param> /// <param name="tokenStream">The TokenStream with the content /// </param> /// <param name="termVector">Whether term vector should be stored /// </param> /// <throws> NullPointerException if name or tokenStream is <code>null</code> </throws> public Field(System.String name, TokenStream tokenStream, TermVector termVector) { if (name == null) throw new System.NullReferenceException("name cannot be null"); if (tokenStream == null) throw new System.NullReferenceException("tokenStream cannot be null"); this.name = StringHelper.Intern(name); // field names are interned this.fieldsData = null; this.tokenStream = tokenStream; this.isStored = false; this.isCompressed = false; this.isIndexed = true; this.isTokenized = true; this.isBinary = false; SetStoreTermVector(termVector); }
/// <summary> Create a tokenized and indexed field that is not stored. Term vectors will /// not be stored. This is useful for pre-analyzed fields. /// The TokenStream is read only when the Document is added to the index, /// i.e. you may not close the TokenStream until {@link IndexWriter#AddDocument(Document)} /// has been called. /// /// </summary> /// <param name="name">The name of the field /// </param> /// <param name="tokenStream">The TokenStream with the content /// </param> /// <throws> NullPointerException if name or tokenStream is <code>null</code> </throws> public Field(System.String name, TokenStream tokenStream) : this(name, tokenStream, TermVector.NO) { }
public PayloadFilter(TokenStream in_Renamed, byte[] data, int offset, int length):base(in_Renamed) { this.data = data; this.length = length; this.offset = offset; payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute)); }
public PayloadFilter(TestBoostingTermQuery enclosingInstance, TokenStream input, System.String fieldName):base(input) { InitBlock(enclosingInstance); this.fieldName = fieldName; payloadAtt = (PayloadAttribute) AddAttribute(typeof(PayloadAttribute)); }
public PayloadFilter(TokenStream input, System.String fieldName):base(input) { this.fieldName = fieldName; pos = 0; i = 0; posIncrAttr = (PositionIncrementAttribute) input.AddAttribute(typeof(PositionIncrementAttribute)); payloadAttr = (PayloadAttribute) input.AddAttribute(typeof(PayloadAttribute)); termAttr = (TermAttribute) input.AddAttribute(typeof(TermAttribute)); }
/// <summary>Construct filtering <i>in</i>. </summary> public StandardFilter(TokenStream in_Renamed) : base(in_Renamed) { termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute)); typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute)); }
public SnowballFilter(TokenStream in_Renamed, SnowballProgram stemmer, System.Reflection.MethodInfo stemMethod) : base(in_Renamed) { this.stemmer = stemmer; this.stemMethod = stemMethod; }
protected internal PayloadFilter(TokenStream input):base(input) { }
/// <summary>Construct filtering <i>in</i>. </summary> public StandardFilter(TokenStream in_Renamed):base(in_Renamed) { }