protected internal virtual void DoTestTokenizerFactoryArguments(Version ver, Type delegatorClass) { string clazz = typeof(PatternTokenizerFactory).AssemblyQualifiedName; TokenFilterFactory factory = null; // simple arg form factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "pattern", "(.*)", "group", "0"); AssertDelegator(factory, delegatorClass); // prefix factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.group", "0"); AssertDelegator(factory, delegatorClass); // sanity check that sub-PatternTokenizerFactory fails w/o pattern try { factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz); fail("tokenizerFactory should have complained about missing pattern arg"); } catch (Exception) { // :NOOP: } // sanity check that sub-PatternTokenizerFactory fails on unexpected try { factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.bogusbogusbogus", "bogus", "tokenizerFactory.group", "0"); fail("tokenizerFactory should have complained about missing pattern arg"); } catch (Exception) { // :NOOP: } }
public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet <string> stopTypes, bool useWhiteList) : base(version, enablePositionIncrements, input) { typeAttribute = AddAttribute <ITypeAttribute>(); this.stopTypes = stopTypes; this.useWhiteList = useWhiteList; }
} // End Sub BuildIndex // https://lucenenet.apache.org/ // https://www.codeproject.com/Articles/609980/Small-Lucene-NET-Demo-App // https://stackoverflow.com/questions/12600196/lucene-how-to-index-file-names private static void SearchPath(string phrase, string indexPath) { Lucene.Net.Util.LuceneVersion version = Lucene.Net.Util.LuceneVersion.LUCENE_48; Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(indexPath); Lucene.Net.Index.IndexReader r = Lucene.Net.Index.DirectoryReader.Open(luceneIndexDirectory); Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher(r); Lucene.Net.Analysis.Analyzer analyzer = GetWrappedAnalyzer(); Lucene.Net.QueryParsers.Classic.QueryParser parser = new Lucene.Net.QueryParsers.Classic.QueryParser(version, "file_name", analyzer); // https://stackoverflow.com/questions/15170097/how-to-search-across-all-the-fields // Lucene.Net.QueryParsers.Classic.MultiFieldQueryParser parser = new Lucene.Net.QueryParsers.Classic.MultiFieldQueryParser(version, GetFields(r), analyzer); Lucene.Net.Search.Query query = parser.Parse(Lucene.Net.QueryParsers.Classic.QueryParser.Escape(phrase)); Lucene.Net.Search.ScoreDoc[] hits = searcher.Search(query, 10).ScoreDocs; foreach (Lucene.Net.Search.ScoreDoc hit in hits) { Lucene.Net.Documents.Document foundDoc = searcher.Doc(hit.Doc); System.Console.WriteLine(hit.Score); string full_name = foundDoc.Get("full_name"); System.Console.WriteLine(full_name); // string favoritePhrase = foundDoc.Get("favoritePhrase"); // System.Console.WriteLine(favoritePhrase); } // Next hit } // End Sub SearchPath
public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet<string> stopTypes, bool useWhiteList) : base(version, enablePositionIncrements, input) { typeAttribute = AddAttribute<ITypeAttribute>(); this.stopTypes = stopTypes; this.useWhiteList = useWhiteList; }
/// <summary> /// Create a new <seealso cref="LengthFilter"/>. This will filter out tokens whose /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="CharTermAttribute#length()"/> /// < min) or too long (<seealso cref="CharTermAttribute#length()"/> > max). </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="min"> the minimum length </param> /// <param name="max"> the maximum length </param> public LengthFilter(Version version, TokenStream @in, int min, int max) : base(version, @in) { if (min < 0) { throw new System.ArgumentException("minimum length must be greater than or equal to zero"); } if (min > max) { throw new System.ArgumentException("maximum length must not be greater than minimum length"); } this.min = min; this.max = max; }
internal V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator /// <summary> /// Create map with enough capacity to hold startSize terms /// </summary> /// <param name="matchVersion"> /// compatibility match version see <a href="#version">Version /// note</a> above for details. </param> /// <param name="startSize"> /// the initial capacity </param> /// <param name="ignoreCase"> /// <code>false</code> if and only if the set should be case sensitive /// otherwise <code>true</code>. </param> public CharArrayMap(Lucene.Net.Util.LuceneVersion matchVersion, int startSize, bool ignoreCase) { this.ignoreCase = ignoreCase; int size_Renamed = INIT_SIZE; while (startSize + (startSize >> 2) > size_Renamed) { size_Renamed <<= 1; } keys = new char[size_Renamed][]; values = (V[])new object[size_Renamed]; this.charUtils = CharacterUtils.GetInstance(matchVersion); this.matchVersion = matchVersion; }
} // End Function GetWrappedAnalyzer private static void BuildIndex(string indexPath, System.Collections.Generic.IEnumerable <string> dataToIndex) { Lucene.Net.Util.LuceneVersion version = Lucene.Net.Util.LuceneVersion.LUCENE_48; Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(indexPath); // Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Core.WhitespaceAnalyzer(version); // Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(version); // Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Core.KeywordAnalyzer(); Lucene.Net.Analysis.Analyzer analyzer = GetWrappedAnalyzer(); Lucene.Net.Index.IndexWriterConfig writerConfig = new Lucene.Net.Index.IndexWriterConfig(version, analyzer); writerConfig.OpenMode = Lucene.Net.Index.OpenMode.CREATE; // Overwrite, if exists using (Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(luceneIndexDirectory, writerConfig)) { foreach (string thisValue in dataToIndex) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); string directory_name = System.IO.Path.GetDirectoryName(thisValue); string file_name = System.IO.Path.GetFileName(thisValue); string filename_no_extension = System.IO.Path.GetFileNameWithoutExtension(thisValue); string extension = System.IO.Path.GetExtension(thisValue); // StringField indexes but doesn't tokenize doc.Add(new Lucene.Net.Documents.StringField("full_name", thisValue, Lucene.Net.Documents.Field.Store.YES)); doc.Add(new Lucene.Net.Documents.StringField("directory_name", directory_name, Lucene.Net.Documents.Field.Store.YES)); doc.Add(new Lucene.Net.Documents.StringField("file_name", file_name, Lucene.Net.Documents.Field.Store.YES)); doc.Add(new Lucene.Net.Documents.StringField("filename_no_extension", filename_no_extension, Lucene.Net.Documents.Field.Store.YES)); doc.Add(new Lucene.Net.Documents.StringField("extension", extension, Lucene.Net.Documents.Field.Store.YES)); // doc.Add( new Lucene.Net.Documents.TextField("favoritePhrase", thisValue, Lucene.Net.Documents.Field.Store.YES) ); writer.AddDocument(doc); } // Next thisValue // writer.Optimize(); writer.Flush(true, true); } // Dispose needs to be called, otherwise the index cannot be read ... } // End Sub BuildIndex
public KeepWordFilter(Version version, bool enablePositionIncrements, TokenStream @in, CharArraySet words) : base(version, enablePositionIncrements, @in) { this.words = words; termAtt = AddAttribute <ICharTermAttribute>(); }
/// <summary> /// Create a new <seealso cref="TypeTokenFilter"/> that filters tokens out /// (useWhiteList=false). </summary> /// <seealso cref= #TypeTokenFilter(Version, TokenStream, Set, boolean) </seealso> public TypeTokenFilter(Version version, TokenStream input, HashSet <string> stopTypes) : this(version, input, stopTypes, false) { }
public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet<string> stopTypes) : this(version, enablePositionIncrements, input, stopTypes, false) { }
public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet <string> stopTypes) : this(version, enablePositionIncrements, input, stopTypes, false) { }
/// <summary> /// Create a new <seealso cref="TypeTokenFilter"/>. </summary> /// <param name="version"> the Lucene match version </param> /// <param name="input"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="stopTypes"> the types to filter </param> /// <param name="useWhiteList"> if true, then tokens whose type is in stopTypes will /// be kept, otherwise they will be filtered out </param> public TypeTokenFilter(Version version, TokenStream input, HashSet <string> stopTypes, bool useWhiteList) : base(version, input) { this.stopTypes = stopTypes; this.useWhiteList = useWhiteList; }
/// <summary> /// Create a new <seealso cref="TypeTokenFilter"/> that filters tokens out /// (useWhiteList=false). </summary> /// <seealso cref= #TypeTokenFilter(Version, TokenStream, Set, boolean) </seealso> public TypeTokenFilter(Version version, TokenStream input, HashSet<string> stopTypes) : this(version, input, stopTypes, false) { }
/// <summary> /// Create a new <seealso cref="TypeTokenFilter"/>. </summary> /// <param name="version"> the Lucene match version </param> /// <param name="input"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="stopTypes"> the types to filter </param> /// <param name="useWhiteList"> if true, then tokens whose type is in stopTypes will /// be kept, otherwise they will be filtered out </param> public TypeTokenFilter(Version version, TokenStream input, HashSet<string> stopTypes, bool useWhiteList) : base(version, input) { this.stopTypes = stopTypes; this.useWhiteList = useWhiteList; }
public FilteringTokenFilter(Lucene.Net.Util.LuceneVersion version, bool enablePositionIncrements, TokenStream input) : this(version, input) { CheckPositionIncrement(version, enablePositionIncrements); this.enablePositionIncrements = enablePositionIncrements; }
public KeepWordFilter(Version version, bool enablePositionIncrements, TokenStream @in, CharArraySet words) : base(version, enablePositionIncrements, @in) { this.words = words; termAtt = AddAttribute<ICharTermAttribute>(); }
/// <summary> /// Create a new <seealso cref="KeepWordFilter"/>. /// <para><b>NOTE</b>: The words set passed to this constructor will be directly /// used by this filter and should not be modified. /// </para> /// </summary> /// <param name="version"> the Lucene match version </param> /// <param name="in"> the <seealso cref="TokenStream"/> to consume </param> /// <param name="words"> the words to keep </param> public KeepWordFilter(Version version, TokenStream @in, CharArraySet words) : base(version, @in) { this.words = words; }