Exemple #1
0
        protected internal virtual void DoTestTokenizerFactoryArguments(Version ver, Type delegatorClass)
        {
            string             clazz   = typeof(PatternTokenizerFactory).AssemblyQualifiedName;
            TokenFilterFactory factory = null;

            // simple arg form
            factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "pattern", "(.*)", "group", "0");
            AssertDelegator(factory, delegatorClass);

            // prefix
            factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.group", "0");
            AssertDelegator(factory, delegatorClass);

            // sanity check that sub-PatternTokenizerFactory fails w/o pattern
            try
            {
                factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz);
                fail("tokenizerFactory should have complained about missing pattern arg");
            }
            catch (Exception)
            {
                // :NOOP:
            }

            // sanity check that sub-PatternTokenizerFactory fails on unexpected
            try
            {
                factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.bogusbogusbogus", "bogus", "tokenizerFactory.group", "0");
                fail("tokenizerFactory should have complained about missing pattern arg");
            }
            catch (Exception)
            {
                // :NOOP:
            }
        }
Exemple #2
0
 public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet <string> stopTypes, bool useWhiteList)
     : base(version, enablePositionIncrements, input)
 {
     typeAttribute     = AddAttribute <ITypeAttribute>();
     this.stopTypes    = stopTypes;
     this.useWhiteList = useWhiteList;
 }
        } // End Sub BuildIndex

        // https://lucenenet.apache.org/
        // https://www.codeproject.com/Articles/609980/Small-Lucene-NET-Demo-App
        // https://stackoverflow.com/questions/12600196/lucene-how-to-index-file-names
        private static void SearchPath(string phrase, string indexPath)
        {
            Lucene.Net.Util.LuceneVersion version = Lucene.Net.Util.LuceneVersion.LUCENE_48;
            Lucene.Net.Store.Directory    luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(indexPath);

            Lucene.Net.Index.IndexReader r = Lucene.Net.Index.DirectoryReader.Open(luceneIndexDirectory);

            Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher(r);
            Lucene.Net.Analysis.Analyzer    analyzer = GetWrappedAnalyzer();

            Lucene.Net.QueryParsers.Classic.QueryParser parser = new Lucene.Net.QueryParsers.Classic.QueryParser(version, "file_name", analyzer);

            // https://stackoverflow.com/questions/15170097/how-to-search-across-all-the-fields
            // Lucene.Net.QueryParsers.Classic.MultiFieldQueryParser parser = new Lucene.Net.QueryParsers.Classic.MultiFieldQueryParser(version, GetFields(r), analyzer);


            Lucene.Net.Search.Query query = parser.Parse(Lucene.Net.QueryParsers.Classic.QueryParser.Escape(phrase));

            Lucene.Net.Search.ScoreDoc[] hits = searcher.Search(query, 10).ScoreDocs;
            foreach (Lucene.Net.Search.ScoreDoc hit in hits)
            {
                Lucene.Net.Documents.Document foundDoc = searcher.Doc(hit.Doc);
                System.Console.WriteLine(hit.Score);
                string full_name = foundDoc.Get("full_name");
                System.Console.WriteLine(full_name);
                // string favoritePhrase = foundDoc.Get("favoritePhrase");
                // System.Console.WriteLine(favoritePhrase);
            } // Next hit
        }     // End Sub SearchPath
 public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet<string> stopTypes, bool useWhiteList)
     : base(version, enablePositionIncrements, input)
 {
     typeAttribute = AddAttribute<ITypeAttribute>();
     this.stopTypes = stopTypes;
     this.useWhiteList = useWhiteList;
 }
Exemple #5
0
 /// <summary>
 /// Create a new <seealso cref="LengthFilter"/>. This will filter out tokens whose
 /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="CharTermAttribute#length()"/>
 /// &lt; min) or too long (<seealso cref="CharTermAttribute#length()"/> &gt; max). </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="min">     the minimum length </param>
 /// <param name="max">     the maximum length </param>
 public LengthFilter(Version version, TokenStream @in, int min, int max)
     : base(version, @in)
 {
     if (min < 0)
     {
         throw new System.ArgumentException("minimum length must be greater than or equal to zero");
     }
     if (min > max)
     {
         throw new System.ArgumentException("maximum length must not be greater than minimum length");
     }
     this.min = min;
     this.max = max;
 }
        internal V[] values;                    // package private because used in CharArraySet's non Set-conform CharArraySetIterator

        /// <summary>
        /// Create map with enough capacity to hold startSize terms
        /// </summary>
        /// <param name="matchVersion">
        ///          compatibility match version see <a href="#version">Version
        ///          note</a> above for details. </param>
        /// <param name="startSize">
        ///          the initial capacity </param>
        /// <param name="ignoreCase">
        ///          <code>false</code> if and only if the set should be case sensitive
        ///          otherwise <code>true</code>. </param>
        public CharArrayMap(Lucene.Net.Util.LuceneVersion matchVersion, int startSize, bool ignoreCase)
        {
            this.ignoreCase = ignoreCase;
            int size_Renamed = INIT_SIZE;

            while (startSize + (startSize >> 2) > size_Renamed)
            {
                size_Renamed <<= 1;
            }
            keys              = new char[size_Renamed][];
            values            = (V[])new object[size_Renamed];
            this.charUtils    = CharacterUtils.GetInstance(matchVersion);
            this.matchVersion = matchVersion;
        }
Exemple #7
0
 /// <summary>
 /// Create a new <seealso cref="LengthFilter"/>. This will filter out tokens whose
 /// <seealso cref="CharTermAttribute"/> is either too short (<seealso cref="CharTermAttribute#length()"/>
 /// &lt; min) or too long (<seealso cref="CharTermAttribute#length()"/> &gt; max). </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="min">     the minimum length </param>
 /// <param name="max">     the maximum length </param>
 public LengthFilter(Version version, TokenStream @in, int min, int max)
     : base(version, @in)
 {
     if (min < 0)
     {
         throw new System.ArgumentException("minimum length must be greater than or equal to zero");
     }
     if (min > max)
     {
         throw new System.ArgumentException("maximum length must not be greater than minimum length");
     }
     this.min = min;
     this.max = max;
 }
        } // End Function GetWrappedAnalyzer

        private static void BuildIndex(string indexPath, System.Collections.Generic.IEnumerable <string> dataToIndex)
        {
            Lucene.Net.Util.LuceneVersion version = Lucene.Net.Util.LuceneVersion.LUCENE_48;

            Lucene.Net.Store.Directory luceneIndexDirectory = Lucene.Net.Store.FSDirectory.Open(indexPath);


            // Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Core.WhitespaceAnalyzer(version);
            // Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(version);
            // Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Core.KeywordAnalyzer();
            Lucene.Net.Analysis.Analyzer analyzer = GetWrappedAnalyzer();

            Lucene.Net.Index.IndexWriterConfig writerConfig = new Lucene.Net.Index.IndexWriterConfig(version, analyzer);
            writerConfig.OpenMode = Lucene.Net.Index.OpenMode.CREATE; // Overwrite, if exists

            using (Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(luceneIndexDirectory, writerConfig))
            {
                foreach (string thisValue in dataToIndex)
                {
                    Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();

                    string directory_name        = System.IO.Path.GetDirectoryName(thisValue);
                    string file_name             = System.IO.Path.GetFileName(thisValue);
                    string filename_no_extension = System.IO.Path.GetFileNameWithoutExtension(thisValue);
                    string extension             = System.IO.Path.GetExtension(thisValue);


                    // StringField indexes but doesn't tokenize
                    doc.Add(new Lucene.Net.Documents.StringField("full_name", thisValue, Lucene.Net.Documents.Field.Store.YES));
                    doc.Add(new Lucene.Net.Documents.StringField("directory_name", directory_name, Lucene.Net.Documents.Field.Store.YES));
                    doc.Add(new Lucene.Net.Documents.StringField("file_name", file_name, Lucene.Net.Documents.Field.Store.YES));
                    doc.Add(new Lucene.Net.Documents.StringField("filename_no_extension", filename_no_extension, Lucene.Net.Documents.Field.Store.YES));
                    doc.Add(new Lucene.Net.Documents.StringField("extension", extension, Lucene.Net.Documents.Field.Store.YES));
                    // doc.Add( new Lucene.Net.Documents.TextField("favoritePhrase", thisValue, Lucene.Net.Documents.Field.Store.YES) );


                    writer.AddDocument(doc);
                } // Next thisValue

                // writer.Optimize();
                writer.Flush(true, true);
            } // Dispose needs to be called, otherwise the index cannot be read ...
        }     // End Sub BuildIndex
Exemple #9
0
 public KeepWordFilter(Version version, bool enablePositionIncrements, TokenStream @in, CharArraySet words)
     : base(version, enablePositionIncrements, @in)
 {
     this.words = words;
     termAtt    = AddAttribute <ICharTermAttribute>();
 }
Exemple #10
0
 /// <summary>
 /// Create a new <seealso cref="TypeTokenFilter"/> that filters tokens out
 /// (useWhiteList=false). </summary>
 /// <seealso cref= #TypeTokenFilter(Version, TokenStream, Set, boolean) </seealso>
 public TypeTokenFilter(Version version, TokenStream input, HashSet <string> stopTypes)
     : this(version, input, stopTypes, false)
 {
 }
 public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet<string> stopTypes)
     : this(version, enablePositionIncrements, input, stopTypes, false)
 {
 }
Exemple #12
0
 public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet <string> stopTypes)
     : this(version, enablePositionIncrements, input, stopTypes, false)
 {
 }
Exemple #13
0
 /// <summary>
 /// Create a new <seealso cref="TypeTokenFilter"/>. </summary>
 /// <param name="version">      the Lucene match version </param>
 /// <param name="input">        the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="stopTypes">    the types to filter </param>
 /// <param name="useWhiteList"> if true, then tokens whose type is in stopTypes will
 ///                     be kept, otherwise they will be filtered out </param>
 public TypeTokenFilter(Version version, TokenStream input, HashSet <string> stopTypes, bool useWhiteList)
     : base(version, input)
 {
     this.stopTypes    = stopTypes;
     this.useWhiteList = useWhiteList;
 }
 /// <summary>
 /// Create a new <seealso cref="TypeTokenFilter"/> that filters tokens out
 /// (useWhiteList=false). </summary>
 /// <seealso cref= #TypeTokenFilter(Version, TokenStream, Set, boolean) </seealso>
 public TypeTokenFilter(Version version, TokenStream input, HashSet<string> stopTypes)
     : this(version, input, stopTypes, false)
 {
 }
 /// <summary>
 /// Create a new <seealso cref="TypeTokenFilter"/>. </summary>
 /// <param name="version">      the Lucene match version </param>
 /// <param name="input">        the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="stopTypes">    the types to filter </param>
 /// <param name="useWhiteList"> if true, then tokens whose type is in stopTypes will
 ///                     be kept, otherwise they will be filtered out </param>
 public TypeTokenFilter(Version version, TokenStream input, HashSet<string> stopTypes, bool useWhiteList)
     : base(version, input)
 {
     this.stopTypes = stopTypes;
     this.useWhiteList = useWhiteList;
 }
Exemple #16
0
 public FilteringTokenFilter(Lucene.Net.Util.LuceneVersion version, bool enablePositionIncrements, TokenStream input)
     : this(version, input)
 {
     CheckPositionIncrement(version, enablePositionIncrements);
     this.enablePositionIncrements = enablePositionIncrements;
 }
Exemple #17
0
 public KeepWordFilter(Version version, bool enablePositionIncrements, TokenStream @in, CharArraySet words)
     : base(version, enablePositionIncrements, @in)
 {
     this.words = words;
     termAtt = AddAttribute<ICharTermAttribute>();
 }
Exemple #18
0
 /// <summary>
 /// Create a new <seealso cref="KeepWordFilter"/>.
 /// <para><b>NOTE</b>: The words set passed to this constructor will be directly
 /// used by this filter and should not be modified.
 /// </para>
 /// </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="words">   the words to keep </param>
 public KeepWordFilter(Version version, TokenStream @in, CharArraySet words)
     : base(version, @in)
 {
     this.words = words;
 }
        protected internal virtual void DoTestTokenizerFactoryArguments(Version ver, Type delegatorClass)
        {
            string clazz = typeof(PatternTokenizerFactory).AssemblyQualifiedName;
            TokenFilterFactory factory = null;

            // simple arg form
            factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "pattern", "(.*)", "group", "0");
            AssertDelegator(factory, delegatorClass);

            // prefix
            factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.group", "0");
            AssertDelegator(factory, delegatorClass);

            // sanity check that sub-PatternTokenizerFactory fails w/o pattern
            try
            {
                factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz);
                fail("tokenizerFactory should have complained about missing pattern arg");
            }
            catch (Exception)
            {
                // :NOOP:
            }

            // sanity check that sub-PatternTokenizerFactory fails on unexpected
            try
            {
                factory = TokenFilterFactory("Synonym", ver, "synonyms", "synonyms.txt", "tokenizerFactory", clazz, "tokenizerFactory.pattern", "(.*)", "tokenizerFactory.bogusbogusbogus", "bogus", "tokenizerFactory.group", "0");
                fail("tokenizerFactory should have complained about missing pattern arg");
            }
            catch (Exception)
            {
                // :NOOP:
            }
        }
Exemple #20
0
 /// <summary>
 /// Create a new <seealso cref="KeepWordFilter"/>.
 /// <para><b>NOTE</b>: The words set passed to this constructor will be directly
 /// used by this filter and should not be modified.
 /// </para>
 /// </summary>
 /// <param name="version"> the Lucene match version </param>
 /// <param name="in">      the <seealso cref="TokenStream"/> to consume </param>
 /// <param name="words">   the words to keep </param>
 public KeepWordFilter(Version version, TokenStream @in, CharArraySet words)
     : base(version, @in)
 {
     this.words = words;
 }