public void DeleteLuceneIndexRecord(string cin)
        {
            var path         = ConfigurationManager.AppSettings["LuceneDirectory"];
            var indexDirInfo = new DirectoryInfo(path);

            if (!System.IO.Directory.Exists(path))
            {
                System.IO.Directory.CreateDirectory(path);
            }

            Directory directory = FSDirectory.Open(indexDirInfo,
                                                   new SimpleFSLockFactory(indexDirInfo));

            IndexWriter.Unlock(directory);
            // init lucene
            var analyzer = new KeywordAnalyzer();

            using (var writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
            {
                // remove older index entry
                var searchQuery = new TermQuery(new Term("Cin", cin));
                writer.DeleteDocuments(searchQuery);

                // close handles
                analyzer.Close();
                writer.Dispose();
            }
        }
Example #2
0
        public static Query BuildQuery(string query, IndexQuery indexQuery, PerFieldAnalyzerWrapper analyzer)
        {
            var      originalQuery   = query;
            Analyzer keywordAnalyzer = new KeywordAnalyzer();

            try
            {
                var queryParser = new RangeQueryParser(Version.LUCENE_29, indexQuery.DefaultField ?? string.Empty, analyzer)
                {
                    DefaultOperator = indexQuery.DefaultOperator == QueryOperator.Or
                                                                ? QueryParser.Operator.OR
                                                                : QueryParser.Operator.AND,
                    AllowLeadingWildcard = true
                };
                query = PreProcessUntokenizedTerms(query, queryParser);
                query = PreProcessSearchTerms(query);
                query = PreProcessDateTerms(query, queryParser);
                return(queryParser.Parse(query));
            }
            catch (ParseException pe)
            {
                if (originalQuery == query)
                {
                    throw new ParseException("Could not parse: '" + query + "'", pe);
                }
                throw new ParseException("Could not parse modified query: '" + query + "' original was: '" + originalQuery + "'", pe);
            }
            finally
            {
                keywordAnalyzer.Close();
            }
        }
Example #3
0
        public static Query BuildQuery(string query, PerFieldAnalyzerWrapper analyzer)
        {
            var keywordAnalyzer = new KeywordAnalyzer();

            try
            {
                query = PreProcessUntokenizedTerms(analyzer, query, keywordAnalyzer);
                var queryParser = new RangeQueryParser(Version.LUCENE_29, "", analyzer);
                queryParser.SetAllowLeadingWildcard(true);
                return(queryParser.Parse(query));;
            }
            finally
            {
                keywordAnalyzer.Close();
            }
        }
Example #4
0
        public static Query BuildQuery(string query, PerFieldAnalyzerWrapper analyzer)
        {
            Analyzer keywordAnalyzer = new KeywordAnalyzer();

            try
            {
                var queryParser = new RangeQueryParser(Version.LUCENE_29, string.Empty, analyzer);
                query = PreProcessUntokenizedTerms(query, queryParser);
                query = PreProcessSearchTerms(query);
                queryParser.SetAllowLeadingWildcard(true);                 // not the recommended approach, should rather use ReverseFilter
                return(queryParser.Parse(query));
            }
            finally
            {
                keywordAnalyzer.Close();
            }
        }
        protected bool DoIndex(IList <ProductSynonym> synonyms, bool append, bool optimize)
        {
            if (append && !Directory.Exists(IdxDir))
            {
                return(false);
            }
            var idxDirectory = FSDirectory.Open(new DirectoryInfo(IdxDir));
            var analyzer     = new KeywordAnalyzer();
            var writer       = new IndexWriter(idxDirectory, analyzer, !append, IndexWriter.MaxFieldLength.UNLIMITED);

            try {
                _logger.Info("Старт индексации синонимов...");
                foreach (var synonym in synonyms)
                {
                    var synstr = synonym.Synonym.Replace("\"", "_QUOTE_").Replace("\\", "_LSLASH_");
                    var doc    = new Document();
                    doc.Add(
                        new Field(
                            "FirmCode",
                            synonym.Price.Supplier.Id.ToString(),
                            Field.Store.YES,
                            Field.Index.NO));
                    doc.Add(
                        new Field(
                            "FirmName",
                            synonym.Price.Supplier.Name + " (" + synonym.Price.Supplier.FullName + ")",
                            Field.Store.YES,
                            Field.Index.NO));
                    doc.Add(
                        new Field(
                            "PriceCode",
                            synonym.Price.Id.ToString(),
                            Field.Store.YES,
                            Field.Index.NO));
                    doc.Add(
                        new Field(
                            "ProductId",
                            synonym.Product.Id.ToString(),
                            Field.Store.YES,
                            Field.Index.NO));
                    doc.Add(
                        new Field(
                            "Junk",
                            synonym.Junk.ToString(),
                            Field.Store.YES,
                            Field.Index.NO));
                    doc.Add(
                        new Field(
                            "Synonym",
                            synstr.Trim().ToUpper(),
                            Field.Store.YES,
                            Field.Index.ANALYZED));
                    writer.AddDocument(doc);
                }
                if (optimize)
                {
                    _logger.Info("Оптимизация индекса...");
                    writer.Optimize();
                }
            }
            finally {
                writer.Close();
                analyzer.Close();
                idxDirectory.Close();
            }
            _logger.Info("Индексация завершена");
            return(true);
        }
        private void DoMatching()
        {
            _logger.InfoFormat("Старт сопоставления для {0} позиций", names.Count());
            var  idxDirectory = FSDirectory.Open(new DirectoryInfo(handler.IdxDir));
            var  reader       = IndexReader.Open(idxDirectory, true);
            var  searcher     = new IndexSearcher(reader);
            var  analyzer     = new KeywordAnalyzer();
            var  parser       = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "Synonym", analyzer);
            uint counter      = 0;

            try {
                foreach (var position in names)
                {
                    if (stopped)
                    {
                        _logger.Info("Сопоставление отменено");
                        State = TaskState.Canceled;
                        return;
                    }
                    string name = position.Trim().ToUpper().Replace("\"", "_QUOTE_").Replace("\\", "_LSLASH_");                     // почуму-то KeywordAnalyzer не находит фразы, если в них есть кавычки

                    var query = parser.Parse(String.Format("Synonym:\"{0}\"", name));
                    name = name.Replace("_QUOTE_", "\"").Replace("_LSLASH_", "\\");
                    if (matches.ContainsKey(name))
                    {
                        continue;
                    }
                    var collector = TopScoreDocCollector.create(10000, true);
                    searcher.Search(query, collector);
                    var hits = collector.TopDocs().scoreDocs;
                    foreach (var scoreDoc in hits)
                    {
                        var document = searcher.Doc(scoreDoc.doc);
                        var pcode    = Convert.ToUInt32(document.Get("PriceCode"));
                        // если уже существует синоним с таким PriceCode - не добавляем в результирующий набор
                        if (priceCode == pcode)
                        {
                            if (matches.ContainsKey(name))
                            {
                                matches.Remove(name);
                            }
                            break;
                        }
                        if (!matches.ContainsKey(name))
                        {
                            matches[name] = new SynonymSummary(position);
                        }
                        matches[name].AddInfo(Convert.ToUInt32(document.Get("FirmCode")),
                                              document.Get("FirmName"),
                                              Convert.ToUInt32(document.Get("PriceCode")),
                                              Convert.ToUInt32(document.Get("ProductId")),
                                              Convert.ToBoolean(document.Get("Junk")));
                    }
                    counter++;
                    Rate = (uint)(counter * 100 / names.Count());
                }
            }
            finally {
                reader.Close();
                searcher.Close();
                analyzer.Close();
                idxDirectory.Close();
                StopDate = DateTime.UtcNow;
            }
            State = TaskState.Success;
            _logger.Info("Сопоставление завершено");
        }
        /*
         * 分词函数
         * @srcdata:待分词的文本
         * 返回值:按照学长格式定义的分词结果的string表示
         * 即{<分词1>}{<分词2>}...{<分词n>}
         */

        //	这个函数是核心
        //	输入是待分词的内容
        //	输出是分词结果
        //	分词结果的格式是{<word>}
        //	这个格式是学长定义的,我们为了不破坏既定的接口,沿用了这个格式
        //	这个函数的工作原理主要是调用了Lucene.Net.Analysis和Lucene.China的接口
        //	调用这两个接口的配置工作很简单:1.在引用中加入dll文件 2.在可执行程序的目录下放置一个data文件夹,文件夹内有两个文件,分别是sDict和sNoise
        //	存放词库和噪声

        /*private bool isChineseWord(string word)
         * {
         *  if (word == null)
         *  {
         *      return false;
         *  }
         *  for (int i = 0; i < word.Length; i++)
         *  {
         *      char chr = word[i];
         *      if (!(chr >= 0x4E00 && chr <= 0x9FFF))
         *      {
         *          return false;
         *      }
         *  }
         *
         *  return true;
         * }*/

        /*private string word_seg(string srcdata)
         * {
         *  //StringBuilder sb = new StringBuilder();
         *  //sb.Remove(0, sb.Length);
         *  string t1 = "";
         *  ChineseAnalyzer analyzer = new Lucene.China.ChineseAnalyzer();
         *  //string FilePath = @"C:\Users\梁亦清\Documents\Visual Studio 2013\Projects\中科院分词简例\1.htm";
         *
         *  StringReader sr = new StringReader(srcdata);
         *  //Console.WriteLine(sr.ToString());
         *  //Environment.Exit(0);
         *  TokenStream stream = analyzer.TokenStream("", sr);
         *
         *  //long begin = System.DateTime.Now.Ticks;
         *  Lucene.Net.Analysis.Token t = stream.Next();
         *  while (t != null)
         *  {
         *      /*
         *      t1 = t.ToString();   //显示格式: (关键词,0,2) ,需要处理
         *      t1 = t1.Replace("(", "");
         *      char[] separator = { ',' };
         *      t1 = t1.Split(separator)[0];
         *      if (isChineseWord(t1))
         *      {
         *          sb.Append("{<" + t1 + ">}");
         *      }
         *      t = stream.Next();
         *  }
         *  //return sb.ToString()
         * }*/



        //	这个函数是学长代码的对外接口,我们沿用了这个接口,但使用的分词方法不是朴素贝叶斯

        /*public string DoWordSegment(string strIn)
         * {
         *  return word_seg(strIn);
         *
         * }*/

        public List <string> cutwords(string words, string analyzer = "Lucene.China.ChineseAnalyzer")
        {
            List <string> results = new List <string>();

            switch (analyzer)
            {
            case "Lucene.Net.Analysis.SimpleAnalyzer":
                SimpleAnalyzer            analyzerInstance0 = new SimpleAnalyzer();
                TokenStream               ts0 = analyzerInstance0.ReusableTokenStream("", new StringReader(words));
                Lucene.Net.Analysis.Token token0;
                while ((token0 = ts0.Next()) != null)
                {
                    results.Add(token0.TermText());
                }
                ts0.Close();
                analyzerInstance0.Close();
                break;

            case "Lucene.Net.Analysis.KeywordAnalyzer":
                KeywordAnalyzer           analyzerInstance1 = new KeywordAnalyzer();
                TokenStream               ts1 = analyzerInstance1.ReusableTokenStream("", new StringReader(words));
                Lucene.Net.Analysis.Token token1;
                while ((token1 = ts1.Next()) != null)
                {
                    results.Add(token1.TermText());
                }
                ts1.Close();
                analyzerInstance1.Close();
                break;

            case "Lucene.Net.Analysis.StopAnalyzer":
                StopAnalyzer analyzerInstance2 = new StopAnalyzer();
                TokenStream  ts2 = analyzerInstance2.ReusableTokenStream("", new StringReader(words));
                Lucene.Net.Analysis.Token token2;
                while ((token2 = ts2.Next()) != null)
                {
                    results.Add(token2.TermText());
                }
                ts2.Close();
                analyzerInstance2.Close();
                break;

            case "Lucene.Net.Analysis.WhitespaceAnalyzer":
                WhitespaceAnalyzer        analyzerInstance3 = new WhitespaceAnalyzer();
                TokenStream               ts3 = analyzerInstance3.ReusableTokenStream("", new StringReader(words));
                Lucene.Net.Analysis.Token token3;
                while ((token3 = ts3.Next()) != null)
                {
                    results.Add(token3.TermText());
                }
                ts3.Close();
                analyzerInstance3.Close();
                break;

            case "Lucene.Net.Analysis.PanGu.PanGuAnalyzer":
                PanGu.Segment.Init(@"G:\CProjects\Pipeline\pipeline\Pipeline\bin\Release\PanGu.xml");
                PanGuAnalyzer             analyzerInstance4 = new PanGuAnalyzer();
                TokenStream               ts4 = analyzerInstance4.TokenStream("", new StringReader(words));
                Lucene.Net.Analysis.Token token4;
                while ((token4 = ts4.Next()) != null)
                {
                    results.Add(token4.TermText());
                }
                ts4.Close();
                analyzerInstance4.Close();
                break;

            case "Lucene.Net.Analysis.Standard.StandardAnalyzer":
                StandardAnalyzer          analyzerInstance5 = new StandardAnalyzer();
                TokenStream               ts5 = analyzerInstance5.ReusableTokenStream("", new StringReader(words));
                Lucene.Net.Analysis.Token token5;
                while ((token5 = ts5.Next()) != null)
                {
                    results.Add(token5.TermText());
                }
                ts5.Close();
                analyzerInstance5.Close();
                break;

            case "Lucene.China.ChineseAnalyzer":
            default:
                ChineseAnalyzer           analyzerInstance6 = new ChineseAnalyzer();
                TokenStream               ts6 = analyzerInstance6.ReusableTokenStream("", new StringReader(words));
                Lucene.Net.Analysis.Token token6;
                while ((token6 = ts6.Next()) != null)
                {
                    results.Add(token6.TermText());
                }
                ts6.Close();
                analyzerInstance6.Close();
                break;
            }
            return(results);
        }