Exemplo n.º 1
0
        public void Start()
        {
            if (!outputpath.EndsWith("\\"))
            {
                outputpath += "\\";
            }

            var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr);

            var searcher    = LuceneOperations.GetIndexSearcher(inputpath);
            var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs());
            var scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);

            int  iter      = 0;
            bool bContinue = threshold == 0 ? false : true;

            while (bContinue && iter < 5)
            {
                iter++;
                Console.WriteLine("iteration------------------" + iter);
                List <string> keywordsNew;
                #region Calculate Keywords
                var counter = new Counter <string>();
                foreach (var scoredDoc in scoredDocs)
                {
                    var doc     = searcher.Doc(scoredDoc.doc);
                    var content = doc.Get(searchfield);
                    foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig))
                    {
                        counter.Add(word);
                    }
                }
                keywordsNew = counter.GetMostFreqObjs(keywordNum);
                #endregion

                var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num);
                #region Test whether exit
                int repeatNum = 0;
                var docIDs    = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    docIDs.Add(scoredDoc.doc);
                }

                foreach (var scoredDocNew in scoredDocsNew)
                {
                    if (docIDs.Contains(scoredDocNew.doc))
                    {
                        repeatNum++;
                    }
                }

                bContinue = (double)repeatNum / scoredDocs.Length < threshold;
                #endregion

                Console.WriteLine(repeatNum + "  " + scoredDocsNew.Length);

                keywords   = keywordsNew;
                scoredDocs = scoredDocsNew;

                Console.WriteLine(StringOperations.GetMergedString(keywords));
            }

            max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs());
            scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);
            var writer = LuceneOperations.GetIndexWriter(outputpath);
            foreach (var scoredDoc in scoredDocs)
            {
                Document doc = searcher.Doc(scoredDoc.doc);
                writer.AddDocument(doc);
            }
            writer.Optimize();
            writer.Close();

            if (isPrintRemovedDocuments)
            {
                var sw             = new StreamWriter(outputpath + "removeDocuments.txt");
                var selectedDocIDs = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    selectedDocIDs.Add(scoredDoc.doc);
                }

                var reader = searcher.GetIndexReader();
                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    if (!selectedDocIDs.Contains(iDoc))
                    {
                        sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc)));
                    }
                }
                reader.Close();
                sw.Flush();
                sw.Close();
            }

            searcher.Close();

            Console.WriteLine("Done");
            Console.ReadKey();
        }
Exemplo n.º 2
0
 private string DocumentToString(Document document)
 {
     return(LuceneOperations.GetDocumentString(document));
 }