public void Start() { if (!outputpath.EndsWith("\\")) { outputpath += "\\"; } var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr); var searcher = LuceneOperations.GetIndexSearcher(inputpath); var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs()); var scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); int iter = 0; bool bContinue = threshold == 0 ? false : true; while (bContinue && iter < 5) { iter++; Console.WriteLine("iteration------------------" + iter); List <string> keywordsNew; #region Calculate Keywords var counter = new Counter <string>(); foreach (var scoredDoc in scoredDocs) { var doc = searcher.Doc(scoredDoc.doc); var content = doc.Get(searchfield); foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig)) { counter.Add(word); } } keywordsNew = counter.GetMostFreqObjs(keywordNum); #endregion var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num); #region Test whether exit int repeatNum = 0; var docIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { docIDs.Add(scoredDoc.doc); } foreach (var scoredDocNew in scoredDocsNew) { if (docIDs.Contains(scoredDocNew.doc)) { repeatNum++; } } bContinue = (double)repeatNum / scoredDocs.Length < threshold; #endregion Console.WriteLine(repeatNum + " " + scoredDocsNew.Length); keywords = keywordsNew; scoredDocs = scoredDocsNew; Console.WriteLine(StringOperations.GetMergedString(keywords)); } max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs()); scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); var writer = LuceneOperations.GetIndexWriter(outputpath); foreach (var scoredDoc in scoredDocs) { Document doc = searcher.Doc(scoredDoc.doc); writer.AddDocument(doc); } writer.Optimize(); writer.Close(); if (isPrintRemovedDocuments) { var sw = new StreamWriter(outputpath + "removeDocuments.txt"); var selectedDocIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { selectedDocIDs.Add(scoredDoc.doc); } var reader = searcher.GetIndexReader(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (!selectedDocIDs.Contains(iDoc)) { sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc))); } } reader.Close(); sw.Flush(); sw.Close(); } searcher.Close(); Console.WriteLine("Done"); Console.ReadKey(); }
private string DocumentToString(Document document) { return(LuceneOperations.GetDocumentString(document)); }