public static void AnalyzeSearchWordSentiment(string indexPath, string field, string[] keywords, int printDocumentCnt = 10, string histogramField = null) { var searcher = LuceneOperations.GetIndexSearcher(indexPath); var reader = searcher.GetIndexReader(); var docIDs = LuceneOperations.Search(searcher, StringOperations.GetMergedString(keywords, " "), field); Console.WriteLine("Find {0}% ({1}/{2}) documents containing: {3}", (100.0 * docIDs.Count / reader.NumDocs()), docIDs.Count, reader.NumDocs(), StringOperations.GetMergedString(keywords, " ")); var progress = new ProgramProgress(docIDs.Count); var sentiAnalyzer = new SentimentAnalyzer(); SentimentType sentimentType; double sentimentScore; HeapSortDouble hsdPos = new HeapSortDouble(printDocumentCnt); HeapSortDouble hsdNeg = new HeapSortDouble(printDocumentCnt); Counter <string> counterPos = null; Counter <string> counterNeg = null; Counter <string> counterNeu = null; if (histogramField != null) { counterPos = new Counter <string>(); counterNeg = new Counter <string>(); counterNeu = new Counter <string>(); } int posCnt = 0; int negCnt = 0; int neuCnt = 0; foreach (var docID in docIDs) { var document = reader.Document(docID); var content = document.Get(field); sentiAnalyzer.GetSentiment(content, out sentimentType, out sentimentScore); switch (sentimentType) { case SentimentType.Positive: posCnt++; hsdPos.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterPos.Add(document.Get(histogramField)); } break; case SentimentType.Negative: negCnt++; hsdNeg.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterNeg.Add(document.Get(histogramField)); } break; case SentimentType.Neutral: neuCnt++; if (histogramField != null) { counterNeu.Add(document.Get(histogramField)); } break; default: throw new NotImplementedException(); } progress.PrintIncrementExperiment(); } Console.WriteLine("Positive document ratio {0}% ({1}/{2})", Math.Round(100.0 * posCnt / docIDs.Count), posCnt, docIDs.Count); Console.WriteLine("Negatvie document ratio {0}% ({1}/{2})", Math.Round(100.0 * negCnt / docIDs.Count), negCnt, docIDs.Count); Console.WriteLine("Neutral document ratio {0}% ({1}/{2})", Math.Round(100.0 * neuCnt / docIDs.Count), neuCnt, docIDs.Count); Console.WriteLine(StringOperations.WrapWithDash("Positive documents")); foreach (var kvp in hsdPos.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } Console.WriteLine(StringOperations.WrapWithDash("Negative documents")); foreach (var kvp in hsdNeg.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } progress.PrintTotalTime(); if (histogramField != null) { string[] featureStrings = new[] { "Pos", "Neg", "Neu" }; Counter <string>[] counters = new[] { counterPos, counterNeg, counterNeu }; for (int i = 0; i < featureStrings.Length; i++) { Console.WriteLine(StringOperations.WrapWithDash(histogramField + " " + featureStrings[i])); int index = 0; foreach (var kvp in counters[i].GetCountDictionary().OrderByDescending(kvp => kvp.Value)) { Console.WriteLine(kvp.Key + "\t" + kvp.Value); if (++index >= 100) { break; } } } } Console.ReadKey(); }
public void Start() { if (!outputpath.EndsWith("\\")) { outputpath += "\\"; } var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr); var searcher = LuceneOperations.GetIndexSearcher(inputpath); var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs()); var scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); int iter = 0; bool bContinue = threshold == 0 ? false : true; while (bContinue && iter < 5) { iter++; Console.WriteLine("iteration------------------" + iter); List <string> keywordsNew; #region Calculate Keywords var counter = new Counter <string>(); foreach (var scoredDoc in scoredDocs) { var doc = searcher.Doc(scoredDoc.doc); var content = doc.Get(searchfield); foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig)) { counter.Add(word); } } keywordsNew = counter.GetMostFreqObjs(keywordNum); #endregion var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num); #region Test whether exit int repeatNum = 0; var docIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { docIDs.Add(scoredDoc.doc); } foreach (var scoredDocNew in scoredDocsNew) { if (docIDs.Contains(scoredDocNew.doc)) { repeatNum++; } } bContinue = (double)repeatNum / scoredDocs.Length < threshold; #endregion Console.WriteLine(repeatNum + " " + scoredDocsNew.Length); keywords = keywordsNew; scoredDocs = scoredDocsNew; Console.WriteLine(StringOperations.GetMergedString(keywords)); } max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs()); scoredDocs = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num); var writer = LuceneOperations.GetIndexWriter(outputpath); foreach (var scoredDoc in scoredDocs) { Document doc = searcher.Doc(scoredDoc.doc); writer.AddDocument(doc); } writer.Optimize(); writer.Close(); if (isPrintRemovedDocuments) { var sw = new StreamWriter(outputpath + "removeDocuments.txt"); var selectedDocIDs = new HashSet <int>(); foreach (var scoredDoc in scoredDocs) { selectedDocIDs.Add(scoredDoc.doc); } var reader = searcher.GetIndexReader(); for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++) { if (!selectedDocIDs.Contains(iDoc)) { sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc))); } } reader.Close(); sw.Flush(); sw.Close(); } searcher.Close(); Console.WriteLine("Done"); Console.ReadKey(); }