private static int[] getMostFreqWordIndex(SparseVectorList featurevector, int k) { var sort = new HeapSortDouble(k); for (int iword = 0; iword < featurevector.keyarray.Length; iword++) { sort.Insert(featurevector.keyarray[iword], featurevector.valuearray[iword]); } return(sort.GetTopIndices().ToArray <int>()); }
public static void AnalyzeSearchWordSentiment(string indexPath, string field, string[] keywords, int printDocumentCnt = 10, string histogramField = null) { var searcher = LuceneOperations.GetIndexSearcher(indexPath); var reader = searcher.GetIndexReader(); var docIDs = LuceneOperations.Search(searcher, StringOperations.GetMergedString(keywords, " "), field); Console.WriteLine("Find {0}% ({1}/{2}) documents containing: {3}", (100.0 * docIDs.Count / reader.NumDocs()), docIDs.Count, reader.NumDocs(), StringOperations.GetMergedString(keywords, " ")); var progress = new ProgramProgress(docIDs.Count); var sentiAnalyzer = new SentimentAnalyzer(); SentimentType sentimentType; double sentimentScore; HeapSortDouble hsdPos = new HeapSortDouble(printDocumentCnt); HeapSortDouble hsdNeg = new HeapSortDouble(printDocumentCnt); Counter <string> counterPos = null; Counter <string> counterNeg = null; Counter <string> counterNeu = null; if (histogramField != null) { counterPos = new Counter <string>(); counterNeg = new Counter <string>(); counterNeu = new Counter <string>(); } int posCnt = 0; int negCnt = 0; int neuCnt = 0; foreach (var docID in docIDs) { var document = reader.Document(docID); var content = document.Get(field); sentiAnalyzer.GetSentiment(content, out sentimentType, out sentimentScore); switch (sentimentType) { case SentimentType.Positive: posCnt++; hsdPos.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterPos.Add(document.Get(histogramField)); } break; case SentimentType.Negative: negCnt++; hsdNeg.Insert(docID, Math.Abs(sentimentScore)); if (histogramField != null) { counterNeg.Add(document.Get(histogramField)); } break; case SentimentType.Neutral: neuCnt++; if (histogramField != null) { counterNeu.Add(document.Get(histogramField)); } break; default: throw new NotImplementedException(); } progress.PrintIncrementExperiment(); } Console.WriteLine("Positive document ratio {0}% ({1}/{2})", Math.Round(100.0 * posCnt / docIDs.Count), posCnt, docIDs.Count); Console.WriteLine("Negatvie document ratio {0}% ({1}/{2})", Math.Round(100.0 * negCnt / docIDs.Count), negCnt, docIDs.Count); Console.WriteLine("Neutral document ratio {0}% ({1}/{2})", Math.Round(100.0 * neuCnt / docIDs.Count), neuCnt, docIDs.Count); Console.WriteLine(StringOperations.WrapWithDash("Positive documents")); foreach (var kvp in hsdPos.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } Console.WriteLine(StringOperations.WrapWithDash("Negative documents")); foreach (var kvp in hsdNeg.GetSortedDictionary()) { Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field)); } progress.PrintTotalTime(); if (histogramField != null) { string[] featureStrings = new[] { "Pos", "Neg", "Neu" }; Counter <string>[] counters = new[] { counterPos, counterNeg, counterNeu }; for (int i = 0; i < featureStrings.Length; i++) { Console.WriteLine(StringOperations.WrapWithDash(histogramField + " " + featureStrings[i])); int index = 0; foreach (var kvp in counters[i].GetCountDictionary().OrderByDescending(kvp => kvp.Value)) { Console.WriteLine(kvp.Key + "\t" + kvp.Value); if (++index >= 100) { break; } } } } Console.ReadKey(); }
public void Start() { if (!Configure.InputPath.EndsWith("\\")) { Configure.InputPath += "\\"; } var reader = LuceneOperations.GetIndexReader(Configure.InputPath); var docNum = reader.NumDocs(); var docNumPart = docNum / 100; Console.WriteLine("Total: " + docNum); Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed); //Topwords var counter = new Counter <string>(); for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = NLPOperations.Tokenize(content, Configure.TokenizeConfig); foreach (var word in words) { counter.Add(word); } } var topwords = counter.GetMostFreqObjs(Configure.TopWordCount); var wordCounterDict = counter.GetCountDictionary(); var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt"); foreach (var topword in topwords) { swTopWords.WriteLine(topword); } swTopWords.Flush(); swTopWords.Close(); //CoOccurrence if (Configure.IsPrintCooccurrence) { var k = topwords.Count; var occurCounterDict = new Dictionary <string, Counter <string> >(); foreach (var topword in topwords) { occurCounterDict.Add(topword, new Counter <string>()); } for (int iDoc = 0; iDoc < docNum; iDoc++) { if (iDoc % docNumPart == 0) { Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%"); } if (random.NextDouble() > Configure.SampleRatio) { continue; } var doc = reader.Document(iDoc); var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict); var words = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig)); foreach (var word in words) { if (occurCounterDict.ContainsKey(word)) { var occurCounter = occurCounterDict[word]; foreach (var word2 in words) { if (word2 == word) { continue; } if (occurCounterDict.ContainsKey(word2)) { occurCounter.Add(word2); } } } } } var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount); var pairDict = new Dictionary <int, Tuple <string, string> >(); var iPair = 0; foreach (var kvp in occurCounterDict) { var word = kvp.Key; var occurCounter = kvp.Value; foreach (var kvp2 in occurCounter.GetCountDictionary()) { heapSort.Insert(iPair, kvp2.Value); pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key)); iPair++; } } var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt"); foreach (var kvp in heapSort.GetSortedDictionary()) { var pair = pairDict[kvp.Key]; swCoOccurrence.WriteLine("{0} - {1}\t{2}", pair.Item1, pair.Item2, kvp.Value); } swCoOccurrence.Flush(); swCoOccurrence.Close(); } reader.Close(); }