Example #1
0
        private static int[] getMostFreqWordIndex(SparseVectorList featurevector, int k)
        {
            var sort = new HeapSortDouble(k);

            for (int iword = 0; iword < featurevector.keyarray.Length; iword++)
            {
                sort.Insert(featurevector.keyarray[iword], featurevector.valuearray[iword]);
            }

            return(sort.GetTopIndices().ToArray <int>());
        }
Example #2
0
        public void AnalyzeDocuments()
        {
            string fileName  = @"D:\Project\TopicPanorama\data\TopicGraphs\NewCode-Ebola-Test2\Raw\news\result\lda.top.json";
            string indexPath = @"D:\DataProcess\Index\Raw_EbolaEnBingNews_Ebola_0_1_RS_R-1";

            int topDocCnt = 20;

            var indexReader = LuceneOperations.GetIndexReader(indexPath);

            //Read from json and sort
            SimpleJsonReader reader = new SimpleJsonReader(new StreamReader(File.Open(fileName, FileMode.Open)));

            HeapSortDouble[] hsd        = null;
            int             topicNumber = -1;
            ProgramProgress progress    = new ProgramProgress(indexReader.NumDocs());

            while (reader.IsReadable)
            {
                int      docID      = int.Parse(reader.ReadPropertyName());
                double[] topicArray = reader.ReadDoubleArray();

                if (topicNumber < 0)
                {
                    topicNumber = topicArray.Length;
                    hsd         = new HeapSortDouble[topicNumber];
                    for (int i = 0; i < topicNumber; i++)
                    {
                        hsd[i] = new HeapSortDouble(topDocCnt);
                    }
                }

                for (int i = 0; i < topicNumber; i++)
                {
                    hsd[i].Insert(docID, topicArray[i]);
                }
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            //Statistics


            Console.ReadLine();
        }
        public static void AnalyzeSearchWordSentiment(string indexPath, string field, string[] keywords, int printDocumentCnt = 10, string histogramField = null)
        {
            var searcher = LuceneOperations.GetIndexSearcher(indexPath);
            var reader   = searcher.GetIndexReader();
            var docIDs   = LuceneOperations.Search(searcher, StringOperations.GetMergedString(keywords, " "), field);

            Console.WriteLine("Find {0}% ({1}/{2}) documents containing: {3}", (100.0 * docIDs.Count / reader.NumDocs()), docIDs.Count, reader.NumDocs(), StringOperations.GetMergedString(keywords, " "));

            var              progress      = new ProgramProgress(docIDs.Count);
            var              sentiAnalyzer = new SentimentAnalyzer();
            SentimentType    sentimentType;
            double           sentimentScore;
            HeapSortDouble   hsdPos     = new HeapSortDouble(printDocumentCnt);
            HeapSortDouble   hsdNeg     = new HeapSortDouble(printDocumentCnt);
            Counter <string> counterPos = null;
            Counter <string> counterNeg = null;
            Counter <string> counterNeu = null;

            if (histogramField != null)
            {
                counterPos = new Counter <string>();
                counterNeg = new Counter <string>();
                counterNeu = new Counter <string>();
            }
            int posCnt = 0;
            int negCnt = 0;
            int neuCnt = 0;

            foreach (var docID in docIDs)
            {
                var document = reader.Document(docID);
                var content  = document.Get(field);
                sentiAnalyzer.GetSentiment(content, out sentimentType, out sentimentScore);

                switch (sentimentType)
                {
                case SentimentType.Positive:
                    posCnt++;
                    hsdPos.Insert(docID, Math.Abs(sentimentScore));
                    if (histogramField != null)
                    {
                        counterPos.Add(document.Get(histogramField));
                    }
                    break;

                case SentimentType.Negative:
                    negCnt++;
                    hsdNeg.Insert(docID, Math.Abs(sentimentScore));
                    if (histogramField != null)
                    {
                        counterNeg.Add(document.Get(histogramField));
                    }
                    break;

                case SentimentType.Neutral:
                    neuCnt++;
                    if (histogramField != null)
                    {
                        counterNeu.Add(document.Get(histogramField));
                    }
                    break;

                default:
                    throw new NotImplementedException();
                }

                progress.PrintIncrementExperiment();
            }

            Console.WriteLine("Positive document ratio {0}% ({1}/{2})", Math.Round(100.0 * posCnt / docIDs.Count), posCnt, docIDs.Count);
            Console.WriteLine("Negatvie document ratio {0}% ({1}/{2})", Math.Round(100.0 * negCnt / docIDs.Count), negCnt, docIDs.Count);
            Console.WriteLine("Neutral document ratio {0}% ({1}/{2})", Math.Round(100.0 * neuCnt / docIDs.Count), neuCnt, docIDs.Count);

            Console.WriteLine(StringOperations.WrapWithDash("Positive documents"));
            foreach (var kvp in hsdPos.GetSortedDictionary())
            {
                Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field));
            }

            Console.WriteLine(StringOperations.WrapWithDash("Negative documents"));
            foreach (var kvp in hsdNeg.GetSortedDictionary())
            {
                Console.WriteLine(kvp.Value + "\t" + reader.Document(kvp.Key).Get(field));
            }

            progress.PrintTotalTime();

            if (histogramField != null)
            {
                string[]           featureStrings = new[] { "Pos", "Neg", "Neu" };
                Counter <string>[] counters       = new[] { counterPos, counterNeg, counterNeu };
                for (int i = 0; i < featureStrings.Length; i++)
                {
                    Console.WriteLine(StringOperations.WrapWithDash(histogramField + " " + featureStrings[i]));
                    int index = 0;
                    foreach (var kvp in counters[i].GetCountDictionary().OrderByDescending(kvp => kvp.Value))
                    {
                        Console.WriteLine(kvp.Key + "\t" + kvp.Value);
                        if (++index >= 100)
                        {
                            break;
                        }
                    }
                }
            }

            Console.ReadKey();
        }
Example #4
0
        public void Start()
        {
            if (!Configure.InputPath.EndsWith("\\"))
            {
                Configure.InputPath += "\\";
            }
            var reader     = LuceneOperations.GetIndexReader(Configure.InputPath);
            var docNum     = reader.NumDocs();
            var docNumPart = docNum / 100;

            Console.WriteLine("Total: " + docNum);

            Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed);

            //Topwords
            var counter = new Counter <string>();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                if (iDoc % docNumPart == 0)
                {
                    Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                }
                if (random.NextDouble() > Configure.SampleRatio)
                {
                    continue;
                }

                var doc     = reader.Document(iDoc);
                var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                var words   = NLPOperations.Tokenize(content, Configure.TokenizeConfig);
                foreach (var word in words)
                {
                    counter.Add(word);
                }
            }
            var topwords        = counter.GetMostFreqObjs(Configure.TopWordCount);
            var wordCounterDict = counter.GetCountDictionary();

            var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt");

            foreach (var topword in topwords)
            {
                swTopWords.WriteLine(topword);
            }
            swTopWords.Flush();
            swTopWords.Close();

            //CoOccurrence
            if (Configure.IsPrintCooccurrence)
            {
                var k = topwords.Count;
                var occurCounterDict = new Dictionary <string, Counter <string> >();
                foreach (var topword in topwords)
                {
                    occurCounterDict.Add(topword, new Counter <string>());
                }
                for (int iDoc = 0; iDoc < docNum; iDoc++)
                {
                    if (iDoc % docNumPart == 0)
                    {
                        Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                    }
                    if (random.NextDouble() > Configure.SampleRatio)
                    {
                        continue;
                    }

                    var doc     = reader.Document(iDoc);
                    var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                    var words   = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig));
                    foreach (var word in words)
                    {
                        if (occurCounterDict.ContainsKey(word))
                        {
                            var occurCounter = occurCounterDict[word];
                            foreach (var word2 in words)
                            {
                                if (word2 == word)
                                {
                                    continue;
                                }
                                if (occurCounterDict.ContainsKey(word2))
                                {
                                    occurCounter.Add(word2);
                                }
                            }
                        }
                    }
                }
                var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount);
                var pairDict = new Dictionary <int, Tuple <string, string> >();
                var iPair    = 0;
                foreach (var kvp in occurCounterDict)
                {
                    var word         = kvp.Key;
                    var occurCounter = kvp.Value;
                    foreach (var kvp2 in occurCounter.GetCountDictionary())
                    {
                        heapSort.Insert(iPair, kvp2.Value);
                        pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key));
                        iPair++;
                    }
                }

                var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt");
                foreach (var kvp in heapSort.GetSortedDictionary())
                {
                    var pair = pairDict[kvp.Key];
                    swCoOccurrence.WriteLine("{0} - {1}\t{2}",
                                             pair.Item1, pair.Item2, kvp.Value);
                }

                swCoOccurrence.Flush();
                swCoOccurrence.Close();
            }

            reader.Close();
        }