示例#1
0
        /// <summary>
        /// don't use
        /// </summary>
        /// <returns></returns>
        private double[,] LoadTopicWords()
        {
            var wordId = 0;

            StatusMessage.Write("Loading topic words...");
            var writer = StatusMessage.StatusWriter(() => string.Format("Processed {0:D0} words.", wordId));

            StatusMessage.Add(writer);

            var topicWords = new double[this.NumTopics, this.NumWords];

            var lines =
                File.ReadLines(this.wordTopicAllocationsFileName)
                .Skip(TopicWordsAllocationHeaderLines)
                .Take(this.NumWords);

            foreach (var line in lines)
            {
                var parts = line
                            .Split(' ')
                            .Skip(1)
                            .Take(this.NumTopics)
                            .Select(double.Parse)
                            .ToArray();

                for (var topic = 0; topic < this.NumTopics; topic++)
                {
                    topicWords[topic, wordId] = parts[topic];
                }

                wordId++;
            }

            StatusMessage.Remove(writer);
            return(topicWords);
        }
示例#2
0
        /// <summary>
        /// Process documents topic allocations file to gather:
        ///     * allocations per topic
        ///     * most prominent allocations per topic
        ///     * number of most prominent documents per topic
        ///     * top 10 documents for each topic
        ///     * identified bad topic
        /// </summary>
        /// <param name="topicsInfoFile">file to store topics info</param>
        /// <returns>true if succeeded</returns>
        private bool BuildTopicsInfo(string topicsInfoFile)
        {
            var docIndex = 0;

            StatusMessage.Write("Aggregating topic document allocations...");
            var writer = StatusMessage.StatusWriter(() => string.Format("Processed {0:D0} documents.", docIndex));

            StatusMessage.Add(writer);

            this.topicsInfo = new TopicInfo[this.NumTopics].Init();

            // Differentiate if the document-topic allocations file was generated by training OR DvGen (and then copy).
            // the one generated by training has the number of columns equal to NumOfTopics, whereas the one generated by DvGen has fixed number of columns = 4.
            string sourceOfDocumentTopicAllocationsFile;

            var lines =
                File.ReadLines(this.documentTopicAllocationsFile);

            if (lines.Count() < this.numDocs)
            {
                throw new Exception("The number of lines in document-topic allocations file is less than the number of documents!");
            }

            string[] firstLine = lines.First().Split(' ');
            float    number;

            if (firstLine.Length == NumOfColumnsForDvGenOutput &&
                !Single.TryParse(firstLine[1], out number) &&
                !Single.TryParse(firstLine[2], out number))
            {
                sourceOfDocumentTopicAllocationsFile = "DvGen";
            }
            else if (firstLine.Length >= NumTopics &&
                     Single.TryParse(firstLine[0], out number) &&
                     Single.TryParse(firstLine[1], out number))
            {
                sourceOfDocumentTopicAllocationsFile = "Training";
                lines = lines.Skip(this.numDocs * (this.numPasses - 1));
            }
            else
            {
                throw new Exception("Document-topic allocations file is not in right format!");
            }

            foreach (var line in lines)
            {
                // check if line is empty (this happens when a document does not contain any word in the corpus vocabulary).
                if (string.IsNullOrWhiteSpace(line))
                {
                    docIndex++;
                    continue;
                }

                IEnumerable <float> parts;
                if (sourceOfDocumentTopicAllocationsFile == "Training")
                {
                    parts = line.Split(' ')
                            .Take(this.NumTopics)
                            .Select(Single.Parse);
                }
                else
                {
                    var denseVector = LoadDocumentVector(line);
                    parts = denseVector.ToArray();
                }

                var sum = parts.Sum();

                var        allocations = parts.Select(v => v / sum).ToList();
                double     max         = -1;
                List <int> maxTopics   = new List <int>();

                for (var topic = 0; topic < this.NumTopics; topic++)
                {
                    var alloc = allocations[topic];

                    this.topicsInfo[topic].AggregatedAllocation += alloc;

                    if (alloc >= max)
                    {
                        if (alloc > max)
                        {
                            max = alloc;
                            maxTopics.Clear();
                        }

                        maxTopics.Add(topic);
                    }
                }

                foreach (var maxTopic in maxTopics)
                {
                    this.topicsInfo[maxTopic].ProminentAllocation += max;
                    this.topicsInfo[maxTopic].ProminentFrequency++;
                    this.topicsInfo[maxTopic].TopProminentDocuments.Add(new Tuple <int, double>(docIndex, max));
                }

                docIndex++;
            }

            foreach (var topicInfo in this.topicsInfo)
            {
                topicInfo.TopProminentDocuments = topicInfo.TopProminentDocuments.OrderByDescending(x => x.Item2).Take(20).ToList();
            }

            var sortedTopics = Enumerable.Range(0, this.NumTopics)
                               .OrderByDescending(topic => this.topicsInfo[topic].AggregatedAllocation).ToArray();

            int badTopicCount  = 0;
            var lastAllocation = this.topicsInfo[sortedTopics.Last()].AggregatedAllocation;

            for (int i = sortedTopics.Length - 1; i >= 0; i--)
            {
                if (Math.Abs(this.topicsInfo[sortedTopics[i]].AggregatedAllocation - lastAllocation) > MaxDiffOfAllocationBetweenBadTopics)
                {
                    break;
                }

                this.topicsInfo[sortedTopics[i]].IsBadTopic = true;
                badTopicCount++;
            }

            // assume no bad topic
            if (badTopicCount == 1)
            {
                this.topicsInfo[sortedTopics.Last()].IsBadTopic = false;
            }

            using (var sw = new StreamWriter(topicsInfoFile))
            {
                sw.WriteLine(JsonConvert.SerializeObject(this.topicsInfo, Formatting.Indented));
            }

            StatusMessage.Remove(writer);

            return(true);
        }