/// <summary> /// don't use /// </summary> /// <returns></returns> private double[,] LoadTopicWords() { var wordId = 0; StatusMessage.Write("Loading topic words..."); var writer = StatusMessage.StatusWriter(() => string.Format("Processed {0:D0} words.", wordId)); StatusMessage.Add(writer); var topicWords = new double[this.NumTopics, this.NumWords]; var lines = File.ReadLines(this.wordTopicAllocationsFileName) .Skip(TopicWordsAllocationHeaderLines) .Take(this.NumWords); foreach (var line in lines) { var parts = line .Split(' ') .Skip(1) .Take(this.NumTopics) .Select(double.Parse) .ToArray(); for (var topic = 0; topic < this.NumTopics; topic++) { topicWords[topic, wordId] = parts[topic]; } wordId++; } StatusMessage.Remove(writer); return(topicWords); }
/// <summary> /// Process documents topic allocations file to gather: /// * allocations per topic /// * most prominent allocations per topic /// * number of most prominent documents per topic /// * top 10 documents for each topic /// * identified bad topic /// </summary> /// <param name="topicsInfoFile">file to store topics info</param> /// <returns>true if succeeded</returns> private bool BuildTopicsInfo(string topicsInfoFile) { var docIndex = 0; StatusMessage.Write("Aggregating topic document allocations..."); var writer = StatusMessage.StatusWriter(() => string.Format("Processed {0:D0} documents.", docIndex)); StatusMessage.Add(writer); this.topicsInfo = new TopicInfo[this.NumTopics].Init(); // Differentiate if the document-topic allocations file was generated by training OR DvGen (and then copy). // the one generated by training has the number of columns equal to NumOfTopics, whereas the one generated by DvGen has fixed number of columns = 4. string sourceOfDocumentTopicAllocationsFile; var lines = File.ReadLines(this.documentTopicAllocationsFile); if (lines.Count() < this.numDocs) { throw new Exception("The number of lines in document-topic allocations file is less than the number of documents!"); } string[] firstLine = lines.First().Split(' '); float number; if (firstLine.Length == NumOfColumnsForDvGenOutput && !Single.TryParse(firstLine[1], out number) && !Single.TryParse(firstLine[2], out number)) { sourceOfDocumentTopicAllocationsFile = "DvGen"; } else if (firstLine.Length >= NumTopics && Single.TryParse(firstLine[0], out number) && Single.TryParse(firstLine[1], out number)) { sourceOfDocumentTopicAllocationsFile = "Training"; lines = lines.Skip(this.numDocs * (this.numPasses - 1)); } else { throw new Exception("Document-topic allocations file is not in right format!"); } foreach (var line in lines) { // check if line is empty (this happens when a document does not contain any word in the corpus vocabulary). if (string.IsNullOrWhiteSpace(line)) { docIndex++; continue; } IEnumerable <float> parts; if (sourceOfDocumentTopicAllocationsFile == "Training") { parts = line.Split(' ') .Take(this.NumTopics) .Select(Single.Parse); } else { var denseVector = LoadDocumentVector(line); parts = denseVector.ToArray(); } var sum = parts.Sum(); var allocations = parts.Select(v => v / sum).ToList(); double max = -1; List <int> maxTopics = new List <int>(); for (var topic = 0; topic < this.NumTopics; topic++) { var alloc = allocations[topic]; this.topicsInfo[topic].AggregatedAllocation += alloc; if (alloc >= max) { if (alloc > max) { max = alloc; maxTopics.Clear(); } maxTopics.Add(topic); } } foreach (var maxTopic in maxTopics) { this.topicsInfo[maxTopic].ProminentAllocation += max; this.topicsInfo[maxTopic].ProminentFrequency++; this.topicsInfo[maxTopic].TopProminentDocuments.Add(new Tuple <int, double>(docIndex, max)); } docIndex++; } foreach (var topicInfo in this.topicsInfo) { topicInfo.TopProminentDocuments = topicInfo.TopProminentDocuments.OrderByDescending(x => x.Item2).Take(20).ToList(); } var sortedTopics = Enumerable.Range(0, this.NumTopics) .OrderByDescending(topic => this.topicsInfo[topic].AggregatedAllocation).ToArray(); int badTopicCount = 0; var lastAllocation = this.topicsInfo[sortedTopics.Last()].AggregatedAllocation; for (int i = sortedTopics.Length - 1; i >= 0; i--) { if (Math.Abs(this.topicsInfo[sortedTopics[i]].AggregatedAllocation - lastAllocation) > MaxDiffOfAllocationBetweenBadTopics) { break; } this.topicsInfo[sortedTopics[i]].IsBadTopic = true; badTopicCount++; } // assume no bad topic if (badTopicCount == 1) { this.topicsInfo[sortedTopics.Last()].IsBadTopic = false; } using (var sw = new StreamWriter(topicsInfoFile)) { sw.WriteLine(JsonConvert.SerializeObject(this.topicsInfo, Formatting.Indented)); } StatusMessage.Remove(writer); return(true); }