private static void NormalizeMetrics(string metricsFile, float[] metricMeans, float[] metricStdevs) { StringBuilder modelSummary = new StringBuilder(); bool foundRawMetrics = false; List <Topic> topics = new List <Topic>(); float totalAllocations = 0.0f; long totalPromimentDF = 0L; int numTopics = 0; int goodTopicCount = 0; foreach (var line in File.ReadAllLines(metricsFile)) { if (!foundRawMetrics) { // Read first part. if (!line.StartsWith("Topic Id")) { modelSummary.AppendLine(line.Replace("NaN", "0")); var pairs = line.Split('\t'); if (pairs[0].ToLowerInvariant() == "topics") { numTopics = int.Parse(pairs[1]); } else if (pairs[0].ToLowerInvariant() == "good topics") { goodTopicCount = int.Parse(pairs[1]); } } else { foundRawMetrics = true; } continue; } var parts = line.Split('\t'); if (parts[1] == "1") { // Skip bad topics. break; } // Cope with wierd case that "TopDocs" column contains two double quotes. // e.g. \\br1iceml001\ModelRepository\Models\LDA\en-us\msn\20150212_250.tsv\500_45_0.5_0.1_0.2_256_2_1_0.5\build\ExtrinsicMetrics.tsv if (parts[7].IndexOf('\"') >= 0) { parts[7] = parts[7].Replace("\"", ""); } // Read second part and reconstruct each topic List <Tuple <int, double> > promDocs; if (parts[6] == "0") { promDocs = new List <Tuple <int, double> >(); } else { if (parts[7].IndexOf('|') >= 0) { promDocs = parts[7].Split(',') .Select(p => new Tuple <int, double>(int.Parse(p.Split('|')[0]), double.Parse(p.Split('|')[1]))) .ToList(); } else { promDocs = parts[7].Split(',') .Select(p => new Tuple <int, double>(int.Parse(p), -1.0)) .ToList(); } } Topic t = new Topic() { TopicId = int.Parse(parts[0]), IsBad = parts[1] == "1", Allocations = float.Parse(parts[2]), TC = float.Parse(parts[3]), TS = (parts[4] == "NaN" ? 0.0f : float.Parse(parts[4])), TD = float.Parse(parts[5]), PromimentDF = long.Parse(parts[6]), TopProminentDocuments = promDocs, HighProbWords = parts.Skip(8).ToList() }; // Compute the normalized TC/TS/TD for each topic t.NormalizedTC = (t.TC - metricMeans[0]) / metricStdevs[0]; t.NormalizedTS = (t.TS - metricMeans[1]) / metricStdevs[1]; t.NormalizedTD = (t.TD - metricMeans[2]) / metricStdevs[2]; totalAllocations += t.Allocations; totalPromimentDF += t.PromimentDF; topics.Add(t); } // Add "Bad Topic Count" to modelSummary int badTopicCount = topics.Count(t => t.TC < ThresholdForBadTopics); modelSummary.AppendFormat("Bad Topic Count\t{0}\r\n", badTopicCount); modelSummary.AppendFormat("Good Topics(%)\t{0}\r\n", (float)goodTopicCount / numTopics); modelSummary.AppendFormat("Actual Good Topics\t{0}\r\n", goodTopicCount - badTopicCount); // todo: add other model summary statistics here (need to modify metrics list in Model Loader as well). DenseVector normalizedTCVector = new DenseVector(topics.Select(t => t.NormalizedTC).ToArray()); DenseVector normalizedTSVector = new DenseVector(topics.Select(t => t.NormalizedTS).ToArray()); DenseVector normalizedTDVector = new DenseVector(topics.Select(t => t.NormalizedTD).ToArray()); DenseVector weightsByAlloc = new DenseVector(topics.Select(t => t.Allocations / totalAllocations * topics.Count).ToArray()); DenseVector weightsByPromDF = new DenseVector(topics.Select(t => (float)t.PromimentDF / totalPromimentDF * topics.Count).ToArray()); double[,] cumulativeNormalizedMetricsMatrix = new double[3, 3]; // TC cumulativeNormalizedMetricsMatrix[0, 0] = VectorBase.DotProduct(normalizedTCVector, weightsByAlloc); cumulativeNormalizedMetricsMatrix[0, 1] = VectorBase.DotProduct(normalizedTCVector, weightsByPromDF); cumulativeNormalizedMetricsMatrix[0, 2] = topics.Sum(t => (double)t.NormalizedTC); // TS cumulativeNormalizedMetricsMatrix[1, 0] = VectorBase.DotProduct(normalizedTSVector, weightsByAlloc); cumulativeNormalizedMetricsMatrix[1, 1] = VectorBase.DotProduct(normalizedTSVector, weightsByPromDF); cumulativeNormalizedMetricsMatrix[1, 2] = topics.Sum(t => (double)t.NormalizedTS); // TD cumulativeNormalizedMetricsMatrix[2, 0] = VectorBase.DotProduct(normalizedTDVector, weightsByAlloc); cumulativeNormalizedMetricsMatrix[2, 1] = VectorBase.DotProduct(normalizedTDVector, weightsByPromDF); cumulativeNormalizedMetricsMatrix[2, 2] = topics.Sum(t => (double)t.NormalizedTD); // append the 3x3 matrix elements to modelSummary, save as "ExtrinsicMetrics.processed.tsv" RegenerateModelSynopsis(metricsFile, modelSummary, topics, cumulativeNormalizedMetricsMatrix); }
/// <summary> /// Computes de Euclidean distance between 2 unit vectors /// </summary> /// <param name="v1">Unit LDA topic vector.</param> /// <param name="v2">Unit LDA topic vector.</param> /// <returns></returns> public static double Distance(DenseVector v1, DenseVector v2) { return(Math.Sqrt((2 - 2 * VectorBase.DotProduct(v1, v2)))); }