Exemplo n.º 1
0
        private static void NormalizeMetrics(string metricsFile, float[] metricMeans, float[] metricStdevs)
        {
            StringBuilder modelSummary     = new StringBuilder();
            bool          foundRawMetrics  = false;
            List <Topic>  topics           = new List <Topic>();
            float         totalAllocations = 0.0f;
            long          totalPromimentDF = 0L;
            int           numTopics        = 0;
            int           goodTopicCount   = 0;

            foreach (var line in File.ReadAllLines(metricsFile))
            {
                if (!foundRawMetrics)
                {
                    // Read first part.
                    if (!line.StartsWith("Topic Id"))
                    {
                        modelSummary.AppendLine(line.Replace("NaN", "0"));
                        var pairs = line.Split('\t');
                        if (pairs[0].ToLowerInvariant() == "topics")
                        {
                            numTopics = int.Parse(pairs[1]);
                        }
                        else if (pairs[0].ToLowerInvariant() == "good topics")
                        {
                            goodTopicCount = int.Parse(pairs[1]);
                        }
                    }
                    else
                    {
                        foundRawMetrics = true;
                    }
                    continue;
                }

                var parts = line.Split('\t');
                if (parts[1] == "1")
                {
                    // Skip bad topics.
                    break;
                }

                // Cope with wierd case that "TopDocs" column contains two double quotes.
                // e.g. \\br1iceml001\ModelRepository\Models\LDA\en-us\msn\20150212_250.tsv\500_45_0.5_0.1_0.2_256_2_1_0.5\build\ExtrinsicMetrics.tsv
                if (parts[7].IndexOf('\"') >= 0)
                {
                    parts[7] = parts[7].Replace("\"", "");
                }

                // Read second part and reconstruct each topic
                List <Tuple <int, double> > promDocs;
                if (parts[6] == "0")
                {
                    promDocs = new List <Tuple <int, double> >();
                }
                else
                {
                    if (parts[7].IndexOf('|') >= 0)
                    {
                        promDocs = parts[7].Split(',')
                                   .Select(p => new Tuple <int, double>(int.Parse(p.Split('|')[0]), double.Parse(p.Split('|')[1])))
                                   .ToList();
                    }
                    else
                    {
                        promDocs = parts[7].Split(',')
                                   .Select(p => new Tuple <int, double>(int.Parse(p), -1.0))
                                   .ToList();
                    }
                }
                Topic t = new Topic()
                {
                    TopicId               = int.Parse(parts[0]),
                    IsBad                 = parts[1] == "1",
                    Allocations           = float.Parse(parts[2]),
                    TC                    = float.Parse(parts[3]),
                    TS                    = (parts[4] == "NaN" ? 0.0f : float.Parse(parts[4])),
                    TD                    = float.Parse(parts[5]),
                    PromimentDF           = long.Parse(parts[6]),
                    TopProminentDocuments = promDocs,
                    HighProbWords         = parts.Skip(8).ToList()
                };

                // Compute the normalized TC/TS/TD for each topic
                t.NormalizedTC = (t.TC - metricMeans[0]) / metricStdevs[0];
                t.NormalizedTS = (t.TS - metricMeans[1]) / metricStdevs[1];
                t.NormalizedTD = (t.TD - metricMeans[2]) / metricStdevs[2];

                totalAllocations += t.Allocations;
                totalPromimentDF += t.PromimentDF;

                topics.Add(t);
            }

            // Add "Bad Topic Count" to modelSummary
            int badTopicCount = topics.Count(t => t.TC < ThresholdForBadTopics);

            modelSummary.AppendFormat("Bad Topic Count\t{0}\r\n", badTopicCount);
            modelSummary.AppendFormat("Good Topics(%)\t{0}\r\n", (float)goodTopicCount / numTopics);
            modelSummary.AppendFormat("Actual Good Topics\t{0}\r\n", goodTopicCount - badTopicCount);
            // todo: add other model summary statistics here (need to modify metrics list in Model Loader as well).

            DenseVector normalizedTCVector = new DenseVector(topics.Select(t => t.NormalizedTC).ToArray());
            DenseVector normalizedTSVector = new DenseVector(topics.Select(t => t.NormalizedTS).ToArray());
            DenseVector normalizedTDVector = new DenseVector(topics.Select(t => t.NormalizedTD).ToArray());

            DenseVector weightsByAlloc  = new DenseVector(topics.Select(t => t.Allocations / totalAllocations * topics.Count).ToArray());
            DenseVector weightsByPromDF = new DenseVector(topics.Select(t => (float)t.PromimentDF / totalPromimentDF * topics.Count).ToArray());

            double[,] cumulativeNormalizedMetricsMatrix = new double[3, 3];
            // TC
            cumulativeNormalizedMetricsMatrix[0, 0] = VectorBase.DotProduct(normalizedTCVector, weightsByAlloc);
            cumulativeNormalizedMetricsMatrix[0, 1] = VectorBase.DotProduct(normalizedTCVector, weightsByPromDF);
            cumulativeNormalizedMetricsMatrix[0, 2] = topics.Sum(t => (double)t.NormalizedTC);

            // TS
            cumulativeNormalizedMetricsMatrix[1, 0] = VectorBase.DotProduct(normalizedTSVector, weightsByAlloc);
            cumulativeNormalizedMetricsMatrix[1, 1] = VectorBase.DotProduct(normalizedTSVector, weightsByPromDF);
            cumulativeNormalizedMetricsMatrix[1, 2] = topics.Sum(t => (double)t.NormalizedTS);

            // TD
            cumulativeNormalizedMetricsMatrix[2, 0] = VectorBase.DotProduct(normalizedTDVector, weightsByAlloc);
            cumulativeNormalizedMetricsMatrix[2, 1] = VectorBase.DotProduct(normalizedTDVector, weightsByPromDF);
            cumulativeNormalizedMetricsMatrix[2, 2] = topics.Sum(t => (double)t.NormalizedTD);

            // append the 3x3 matrix elements to modelSummary, save as "ExtrinsicMetrics.processed.tsv"
            RegenerateModelSynopsis(metricsFile, modelSummary, topics, cumulativeNormalizedMetricsMatrix);
        }
Exemplo n.º 2
0
 /// <summary>
 /// Computes de Euclidean distance between 2 unit vectors
 /// </summary>
 /// <param name="v1">Unit LDA topic vector.</param>
 /// <param name="v2">Unit LDA topic vector.</param>
 /// <returns></returns>
 public static double Distance(DenseVector v1, DenseVector v2)
 {
     return(Math.Sqrt((2 - 2 * VectorBase.DotProduct(v1, v2))));
 }