/// <summary> /// Calculate topic distinctiveness for each topic and return the average TD /// TD for bad topic is treated as 0 /// </summary> /// <returns>average TD</returns> public double ComputeAvgTopicDistinctiveness() { var globalDist = this.wordTF.Select(x => (float)x / (float)this.totalTF).ToArray(); DenseVector globalDistVector = new DenseVector(globalDist); for (int i = 0; i < this.NumTopics; i++) { if (this.topicsInfo[i].IsBadTopic) { continue; } var topicDist = this.topicWords[i]; DenseVector topicDistVector = new DenseVector(topicDist); var td = 1 - VectorBase.CosineSimilarity(topicDistVector, globalDistVector); this.topicMetrics[i, (int)MetricType.Distinctiveness] = td; this.avgTopicDistinctiveness += td; } this.avgGoodTopicDistinctiveness = this.avgTopicDistinctiveness / this.topicsInfo.Count(x => !x.IsBadTopic); this.avgTopicDistinctiveness = this.avgTopicDistinctiveness / this.NumTopics; return(this.avgTopicDistinctiveness); }
/// <summary> /// Given a group of N vectors, compute NxN comparisions and return results in a matrix. /// The comparison operator can be Distance or Similarity /// </summary> /// <param name="func">The comparison function to apply between pairs of vectors: Euclidean Distance or Cosine Similarity</param> /// <param name="n">Limit comparisons to the top n vectors in the vector group. If less than 1, compute comps for all vectors in vectorGroup.</param> /// <param name="maxCountItemsToCompare">For each of n vectors in the top, compare to the top maxCountItemsToCompare. /// If less than 1, compare top n vs all vectors in vectorGroup. /// If both n and maxCountItemsToCompare are less than 1 a full NxN compas are computed where N = |vectorGroup| /// </param> /// <returns>A diagonal matrix of doubles. Each cell encodes either the euclidean Distance or Cosine similarity between each vector pair. /// The bottom triangule is set to 0. /// </returns> public double[][] GenerateSimDistMatrix(VectorFunction func, int n, int maxCountItemsToCompare) { if (this.RowCount <= 0) { return(null); } // Are we being asked to compare top n vectors vs. only maxCountItemsToCompare vector? int vectorCount = ((maxCountItemsToCompare > 0) && (maxCountItemsToCompare < this.RowCount)) ? maxCountItemsToCompare : this.RowCount; // Create a diagonal matrix: for document i we need to compute and store only (docCount - (i+1)) similarity or distance scores // If asked to compare only the top n vectors againts the rest we only need to allocate a matrix of height n. // Otherwise (n <= 0) we compare all (e.g. vectorCount) vs. all => need to allocate a matrix of height vectorCount int matrixHeight = (n > 0) ? n : vectorCount - 1; var funcMatrix = new double[matrixHeight][]; for (int i = 0; i < matrixHeight; i++) { funcMatrix[i] = new double[vectorCount - (i + 1)]; } if (this.Verbose) { StatusMessage.Write(string.Format("Comparing top {0} vs. {1} other items...", matrixHeight, vectorCount - 1)); } var timer = new Stopwatch(); timer.Start(); for (int y = 0; y < matrixHeight; y++) { for (var x = 0; x < funcMatrix[y].Length; x++) { if (func == VectorFunction.Distance) { funcMatrix[y][x] = Distance(this[y], this[x + (y + 1)]); } else { funcMatrix[y][x] = VectorBase.CosineSimilarity(this[y], this[x + (y + 1)]); } } } timer.Stop(); return(funcMatrix); }