private void ExecuteSub(MyDoubleDictionary queryTfDictionary, string appendTextToFileName = "") { // create the vector for each source code var allUniqueWordsInSourceAndQuery = IdfDictionary.Keys.Union(queryTfDictionary.Keys).Distinct().ToList(); int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count; var sourceVectors = new Dictionary <string, double[]>(); TfDictionary.ToList().ForEach(fileWithTfCount => { MyDoubleDictionary tfDictionary = fileWithTfCount.Value; int totalWordsInFile = CodeFilesWithContent[fileWithTfCount.Key].Count; double[] vector = new double[allUniqueWordsInSourceAndQueryCount]; int counter = 0; allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { vector[counter] = tfDictionary.ContainsKey(uniqueWord) ? tfDictionary[uniqueWord] / totalWordsInFile : 0; counter++; }); sourceVectors.Add(fileWithTfCount.Key, vector); }); // create the vector for query double[] queryVector = new double[allUniqueWordsInSourceAndQueryCount]; int queryCounter = 0; var queryHashSet = new HashSet <string>(queryTfDictionary.Keys); var totalQueryWordCount = queryTfDictionary.Sum(x => x.Value); allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { queryVector[queryCounter] = queryHashSet.Contains(uniqueWord) ? (double)queryTfDictionary[uniqueWord] / totalQueryWordCount : 0; queryCounter++; }); // calculate H(p), H(q) and H(p + q) MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); sourceVectors.ToList().ForEach(sourceFileWithVector => { var p = sourceFileWithVector.Value; var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy(); var pEntropy = 1.0 / 2 * p.JensenEntropy(); var qEntropy = 1.0 / 2 * queryVector.JensenEntropy(); var jensenDivergence = sumEntropy - pEntropy - qEntropy; var jensenSimilarity = 1 - jensenDivergence; similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity); }); // WRITE TO FILE WriteDocumentVectorToFileOrderedDescending("Jsm" + appendTextToFileName, similarityDictionary); }
public static double GetSimilarity(MyDoubleDictionary vector1, MyDoubleDictionary vector2) { double length1 = GetLength(vector1); double length2 = GetLength(vector2); double dotProduct = vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key)).Sum(wordWithCount => (wordWithCount.Value * vector2[wordWithCount.Key])); return(vector2.Count == 0 ? 0 : dotProduct / (length1 * length2)); }
public string GetSimilarityText(MyDoubleDictionary vector2) { double length1 = GetLength(_vector1); double length2 = GetLength(vector2); var dotProductObj = _vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key)) .Select( wordWithCount => new { Word = wordWithCount.Key, Value1 = wordWithCount.Value, Value2 = vector2[wordWithCount.Key] }) .Select(x => $"{x.Word} {x.Value1.ToString("##.000")}, {x.Value2.ToString("##.000")}"); var dotProductString = string.Join(Environment.NewLine, dotProductObj); return(dotProductString); }
/// <summary> /// 计算 VSM 方法 /// </summary> /// <param name="outputFolderPath"></param> /// <param name="bugName"></param> /// <param name="queryText">查询文本</param> public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText) { Utility.Status("Creating VSM: " + bugName); // 创建查询文本的TF-IDF字典 MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary(); queryText.ForEach(queryTfIdfDictionary.Add); // 最大频度 double maxFrequency = queryTfIdfDictionary.Max(x => x.Value); // 计算TF-IDF foreach (var queryWordWithTf in queryTfIdfDictionary.ToList()) { queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key) ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key] : 0; } // 计算相似度字典 MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary); // 计算文本文件相似度 with each _codeFiles foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary) { double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value); similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase); } // 将文档向量降序写入文件Project\001\Results\Vsm.txt WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary); Utility.Status("Completed VSM: " + bugName); }
/// <summary> /// 计算 Jensen-Shannon 方法 /// </summary> /// <param name="outputFolderPath">输出文件夹,各个bug文件夹</param> /// <param name="bugName">bug名称</param> /// <param name="queryText">查询文本</param> public static void ComputeJen(string outputFolderPath, string bugName, List <string> queryText) { Utility.Status("Computing JEN: " + bugName); /// 为源代码创建向量 // 源码和查询中出现的单词, 单词库大小的向量 List <string> allUniqueWordsInSourceAndQuery = IdfDictionary.Keys.Union(queryText).Distinct().ToList(); // 总单词数 int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count; //源码向量字典 Dictionary <string, double[]> sourceVectors = new Dictionary <string, double[]>(); TfDictionary.ToList().ForEach(fileWithTfCount => { MyDoubleDictionary tfDictionary = fileWithTfCount.Value; // 某源码中的总单词数 int totalWordsInFile = CodeFilesWithContent[fileWithTfCount.Key].Count; // 单个源码文件向量,存放Pd=f(w, d)/Td double[] vector = new double[allUniqueWordsInSourceAndQueryCount]; int counter = 0; allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { vector[counter] = tfDictionary.ContainsKey(uniqueWord) ? tfDictionary[uniqueWord] / totalWordsInFile : 0; counter++; }); sourceVectors.Add(fileWithTfCount.Key, vector); }); // 为查询创建向量 double[] queryVector = new double[allUniqueWordsInSourceAndQueryCount]; int queryCounter = 0; allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { queryVector[queryCounter] = queryText.Contains(uniqueWord) ? (double)queryText.Count(x => x == uniqueWord) / queryText.Count : 0; queryCounter++; }); // 计算 H(p), H(q) and H(p + q) MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); sourceVectors.ToList().ForEach(sourceFileWithVector => { var p = sourceFileWithVector.Value; var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy(); var pEntropy = 1.0 / 2 * p.JensenEntropy(); var qEntropy = 1.0 / 2 * queryVector.JensenEntropy(); var jensenDivergence = sumEntropy - pEntropy - qEntropy; var jensenSimilarity = 1 - jensenDivergence; // 源码文件编码-jensen相似度 similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity); }); // 将文档向量降序写入文件Project\001\Results\Jen.txt WriteDocumentVectorToFileOrderedDescending(outputFolderPath + JenFileName, similarityDictionary); Utility.Status("DONE Computing JEN: " + bugName); }