private void ExecuteSub(MyDoubleDictionary queryTfIdfDictionary, string appendTextToFileName = "") { // max frequency double maxFrequency = queryTfIdfDictionary.Max(x => x.Value); // now multiply each by idf to get tfidf for query foreach (var queryWordWithTf in queryTfIdfDictionary.ToList()) { queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key) ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key] : 0; } // Calculate Similarity var similarityDictionary = new MyDoubleDictionary(); // compute similarity of fileText with each _codeFiles foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary) { double cosineSimilarityWithUseCase = Helper.GetSimilarity(queryTfIdfDictionary, codeFileWithTfIdfDictionary.Value); similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase); } // WRITE TO FILE WriteDocumentVectorToFileOrderedDescending("Vsm" + appendTextToFileName, similarityDictionary); }
/// <summary> /// 对VSM PMI LSI 方法进行初始化 /// 设置 TF, IDF, TF-IDF /// </summary> public static void InitializeForVsmSimLsi() { // 计算 TF and idf foreach (var fileAndItsWords in CodeFilesWithContent) { //TF 词频字典 MyDoubleDictionary fileTfDictionary = new MyDoubleDictionary(); // 统计每个源码文件的中的单词及其词频 foreach (string word in fileAndItsWords.Value) { fileTfDictionary.Add(word); } // 为每个源码文件保存其词频 TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary); // 对每个在源码文件中出现的单词, 记录它出现的文档数目, 此时IDF中保存的是文档频数DF foreach (var wordAndItsCount in fileTfDictionary) { IdfDictionary.Add(wordAndItsCount.Key); } } // 将DF转换为IDF IDF = log(T/DF) // 文档总数 int totalNumberOfDocuments = CodeFilesWithContent.Count; foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list 这样可以改变该字典 { IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value); } // 为每个文件设置 TF-IDF foreach (var sourceFileWithTfDictionary in TfDictionary) { // 单个源文件的TF-IDF MyDoubleDictionary fileTfIdfDictionary = new MyDoubleDictionary(); foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value) { fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]); } TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary); } }
/// <summary> /// 计算 VSM 方法 /// </summary> /// <param name="outputFolderPath"></param> /// <param name="bugName"></param> /// <param name="queryText">查询文本</param> public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText) { Utility.Status("Creating VSM: " + bugName); // 创建查询文本的TF-IDF字典 MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary(); queryText.ForEach(queryTfIdfDictionary.Add); // 最大频度 double maxFrequency = queryTfIdfDictionary.Max(x => x.Value); // 计算TF-IDF foreach (var queryWordWithTf in queryTfIdfDictionary.ToList()) { queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key) ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key] : 0; } // 计算相似度字典 MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary); // 计算文本文件相似度 with each _codeFiles foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary) { double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value); similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase); } // 将文档向量降序写入文件Project\001\Results\Vsm.txt WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary); Utility.Status("Completed VSM: " + bugName); }
/// <summary> /// Writes vector to file ordered /// 降序排列写入文件 /// </summary> /// <param name="filePath"></param> /// <param name="vector"></param> /// <param name="asInt"></param> protected static void WriteDocumentVectorToFileOrderedDescending(string filePath, MyDoubleDictionary vector, bool asInt = false) { string pattern = asInt ? "##" : "##.00000"; File.WriteAllLines(filePath, vector.ToList().OrderByDescending(x => x.Value).Select(x => x.Key + " " + x.Value.ToString(pattern))); }
private void ProcessSourceCode() { // Read all files CodeFilesWithContent = new Dictionary <string, List <string> >(); foreach (var line in File.ReadAllLines(_sourceFilePath)) { var lineSplit = line.SplitWith("##"); string[] text = lineSplit[1].SplitWith(",").Where(x => x.Length > 2).ToArray(); CodeFilesWithContent.Add(lineSplit[0], text.Take(text.Length / 50).ToList()); } // compute tf and idf TfDictionary = new Dictionary <string, MyDoubleDictionary>(); IdfDictionary = new MyDoubleDictionary(); TfIdfDictionary = new Dictionary <string, MyDoubleDictionary>(); foreach (var fileAndItsWords in CodeFilesWithContent) { var fileTfDictionary = new MyDoubleDictionary(); // for each word in the file add 1 to the count foreach (string word in fileAndItsWords.Value) { fileTfDictionary.IncreaseCount(word); } // save tf result for the file TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary); // for each DISTINCT word found in the file increase the idf by 1. At this point idf holds document frequency foreach (var wordAndItsCount in fileTfDictionary) { IdfDictionary.IncreaseCount(wordAndItsCount.Key); } } // change df to idf int totalNumberOfDocuments = CodeFilesWithContent.Count; foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list so that we can change the dictionary { IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value); } // update tfidf for each file foreach (var sourceFileWithTfDictionary in TfDictionary) { var fileTfIdfDictionary = new MyDoubleDictionary(); foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value) { fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]); } TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary); } WordAndContainingFiles = new Dictionary <string, List <string> >(); foreach (var sourceFileWithWords in CodeFilesWithContent) { sourceFileWithWords.Value.Distinct().ToList().ForEach(word => { if (!WordAndContainingFiles.ContainsKey(word)) { WordAndContainingFiles.Add(word, new List <string>()); } WordAndContainingFiles[word].Add(sourceFileWithWords.Key); }); } }