public MyDoubleDictionary GetQueryIntDictionary(List <string> queryText) { var queryTfIdfDictionary = new MyDoubleDictionary(); queryText.ForEach(queryTfIdfDictionary.IncreaseCount); return(queryTfIdfDictionary); }
private void ExecuteSub(MyDoubleDictionary queryTfIdfDictionary, string appendTextToFileName = "") { // max frequency double maxFrequency = queryTfIdfDictionary.Max(x => x.Value); // now multiply each by idf to get tfidf for query foreach (var queryWordWithTf in queryTfIdfDictionary.ToList()) { queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key) ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key] : 0; } // Calculate Similarity var similarityDictionary = new MyDoubleDictionary(); // compute similarity of fileText with each _codeFiles foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary) { double cosineSimilarityWithUseCase = Helper.GetSimilarity(queryTfIdfDictionary, codeFileWithTfIdfDictionary.Value); similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase); } // WRITE TO FILE WriteDocumentVectorToFileOrderedDescending("Vsm" + appendTextToFileName, similarityDictionary); }
private void ExecuteSub(MyDoubleDictionary queryTfDictionary, string appendTextToFileName = "") { // create the vector for each source code var allUniqueWordsInSourceAndQuery = IdfDictionary.Keys.Union(queryTfDictionary.Keys).Distinct().ToList(); int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count; var sourceVectors = new Dictionary <string, double[]>(); TfDictionary.ToList().ForEach(fileWithTfCount => { MyDoubleDictionary tfDictionary = fileWithTfCount.Value; int totalWordsInFile = CodeFilesWithContent[fileWithTfCount.Key].Count; double[] vector = new double[allUniqueWordsInSourceAndQueryCount]; int counter = 0; allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { vector[counter] = tfDictionary.ContainsKey(uniqueWord) ? tfDictionary[uniqueWord] / totalWordsInFile : 0; counter++; }); sourceVectors.Add(fileWithTfCount.Key, vector); }); // create the vector for query double[] queryVector = new double[allUniqueWordsInSourceAndQueryCount]; int queryCounter = 0; var queryHashSet = new HashSet <string>(queryTfDictionary.Keys); var totalQueryWordCount = queryTfDictionary.Sum(x => x.Value); allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { queryVector[queryCounter] = queryHashSet.Contains(uniqueWord) ? (double)queryTfDictionary[uniqueWord] / totalQueryWordCount : 0; queryCounter++; }); // calculate H(p), H(q) and H(p + q) MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); sourceVectors.ToList().ForEach(sourceFileWithVector => { var p = sourceFileWithVector.Value; var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy(); var pEntropy = 1.0 / 2 * p.JensenEntropy(); var qEntropy = 1.0 / 2 * queryVector.JensenEntropy(); var jensenDivergence = sumEntropy - pEntropy - qEntropy; var jensenSimilarity = 1 - jensenDivergence; similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity); }); // WRITE TO FILE WriteDocumentVectorToFileOrderedDescending("Jsm" + appendTextToFileName, similarityDictionary); }
public static double GetSimilarity(MyDoubleDictionary vector1, MyDoubleDictionary vector2) { double length1 = GetLength(vector1); double length2 = GetLength(vector2); double dotProduct = vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key)).Sum(wordWithCount => (wordWithCount.Value * vector2[wordWithCount.Key])); return(vector2.Count == 0 ? 0 : dotProduct / (length1 * length2)); }
public static void ComputeAPm(string pmiOutputFolderPath, string reqName, List <string> reqText) { Utility.Status("Creating Apm: " + reqName); // Create list of word contained in query List <string> distinctReqWordList = reqText.Distinct().ToList(); DocumentDictionaryAny <MyDoubleDictionary> nPmiMatrix = new DocumentDictionaryAny <MyDoubleDictionary>(); int n = CodeFilesWithContent.Count; // Compute pmi for each word in WordAndContainingFiles and unique words in query foreach (var reqWordW2 in distinctReqWordList) { MyDoubleDictionary nPmiDictionary = new MyDoubleDictionary(); foreach (var sourceWordW1 in WordAndContainingFiles.Keys) { bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(reqWordW2); int countW1 = WordAndContainingFiles[sourceWordW1].Count; //double averageCountW1Files = _wordAndContainingFiles[sourceWordW1].Select(x => _codeFilesWithContent[x].Count).Average(); int countW2 = sourceContainsUseCaseWord ? WordAndContainingFiles[reqWordW2].Count : 0; //double averageCountW2Files = sourceContainsUseCaseWord ? _wordAndContainingFiles[reqWordW2].Select(x => _codeFilesWithContent[x].Count).Average() : 0; // if query contains source then add 1 (query contains usecase word + source word // if source contains query word find the intersection of files containing both words int countW1W2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[reqWordW2]).Count() : 0; //double averageCountW1W2Files = sourceContainsUseCaseWord ? _wordAndContainingFiles[sourceWordW1].Intersect(_wordAndContainingFiles[reqWordW2]).Select(x => _codeFilesWithContent[x].Count).Average() : 0; // d1 and d2 will never be 0, d1d2 however can be double nPmi; if (countW1W2 == 0) { // no cooccurence nPmi = -1; } else { if (countW1 == countW1W2 && countW2 == countW1W2) { nPmi = 1; } else { nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n) - 1) * ((double)countW1W2 / CodeFilesWithContent.Count); } } nPmiDictionary.Add(sourceWordW1, nPmi); } nPmiMatrix.Add(reqWordW2, nPmiDictionary); } MyDoubleDictionary tssDocumentDictionary = GetTssAltered(reqText, nPmiMatrix, -1); WriteDocumentVectorToFileOrderedDescending(pmiOutputFolderPath + APmFileName, tssDocumentDictionary); Utility.Status("Completed APm: " + reqName); }
/// <summary> /// 对VSM PMI LSI 方法进行初始化 /// 设置 TF, IDF, TF-IDF /// </summary> public static void InitializeForVsmSimLsi() { // 计算 TF and idf foreach (var fileAndItsWords in CodeFilesWithContent) { //TF 词频字典 MyDoubleDictionary fileTfDictionary = new MyDoubleDictionary(); // 统计每个源码文件的中的单词及其词频 foreach (string word in fileAndItsWords.Value) { fileTfDictionary.Add(word); } // 为每个源码文件保存其词频 TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary); // 对每个在源码文件中出现的单词, 记录它出现的文档数目, 此时IDF中保存的是文档频数DF foreach (var wordAndItsCount in fileTfDictionary) { IdfDictionary.Add(wordAndItsCount.Key); } } // 将DF转换为IDF IDF = log(T/DF) // 文档总数 int totalNumberOfDocuments = CodeFilesWithContent.Count; foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list 这样可以改变该字典 { IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value); } // 为每个文件设置 TF-IDF foreach (var sourceFileWithTfDictionary in TfDictionary) { // 单个源文件的TF-IDF MyDoubleDictionary fileTfIdfDictionary = new MyDoubleDictionary(); foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value) { fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]); } TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary); } }
public string GetSimilarityText(MyDoubleDictionary vector2) { double length1 = GetLength(_vector1); double length2 = GetLength(vector2); var dotProductObj = _vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key)) .Select( wordWithCount => new { Word = wordWithCount.Key, Value1 = wordWithCount.Value, Value2 = vector2[wordWithCount.Key] }) .Select(x => $"{x.Word} {x.Value1.ToString("##.000")}, {x.Value2.ToString("##.000")}"); var dotProductString = string.Join(Environment.NewLine, dotProductObj); return(dotProductString); }
/// <summary> /// 计算 VSM 方法 /// </summary> /// <param name="outputFolderPath"></param> /// <param name="bugName"></param> /// <param name="queryText">查询文本</param> public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText) { Utility.Status("Creating VSM: " + bugName); // 创建查询文本的TF-IDF字典 MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary(); queryText.ForEach(queryTfIdfDictionary.Add); // 最大频度 double maxFrequency = queryTfIdfDictionary.Max(x => x.Value); // 计算TF-IDF foreach (var queryWordWithTf in queryTfIdfDictionary.ToList()) { queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key) ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key] : 0; } // 计算相似度字典 MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary); // 计算文本文件相似度 with each _codeFiles foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary) { double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value); similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase); } // 将文档向量降序写入文件Project\001\Results\Vsm.txt WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary); Utility.Status("Completed VSM: " + bugName); }
/// <summary> /// 计算 Jensen-Shannon 方法 /// </summary> /// <param name="outputFolderPath">输出文件夹,各个bug文件夹</param> /// <param name="bugName">bug名称</param> /// <param name="queryText">查询文本</param> public static void ComputeJen(string outputFolderPath, string bugName, List <string> queryText) { Utility.Status("Computing JEN: " + bugName); /// 为源代码创建向量 // 源码和查询中出现的单词, 单词库大小的向量 List <string> allUniqueWordsInSourceAndQuery = IdfDictionary.Keys.Union(queryText).Distinct().ToList(); // 总单词数 int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count; //源码向量字典 Dictionary <string, double[]> sourceVectors = new Dictionary <string, double[]>(); TfDictionary.ToList().ForEach(fileWithTfCount => { MyDoubleDictionary tfDictionary = fileWithTfCount.Value; // 某源码中的总单词数 int totalWordsInFile = CodeFilesWithContent[fileWithTfCount.Key].Count; // 单个源码文件向量,存放Pd=f(w, d)/Td double[] vector = new double[allUniqueWordsInSourceAndQueryCount]; int counter = 0; allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { vector[counter] = tfDictionary.ContainsKey(uniqueWord) ? tfDictionary[uniqueWord] / totalWordsInFile : 0; counter++; }); sourceVectors.Add(fileWithTfCount.Key, vector); }); // 为查询创建向量 double[] queryVector = new double[allUniqueWordsInSourceAndQueryCount]; int queryCounter = 0; allUniqueWordsInSourceAndQuery.ForEach(uniqueWord => { queryVector[queryCounter] = queryText.Contains(uniqueWord) ? (double)queryText.Count(x => x == uniqueWord) / queryText.Count : 0; queryCounter++; }); // 计算 H(p), H(q) and H(p + q) MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); sourceVectors.ToList().ForEach(sourceFileWithVector => { var p = sourceFileWithVector.Value; var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy(); var pEntropy = 1.0 / 2 * p.JensenEntropy(); var qEntropy = 1.0 / 2 * queryVector.JensenEntropy(); var jensenDivergence = sumEntropy - pEntropy - qEntropy; var jensenSimilarity = 1 - jensenDivergence; // 源码文件编码-jensen相似度 similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity); }); // 将文档向量降序写入文件Project\001\Results\Jen.txt WriteDocumentVectorToFileOrderedDescending(outputFolderPath + JenFileName, similarityDictionary); Utility.Status("DONE Computing JEN: " + bugName); }
private void ExecuteSub(MyDoubleDictionary queryTfDictionary, string appendTextToFileName = "") { Log("SVD: " + LogAdd); // create the matrix int totalNumberOfSourceFiles = TfDictionary.Count; int totalDistinctTermsInAllSourceFiles = IdfDictionary.Count; Dictionary <string, int> allSourceFilesWithIndex = TfDictionary.Keys.Select((x, index) => new { Name = x, Index = index }).ToDictionary(x => x.Name, x => x.Index); Dictionary <string, int> allSourceWordsWithIndex = IdfDictionary.Keys.Select((x, index) => new { Name = x, Index = index }).ToDictionary(x => x.Name, x => x.Index); double[,] sourceMatrix = new double[totalDistinctTermsInAllSourceFiles, totalNumberOfSourceFiles]; // row, col row is word col docs foreach (var fileNameWithTfDictionary in TfDictionary) { int fileIndex = allSourceFilesWithIndex[fileNameWithTfDictionary.Key]; foreach (var fileWordWithTf in fileNameWithTfDictionary.Value) { sourceMatrix[allSourceWordsWithIndex[fileWordWithTf.Key], fileIndex] = fileWordWithTf.Value; } } // create matrix Matrix generalMatrix = new Matrix(sourceMatrix); // singular value decomposition SVD svd = new SVD(generalMatrix); _uk = new Dictionary <int, Matrix>(); _sk = new Dictionary <int, Matrix>(); _vkTranspose = new Dictionary <int, Matrix>(); LsiKs.Where(x => x <= svd.S.Cols).ToList().ForEach(k => { Log("SVD " + k + ": " + LogAdd); _uk.Add(k, new Matrix(svd.U.ToArray(), svd.U.Rows, k)); _sk.Add(k, new Matrix(svd.S.ToArray(), k, k)); _vkTranspose.Add(k, new Matrix(svd.VH.ToArray(), k, svd.VH.Cols)); }); // create one for query as well double[,] queryMatrixTranspose = new double[1, totalDistinctTermsInAllSourceFiles]; queryTfDictionary.Keys.ToList().ForEach(queryWord => { if (allSourceWordsWithIndex.ContainsKey(queryWord)) { queryMatrixTranspose[0, allSourceWordsWithIndex[queryWord]] = queryMatrixTranspose[0, allSourceWordsWithIndex[queryWord]] + 1; } }); var outputResultFolderPath = _outputLsiFolderPath + @"\Lsi\"; if (!Directory.Exists(outputResultFolderPath)) { Directory.CreateDirectory(outputResultFolderPath); } var ks = _uk.Keys.Where(x => !File.Exists(outputResultFolderPath + x + ".txt")).ToList(); foreach (var k in ks) { var uk = _uk[k]; var sk = _sk[k]; var vkTranspose = _vkTranspose[k]; var q = new Matrix(queryMatrixTranspose); var qv = q * uk * sk.Inverse(); var qDoubles = qv.RowVector(0).ToArray().ToList(); var similarityList = allSourceFilesWithIndex.Select(doc => new KeyValuePair <string, double>(doc.Key, Helper.GetSimilarity(qDoubles, vkTranspose.ColVector(doc.Value).ToArray().ToList()))); var similarityDictionary = similarityList.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value); WriteDocumentVectorToFileOrderedDescending(@"Lsi\Lsi_" + k + appendTextToFileName, similarityDictionary); } }
/// <summary> /// Writes vector to file ordered /// 降序排列写入文件 /// </summary> /// <param name="filePath"></param> /// <param name="vector"></param> /// <param name="asInt"></param> protected static void WriteDocumentVectorToFileOrderedDescending(string filePath, MyDoubleDictionary vector, bool asInt = false) { string pattern = asInt ? "##" : "##.00000"; File.WriteAllLines(filePath, vector.ToList().OrderByDescending(x => x.Value).Select(x => x.Key + " " + x.Value.ToString(pattern))); }
/// <summary> /// 计算 NDG 方法 /// </summary> /// <param name="ngdOutputFolderPath"></param> /// <param name="bugName"></param> /// <param name="fileText"></param> public static void ComputeNgd(string ngdOutputFolderPath, string bugName, List <string> fileText) { Utility.Status("Creating NGD: " + bugName); MyDoubleDictionary tssDocumentDictionary = new MyDoubleDictionary(); double logD = Math.Log10(CodeFilesWithContent.Count + 1); // just make the N bigger than any // Create list of word contained in query List <string> distinctQueryWordList = fileText.Distinct().ToList(); // DISTINCT HERE but since its calculating NGD done remove it DocumentDictionaryAny <MyDoubleDictionary> ngdMatrix = new DocumentDictionaryAny <MyDoubleDictionary>(); foreach (var queryWordW2 in distinctQueryWordList) { MyDoubleDictionary ngdDictionary = new MyDoubleDictionary(); foreach (var sourceWordW1 in WordAndContainingFiles.Keys) { bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(queryWordW2); int countD1 = WordAndContainingFiles[sourceWordW1].Count; // number of file containing W1 + if query also contains the word int countD2 = sourceContainsUseCaseWord ? WordAndContainingFiles[queryWordW2].Count : 0; // if query contains source then add 1 (query contains usecase word + source word // if source contains query word find the intersection of files containing both words int countD1D2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0; // d1 and d2 will never be 0, d1d2 however can be double ngd = (countD1D2 == 0) ? 0 : ComputenNgd(countD1, countD2, countD1D2, logD); ngdDictionary.Add(sourceWordW1, ngd); } ngdMatrix.Add(queryWordW2, ngdDictionary); } // TSS评判 //List<string> distinctUseCaseWordListForTss = fileText.Distinct().ToList(); //DISTINCT HERE List <string> distinctQueryWordListForTss = fileText.ToList(); //DISTINCT HERE int totalNumberOfDocumentInSource = CodeFilesWithContent.Count; foreach (var sourceFileWithWords in CodeFilesWithContent) { //List<string> distinctSourceWords = sourceFileWithWords.Value.Distinct().ToList(); //DISTINCT HERE List <string> distinctSourceWords = sourceFileWithWords.Value.ToList(); //DISTINCT HERE double sumQueryTimeIdf = 0.0; double sumQueryIdf = 0.0; foreach (var queryWord in distinctQueryWordListForTss) { double maxSim = -1; foreach (var sourceWord in distinctSourceWords) { double currentNgd = ngdMatrix[queryWord][sourceWord]; if (maxSim < currentNgd) { maxSim = currentNgd; } } // if term does not occur in any corpus then its only in use case hence 1 double idf = 0; if (WordAndContainingFiles.ContainsKey(queryWord)) { idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count); } sumQueryIdf += idf; sumQueryTimeIdf += (maxSim * idf); } double sumCorpusTimeIdf = 0.0; double sumCorpusIdf = 0.0; foreach (string sourceWord in distinctSourceWords) { double maxSim = -1; foreach (string queryWord in distinctQueryWordListForTss) { double currentNgd = ngdMatrix[queryWord][sourceWord]; if (maxSim < currentNgd) { maxSim = currentNgd; } } // sourceWord has to be in IdfDictionary double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count); sumCorpusIdf += idf; sumCorpusTimeIdf += (maxSim * idf); } double tss = (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf)); tssDocumentDictionary.Add(sourceFileWithWords.Key, tss); } WriteDocumentVectorToFileOrderedDescending(ngdOutputFolderPath + NgdFileName, tssDocumentDictionary); Utility.Status("Completed NGD: " + bugName); }
public void Execute(List <string> queryTexts, string pmiCacheFolderPath, string appendTextToFileName) { var queryTextsDistinct = queryTexts.Distinct().ToList(); var tssDocumentDictionary = new MyDoubleDictionary(); // Create list of word contained in query var nPmiMatrix = new Dictionary <string, Dictionary <string, double> >(); foreach (var queryText in queryTextsDistinct) { var pmiDictionaryString = Newtonsoft.Json.JsonConvert.DeserializeObject <Dictionary <string, string> >(File.ReadAllText(Path.Combine(pmiCacheFolderPath, $"_{queryText}.txt"))); var pmiDictionary = pmiDictionaryString.ToDictionary(x => x.Key, x => double.Parse(x.Value)); nPmiMatrix.Add(queryText, pmiDictionary); } // Compute pmi for each word in WordAndContainingFiles and unique words in query int totalNumberOfDocumentInSource = CodeFilesWithContent.Count; int counter = 0; foreach (var sourceFileWithWords in CodeFilesWithContent) { counter++; Log($"{_appendLog} Running PMI: {counter} of {CodeFilesWithContent.Count}"); var sourceWords = sourceFileWithWords.Value.ToList(); double sumQueryTimeIdf = 0.0; double sumQueryIdf = 0.0; var queryTextsDistinctMaxSimDictionary = new Dictionary <string, double>(); foreach (var queryWord in queryTexts) { if (!queryTextsDistinctMaxSimDictionary.ContainsKey(queryWord)) { double maxSimCurrent = sourceWords.AsParallel().Select(x => nPmiMatrix[queryWord].ContainsKey(x) ? nPmiMatrix[queryWord][x] : -1).MyMax(-1); queryTextsDistinctMaxSimDictionary.Add(queryWord, maxSimCurrent); } double maxSim = queryTextsDistinctMaxSimDictionary[queryWord]; // if term does not occur in any corpus then its only in use case hence -1 double idf = 0; if (WordAndContainingFiles.ContainsKey(queryWord)) { idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count); } sumQueryIdf += idf; sumQueryTimeIdf += (maxSim * idf); } double sumCorpusTimeIdf = 0.0; double sumCorpusIdf = 0.0; var sourceWordsAsIntsMaxSimDictionary = new Dictionary <string, double>(); foreach (var sourceWord in sourceWords) { if (!sourceWordsAsIntsMaxSimDictionary.ContainsKey(sourceWord)) { double maxSimCurrent = queryTextsDistinct.AsParallel().Select(x => nPmiMatrix[x].ContainsKey(sourceWord) ? nPmiMatrix[x][sourceWord] : -1).MyMax(-1); sourceWordsAsIntsMaxSimDictionary[sourceWord] = maxSimCurrent; } var maxSim = sourceWordsAsIntsMaxSimDictionary[sourceWord]; // sourceWord has to be in IdfDictionary double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count); sumCorpusIdf += idf; sumCorpusTimeIdf += (maxSim * idf); } double tss = sumQueryIdf == 0 || sumCorpusIdf == 0 ? -1 : (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf)); tssDocumentDictionary.Add(sourceFileWithWords.Key, tss); } // WRITE TO FILE WriteDocumentVectorToFileOrderedDescending("Pmi" + appendTextToFileName, tssDocumentDictionary); }
private void ProcessSourceCode() { // Read all files CodeFilesWithContent = new Dictionary <string, List <string> >(); foreach (var line in File.ReadAllLines(_sourceFilePath)) { var lineSplit = line.SplitWith("##"); string[] text = lineSplit[1].SplitWith(",").Where(x => x.Length > 2).ToArray(); CodeFilesWithContent.Add(lineSplit[0], text.Take(text.Length / 50).ToList()); } // compute tf and idf TfDictionary = new Dictionary <string, MyDoubleDictionary>(); IdfDictionary = new MyDoubleDictionary(); TfIdfDictionary = new Dictionary <string, MyDoubleDictionary>(); foreach (var fileAndItsWords in CodeFilesWithContent) { var fileTfDictionary = new MyDoubleDictionary(); // for each word in the file add 1 to the count foreach (string word in fileAndItsWords.Value) { fileTfDictionary.IncreaseCount(word); } // save tf result for the file TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary); // for each DISTINCT word found in the file increase the idf by 1. At this point idf holds document frequency foreach (var wordAndItsCount in fileTfDictionary) { IdfDictionary.IncreaseCount(wordAndItsCount.Key); } } // change df to idf int totalNumberOfDocuments = CodeFilesWithContent.Count; foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list so that we can change the dictionary { IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value); } // update tfidf for each file foreach (var sourceFileWithTfDictionary in TfDictionary) { var fileTfIdfDictionary = new MyDoubleDictionary(); foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value) { fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]); } TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary); } WordAndContainingFiles = new Dictionary <string, List <string> >(); foreach (var sourceFileWithWords in CodeFilesWithContent) { sourceFileWithWords.Value.Distinct().ToList().ForEach(word => { if (!WordAndContainingFiles.ContainsKey(word)) { WordAndContainingFiles.Add(word, new List <string>()); } WordAndContainingFiles[word].Add(sourceFileWithWords.Key); }); } }
public void Execute(List <string> queryText, string appendTextToFileName) { ExecuteBase(); var tssDocumentDictionary = new MyDoubleDictionary(); // Create list of word contained in query var distinctQueryWordList = queryText.Distinct().ToList(); // DISTINCT HERE but since its calculating PMI done remove it var nPmiMatrix = new MyAnyDictionary <MyDoubleDictionary>(); int n = CodeFilesWithContent.Count; // Compute pmi for each word in WordAndContainingFiles and unique words in query foreach (var queryWordW2 in distinctQueryWordList) { var nPmiDictionary = new MyDoubleDictionary(); foreach (var sourceWordW1 in WordAndContainingFiles.Keys) { bool sourceContainsQueryWord = WordAndContainingFiles.ContainsKey(queryWordW2); int countW1 = WordAndContainingFiles[sourceWordW1].Count; int countW2 = sourceContainsQueryWord ? WordAndContainingFiles[queryWordW2].Count : 0; int countW1W2 = sourceContainsQueryWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0; double nPmi; if (countW1W2 == 0) { nPmi = -1; } else if (countW1 == countW1W2 && countW2 == countW1W2) { nPmi = 1; } else { nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n)) - 1; } nPmiDictionary.Add(sourceWordW1, nPmi); } nPmiMatrix.Add(queryWordW2, nPmiDictionary); } var queryWordListForTss = queryText.ToList(); int totalNumberOfDocumentInSource = CodeFilesWithContent.Count; foreach (var sourceFileWithWords in CodeFilesWithContent) { var sourceWords = sourceFileWithWords.Value.ToList(); double sumQueryTimeIdf = 0.0; double sumQueryIdf = 0.0; foreach (var queryWord in queryWordListForTss) { double maxSim = -1; foreach (var sourceWord in sourceWords) { double currentnPmi = nPmiMatrix[queryWord][sourceWord]; if (maxSim < currentnPmi) { maxSim = currentnPmi; } } // if term does not occur in any corpus then its only in use case hence -1 double idf = 0; if (WordAndContainingFiles.ContainsKey(queryWord)) { idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count); } sumQueryIdf += idf; sumQueryTimeIdf += (maxSim * idf); } double sumCorpusTimeIdf = 0.0; double sumCorpusIdf = 0.0; foreach (string sourceWord in sourceWords) { double maxSim = -1; foreach (string useCaseWord in queryWordListForTss) { double currentNPmi = nPmiMatrix[useCaseWord][sourceWord]; if (maxSim < currentNPmi) { maxSim = currentNPmi; } } // sourceWord has to be in IdfDictionary double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count); sumCorpusIdf += idf; sumCorpusTimeIdf += (maxSim * idf); } double tss = sumQueryIdf == 0 || sumCorpusIdf == 0 ? -1 : (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf)); tssDocumentDictionary.Add(sourceFileWithWords.Key, tss); } // WRITE TO FILE WriteDocumentVectorToFileOrderedDescending("Pmi" + appendTextToFileName, tssDocumentDictionary); }
public void Execute(List <string> queryTextList, string cacheOutputFolderPath) { ExecuteBase(); if (!Directory.Exists(cacheOutputFolderPath)) { Directory.CreateDirectory(cacheOutputFolderPath); Thread.Sleep(100); } var tssDocumentDictionary = new MyDoubleDictionary(); // Create list of word contained in query int n = CodeFilesWithContent.Count; // Compute pmi for each word in WordAndContainingFiles and unique words in query int counter = 0; foreach (var queryText in queryTextList.Distinct()) { var queryWordW2 = queryText; Log($"{_appendLog} Creating {++counter} of {queryTextList.Count}: {queryWordW2}"); var outputFilePath = Path.Combine(cacheOutputFolderPath, $"_{queryWordW2}.txt"); if (File.Exists(outputFilePath)) { continue; } var nPmiDictionary = new MyDoubleDictionary(); foreach (var sourceText in WordAndContainingFiles.Keys) { var sourceWordW1 = sourceText; bool sourceContainsQueryWord = WordAndContainingFiles.ContainsKey(queryWordW2); int countW1 = WordAndContainingFiles[sourceWordW1].Count; int countW2 = sourceContainsQueryWord ? WordAndContainingFiles[queryWordW2].Count : 0; int countW1W2 = sourceContainsQueryWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0; double nPmi; if (countW1W2 == 0) { nPmi = -1; } else if (countW1 == countW1W2 && countW2 == countW1W2) { nPmi = 1; } else { nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n)) - 1; } nPmiDictionary.Add(sourceWordW1, nPmi); } var print = nPmiDictionary.Where(x => x.Value > -1).ToDictionary(x => x.Key, x => x.Value.ToString("#.000")); File.WriteAllText(outputFilePath, Newtonsoft.Json.JsonConvert.SerializeObject(print)); } }
private static MyDoubleDictionary GetTssAltered(List <string> reqFileText, DocumentDictionaryAny <MyDoubleDictionary> simMatrix, double noMatch) { MyDoubleDictionary tssDocumentDictionary = new MyDoubleDictionary(); Dictionary <string, double> reqTfDictionary = new Dictionary <string, double>(); reqFileText.ForEach(reqWord => { if (!reqTfDictionary.ContainsKey(reqWord)) { reqTfDictionary.Add(reqWord, 0); } reqTfDictionary[reqWord] = reqTfDictionary[reqWord] + 1; }); List <string> reqWordListForTss = reqFileText.ToList(); int totalNumberOfDocumentInSource = CodeFilesWithContent.Count; foreach (var sourceFileWithWords in CodeFilesWithContent) { List <string> sourceWords = sourceFileWithWords.Value.ToList(); double sumReqTimeIdf = 0.0; double sumReqIdf = 0.0; foreach (var reqWord in reqWordListForTss) { double maxSim = -1; foreach (var sourceWord in sourceWords) { double currentSim = GetSim(reqWord, sourceWord, simMatrix, noMatch); if (maxSim < currentSim) { maxSim = currentSim; } } // if term does not occur in any source then its only in use case hence 1 double idf = 0; if (WordAndContainingFiles.ContainsKey(reqWord)) { idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[reqWord].Count); } sumReqIdf += idf; sumReqTimeIdf += (maxSim * idf); } double sumSourceTimeIdf = 0.0; double sumSourceIdf = 0.0; foreach (string sourceWord in sourceWords) { double maxSim = -1; foreach (string reqWord in reqWordListForTss) { double currentSim = GetSim(reqWord, sourceWord, simMatrix, noMatch); if (maxSim < currentSim) { maxSim = currentSim; } } // sourceWord has to be in IdfDictionary double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count); sumSourceTimeIdf += (maxSim * idf); sumSourceIdf += idf; } double tss = (1.0 / 2) * ((sumReqTimeIdf / sumReqIdf) + (sumSourceTimeIdf / sumSourceIdf)); tssDocumentDictionary.Add(sourceFileWithWords.Key, tss); } return(tssDocumentDictionary); }
public static double GetLength(MyDoubleDictionary vector) { double length = Math.Sqrt(vector.Sum(x => Math.Pow(x.Value, 2))); return(length); }
/// <summary> /// 计算 PMI 方法 /// </summary> /// <param name="simOutputFolderPath"></param> /// <param name="bugName"></param> /// <param name="fileText"></param> public static void ComputePmiSim(string simOutputFolderPath, string bugName, List <string> fileText) { Utility.Status("Creating Pmi: " + bugName); MyDoubleDictionary tssDocumentDictionary = new MyDoubleDictionary(); // 创建查询列表(每个单词唯一) Create list of word contained in query List <string> distinctQueryWordList = fileText.Distinct().ToList(); // DISTINCT HERE but since its calculating PMI done remove it // 单词共现矩阵 DocumentDictionaryAny <MyDoubleDictionary> nPmiMatrix = new DocumentDictionaryAny <MyDoubleDictionary>(); // 源文件数目 int n = CodeFilesWithContent.Count; // 为查询中的每个单词W2计算 带文件单词的 PMI值 foreach (var queryWordW2 in distinctQueryWordList) { MyDoubleDictionary nPmiDictionary = new MyDoubleDictionary(); // 对源码中的每个单词W1 foreach (var sourceWordW1 in WordAndContainingFiles.Keys) { // 源码中是否包含查询W2 bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(queryWordW2); // 包含 W1的数目C(W1), C(W2), C(W1,W2) int countW1 = WordAndContainingFiles[sourceWordW1].Count; int countW2 = sourceContainsUseCaseWord ? WordAndContainingFiles[queryWordW2].Count : 0; // if query contains source then add 1 (query contains usecase word + source word // if source contains query word find the intersection of files containing both words int countW1W2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0; // 归一化的 PMI, d1 and d2 != 0, d1d2 可能 double nPmi; // 从未共现, nPMI = -1 if (countW1W2 == 0) { nPmi = -1; } else { // 完全共现, nPMI = 1 if (countW1 == countW1W2 && countW2 == countW1W2) { nPmi = 1; } else { nPmi = Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n) - 1; } } nPmiDictionary.Add(sourceWordW1, nPmi); } nPmiMatrix.Add(queryWordW2, nPmiDictionary); } //List<string> distinctUseCaseWordListForTss = fileText.Distinct().ToList(); //DISTINCT HERE List <string> distinctQueryWordListForTss = fileText.ToList(); //DISTINCT HERE // 源码中总文件数 int totalNumberOfDocumentInSource = CodeFilesWithContent.Count; // Once the PMI is create compute Sim foreach (var sourceFileWithWords in CodeFilesWithContent) { //List<string> distinctSourceWords = sourceFileWithWords.Value.Distinct().ToList(); //DISTINCT HERE //该处应该是Distinct List <string> distinctSourceWords = sourceFileWithWords.Value.ToList(); //DISTINCT HERE double sumQueryTimeIdf = 0.0; double sumQueryIdf = 0.0; foreach (var queryWord in distinctQueryWordListForTss) { // 计算maxSim double maxSim = -1; foreach (var sourceWord in distinctSourceWords) { double currentnPmi = nPmiMatrix[queryWord][sourceWord]; if (maxSim < currentnPmi) { maxSim = currentnPmi; } } // if term does not occur in any corpus then its only in use case hence 1 double idf = 0; if (WordAndContainingFiles.ContainsKey(queryWord)) { idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count); } sumQueryTimeIdf += (maxSim * idf); sumQueryIdf += idf; } double sumCorpusTimeIdf = 0.0; double sumCorpusIdf = 0.0; foreach (string sourceWord in distinctSourceWords) { double maxSim = -1; foreach (string useCaseWord in distinctQueryWordListForTss) { double currentNPmi = nPmiMatrix[useCaseWord][sourceWord]; if (maxSim < currentNPmi) { maxSim = currentNPmi; } } // sourceWord has to be in IdfDictionary double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count); sumCorpusTimeIdf += (maxSim * idf); sumCorpusIdf += idf; } double tss = (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf)); tssDocumentDictionary.Add(sourceFileWithWords.Key, tss); } WriteDocumentVectorToFileOrderedDescending(simOutputFolderPath + PmiFileName, tssDocumentDictionary); Utility.Status("Completed Pmi: " + bugName); }
/// <summary> /// Writes vector to file /// 将文档向量写入文件 /// </summary> /// <param name="filePath">文件路径</param> /// <param name="vector">文档向量</param> /// <param name="asInt">是否作为整数</param> public static void WriteDocumentVectorToFile(string filePath, MyDoubleDictionary vector, bool asInt = false) { string pattern = asInt ? "##" : "##.00000"; File.WriteAllLines(filePath, vector.Select(x => x.Key + " " + x.Value.ToString(pattern))); }
public CosineSimilarityCalculator(MyDoubleDictionary vector1) { _vector1 = vector1; }