Exemplo n.º 1
0
        public static void ComputeAPm(string pmiOutputFolderPath, string reqName, List <string> reqText)
        {
            Utility.Status("Creating Apm: " + reqName);

            // Create list of word contained in query
            List <string> distinctReqWordList = reqText.Distinct().ToList();
            DocumentDictionaryAny <MyDoubleDictionary> nPmiMatrix = new DocumentDictionaryAny <MyDoubleDictionary>();
            int n = CodeFilesWithContent.Count;

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            foreach (var reqWordW2 in distinctReqWordList)
            {
                MyDoubleDictionary nPmiDictionary = new MyDoubleDictionary();

                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(reqWordW2);

                    int countW1 = WordAndContainingFiles[sourceWordW1].Count;
                    //double averageCountW1Files = _wordAndContainingFiles[sourceWordW1].Select(x => _codeFilesWithContent[x].Count).Average();
                    int countW2 = sourceContainsUseCaseWord ? WordAndContainingFiles[reqWordW2].Count : 0;
                    //double averageCountW2Files = sourceContainsUseCaseWord ? _wordAndContainingFiles[reqWordW2].Select(x => _codeFilesWithContent[x].Count).Average() : 0;

                    // if query contains source then add 1 (query contains usecase word + source word
                    // if source contains query word find the intersection of files containing both words
                    int countW1W2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[reqWordW2]).Count() : 0;
                    //double averageCountW1W2Files = sourceContainsUseCaseWord ? _wordAndContainingFiles[sourceWordW1].Intersect(_wordAndContainingFiles[reqWordW2]).Select(x => _codeFilesWithContent[x].Count).Average() : 0;

                    // d1 and d2 will never be 0, d1d2 however can be
                    double nPmi;
                    if (countW1W2 == 0)
                    {
                        // no cooccurence
                        nPmi = -1;
                    }
                    else
                    {
                        if (countW1 == countW1W2 && countW2 == countW1W2)
                        {
                            nPmi = 1;
                        }
                        else
                        {
                            nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n) - 1) * ((double)countW1W2 / CodeFilesWithContent.Count);
                        }
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }
                nPmiMatrix.Add(reqWordW2, nPmiDictionary);
            }

            MyDoubleDictionary tssDocumentDictionary = GetTssAltered(reqText, nPmiMatrix, -1);

            WriteDocumentVectorToFileOrderedDescending(pmiOutputFolderPath + APmFileName, tssDocumentDictionary);

            Utility.Status("Completed APm: " + reqName);
        }
Exemplo n.º 2
0
        /// <summary>
        /// 获取相似度
        /// </summary>
        /// <param name="w1"></param>
        /// <param name="w2"></param>
        /// <param name="matrix"></param>
        /// <param name="noMatch"></param>
        /// <returns></returns>
        private static double GetSim(string w1, string w2, DocumentDictionaryAny <MyDoubleDictionary> matrix, double noMatch)
        {
            if (matrix.ContainsKey(w1) && matrix[w1].ContainsKey(w2))
            {
                return(matrix[w1][w2]);
            }

            if (matrix.ContainsKey(w2) && matrix[w2].ContainsKey(w1))
            {
                return(matrix[w2][w1]);
            }

            return(noMatch);
        }
Exemplo n.º 3
0
        /// <summary>
        /// 计算 PMI 方法
        /// </summary>
        /// <param name="simOutputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="fileText"></param>
        public static void ComputePmiSim(string simOutputFolderPath, string bugName, List <string> fileText)
        {
            Utility.Status("Creating Pmi: " + bugName);

            MyDoubleDictionary tssDocumentDictionary = new MyDoubleDictionary();

            // 创建查询列表(每个单词唯一) Create list of word contained in query
            List <string> distinctQueryWordList = fileText.Distinct().ToList(); // DISTINCT HERE but since its calculating PMI done remove it
            // 单词共现矩阵
            DocumentDictionaryAny <MyDoubleDictionary> nPmiMatrix = new DocumentDictionaryAny <MyDoubleDictionary>();
            // 源文件数目
            int n = CodeFilesWithContent.Count;

            // 为查询中的每个单词W2计算 带文件单词的 PMI值
            foreach (var queryWordW2 in distinctQueryWordList)
            {
                MyDoubleDictionary nPmiDictionary = new MyDoubleDictionary();
                // 对源码中的每个单词W1
                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    // 源码中是否包含查询W2
                    bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(queryWordW2);
                    // 包含 W1的数目C(W1), C(W2), C(W1,W2)
                    int countW1 = WordAndContainingFiles[sourceWordW1].Count;
                    int countW2 = sourceContainsUseCaseWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    // if query contains source then add 1 (query contains usecase word + source word
                    // if source contains query word find the intersection of files containing both words
                    int countW1W2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;
                    // 归一化的 PMI, d1 and d2 != 0, d1d2 可能
                    double nPmi;
                    // 从未共现, nPMI = -1
                    if (countW1W2 == 0)
                    {
                        nPmi = -1;
                    }
                    else
                    {
                        // 完全共现, nPMI = 1
                        if (countW1 == countW1W2 && countW2 == countW1W2)
                        {
                            nPmi = 1;
                        }
                        else
                        {
                            nPmi = Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n) - 1;
                        }
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }
                nPmiMatrix.Add(queryWordW2, nPmiDictionary);
            }

            //List<string> distinctUseCaseWordListForTss = fileText.Distinct().ToList(); //DISTINCT HERE
            List <string> distinctQueryWordListForTss = fileText.ToList(); //DISTINCT HERE
            // 源码中总文件数
            int totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            // Once the PMI is create compute Sim
            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                //List<string> distinctSourceWords = sourceFileWithWords.Value.Distinct().ToList(); //DISTINCT HERE
                //该处应该是Distinct
                List <string> distinctSourceWords = sourceFileWithWords.Value.ToList(); //DISTINCT HERE
                double        sumQueryTimeIdf     = 0.0;
                double        sumQueryIdf         = 0.0;

                foreach (var queryWord in distinctQueryWordListForTss)
                {
                    // 计算maxSim
                    double maxSim = -1;
                    foreach (var sourceWord in distinctSourceWords)
                    {
                        double currentnPmi = nPmiMatrix[queryWord][sourceWord];
                        if (maxSim < currentnPmi)
                        {
                            maxSim = currentnPmi;
                        }
                    }

                    // if term does not occur in any corpus then its only in use case hence 1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }
                    sumQueryTimeIdf += (maxSim * idf);
                    sumQueryIdf     += idf;
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                foreach (string sourceWord in distinctSourceWords)
                {
                    double maxSim = -1;
                    foreach (string useCaseWord in distinctQueryWordListForTss)
                    {
                        double currentNPmi = nPmiMatrix[useCaseWord][sourceWord];
                        if (maxSim < currentNPmi)
                        {
                            maxSim = currentNPmi;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusTimeIdf += (maxSim * idf);
                    sumCorpusIdf     += idf;
                }

                double tss = (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            WriteDocumentVectorToFileOrderedDescending(simOutputFolderPath + PmiFileName, tssDocumentDictionary);

            Utility.Status("Completed Pmi: " + bugName);
        }
Exemplo n.º 4
0
        private static MyDoubleDictionary GetTssAltered(List <string> reqFileText, DocumentDictionaryAny <MyDoubleDictionary> simMatrix, double noMatch)
        {
            MyDoubleDictionary          tssDocumentDictionary = new MyDoubleDictionary();
            Dictionary <string, double> reqTfDictionary       = new Dictionary <string, double>();

            reqFileText.ForEach(reqWord =>
            {
                if (!reqTfDictionary.ContainsKey(reqWord))
                {
                    reqTfDictionary.Add(reqWord, 0);
                }
                reqTfDictionary[reqWord] = reqTfDictionary[reqWord] + 1;
            });

            List <string> reqWordListForTss             = reqFileText.ToList();
            int           totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                List <string> sourceWords   = sourceFileWithWords.Value.ToList();
                double        sumReqTimeIdf = 0.0;
                double        sumReqIdf     = 0.0;

                foreach (var reqWord in reqWordListForTss)
                {
                    double maxSim = -1;
                    foreach (var sourceWord in sourceWords)
                    {
                        double currentSim = GetSim(reqWord, sourceWord, simMatrix, noMatch);
                        if (maxSim < currentSim)
                        {
                            maxSim = currentSim;
                        }
                    }

                    // if term does not occur in any source then its only in use case hence 1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(reqWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[reqWord].Count);
                    }

                    sumReqIdf     += idf;
                    sumReqTimeIdf += (maxSim * idf);
                }

                double sumSourceTimeIdf = 0.0;
                double sumSourceIdf     = 0.0;

                foreach (string sourceWord in sourceWords)
                {
                    double maxSim = -1;
                    foreach (string reqWord in reqWordListForTss)
                    {
                        double currentSim = GetSim(reqWord, sourceWord, simMatrix, noMatch);
                        if (maxSim < currentSim)
                        {
                            maxSim = currentSim;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumSourceTimeIdf += (maxSim * idf);
                    sumSourceIdf     += idf;
                }

                double tss = (1.0 / 2) * ((sumReqTimeIdf / sumReqIdf) + (sumSourceTimeIdf / sumSourceIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            return(tssDocumentDictionary);
        }
Exemplo n.º 5
0
        /// <summary>
        /// 计算 NDG 方法
        /// </summary>
        /// <param name="ngdOutputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="fileText"></param>
        public static void ComputeNgd(string ngdOutputFolderPath, string bugName, List <string> fileText)
        {
            Utility.Status("Creating NGD: " + bugName);

            MyDoubleDictionary tssDocumentDictionary = new MyDoubleDictionary();
            double             logD = Math.Log10(CodeFilesWithContent.Count + 1); // just make the N bigger than any

            // Create list of word contained in query
            List <string> distinctQueryWordList = fileText.Distinct().ToList(); // DISTINCT HERE but since its calculating NGD done remove it
            DocumentDictionaryAny <MyDoubleDictionary> ngdMatrix = new DocumentDictionaryAny <MyDoubleDictionary>();

            foreach (var queryWordW2 in distinctQueryWordList)
            {
                MyDoubleDictionary ngdDictionary = new MyDoubleDictionary();

                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(queryWordW2);

                    int countD1 = WordAndContainingFiles[sourceWordW1].Count;
                    // number of file containing W1 + if query also contains the word
                    int countD2 = sourceContainsUseCaseWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    // if query contains source then add 1 (query contains usecase word + source word
                    // if source contains query word find the intersection of files containing both words
                    int countD1D2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;

                    // d1 and d2 will never be 0, d1d2 however can be
                    double ngd = (countD1D2 == 0) ? 0 : ComputenNgd(countD1, countD2, countD1D2, logD);

                    ngdDictionary.Add(sourceWordW1, ngd);
                }
                ngdMatrix.Add(queryWordW2, ngdDictionary);
            }

            // TSS评判
            //List<string> distinctUseCaseWordListForTss = fileText.Distinct().ToList(); //DISTINCT HERE
            List <string> distinctQueryWordListForTss   = fileText.ToList(); //DISTINCT HERE
            int           totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                //List<string> distinctSourceWords = sourceFileWithWords.Value.Distinct().ToList(); //DISTINCT HERE
                List <string> distinctSourceWords = sourceFileWithWords.Value.ToList(); //DISTINCT HERE
                double        sumQueryTimeIdf     = 0.0;
                double        sumQueryIdf         = 0.0;

                foreach (var queryWord in distinctQueryWordListForTss)
                {
                    double maxSim = -1;
                    foreach (var sourceWord in distinctSourceWords)
                    {
                        double currentNgd = ngdMatrix[queryWord][sourceWord];
                        if (maxSim < currentNgd)
                        {
                            maxSim = currentNgd;
                        }
                    }

                    // if term does not occur in any corpus then its only in use case hence 1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }
                    sumQueryIdf     += idf;
                    sumQueryTimeIdf += (maxSim * idf);
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                foreach (string sourceWord in distinctSourceWords)
                {
                    double maxSim = -1;
                    foreach (string queryWord in distinctQueryWordListForTss)
                    {
                        double currentNgd = ngdMatrix[queryWord][sourceWord];
                        if (maxSim < currentNgd)
                        {
                            maxSim = currentNgd;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusIdf     += idf;
                    sumCorpusTimeIdf += (maxSim * idf);
                }

                double tss = (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            WriteDocumentVectorToFileOrderedDescending(ngdOutputFolderPath + NgdFileName, tssDocumentDictionary);

            Utility.Status("Completed NGD: " + bugName);
        }