コード例 #1
0
        public MyDoubleDictionary GetQueryIntDictionary(List <string> queryText)
        {
            var queryTfIdfDictionary = new MyDoubleDictionary();

            queryText.ForEach(queryTfIdfDictionary.IncreaseCount);
            return(queryTfIdfDictionary);
        }
コード例 #2
0
        private void ExecuteSub(MyDoubleDictionary queryTfIdfDictionary, string appendTextToFileName = "")
        {
            // max frequency
            double maxFrequency = queryTfIdfDictionary.Max(x => x.Value);

            // now multiply each by idf to get tfidf for query
            foreach (var queryWordWithTf in queryTfIdfDictionary.ToList())
            {
                queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key)
                    ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key]
                    : 0;
            }

            // Calculate Similarity
            var similarityDictionary = new MyDoubleDictionary();

            // compute similarity of fileText with each _codeFiles
            foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary)
            {
                double cosineSimilarityWithUseCase = Helper.GetSimilarity(queryTfIdfDictionary, codeFileWithTfIdfDictionary.Value);
                similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase);
            }

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Vsm" + appendTextToFileName, similarityDictionary);
        }
コード例 #3
0
ファイル: Jsm.cs プロジェクト: skhatiwada01/ASE192
        private void ExecuteSub(MyDoubleDictionary queryTfDictionary, string appendTextToFileName = "")
        {
            // create the vector for each source code
            var allUniqueWordsInSourceAndQuery      = IdfDictionary.Keys.Union(queryTfDictionary.Keys).Distinct().ToList();
            int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count;

            var sourceVectors = new Dictionary <string, double[]>();

            TfDictionary.ToList().ForEach(fileWithTfCount =>
            {
                MyDoubleDictionary tfDictionary = fileWithTfCount.Value;
                int totalWordsInFile            = CodeFilesWithContent[fileWithTfCount.Key].Count;

                double[] vector = new double[allUniqueWordsInSourceAndQueryCount];
                int counter     = 0;
                allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
                {
                    vector[counter] = tfDictionary.ContainsKey(uniqueWord)
                        ? tfDictionary[uniqueWord] / totalWordsInFile
                        : 0;
                    counter++;
                });

                sourceVectors.Add(fileWithTfCount.Key, vector);
            });

            // create the vector for query
            double[] queryVector         = new double[allUniqueWordsInSourceAndQueryCount];
            int      queryCounter        = 0;
            var      queryHashSet        = new HashSet <string>(queryTfDictionary.Keys);
            var      totalQueryWordCount = queryTfDictionary.Sum(x => x.Value);

            allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
            {
                queryVector[queryCounter] = queryHashSet.Contains(uniqueWord)
                    ? (double)queryTfDictionary[uniqueWord] / totalQueryWordCount
                    : 0;
                queryCounter++;
            });

            // calculate H(p), H(q) and H(p + q)
            MyDoubleDictionary similarityDictionary = new MyDoubleDictionary();

            sourceVectors.ToList().ForEach(sourceFileWithVector =>
            {
                var p          = sourceFileWithVector.Value;
                var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy();
                var pEntropy   = 1.0 / 2 * p.JensenEntropy();
                var qEntropy   = 1.0 / 2 * queryVector.JensenEntropy();

                var jensenDivergence = sumEntropy - pEntropy - qEntropy;
                var jensenSimilarity = 1 - jensenDivergence;

                similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity);
            });

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Jsm" + appendTextToFileName, similarityDictionary);
        }
コード例 #4
0
        public static double GetSimilarity(MyDoubleDictionary vector1, MyDoubleDictionary vector2)
        {
            double length1 = GetLength(vector1);
            double length2 = GetLength(vector2);

            double dotProduct = vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key)).Sum(wordWithCount => (wordWithCount.Value * vector2[wordWithCount.Key]));

            return(vector2.Count == 0 ? 0 : dotProduct / (length1 * length2));
        }
コード例 #5
0
        public static void ComputeAPm(string pmiOutputFolderPath, string reqName, List <string> reqText)
        {
            Utility.Status("Creating Apm: " + reqName);

            // Create list of word contained in query
            List <string> distinctReqWordList = reqText.Distinct().ToList();
            DocumentDictionaryAny <MyDoubleDictionary> nPmiMatrix = new DocumentDictionaryAny <MyDoubleDictionary>();
            int n = CodeFilesWithContent.Count;

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            foreach (var reqWordW2 in distinctReqWordList)
            {
                MyDoubleDictionary nPmiDictionary = new MyDoubleDictionary();

                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(reqWordW2);

                    int countW1 = WordAndContainingFiles[sourceWordW1].Count;
                    //double averageCountW1Files = _wordAndContainingFiles[sourceWordW1].Select(x => _codeFilesWithContent[x].Count).Average();
                    int countW2 = sourceContainsUseCaseWord ? WordAndContainingFiles[reqWordW2].Count : 0;
                    //double averageCountW2Files = sourceContainsUseCaseWord ? _wordAndContainingFiles[reqWordW2].Select(x => _codeFilesWithContent[x].Count).Average() : 0;

                    // if query contains source then add 1 (query contains usecase word + source word
                    // if source contains query word find the intersection of files containing both words
                    int countW1W2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[reqWordW2]).Count() : 0;
                    //double averageCountW1W2Files = sourceContainsUseCaseWord ? _wordAndContainingFiles[sourceWordW1].Intersect(_wordAndContainingFiles[reqWordW2]).Select(x => _codeFilesWithContent[x].Count).Average() : 0;

                    // d1 and d2 will never be 0, d1d2 however can be
                    double nPmi;
                    if (countW1W2 == 0)
                    {
                        // no cooccurence
                        nPmi = -1;
                    }
                    else
                    {
                        if (countW1 == countW1W2 && countW2 == countW1W2)
                        {
                            nPmi = 1;
                        }
                        else
                        {
                            nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n) - 1) * ((double)countW1W2 / CodeFilesWithContent.Count);
                        }
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }
                nPmiMatrix.Add(reqWordW2, nPmiDictionary);
            }

            MyDoubleDictionary tssDocumentDictionary = GetTssAltered(reqText, nPmiMatrix, -1);

            WriteDocumentVectorToFileOrderedDescending(pmiOutputFolderPath + APmFileName, tssDocumentDictionary);

            Utility.Status("Completed APm: " + reqName);
        }
コード例 #6
0
        /// <summary>
        /// 对VSM PMI LSI 方法进行初始化
        /// 设置 TF, IDF, TF-IDF
        /// </summary>
        public static void InitializeForVsmSimLsi()
        {
            // 计算 TF and idf
            foreach (var fileAndItsWords in CodeFilesWithContent)
            {
                //TF 词频字典
                MyDoubleDictionary fileTfDictionary = new MyDoubleDictionary();

                // 统计每个源码文件的中的单词及其词频
                foreach (string word in fileAndItsWords.Value)
                {
                    fileTfDictionary.Add(word);
                }

                // 为每个源码文件保存其词频
                TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary);

                // 对每个在源码文件中出现的单词, 记录它出现的文档数目, 此时IDF中保存的是文档频数DF
                foreach (var wordAndItsCount in fileTfDictionary)
                {
                    IdfDictionary.Add(wordAndItsCount.Key);
                }
            }

            // 将DF转换为IDF IDF = log(T/DF)
            // 文档总数
            int totalNumberOfDocuments = CodeFilesWithContent.Count;

            foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list 这样可以改变该字典
            {
                IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value);
            }

            // 为每个文件设置 TF-IDF
            foreach (var sourceFileWithTfDictionary in TfDictionary)
            {
                // 单个源文件的TF-IDF
                MyDoubleDictionary fileTfIdfDictionary = new MyDoubleDictionary();
                foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value)
                {
                    fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]);
                }
                TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary);
            }
        }
コード例 #7
0
        public string GetSimilarityText(MyDoubleDictionary vector2)
        {
            double length1 = GetLength(_vector1);
            double length2 = GetLength(vector2);

            var dotProductObj =
                _vector1.Where(wordWithCount => vector2.ContainsKey(wordWithCount.Key))
                .Select(
                    wordWithCount =>
                    new
            {
                Word   = wordWithCount.Key,
                Value1 = wordWithCount.Value,
                Value2 = vector2[wordWithCount.Key]
            })
                .Select(x => $"{x.Word} {x.Value1.ToString("##.000")}, {x.Value2.ToString("##.000")}");

            var dotProductString = string.Join(Environment.NewLine, dotProductObj);

            return(dotProductString);
        }
コード例 #8
0
        /// <summary>
        /// 计算 VSM 方法
        /// </summary>
        /// <param name="outputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="queryText">查询文本</param>
        public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText)
        {
            Utility.Status("Creating VSM: " + bugName);

            // 创建查询文本的TF-IDF字典
            MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary();

            queryText.ForEach(queryTfIdfDictionary.Add);

            // 最大频度
            double maxFrequency = queryTfIdfDictionary.Max(x => x.Value);

            // 计算TF-IDF
            foreach (var queryWordWithTf in queryTfIdfDictionary.ToList())
            {
                queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key)
                    ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key]
                    : 0;
            }

            // 计算相似度字典
            MyDoubleDictionary         similarityDictionary       = new MyDoubleDictionary();
            CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary);

            // 计算文本文件相似度 with each _codeFiles
            foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary)
            {
                double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value);
                similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase);
            }

            // 将文档向量降序写入文件Project\001\Results\Vsm.txt
            WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary);

            Utility.Status("Completed VSM: " + bugName);
        }
コード例 #9
0
        /// <summary>
        /// 计算 Jensen-Shannon 方法
        /// </summary>
        /// <param name="outputFolderPath">输出文件夹,各个bug文件夹</param>
        /// <param name="bugName">bug名称</param>
        /// <param name="queryText">查询文本</param>
        public static void ComputeJen(string outputFolderPath, string bugName, List <string> queryText)
        {
            Utility.Status("Computing JEN: " + bugName);

            /// 为源代码创建向量
            // 源码和查询中出现的单词, 单词库大小的向量
            List <string> allUniqueWordsInSourceAndQuery = IdfDictionary.Keys.Union(queryText).Distinct().ToList();
            // 总单词数
            int allUniqueWordsInSourceAndQueryCount = allUniqueWordsInSourceAndQuery.Count;
            //源码向量字典
            Dictionary <string, double[]> sourceVectors = new Dictionary <string, double[]>();

            TfDictionary.ToList().ForEach(fileWithTfCount =>
            {
                MyDoubleDictionary tfDictionary = fileWithTfCount.Value;
                // 某源码中的总单词数
                int totalWordsInFile = CodeFilesWithContent[fileWithTfCount.Key].Count;
                // 单个源码文件向量,存放Pd=f(w, d)/Td
                double[] vector = new double[allUniqueWordsInSourceAndQueryCount];
                int counter     = 0;
                allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
                {
                    vector[counter] = tfDictionary.ContainsKey(uniqueWord)
                        ? tfDictionary[uniqueWord] / totalWordsInFile
                        : 0;
                    counter++;
                });

                sourceVectors.Add(fileWithTfCount.Key, vector);
            });

            // 为查询创建向量
            double[] queryVector  = new double[allUniqueWordsInSourceAndQueryCount];
            int      queryCounter = 0;

            allUniqueWordsInSourceAndQuery.ForEach(uniqueWord =>
            {
                queryVector[queryCounter] = queryText.Contains(uniqueWord)
                    ? (double)queryText.Count(x => x == uniqueWord) / queryText.Count
                    : 0;
                queryCounter++;
            });

            // 计算 H(p), H(q) and H(p + q)
            MyDoubleDictionary similarityDictionary = new MyDoubleDictionary();

            sourceVectors.ToList().ForEach(sourceFileWithVector =>
            {
                var p          = sourceFileWithVector.Value;
                var sumEntropy = (p.JensenSum(queryVector)).JensenEntropy();
                var pEntropy   = 1.0 / 2 * p.JensenEntropy();
                var qEntropy   = 1.0 / 2 * queryVector.JensenEntropy();

                var jensenDivergence = sumEntropy - pEntropy - qEntropy;
                var jensenSimilarity = 1 - jensenDivergence;
                // 源码文件编码-jensen相似度
                similarityDictionary.Add(sourceFileWithVector.Key, jensenSimilarity);
            });

            // 将文档向量降序写入文件Project\001\Results\Jen.txt
            WriteDocumentVectorToFileOrderedDescending(outputFolderPath + JenFileName, similarityDictionary);

            Utility.Status("DONE Computing JEN: " + bugName);
        }
コード例 #10
0
ファイル: Lsi.cs プロジェクト: skhatiwada01/ASE192
        private void ExecuteSub(MyDoubleDictionary queryTfDictionary, string appendTextToFileName = "")
        {
            Log("SVD: " + LogAdd);

            // create the matrix
            int totalNumberOfSourceFiles                     = TfDictionary.Count;
            int totalDistinctTermsInAllSourceFiles           = IdfDictionary.Count;
            Dictionary <string, int> allSourceFilesWithIndex = TfDictionary.Keys.Select((x, index) => new { Name = x, Index = index }).ToDictionary(x => x.Name, x => x.Index);
            Dictionary <string, int> allSourceWordsWithIndex = IdfDictionary.Keys.Select((x, index) => new { Name = x, Index = index }).ToDictionary(x => x.Name, x => x.Index);

            double[,] sourceMatrix = new double[totalDistinctTermsInAllSourceFiles, totalNumberOfSourceFiles]; // row, col row is word col docs

            foreach (var fileNameWithTfDictionary in TfDictionary)
            {
                int fileIndex = allSourceFilesWithIndex[fileNameWithTfDictionary.Key];
                foreach (var fileWordWithTf in fileNameWithTfDictionary.Value)
                {
                    sourceMatrix[allSourceWordsWithIndex[fileWordWithTf.Key], fileIndex] = fileWordWithTf.Value;
                }
            }

            // create matrix
            Matrix generalMatrix = new Matrix(sourceMatrix);

            // singular value decomposition
            SVD svd = new SVD(generalMatrix);

            _uk          = new Dictionary <int, Matrix>();
            _sk          = new Dictionary <int, Matrix>();
            _vkTranspose = new Dictionary <int, Matrix>();

            LsiKs.Where(x => x <= svd.S.Cols).ToList().ForEach(k =>
            {
                Log("SVD " + k + ": " + LogAdd);
                _uk.Add(k, new Matrix(svd.U.ToArray(), svd.U.Rows, k));
                _sk.Add(k, new Matrix(svd.S.ToArray(), k, k));
                _vkTranspose.Add(k, new Matrix(svd.VH.ToArray(), k, svd.VH.Cols));
            });

            // create one for query as well
            double[,] queryMatrixTranspose = new double[1, totalDistinctTermsInAllSourceFiles];
            queryTfDictionary.Keys.ToList().ForEach(queryWord =>
            {
                if (allSourceWordsWithIndex.ContainsKey(queryWord))
                {
                    queryMatrixTranspose[0, allSourceWordsWithIndex[queryWord]] = queryMatrixTranspose[0, allSourceWordsWithIndex[queryWord]] + 1;
                }
            });

            var outputResultFolderPath = _outputLsiFolderPath + @"\Lsi\";

            if (!Directory.Exists(outputResultFolderPath))
            {
                Directory.CreateDirectory(outputResultFolderPath);
            }

            var ks = _uk.Keys.Where(x => !File.Exists(outputResultFolderPath + x + ".txt")).ToList();

            foreach (var k in ks)
            {
                var uk          = _uk[k];
                var sk          = _sk[k];
                var vkTranspose = _vkTranspose[k];

                var q        = new Matrix(queryMatrixTranspose);
                var qv       = q * uk * sk.Inverse();
                var qDoubles = qv.RowVector(0).ToArray().ToList();

                var similarityList       = allSourceFilesWithIndex.Select(doc => new KeyValuePair <string, double>(doc.Key, Helper.GetSimilarity(qDoubles, vkTranspose.ColVector(doc.Value).ToArray().ToList())));
                var similarityDictionary = similarityList.OrderByDescending(x => x.Value).ToDictionary(x => x.Key, x => x.Value);
                WriteDocumentVectorToFileOrderedDescending(@"Lsi\Lsi_" + k + appendTextToFileName, similarityDictionary);
            }
        }
コード例 #11
0
ファイル: Method.cs プロジェクト: tigerqiu712/buglocalization
        /// <summary>
        /// Writes vector to file ordered
        /// 降序排列写入文件
        /// </summary>
        /// <param name="filePath"></param>
        /// <param name="vector"></param>
        /// <param name="asInt"></param>
        protected static void WriteDocumentVectorToFileOrderedDescending(string filePath, MyDoubleDictionary vector, bool asInt = false)
        {
            string pattern = asInt ? "##" : "##.00000";

            File.WriteAllLines(filePath, vector.ToList().OrderByDescending(x => x.Value).Select(x => x.Key + " " + x.Value.ToString(pattern)));
        }
コード例 #12
0
        /// <summary>
        /// 计算 NDG 方法
        /// </summary>
        /// <param name="ngdOutputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="fileText"></param>
        public static void ComputeNgd(string ngdOutputFolderPath, string bugName, List <string> fileText)
        {
            Utility.Status("Creating NGD: " + bugName);

            MyDoubleDictionary tssDocumentDictionary = new MyDoubleDictionary();
            double             logD = Math.Log10(CodeFilesWithContent.Count + 1); // just make the N bigger than any

            // Create list of word contained in query
            List <string> distinctQueryWordList = fileText.Distinct().ToList(); // DISTINCT HERE but since its calculating NGD done remove it
            DocumentDictionaryAny <MyDoubleDictionary> ngdMatrix = new DocumentDictionaryAny <MyDoubleDictionary>();

            foreach (var queryWordW2 in distinctQueryWordList)
            {
                MyDoubleDictionary ngdDictionary = new MyDoubleDictionary();

                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(queryWordW2);

                    int countD1 = WordAndContainingFiles[sourceWordW1].Count;
                    // number of file containing W1 + if query also contains the word
                    int countD2 = sourceContainsUseCaseWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    // if query contains source then add 1 (query contains usecase word + source word
                    // if source contains query word find the intersection of files containing both words
                    int countD1D2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;

                    // d1 and d2 will never be 0, d1d2 however can be
                    double ngd = (countD1D2 == 0) ? 0 : ComputenNgd(countD1, countD2, countD1D2, logD);

                    ngdDictionary.Add(sourceWordW1, ngd);
                }
                ngdMatrix.Add(queryWordW2, ngdDictionary);
            }

            // TSS评判
            //List<string> distinctUseCaseWordListForTss = fileText.Distinct().ToList(); //DISTINCT HERE
            List <string> distinctQueryWordListForTss   = fileText.ToList(); //DISTINCT HERE
            int           totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                //List<string> distinctSourceWords = sourceFileWithWords.Value.Distinct().ToList(); //DISTINCT HERE
                List <string> distinctSourceWords = sourceFileWithWords.Value.ToList(); //DISTINCT HERE
                double        sumQueryTimeIdf     = 0.0;
                double        sumQueryIdf         = 0.0;

                foreach (var queryWord in distinctQueryWordListForTss)
                {
                    double maxSim = -1;
                    foreach (var sourceWord in distinctSourceWords)
                    {
                        double currentNgd = ngdMatrix[queryWord][sourceWord];
                        if (maxSim < currentNgd)
                        {
                            maxSim = currentNgd;
                        }
                    }

                    // if term does not occur in any corpus then its only in use case hence 1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }
                    sumQueryIdf     += idf;
                    sumQueryTimeIdf += (maxSim * idf);
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                foreach (string sourceWord in distinctSourceWords)
                {
                    double maxSim = -1;
                    foreach (string queryWord in distinctQueryWordListForTss)
                    {
                        double currentNgd = ngdMatrix[queryWord][sourceWord];
                        if (maxSim < currentNgd)
                        {
                            maxSim = currentNgd;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusIdf     += idf;
                    sumCorpusTimeIdf += (maxSim * idf);
                }

                double tss = (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            WriteDocumentVectorToFileOrderedDescending(ngdOutputFolderPath + NgdFileName, tssDocumentDictionary);

            Utility.Status("Completed NGD: " + bugName);
        }
コード例 #13
0
ファイル: PmiUsingCache.cs プロジェクト: skhatiwada01/ASE192
        public void Execute(List <string> queryTexts, string pmiCacheFolderPath, string appendTextToFileName)
        {
            var queryTextsDistinct    = queryTexts.Distinct().ToList();
            var tssDocumentDictionary = new MyDoubleDictionary();

            // Create list of word contained in query
            var nPmiMatrix = new Dictionary <string, Dictionary <string, double> >();

            foreach (var queryText in queryTextsDistinct)
            {
                var pmiDictionaryString = Newtonsoft.Json.JsonConvert.DeserializeObject <Dictionary <string, string> >(File.ReadAllText(Path.Combine(pmiCacheFolderPath, $"_{queryText}.txt")));
                var pmiDictionary       = pmiDictionaryString.ToDictionary(x => x.Key, x => double.Parse(x.Value));
                nPmiMatrix.Add(queryText, pmiDictionary);
            }

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            int totalNumberOfDocumentInSource = CodeFilesWithContent.Count;
            int counter = 0;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                counter++;
                Log($"{_appendLog} Running PMI: {counter} of {CodeFilesWithContent.Count}");

                var    sourceWords     = sourceFileWithWords.Value.ToList();
                double sumQueryTimeIdf = 0.0;
                double sumQueryIdf     = 0.0;

                var queryTextsDistinctMaxSimDictionary = new Dictionary <string, double>();

                foreach (var queryWord in queryTexts)
                {
                    if (!queryTextsDistinctMaxSimDictionary.ContainsKey(queryWord))
                    {
                        double maxSimCurrent = sourceWords.AsParallel().Select(x => nPmiMatrix[queryWord].ContainsKey(x) ? nPmiMatrix[queryWord][x] : -1).MyMax(-1);
                        queryTextsDistinctMaxSimDictionary.Add(queryWord, maxSimCurrent);
                    }

                    double maxSim = queryTextsDistinctMaxSimDictionary[queryWord];

                    // if term does not occur in any corpus then its only in use case hence -1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }

                    sumQueryIdf     += idf;
                    sumQueryTimeIdf += (maxSim * idf);
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                var sourceWordsAsIntsMaxSimDictionary = new Dictionary <string, double>();
                foreach (var sourceWord in sourceWords)
                {
                    if (!sourceWordsAsIntsMaxSimDictionary.ContainsKey(sourceWord))
                    {
                        double maxSimCurrent = queryTextsDistinct.AsParallel().Select(x => nPmiMatrix[x].ContainsKey(sourceWord) ? nPmiMatrix[x][sourceWord] : -1).MyMax(-1);
                        sourceWordsAsIntsMaxSimDictionary[sourceWord] = maxSimCurrent;
                    }

                    var maxSim = sourceWordsAsIntsMaxSimDictionary[sourceWord];

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusIdf     += idf;
                    sumCorpusTimeIdf += (maxSim * idf);
                }

                double tss = sumQueryIdf == 0 || sumCorpusIdf == 0 ? -1 : (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Pmi" + appendTextToFileName, tssDocumentDictionary);
        }
コード例 #14
0
        private void ProcessSourceCode()
        {
            // Read all files
            CodeFilesWithContent = new Dictionary <string, List <string> >();
            foreach (var line in File.ReadAllLines(_sourceFilePath))
            {
                var      lineSplit = line.SplitWith("##");
                string[] text      = lineSplit[1].SplitWith(",").Where(x => x.Length > 2).ToArray();
                CodeFilesWithContent.Add(lineSplit[0], text.Take(text.Length / 50).ToList());
            }

            // compute tf and idf
            TfDictionary    = new Dictionary <string, MyDoubleDictionary>();
            IdfDictionary   = new MyDoubleDictionary();
            TfIdfDictionary = new Dictionary <string, MyDoubleDictionary>();
            foreach (var fileAndItsWords in CodeFilesWithContent)
            {
                var fileTfDictionary = new MyDoubleDictionary();

                // for each word in the file add 1 to the count
                foreach (string word in fileAndItsWords.Value)
                {
                    fileTfDictionary.IncreaseCount(word);
                }

                // save tf result for the file
                TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary);

                // for each DISTINCT word found in the file increase the idf by 1. At this point idf holds document frequency
                foreach (var wordAndItsCount in fileTfDictionary)
                {
                    IdfDictionary.IncreaseCount(wordAndItsCount.Key);
                }
            }

            // change df to idf
            int totalNumberOfDocuments = CodeFilesWithContent.Count;

            foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list so that we can change the dictionary
            {
                IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value);
            }

            // update tfidf for each file
            foreach (var sourceFileWithTfDictionary in TfDictionary)
            {
                var fileTfIdfDictionary = new MyDoubleDictionary();
                foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value)
                {
                    fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]);
                }
                TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary);
            }

            WordAndContainingFiles = new Dictionary <string, List <string> >();
            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                sourceFileWithWords.Value.Distinct().ToList().ForEach(word =>
                {
                    if (!WordAndContainingFiles.ContainsKey(word))
                    {
                        WordAndContainingFiles.Add(word, new List <string>());
                    }
                    WordAndContainingFiles[word].Add(sourceFileWithWords.Key);
                });
            }
        }
コード例 #15
0
        public void Execute(List <string> queryText, string appendTextToFileName)
        {
            ExecuteBase();

            var tssDocumentDictionary = new MyDoubleDictionary();

            // Create list of word contained in query
            var distinctQueryWordList = queryText.Distinct().ToList(); // DISTINCT HERE but since its calculating PMI done remove it
            var nPmiMatrix            = new MyAnyDictionary <MyDoubleDictionary>();
            int n = CodeFilesWithContent.Count;

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            foreach (var queryWordW2 in distinctQueryWordList)
            {
                var nPmiDictionary = new MyDoubleDictionary();

                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    bool sourceContainsQueryWord = WordAndContainingFiles.ContainsKey(queryWordW2);

                    int countW1   = WordAndContainingFiles[sourceWordW1].Count;
                    int countW2   = sourceContainsQueryWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    int countW1W2 = sourceContainsQueryWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;

                    double nPmi;
                    if (countW1W2 == 0)
                    {
                        nPmi = -1;
                    }
                    else if (countW1 == countW1W2 && countW2 == countW1W2)
                    {
                        nPmi = 1;
                    }
                    else
                    {
                        nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n)) - 1;
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }
                nPmiMatrix.Add(queryWordW2, nPmiDictionary);
            }

            var queryWordListForTss           = queryText.ToList();
            int totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                var    sourceWords     = sourceFileWithWords.Value.ToList();
                double sumQueryTimeIdf = 0.0;
                double sumQueryIdf     = 0.0;

                foreach (var queryWord in queryWordListForTss)
                {
                    double maxSim = -1;
                    foreach (var sourceWord in sourceWords)
                    {
                        double currentnPmi = nPmiMatrix[queryWord][sourceWord];
                        if (maxSim < currentnPmi)
                        {
                            maxSim = currentnPmi;
                        }
                    }

                    // if term does not occur in any corpus then its only in use case hence -1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }

                    sumQueryIdf     += idf;
                    sumQueryTimeIdf += (maxSim * idf);
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                foreach (string sourceWord in sourceWords)
                {
                    double maxSim = -1;
                    foreach (string useCaseWord in queryWordListForTss)
                    {
                        double currentNPmi = nPmiMatrix[useCaseWord][sourceWord];
                        if (maxSim < currentNPmi)
                        {
                            maxSim = currentNPmi;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusIdf     += idf;
                    sumCorpusTimeIdf += (maxSim * idf);
                }

                double tss = sumQueryIdf == 0 || sumCorpusIdf == 0 ? -1 : (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            // WRITE TO FILE
            WriteDocumentVectorToFileOrderedDescending("Pmi" + appendTextToFileName, tssDocumentDictionary);
        }
コード例 #16
0
        public void Execute(List <string> queryTextList, string cacheOutputFolderPath)
        {
            ExecuteBase();

            if (!Directory.Exists(cacheOutputFolderPath))
            {
                Directory.CreateDirectory(cacheOutputFolderPath);
                Thread.Sleep(100);
            }

            var tssDocumentDictionary = new MyDoubleDictionary();

            // Create list of word contained in query
            int n = CodeFilesWithContent.Count;

            // Compute pmi for each word in WordAndContainingFiles and unique words in query
            int counter = 0;

            foreach (var queryText in queryTextList.Distinct())
            {
                var queryWordW2 = queryText;

                Log($"{_appendLog} Creating {++counter} of {queryTextList.Count}: {queryWordW2}");

                var outputFilePath = Path.Combine(cacheOutputFolderPath, $"_{queryWordW2}.txt");
                if (File.Exists(outputFilePath))
                {
                    continue;
                }

                var nPmiDictionary = new MyDoubleDictionary();

                foreach (var sourceText in WordAndContainingFiles.Keys)
                {
                    var  sourceWordW1            = sourceText;
                    bool sourceContainsQueryWord = WordAndContainingFiles.ContainsKey(queryWordW2);

                    int countW1   = WordAndContainingFiles[sourceWordW1].Count;
                    int countW2   = sourceContainsQueryWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    int countW1W2 = sourceContainsQueryWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;

                    double nPmi;
                    if (countW1W2 == 0)
                    {
                        nPmi = -1;
                    }
                    else if (countW1 == countW1W2 && countW2 == countW1W2)
                    {
                        nPmi = 1;
                    }
                    else
                    {
                        nPmi = (Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n)) - 1;
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }

                var print = nPmiDictionary.Where(x => x.Value > -1).ToDictionary(x => x.Key, x => x.Value.ToString("#.000"));
                File.WriteAllText(outputFilePath, Newtonsoft.Json.JsonConvert.SerializeObject(print));
            }
        }
コード例 #17
0
        private static MyDoubleDictionary GetTssAltered(List <string> reqFileText, DocumentDictionaryAny <MyDoubleDictionary> simMatrix, double noMatch)
        {
            MyDoubleDictionary          tssDocumentDictionary = new MyDoubleDictionary();
            Dictionary <string, double> reqTfDictionary       = new Dictionary <string, double>();

            reqFileText.ForEach(reqWord =>
            {
                if (!reqTfDictionary.ContainsKey(reqWord))
                {
                    reqTfDictionary.Add(reqWord, 0);
                }
                reqTfDictionary[reqWord] = reqTfDictionary[reqWord] + 1;
            });

            List <string> reqWordListForTss             = reqFileText.ToList();
            int           totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                List <string> sourceWords   = sourceFileWithWords.Value.ToList();
                double        sumReqTimeIdf = 0.0;
                double        sumReqIdf     = 0.0;

                foreach (var reqWord in reqWordListForTss)
                {
                    double maxSim = -1;
                    foreach (var sourceWord in sourceWords)
                    {
                        double currentSim = GetSim(reqWord, sourceWord, simMatrix, noMatch);
                        if (maxSim < currentSim)
                        {
                            maxSim = currentSim;
                        }
                    }

                    // if term does not occur in any source then its only in use case hence 1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(reqWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[reqWord].Count);
                    }

                    sumReqIdf     += idf;
                    sumReqTimeIdf += (maxSim * idf);
                }

                double sumSourceTimeIdf = 0.0;
                double sumSourceIdf     = 0.0;

                foreach (string sourceWord in sourceWords)
                {
                    double maxSim = -1;
                    foreach (string reqWord in reqWordListForTss)
                    {
                        double currentSim = GetSim(reqWord, sourceWord, simMatrix, noMatch);
                        if (maxSim < currentSim)
                        {
                            maxSim = currentSim;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumSourceTimeIdf += (maxSim * idf);
                    sumSourceIdf     += idf;
                }

                double tss = (1.0 / 2) * ((sumReqTimeIdf / sumReqIdf) + (sumSourceTimeIdf / sumSourceIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            return(tssDocumentDictionary);
        }
コード例 #18
0
        public static double GetLength(MyDoubleDictionary vector)
        {
            double length = Math.Sqrt(vector.Sum(x => Math.Pow(x.Value, 2)));

            return(length);
        }
コード例 #19
0
        /// <summary>
        /// 计算 PMI 方法
        /// </summary>
        /// <param name="simOutputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="fileText"></param>
        public static void ComputePmiSim(string simOutputFolderPath, string bugName, List <string> fileText)
        {
            Utility.Status("Creating Pmi: " + bugName);

            MyDoubleDictionary tssDocumentDictionary = new MyDoubleDictionary();

            // 创建查询列表(每个单词唯一) Create list of word contained in query
            List <string> distinctQueryWordList = fileText.Distinct().ToList(); // DISTINCT HERE but since its calculating PMI done remove it
            // 单词共现矩阵
            DocumentDictionaryAny <MyDoubleDictionary> nPmiMatrix = new DocumentDictionaryAny <MyDoubleDictionary>();
            // 源文件数目
            int n = CodeFilesWithContent.Count;

            // 为查询中的每个单词W2计算 带文件单词的 PMI值
            foreach (var queryWordW2 in distinctQueryWordList)
            {
                MyDoubleDictionary nPmiDictionary = new MyDoubleDictionary();
                // 对源码中的每个单词W1
                foreach (var sourceWordW1 in WordAndContainingFiles.Keys)
                {
                    // 源码中是否包含查询W2
                    bool sourceContainsUseCaseWord = WordAndContainingFiles.ContainsKey(queryWordW2);
                    // 包含 W1的数目C(W1), C(W2), C(W1,W2)
                    int countW1 = WordAndContainingFiles[sourceWordW1].Count;
                    int countW2 = sourceContainsUseCaseWord ? WordAndContainingFiles[queryWordW2].Count : 0;
                    // if query contains source then add 1 (query contains usecase word + source word
                    // if source contains query word find the intersection of files containing both words
                    int countW1W2 = sourceContainsUseCaseWord ? WordAndContainingFiles[sourceWordW1].Intersect(WordAndContainingFiles[queryWordW2]).Count() : 0;
                    // 归一化的 PMI, d1 and d2 != 0, d1d2 可能
                    double nPmi;
                    // 从未共现, nPMI = -1
                    if (countW1W2 == 0)
                    {
                        nPmi = -1;
                    }
                    else
                    {
                        // 完全共现, nPMI = 1
                        if (countW1 == countW1W2 && countW2 == countW1W2)
                        {
                            nPmi = 1;
                        }
                        else
                        {
                            nPmi = Math.Log10((double)countW1 / n * countW2 / n) / Math.Log10((double)countW1W2 / n) - 1;
                        }
                    }
                    nPmiDictionary.Add(sourceWordW1, nPmi);
                }
                nPmiMatrix.Add(queryWordW2, nPmiDictionary);
            }

            //List<string> distinctUseCaseWordListForTss = fileText.Distinct().ToList(); //DISTINCT HERE
            List <string> distinctQueryWordListForTss = fileText.ToList(); //DISTINCT HERE
            // 源码中总文件数
            int totalNumberOfDocumentInSource = CodeFilesWithContent.Count;

            // Once the PMI is create compute Sim
            foreach (var sourceFileWithWords in CodeFilesWithContent)
            {
                //List<string> distinctSourceWords = sourceFileWithWords.Value.Distinct().ToList(); //DISTINCT HERE
                //该处应该是Distinct
                List <string> distinctSourceWords = sourceFileWithWords.Value.ToList(); //DISTINCT HERE
                double        sumQueryTimeIdf     = 0.0;
                double        sumQueryIdf         = 0.0;

                foreach (var queryWord in distinctQueryWordListForTss)
                {
                    // 计算maxSim
                    double maxSim = -1;
                    foreach (var sourceWord in distinctSourceWords)
                    {
                        double currentnPmi = nPmiMatrix[queryWord][sourceWord];
                        if (maxSim < currentnPmi)
                        {
                            maxSim = currentnPmi;
                        }
                    }

                    // if term does not occur in any corpus then its only in use case hence 1
                    double idf = 0;
                    if (WordAndContainingFiles.ContainsKey(queryWord))
                    {
                        idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[queryWord].Count);
                    }
                    sumQueryTimeIdf += (maxSim * idf);
                    sumQueryIdf     += idf;
                }

                double sumCorpusTimeIdf = 0.0;
                double sumCorpusIdf     = 0.0;

                foreach (string sourceWord in distinctSourceWords)
                {
                    double maxSim = -1;
                    foreach (string useCaseWord in distinctQueryWordListForTss)
                    {
                        double currentNPmi = nPmiMatrix[useCaseWord][sourceWord];
                        if (maxSim < currentNPmi)
                        {
                            maxSim = currentNPmi;
                        }
                    }

                    // sourceWord has to be in IdfDictionary
                    double idf = Math.Log10((double)totalNumberOfDocumentInSource / WordAndContainingFiles[sourceWord].Count);

                    sumCorpusTimeIdf += (maxSim * idf);
                    sumCorpusIdf     += idf;
                }

                double tss = (1.0 / 2) * ((sumQueryTimeIdf / sumQueryIdf) + (sumCorpusTimeIdf / sumCorpusIdf));
                tssDocumentDictionary.Add(sourceFileWithWords.Key, tss);
            }

            WriteDocumentVectorToFileOrderedDescending(simOutputFolderPath + PmiFileName, tssDocumentDictionary);

            Utility.Status("Completed Pmi: " + bugName);
        }
コード例 #20
0
ファイル: Method.cs プロジェクト: tigerqiu712/buglocalization
        /// <summary>
        /// Writes vector to file
        /// 将文档向量写入文件
        /// </summary>
        /// <param name="filePath">文件路径</param>
        /// <param name="vector">文档向量</param>
        /// <param name="asInt">是否作为整数</param>
        public static void WriteDocumentVectorToFile(string filePath, MyDoubleDictionary vector, bool asInt = false)
        {
            string pattern = asInt ? "##" : "##.00000";

            File.WriteAllLines(filePath, vector.Select(x => x.Key + " " + x.Value.ToString(pattern)));
        }
コード例 #21
0
 public CosineSimilarityCalculator(MyDoubleDictionary vector1)
 {
     _vector1 = vector1;
 }