private void ProcessSourceCode() { // Read all files CodeFilesWithContent = new Dictionary <string, List <string> >(); foreach (var line in File.ReadAllLines(_sourceFilePath)) { var lineSplit = line.SplitWith("##"); string[] text = lineSplit[1].SplitWith(",").Where(x => x.Length > 2).ToArray(); CodeFilesWithContent.Add(lineSplit[0], text.Take(text.Length / 50).ToList()); } // compute tf and idf TfDictionary = new Dictionary <string, MyDoubleDictionary>(); IdfDictionary = new MyDoubleDictionary(); TfIdfDictionary = new Dictionary <string, MyDoubleDictionary>(); foreach (var fileAndItsWords in CodeFilesWithContent) { var fileTfDictionary = new MyDoubleDictionary(); // for each word in the file add 1 to the count foreach (string word in fileAndItsWords.Value) { fileTfDictionary.IncreaseCount(word); } // save tf result for the file TfDictionary.Add(fileAndItsWords.Key, fileTfDictionary); // for each DISTINCT word found in the file increase the idf by 1. At this point idf holds document frequency foreach (var wordAndItsCount in fileTfDictionary) { IdfDictionary.IncreaseCount(wordAndItsCount.Key); } } // change df to idf int totalNumberOfDocuments = CodeFilesWithContent.Count; foreach (var wordAndItsDocumentCount in IdfDictionary.ToList()) // to list so that we can change the dictionary { IdfDictionary[wordAndItsDocumentCount.Key] = Math.Log10(totalNumberOfDocuments / wordAndItsDocumentCount.Value); } // update tfidf for each file foreach (var sourceFileWithTfDictionary in TfDictionary) { var fileTfIdfDictionary = new MyDoubleDictionary(); foreach (var wordWithTfCount in sourceFileWithTfDictionary.Value) { fileTfIdfDictionary.Add(wordWithTfCount.Key, wordWithTfCount.Value * IdfDictionary[wordWithTfCount.Key]); } TfIdfDictionary.Add(sourceFileWithTfDictionary.Key, fileTfIdfDictionary); } WordAndContainingFiles = new Dictionary <string, List <string> >(); foreach (var sourceFileWithWords in CodeFilesWithContent) { sourceFileWithWords.Value.Distinct().ToList().ForEach(word => { if (!WordAndContainingFiles.ContainsKey(word)) { WordAndContainingFiles.Add(word, new List <string>()); } WordAndContainingFiles[word].Add(sourceFileWithWords.Key); }); } }