public void Invert() { var dict = new Dictionary <string, IList <int> >(); var index = fileProvider.GetTextFromFile($"{ParentForderPath}/{uri.Host}/index.txt"); var files = index.Split("\n").Select(x => x.Split(" ")).Select(x => x[2]).ToList(); for (int i = 0; i < files.Count; i++) { var words = fileProvider.GetTextFromFile($"{StemingForder}{i}.txt").Split(' ').ToList(); foreach (var word in words) { if (dict.ContainsKey(word)) { dict[word].Add(i); } else { dict.Add(word, new List <int> { i }); } } } foreach (var item in dict) { dict[item.Key] = item.Value.Distinct().ToList(); } var invertDict = JsonConvert.SerializeObject(dict); fileProvider.WriteTextToFile($"{StemingForder}invertlist.txt", invertDict); }
public void StartStemming() { var index = fileProvider.GetTextFromFile($"{ParentForderPath}/{uri.Host}/index.txt"); var files = index.Split("\n").Select(x => x.Split(" ")).Select(x => x[2]).ToList(); foreach (var file in files) { StemmingFile(file); } }
public TfIdf(string domain) { uri = new Uri(domain); fileProvider = new FileProvider(); StemingForder = $"{ParentForderPath}/{uri.Host}/stemming/"; tfForder = $"{ParentForderPath}/{uri.Host}/tf/"; tfIdfForder = $"{ParentForderPath}/{uri.Host}/tf-idf/"; if (!Directory.Exists(tfForder)) {//Create forder for link Directory.CreateDirectory(tfForder); } if (!Directory.Exists(tfIdfForder)) {//Create forder for link Directory.CreateDirectory(tfIdfForder); } var index = fileProvider.GetTextFromFile($"{ParentForderPath}/{uri.Host}/index.txt"); var files = index.Split("\n").Select(x => x.Split(" ")).Select(x => x[2]).ToList(); docNum = files.Count; }
public Search(string domain) { uri = new Uri(domain); fileProvider = new FileProvider(); StemingForder = $"{ParentForderPath}/{uri.Host}/stemming/"; //tfForder = $"{ParentForderPath}/{uri.Host}/tf/"; tfIdfForder = $"{ParentForderPath}/{uri.Host}/tf-idf/"; wordMatrix = new Dictionary <int, Dictionary <string, double> >(); for (int i = 0; i < CountDocument; i++) { var text = fileProvider.GetTextFromFile($"{tfIdfForder}{i}.txt"); var dict = text.Split('\n').Select(x => x.Split(":")).ToDictionary(x => x[0].Trim(), y => double.Parse(y[1])); wordMatrix.Add(i, dict); } if (invertDict == null) { invertDict = JsonConvert.DeserializeObject <Dictionary <string, List <int> > >(fileProvider.GetTextFromFile($"{StemingForder}invertlist.txt")); } invertDict = invertDict.ToDictionary(x => x.Key.Trim(), y => y.Value); }
public void TF() { for (int i = 0; i < docNum; i++) { var dict = new Dictionary <string, int>(); var text = fileProvider.GetTextFromFile($"{StemingForder}{i}.txt"); var words = fileProvider.GetTextFromFile($"{StemingForder}{i}.txt").Split(' ').ToList(); foreach (var word in words) { if (dict.ContainsKey(word)) { dict[word]++; } else { dict.Add(word, 1); } } var allWords = dict.Select(x => x.Value).Sum(); var result = string.Join('\n', dict.Select(x => $"{x.Key} : {Math.Round((double)x.Value / allWords,5).ToString("0.00000")}").ToList()); fileProvider.WriteTextToFile($"{tfForder}{i}.txt", result); } }