/// <summary> /// run the engine, control all the classes /// </summary> /// <param name="path"></param> public void Engine(string path, string finalPath, bool stem) { Stemmer stemmer = new Stemmer(); ReadFile rf = new ReadFile(path); Parser parser = new Parser(rf.ReadStopWords(path + "\\stop_words.txt")); int filesAmount = rf.FilesAmount(); Document currentDoc = null; string tempPath1 = @"./temp Posting Files1"; string tempPath2 = @"./temp Posting Files2"; Directory.CreateDirectory(tempPath1); Directory.CreateDirectory(tempPath2); Directory.CreateDirectory(finalPath); string[] filesInTmp1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories); for (int i = 0; i < filesInTmp1.Length; i++) { File.Delete(filesInTmp1[i]); } string[] filesInTmp2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories); for (int i = 0; i < filesInTmp2.Length; i++) { File.Delete(filesInTmp2[i]); } DirectoryInfo di = new DirectoryInfo(path); long size = di.EnumerateFiles("*", SearchOption.AllDirectories).Sum(fi => fi.Length); long avgFilesSize = size / filesAmount; long tenPrecent = (size * 9) / 100; long numFiles = tenPrecent / avgFilesSize; int count = 0; //numFiles = 20; for (int i = 0; i < filesAmount; i++) //going through the files in the dictionary and send each to the parser { Match matchTEXT = rf.Seperate(i); // get a sperated files from red file while (matchTEXT.Success) { Term[] terms = parser.Parse(matchTEXT.Groups[1].Value).Values.ToArray(); int max = -1; if (stem) { for (int j = 0; j < terms.Length; j++) { terms[j].SetName(stemmer.stemTerm(terms[j].GetName())); } } indexer.PrepareToPosting(terms, currentDoc = parser.GetDoc()); for (int j = 0; j < terms.Length; j++) { int currentTF = terms[j].GetTF(currentDoc); if (currentTF > max) { max = currentTF; } } currentDoc.SetMaxTF(max); currentDoc.SetLength(terms.Length); string[] details = new string[4]; details[0] = currentDoc.GetMaxTfString(); details[1] = currentDoc.GetLengthString(); details[2] = currentDoc.GetDateString(); details[3] = ""; DocDictionary.Add(currentDoc.GetName(), details); matchTEXT = matchTEXT.NextMatch(); } count++; if (count == numFiles) { Console.WriteLine("create posting"); indexer.CreateTempPostingFile(tempPath1); count = 0; } }//for if (count > 0)// if we finished the for and there are still terms in the hash { indexer.CreateTempPostingFile(tempPath1); } int temporarlyPostingFolder1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories).Length; int temporarlyPostingFolder2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories).Length; //continue until there is only two files while (!(temporarlyPostingFolder1 == 2 && temporarlyPostingFolder2 == 0) || !(temporarlyPostingFolder1 == 0 && temporarlyPostingFolder2 == 2)) { indexer.SetPostingNumber(0); Merge(tempPath1, tempPath2); temporarlyPostingFolder1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories).Length; temporarlyPostingFolder2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories).Length; if (temporarlyPostingFolder1 == 0 && temporarlyPostingFolder2 == 2) { string[] temporarlyPostingFolder = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories); indexer.FinalMerge(temporarlyPostingFolder[0], temporarlyPostingFolder[1], finalPath, stem); File.Delete(temporarlyPostingFolder[0]); File.Delete(temporarlyPostingFolder[1]); break; } indexer.SetPostingNumber(0); Merge(tempPath2, tempPath1); temporarlyPostingFolder1 = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories).Length; temporarlyPostingFolder2 = Directory.GetFiles(tempPath2, "*.*", SearchOption.AllDirectories).Length; if (temporarlyPostingFolder1 == 2 && temporarlyPostingFolder2 == 0) { string[] temporarlyPostingFolder = Directory.GetFiles(tempPath1, "*.*", SearchOption.AllDirectories); indexer.FinalMerge(temporarlyPostingFolder[0], temporarlyPostingFolder[1], finalPath, stem); File.Delete(temporarlyPostingFolder[0]); File.Delete(temporarlyPostingFolder[1]); break; } } theDictionary = indexer.GetFinalDic(); //cach /* string path1 = finalPath + "\\Poodle_Dictionary"; * StreamReader file1 = new StreamReader(path1); * while (!file1.EndOfStream) * { * string line = file1.ReadLine(); * StringBuilder sb = new StringBuilder(); * for (int i = 0; i < line.IndexOf("total tf:") - 1; i++) * { * sb.Append(line[i]); * } * string name = sb.ToString(); * String[] details = new string[4]; * int count1 = 0; * for (int i = line.IndexOf("total tf:") + 10; i < line.Length; i++) * { * if (line[i] != '~') * sb.Append(line[i]); * else * { * details[count1] = sb.ToString(); * sb.Clear(); * } * * } * // string[] data = line.Split('~'); * // String[] details = { data[1], data[2], data[3], data[4] }; * theDictionary.Add(name, details); * }*/ List <KeyValuePair <string, string[]> > tempDic = theDictionary.ToList(); tempDic = tempDic.OrderByDescending(a => Int32.Parse(a.Value[0])).ToList();//sort by max tf for (int i = 0; i < 10000; i++) { string pathtToPosting = Path.Combine(finalPath, theDictionary[tempDic[i].Key][2]); FileStream fs = new FileStream(pathtToPosting, FileMode.Open, FileAccess.Read); BinaryReader br = new BinaryReader(fs); br.BaseStream.Seek(Int64.Parse(theDictionary[tempDic[i].Key][3]), SeekOrigin.Begin); cache.Add(tempDic[i].Key, ReadLine(br)); } tempDic.Clear(); Save(finalPath, stem); }//engine
public static Dictionary <string, int[]> remove(KeyValuePair <List <string>, List <bool> > pair, string docname, string filename) { int maxTf = 0; int[] temp; Dictionary <string, int[]> dic = new Dictionary <string, int[]>(); if (stemB) { Stemmer stem = new Stemmer(); for (int i = pair.Key.Count - 1; i >= 0; i--) { pair.Key[i] = stem.stemTerm(pair.Key[i]); if (dic.ContainsKey(pair.Key[i])) { if (pair.Value[i]) { dic[pair.Key[i]][0]++; } else { dic[pair.Key[i]][1]++; } //docfile temp = dic[pair.Key[i]]; if (temp[0] + temp[1] > maxTf) { maxTf = temp[0] + temp[1]; } } else { dic[pair.Key[i]] = new int[2]; if (pair.Value[i]) { dic[pair.Key[i]][0]++; } else { dic[pair.Key[i]][1]++; } //docfile if (1 > maxTf) { maxTf = 1; } } } } else { for (int i = pair.Key.Count - 1; i >= 0; i--) { if (dic.ContainsKey(pair.Key[i])) { if (pair.Value[i]) { dic[pair.Key[i]][0]++; } else { dic[pair.Key[i]][1]++; } //docfile temp = dic[pair.Key[i]]; if (temp[0] + temp[1] > maxTf) { maxTf = temp[0] + temp[1]; } } else { dic[pair.Key[i]] = new int[2]; if (pair.Value[i]) { dic[pair.Key[i]][0]++; } else { dic[pair.Key[i]][1]++; } //docfile if (1 > maxTf) { maxTf = 1; } } } } DocDic.Add(docname, filename + "," + maxTf + "," + pair.Key.Count); return(dic); }