/// <summary> /// method which splits per document in the collection of the files into terms /// </summary> /// <param name="file">the path of the first file in the collection</param> public void ParseMasterFile(MasterFile file)// { foreach (Document document in file.m_documents.Values) { ParseDocuments(document); } Console.WriteLine(file.m_fileName); }
/// <summary> /// method which reads all documents in a chunck /// </summary> /// <param name="i">the id of the chunck</param> /// <returns>the collection of files</returns> public MasterFile ReadChunk(int i) { string[] currentChunk = path_Chank[i]; string[] fields = currentChunk[0].Split('\\'); string currFileName = fields[fields.Length - 1]; MasterFile masterFile = new MasterFile(currFileName, currentChunk[0]); for (int j = 0; j < currentChunk.Length; j++) { ReadDocuments(currentChunk[j], masterFile); } return(masterFile); }
/// <summary> /// method which saves all the cities exist in the tags /// </summary> /// <param name="masterFile">a collection of files</param> public void UpdateCitiesAndLanguagesInDocument(MasterFile masterFile) { string city, language; docCounter += masterFile.m_documents.Count; foreach (Document document in masterFile.m_documents.Values) { city = document.m_CITY.ToString().Trim(toDelete); if (!city.Equals("") && !m_Cities.Contains(city) && !(city.Any(char.IsDigit))) { m_Cities.Add(city); } language = document.m_language.ToString().Trim(toDelete); if (!language.Equals("") && !m_Languages.Contains(language) && !(language.Any(char.IsDigit))) { m_Languages.Add(language); } } }
/// <summary> /// method which reads all documents in current master file /// </summary> /// <param name="file">the path of current file</param> /// <param name="masterFile">current master file</param> private void ReadDocuments(string file, MasterFile masterFile) { string content = File.ReadAllText(file); string[] docs = content.Split(new[] { "<DOC>" }, StringSplitOptions.None); string DOCNO, TEXT; for (int i = 1; i < docs.Length; i++) { DOCNO = GetStringInBetween("<DOCNO>", "</DOCNO>", docs[i]).Trim(' '); StringBuilder DATE1 = new StringBuilder(GetDateInBetween(docs[i]).Trim(' ')); StringBuilder TI = new StringBuilder(GetStringInBetween("<TI>", "</TI>", docs[i]).Trim(' ')); TEXT = TI.ToString() + " "; TEXT += GetStringInBetween("<TEXT>", "</TEXT>", docs[i]); StringBuilder City = new StringBuilder(GetCityInBetween(docs[i])); StringBuilder language = new StringBuilder(GetLanguageInBetween(docs[i])); masterFile.m_documents.Add(DOCNO.ToString(), new Document(DOCNO, DATE1, TI, TEXT, City, language)); masterFile.m_docAmount++; } }
/// <summary> /// method to write the documents' file to disk /// </summary> /// <param name="masterFiles">current collection of files</param> public void WriteTheNewDocumentsFile(MasterFile masterFiles) { if (!File.Exists(Path.Combine(m_outPutPath, "Documents.txt"))) { CreateEmptyTxtFile("Documents.txt"); Writer = new StreamWriter(Path.Combine(m_outPutPath, "Documents.txt")); } else { Writer = File.AppendText(Path.Combine(m_outPutPath, "Documents.txt")); } foreach (Document document in masterFiles.m_documents.Values) { Writer.WriteLine(document.WriteDocumentToIndexFile()); totalLenDocs += document.m_length; } Writer.Flush(); Writer.Close(); Writer = null; }
/// <summary> /// method to execute the model to get inverted index /// </summary> public override void RunIndexing() { int sizeTasks, _external = 0; readFile = new ReadFile(inputPath); indexer = new Indexer(m_doStemming, outPutPath); Semaphore semaphore = new Semaphore(2, 2); Mutex m1 = new Mutex(); Mutex m2 = new Mutex(); bool enter = false; this.lastRunStem = m_doStemming; string documentsIndexPath = Path.Combine(indexer.m_outPutPath, "Documents.txt"); if (File.Exists(documentsIndexPath)) // delete the documents file if exists { File.Delete(documentsIndexPath); } Action <object> indexCity = (object obj) => { indexer.WriteCitiesIndexFile(); }; Action <object> DataOfDocsAction = (object obj) => { indexer.WriteAdditionalDataOfDocs(); }; Action <object> indexDictionary = (object obj) => { indexer.WriteTheNewIndexFile(); indexer.SerializeDictionary(); }; Action <object> updateRuleTask = (object obj) => { indexer.MergeSameWords((int)obj); }; Action <object> taskAction = (object obj) => { Parse parse = new Parse(m_doStemming, readFile.m_mainPath); m1.WaitOne(); MasterFile currMasterFile = readFile.ReadChunk(_external++); m1.ReleaseMutex(); semaphore.WaitOne(); parse.ParseMasterFile(currMasterFile); semaphore.Release(); m2.WaitOne(); indexer.WriteToPostingFile(new Dictionary <string, DocumentsTerm>(parse.m_allTerms), enter); enter = true; indexer.UpdateCitiesAndLanguagesInDocument(currMasterFile); indexer.WriteTheNewDocumentsFile(currMasterFile); m2.ReleaseMutex(); parse.m_allTerms.Clear(); }; Action <object> LanguagesAction = (object obj) => { indexer.WriteLanguagesToFile(); }; sizeTasks = readFile.path_Chank.Count; Task[] taskArray = new Task[4]; Task[] lastTaskArray = new Task[sizeTasks % 4]; for (int i = 0; i < sizeTasks;) { if ((i + 4) <= sizeTasks) { for (int _internal = 0; _internal < 4; _internal++, i++) { taskArray[_internal] = Task.Factory.StartNew(taskAction, "taskParse"); } Task.WaitAll(taskArray); } else { for (int _internal = 0; _internal < (sizeTasks % 4); _internal++, i++) { lastTaskArray[_internal] = Task.Factory.StartNew(taskAction, "taskParse"); } Task.WaitAll(lastTaskArray); } } Task[][] m_tasks = new Task[7][]; for (int i = 0; i < 6; i++) { m_tasks[i] = new Task[4]; } m_tasks[6] = new Task[3]; m_tasks[0][0] = Task.Factory.StartNew(() => updateRuleTask(0)); m_tasks[0][1] = Task.Factory.StartNew(() => updateRuleTask(1)); m_tasks[0][2] = Task.Factory.StartNew(() => updateRuleTask(2)); m_tasks[0][3] = Task.Factory.StartNew(() => updateRuleTask(3)); Task.WaitAll(m_tasks[0]); m_tasks[1][0] = Task.Factory.StartNew(() => updateRuleTask(4)); m_tasks[1][1] = Task.Factory.StartNew(() => updateRuleTask(5)); m_tasks[1][2] = Task.Factory.StartNew(() => updateRuleTask(6)); m_tasks[1][3] = Task.Factory.StartNew(() => updateRuleTask(7)); Task.WaitAll(m_tasks[1]); m_tasks[2][0] = Task.Factory.StartNew(() => updateRuleTask(8)); m_tasks[2][1] = Task.Factory.StartNew(() => updateRuleTask(9)); m_tasks[2][2] = Task.Factory.StartNew(() => updateRuleTask(10)); m_tasks[2][3] = Task.Factory.StartNew(() => updateRuleTask(11)); Task.WaitAll(m_tasks[2]); m_tasks[3][0] = Task.Factory.StartNew(() => updateRuleTask(12)); m_tasks[3][1] = Task.Factory.StartNew(() => updateRuleTask(13)); m_tasks[3][2] = Task.Factory.StartNew(() => updateRuleTask(14)); m_tasks[3][3] = Task.Factory.StartNew(() => updateRuleTask(15)); Task.WaitAll(m_tasks[3]); m_tasks[4][0] = Task.Factory.StartNew(() => updateRuleTask(16)); m_tasks[4][1] = Task.Factory.StartNew(() => updateRuleTask(17)); m_tasks[4][2] = Task.Factory.StartNew(() => updateRuleTask(18)); m_tasks[4][3] = Task.Factory.StartNew(() => updateRuleTask(19)); Task.WaitAll(m_tasks[4]); m_tasks[5][0] = Task.Factory.StartNew(() => updateRuleTask(20)); m_tasks[5][1] = Task.Factory.StartNew(() => updateRuleTask(21)); m_tasks[5][2] = Task.Factory.StartNew(() => updateRuleTask(22)); m_tasks[5][3] = Task.Factory.StartNew(() => updateRuleTask(23)); Task.WaitAll(m_tasks[5]); m_tasks[6][0] = Task.Factory.StartNew(() => updateRuleTask(24)); m_tasks[6][1] = Task.Factory.StartNew(() => updateRuleTask(25)); m_tasks[6][2] = Task.Factory.StartNew(() => updateRuleTask(26)); Task.WaitAll(m_tasks[6]); Task dictionaryIndex = Task.Factory.StartNew(indexDictionary, "taskDictionary"); dictionaryIndex.Wait(); Task citiesIndex = Task.Factory.StartNew(indexCity, "taskCity"); citiesIndex.Wait(); Task DataOfDocsTask = Task.Factory.StartNew(DataOfDocsAction, "taskData"); DataOfDocsTask.Wait(); Task LanguagesTask = Task.Factory.StartNew(LanguagesAction, "taskLanguages"); LanguagesTask.Wait(); }