예제 #1
0
 /// <summary>
 /// method which splits per document in the collection of the files into terms
 /// </summary>
 /// <param name="file">the path of the first file in the collection</param>
 public void ParseMasterFile(MasterFile file)//
 {
     foreach (Document document in file.m_documents.Values)
     {
         ParseDocuments(document);
     }
     Console.WriteLine(file.m_fileName);
 }
예제 #2
0
        /// <summary>
        /// method which reads all documents in a chunck
        /// </summary>
        /// <param name="i">the id of the chunck</param>
        /// <returns>the collection of files</returns>
        public MasterFile ReadChunk(int i)
        {
            string[]   currentChunk = path_Chank[i];
            string[]   fields       = currentChunk[0].Split('\\');
            string     currFileName = fields[fields.Length - 1];
            MasterFile masterFile   = new MasterFile(currFileName, currentChunk[0]);

            for (int j = 0; j < currentChunk.Length; j++)
            {
                ReadDocuments(currentChunk[j], masterFile);
            }
            return(masterFile);
        }
예제 #3
0
        /// <summary>
        /// method which saves all the cities exist in the tags
        /// </summary>
        /// <param name="masterFile">a collection of files</param>
        public void UpdateCitiesAndLanguagesInDocument(MasterFile masterFile)
        {
            string city, language;

            docCounter += masterFile.m_documents.Count;
            foreach (Document document in masterFile.m_documents.Values)
            {
                city = document.m_CITY.ToString().Trim(toDelete);
                if (!city.Equals("") && !m_Cities.Contains(city) && !(city.Any(char.IsDigit)))
                {
                    m_Cities.Add(city);
                }
                language = document.m_language.ToString().Trim(toDelete);
                if (!language.Equals("") && !m_Languages.Contains(language) && !(language.Any(char.IsDigit)))
                {
                    m_Languages.Add(language);
                }
            }
        }
예제 #4
0
        /// <summary>
        /// method which reads all documents in current master file
        /// </summary>
        /// <param name="file">the path of current file</param>
        /// <param name="masterFile">current master file</param>
        private void ReadDocuments(string file, MasterFile masterFile)
        {
            string content = File.ReadAllText(file);

            string[] docs = content.Split(new[] { "<DOC>" }, StringSplitOptions.None);
            string   DOCNO, TEXT;

            for (int i = 1; i < docs.Length; i++)
            {
                DOCNO = GetStringInBetween("<DOCNO>", "</DOCNO>", docs[i]).Trim(' ');
                StringBuilder DATE1 = new StringBuilder(GetDateInBetween(docs[i]).Trim(' '));
                StringBuilder TI    = new StringBuilder(GetStringInBetween("<TI>", "</TI>", docs[i]).Trim(' '));
                TEXT  = TI.ToString() + " ";
                TEXT += GetStringInBetween("<TEXT>", "</TEXT>", docs[i]);
                StringBuilder City     = new StringBuilder(GetCityInBetween(docs[i]));
                StringBuilder language = new StringBuilder(GetLanguageInBetween(docs[i]));
                masterFile.m_documents.Add(DOCNO.ToString(), new Document(DOCNO, DATE1, TI, TEXT, City, language));
                masterFile.m_docAmount++;
            }
        }
예제 #5
0
        /// <summary>
        /// method to write the documents' file to disk
        /// </summary>
        /// <param name="masterFiles">current collection of files</param>
        public void WriteTheNewDocumentsFile(MasterFile masterFiles)
        {
            if (!File.Exists(Path.Combine(m_outPutPath, "Documents.txt")))
            {
                CreateEmptyTxtFile("Documents.txt");
                Writer = new StreamWriter(Path.Combine(m_outPutPath, "Documents.txt"));
            }
            else
            {
                Writer = File.AppendText(Path.Combine(m_outPutPath, "Documents.txt"));
            }
            foreach (Document document in masterFiles.m_documents.Values)
            {
                Writer.WriteLine(document.WriteDocumentToIndexFile());
                totalLenDocs += document.m_length;
            }

            Writer.Flush();
            Writer.Close();
            Writer = null;
        }
예제 #6
0
        /// <summary>
        /// method to execute the model to get inverted index
        /// </summary>
        public override void RunIndexing()
        {
            int sizeTasks, _external = 0;

            readFile = new ReadFile(inputPath);
            indexer  = new Indexer(m_doStemming, outPutPath);
            Semaphore semaphore = new Semaphore(2, 2);
            Mutex     m1        = new Mutex();
            Mutex     m2        = new Mutex();
            bool      enter     = false;

            this.lastRunStem = m_doStemming;

            string documentsIndexPath = Path.Combine(indexer.m_outPutPath, "Documents.txt");

            if (File.Exists(documentsIndexPath))   // delete the documents file if exists
            {
                File.Delete(documentsIndexPath);
            }

            Action <object> indexCity = (object obj) =>
            {
                indexer.WriteCitiesIndexFile();
            };

            Action <object> DataOfDocsAction = (object obj) =>
            {
                indexer.WriteAdditionalDataOfDocs();
            };

            Action <object> indexDictionary = (object obj) =>
            {
                indexer.WriteTheNewIndexFile();
                indexer.SerializeDictionary();
            };

            Action <object> updateRuleTask = (object obj) =>
            {
                indexer.MergeSameWords((int)obj);
            };

            Action <object> taskAction = (object obj) =>
            {
                Parse parse = new Parse(m_doStemming, readFile.m_mainPath);

                m1.WaitOne();
                MasterFile currMasterFile = readFile.ReadChunk(_external++);
                m1.ReleaseMutex();

                semaphore.WaitOne();
                parse.ParseMasterFile(currMasterFile);
                semaphore.Release();

                m2.WaitOne();
                indexer.WriteToPostingFile(new Dictionary <string, DocumentsTerm>(parse.m_allTerms), enter);
                enter = true;
                indexer.UpdateCitiesAndLanguagesInDocument(currMasterFile);
                indexer.WriteTheNewDocumentsFile(currMasterFile);
                m2.ReleaseMutex();
                parse.m_allTerms.Clear();
            };


            Action <object> LanguagesAction = (object obj) =>
            {
                indexer.WriteLanguagesToFile();
            };

            sizeTasks = readFile.path_Chank.Count;
            Task[] taskArray     = new Task[4];
            Task[] lastTaskArray = new Task[sizeTasks % 4];

            for (int i = 0; i < sizeTasks;)
            {
                if ((i + 4) <= sizeTasks)
                {
                    for (int _internal = 0; _internal < 4; _internal++, i++)
                    {
                        taskArray[_internal] = Task.Factory.StartNew(taskAction, "taskParse");
                    }
                    Task.WaitAll(taskArray);
                }
                else
                {
                    for (int _internal = 0; _internal < (sizeTasks % 4); _internal++, i++)
                    {
                        lastTaskArray[_internal] = Task.Factory.StartNew(taskAction, "taskParse");
                    }
                    Task.WaitAll(lastTaskArray);
                }
            }

            Task[][] m_tasks = new Task[7][];
            for (int i = 0; i < 6; i++)
            {
                m_tasks[i] = new Task[4];
            }
            m_tasks[6] = new Task[3];

            m_tasks[0][0] = Task.Factory.StartNew(() => updateRuleTask(0));
            m_tasks[0][1] = Task.Factory.StartNew(() => updateRuleTask(1));
            m_tasks[0][2] = Task.Factory.StartNew(() => updateRuleTask(2));
            m_tasks[0][3] = Task.Factory.StartNew(() => updateRuleTask(3));
            Task.WaitAll(m_tasks[0]);

            m_tasks[1][0] = Task.Factory.StartNew(() => updateRuleTask(4));
            m_tasks[1][1] = Task.Factory.StartNew(() => updateRuleTask(5));
            m_tasks[1][2] = Task.Factory.StartNew(() => updateRuleTask(6));
            m_tasks[1][3] = Task.Factory.StartNew(() => updateRuleTask(7));
            Task.WaitAll(m_tasks[1]);

            m_tasks[2][0] = Task.Factory.StartNew(() => updateRuleTask(8));
            m_tasks[2][1] = Task.Factory.StartNew(() => updateRuleTask(9));
            m_tasks[2][2] = Task.Factory.StartNew(() => updateRuleTask(10));
            m_tasks[2][3] = Task.Factory.StartNew(() => updateRuleTask(11));
            Task.WaitAll(m_tasks[2]);

            m_tasks[3][0] = Task.Factory.StartNew(() => updateRuleTask(12));
            m_tasks[3][1] = Task.Factory.StartNew(() => updateRuleTask(13));
            m_tasks[3][2] = Task.Factory.StartNew(() => updateRuleTask(14));
            m_tasks[3][3] = Task.Factory.StartNew(() => updateRuleTask(15));
            Task.WaitAll(m_tasks[3]);

            m_tasks[4][0] = Task.Factory.StartNew(() => updateRuleTask(16));
            m_tasks[4][1] = Task.Factory.StartNew(() => updateRuleTask(17));
            m_tasks[4][2] = Task.Factory.StartNew(() => updateRuleTask(18));
            m_tasks[4][3] = Task.Factory.StartNew(() => updateRuleTask(19));
            Task.WaitAll(m_tasks[4]);

            m_tasks[5][0] = Task.Factory.StartNew(() => updateRuleTask(20));
            m_tasks[5][1] = Task.Factory.StartNew(() => updateRuleTask(21));
            m_tasks[5][2] = Task.Factory.StartNew(() => updateRuleTask(22));
            m_tasks[5][3] = Task.Factory.StartNew(() => updateRuleTask(23));
            Task.WaitAll(m_tasks[5]);

            m_tasks[6][0] = Task.Factory.StartNew(() => updateRuleTask(24));
            m_tasks[6][1] = Task.Factory.StartNew(() => updateRuleTask(25));
            m_tasks[6][2] = Task.Factory.StartNew(() => updateRuleTask(26));
            Task.WaitAll(m_tasks[6]);

            Task dictionaryIndex = Task.Factory.StartNew(indexDictionary, "taskDictionary");

            dictionaryIndex.Wait();

            Task citiesIndex = Task.Factory.StartNew(indexCity, "taskCity");

            citiesIndex.Wait();


            Task DataOfDocsTask = Task.Factory.StartNew(DataOfDocsAction, "taskData");

            DataOfDocsTask.Wait();

            Task LanguagesTask = Task.Factory.StartNew(LanguagesAction, "taskLanguages");

            LanguagesTask.Wait();
        }