/// <summary> /// threaded parsing, run on a singel file, split docs and save to dic /// </summary> /// <param name="o_filePath">file path</param> public static void ThreadParsing(object o_filePath) { string filePath = (string)o_filePath; //if(System.IO.Path.GetFileName(filePath)== "stop_words.txt") continue; string[] docs = ReadFile.fileToDocString(filePath); foreach (string doc in docs) { if (doc.Length > 3) { parseDoc(doc); } } }
/// <summary> /// serializes the indexes to files /// </summary> public void saveLists() { File.Delete(postingPath + @"\list1.bin"); File.Delete(postingPath + @"\list2.bin"); File.Delete(postingPath + @"\list3.bin"); File.Delete(postingPath + @"\list4.bin"); File.Delete(postingPath + @"\list5.bin"); File.Delete(postingPath + @"\Doc.bin"); IFormatter formatter = new BinaryFormatter(); formatter.Serialize(new FileStream(postingPath + @"\list1.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList1); formatter.Serialize(new FileStream(postingPath + @"\list2.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList2); formatter.Serialize(new FileStream(postingPath + @"\list3.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList3); formatter.Serialize(new FileStream(postingPath + @"\list4.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList4); formatter.Serialize(new FileStream(postingPath + @"\list5.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList5); formatter.Serialize(new FileStream(postingPath + @"\Doc.bin", FileMode.Create, FileAccess.Write, FileShare.None), Parse.d_docs); Parse.StopWords = ReadFile.readStopWords(postingPath + @"\stop_words.txt"); }
/// <summary> /// constructor /// </summary> /// <param name="PathData">path to data files</param> /// <param name="indexer">instance of indexer</param> /// <param name="stemming">yes/no stemming</param> public Parse(string PathData, string PathPosting, Indexer indexer, bool stemming) { d_abNumTerms = new SortedDictionary <string, Term>(); d_cfTerms = new SortedDictionary <string, Term>(); d_gmTerms = new SortedDictionary <string, Term>(); d_nrTerms = new SortedDictionary <string, Term>(); d_szTerms = new SortedDictionary <string, Term>(); use_stem = stemming; filesPath = PathData; _indexer = indexer; postingPath = PathPosting; StopWords = ReadFile.readStopWords(PathData + @"\stop_words.txt"); if (!File.Exists(PathPosting + @"\stop_words.txt")) { File.Copy(PathData + @"\stop_words.txt", PathPosting + @"\stop_words.txt"); } if (months.Count == 0) { addMonths(); } }
/// <summary> /// Constructor /// </summary> /// <param name="postingPath">path to save the posting to</param> /// <param name="v">load posting from files</param> public Indexer(string postingPath, bool v) { this.postingPath = postingPath; IFormatter formatter = new BinaryFormatter(); termsData = new Dictionary <string, double>(); try { mainIndexList1 = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list1.bin", FileMode.Open, FileAccess.Read, FileShare.Read)); mainIndexList2 = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list2.bin", FileMode.Open, FileAccess.Read, FileShare.Read)); mainIndexList3 = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list3.bin", FileMode.Open, FileAccess.Read, FileShare.Read)); mainIndexList4 = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list4.bin", FileMode.Open, FileAccess.Read, FileShare.Read)); mainIndexList5 = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list5.bin", FileMode.Open, FileAccess.Read, FileShare.Read)); Parse.d_docs = (Dictionary <string, Doc>)formatter.Deserialize(new FileStream(postingPath + @"\Doc.bin", FileMode.Open, FileAccess.Read, FileShare.Read)); Parse.StopWords = ReadFile.readStopWords(postingPath + @"\stop_words.txt"); } catch (Exception e) { throw; } }
/// <summary> /// main function for parsing /// </summary> public void startParsing() { ModelChanged(1, "Started parsing"); DateTime start = DateTime.Now; string[] paths = ReadFile.getFilesPaths(filesPath); int i = 0, j = 0; foreach (string filePath in paths) { if (stop) { return; } //runs every file in a new thread a_Threads[i] = new Thread(new ParameterizedThreadStart(ThreadParsing)); a_Threads[i].Start(filePath); if (++i == 8) { j += i; i = 0; foreach (Thread t in a_Threads) { t.Join(); } //every 40 files start indexing if (j == 40) { j = 0; if (!(t_indexer == null)) { t_indexer.Join(); } d_terms[0] = d_abNumTerms; d_terms[1] = d_cfTerms; d_terms[2] = d_gmTerms; d_terms[3] = d_nrTerms; d_terms[4] = d_szTerms; t_indexer = new Thread(new ThreadStart(startIndexing)); t_indexer.Start(); d_abNumTerms = new SortedDictionary <string, Term>(); d_cfTerms = new SortedDictionary <string, Term>(); d_gmTerms = new SortedDictionary <string, Term>(); d_nrTerms = new SortedDictionary <string, Term>(); d_szTerms = new SortedDictionary <string, Term>(); } } } foreach (Thread t in a_Threads) { if (t != null) { t.Join(); } } if (!(t_indexer == null)) { t_indexer.Join(); } d_terms[0] = d_abNumTerms; d_terms[1] = d_cfTerms; d_terms[2] = d_gmTerms; d_terms[3] = d_nrTerms; d_terms[4] = d_szTerms; t_indexer = new Thread(new ThreadStart(startIndexing)); t_indexer.Start(); t_indexer.Join(); for (i = 0; i < 5; i++) { d_terms[i] = null; } d_abNumTerms = null; d_cfTerms = null; d_gmTerms = null; d_nrTerms = null; d_szTerms = null; //calculate data for all documents _indexer.calculatDocumentsData(); //save all list data for import _indexer.saveLists(); float time = (DateTime.Now.Minute * 60 + DateTime.Now.Second - start.Minute * 60 - start.Second) / 60; int numOfTerms = _indexer.getNumOfTerms(); ModelChanged(1, "Finshed parsing and indexing docs after " + time + " min\n" + "Number of Docs: " + d_docs.Count + "\nNumber of Terms: " + numOfTerms); }