Beispiel #1
0
        /// <summary>
        /// threaded parsing, run on a singel file, split docs and save to dic
        /// </summary>
        /// <param name="o_filePath">file path</param>
        public static void ThreadParsing(object o_filePath)
        {
            string filePath = (string)o_filePath;

            //if(System.IO.Path.GetFileName(filePath)== "stop_words.txt") continue;
            string[] docs = ReadFile.fileToDocString(filePath);
            foreach (string doc in docs)
            {
                if (doc.Length > 3)
                {
                    parseDoc(doc);
                }
            }
        }
Beispiel #2
0
        /// <summary>
        /// serializes the indexes to files
        /// </summary>
        public void saveLists()
        {
            File.Delete(postingPath + @"\list1.bin");
            File.Delete(postingPath + @"\list2.bin");
            File.Delete(postingPath + @"\list3.bin");
            File.Delete(postingPath + @"\list4.bin");
            File.Delete(postingPath + @"\list5.bin");
            File.Delete(postingPath + @"\Doc.bin");
            IFormatter formatter = new BinaryFormatter();

            formatter.Serialize(new FileStream(postingPath + @"\list1.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList1);
            formatter.Serialize(new FileStream(postingPath + @"\list2.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList2);
            formatter.Serialize(new FileStream(postingPath + @"\list3.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList3);
            formatter.Serialize(new FileStream(postingPath + @"\list4.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList4);
            formatter.Serialize(new FileStream(postingPath + @"\list5.bin", FileMode.Create, FileAccess.Write, FileShare.None), mainIndexList5);
            formatter.Serialize(new FileStream(postingPath + @"\Doc.bin", FileMode.Create, FileAccess.Write, FileShare.None), Parse.d_docs);
            Parse.StopWords = ReadFile.readStopWords(postingPath + @"\stop_words.txt");
        }
Beispiel #3
0
 /// <summary>
 /// constructor
 /// </summary>
 /// <param name="PathData">path to data files</param>
 /// <param name="indexer">instance of indexer</param>
 /// <param name="stemming">yes/no stemming</param>
 public Parse(string PathData, string PathPosting, Indexer indexer, bool stemming)
 {
     d_abNumTerms = new SortedDictionary <string, Term>();
     d_cfTerms    = new SortedDictionary <string, Term>();
     d_gmTerms    = new SortedDictionary <string, Term>();
     d_nrTerms    = new SortedDictionary <string, Term>();
     d_szTerms    = new SortedDictionary <string, Term>();
     use_stem     = stemming;
     filesPath    = PathData;
     _indexer     = indexer;
     postingPath  = PathPosting;
     StopWords    = ReadFile.readStopWords(PathData + @"\stop_words.txt");
     if (!File.Exists(PathPosting + @"\stop_words.txt"))
     {
         File.Copy(PathData + @"\stop_words.txt", PathPosting + @"\stop_words.txt");
     }
     if (months.Count == 0)
     {
         addMonths();
     }
 }
Beispiel #4
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="postingPath">path to save the posting to</param>
        /// <param name="v">load posting from files</param>
        public Indexer(string postingPath, bool v)
        {
            this.postingPath = postingPath;
            IFormatter formatter = new BinaryFormatter();

            termsData = new Dictionary <string, double>();

            try
            {
                mainIndexList1  = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list1.bin", FileMode.Open, FileAccess.Read, FileShare.Read));
                mainIndexList2  = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list2.bin", FileMode.Open, FileAccess.Read, FileShare.Read));
                mainIndexList3  = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list3.bin", FileMode.Open, FileAccess.Read, FileShare.Read));
                mainIndexList4  = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list4.bin", FileMode.Open, FileAccess.Read, FileShare.Read));
                mainIndexList5  = (SortedList <string, int>)formatter.Deserialize(new FileStream(postingPath + @"\list5.bin", FileMode.Open, FileAccess.Read, FileShare.Read));
                Parse.d_docs    = (Dictionary <string, Doc>)formatter.Deserialize(new FileStream(postingPath + @"\Doc.bin", FileMode.Open, FileAccess.Read, FileShare.Read));
                Parse.StopWords = ReadFile.readStopWords(postingPath + @"\stop_words.txt");
            }
            catch (Exception e)
            {
                throw;
            }
        }
Beispiel #5
0
        /// <summary>
        /// main function for parsing
        /// </summary>
        public void startParsing()
        {
            ModelChanged(1, "Started parsing");
            DateTime start = DateTime.Now;

            string[] paths = ReadFile.getFilesPaths(filesPath);
            int      i = 0, j = 0;

            foreach (string filePath in paths)
            {
                if (stop)
                {
                    return;
                }
                //runs every file in a new thread
                a_Threads[i] = new Thread(new ParameterizedThreadStart(ThreadParsing));
                a_Threads[i].Start(filePath);

                if (++i == 8)
                {
                    j += i;
                    i  = 0;
                    foreach (Thread t in a_Threads)
                    {
                        t.Join();
                    }
                    //every 40 files start indexing
                    if (j == 40)
                    {
                        j = 0;
                        if (!(t_indexer == null))
                        {
                            t_indexer.Join();
                        }

                        d_terms[0] = d_abNumTerms;
                        d_terms[1] = d_cfTerms;
                        d_terms[2] = d_gmTerms;
                        d_terms[3] = d_nrTerms;
                        d_terms[4] = d_szTerms;
                        t_indexer  = new Thread(new ThreadStart(startIndexing));
                        t_indexer.Start();
                        d_abNumTerms = new SortedDictionary <string, Term>();
                        d_cfTerms    = new SortedDictionary <string, Term>();
                        d_gmTerms    = new SortedDictionary <string, Term>();
                        d_nrTerms    = new SortedDictionary <string, Term>();
                        d_szTerms    = new SortedDictionary <string, Term>();
                    }
                }
            }
            foreach (Thread t in a_Threads)
            {
                if (t != null)
                {
                    t.Join();
                }
            }
            if (!(t_indexer == null))
            {
                t_indexer.Join();
            }
            d_terms[0] = d_abNumTerms;
            d_terms[1] = d_cfTerms;
            d_terms[2] = d_gmTerms;
            d_terms[3] = d_nrTerms;
            d_terms[4] = d_szTerms;
            t_indexer  = new Thread(new ThreadStart(startIndexing));
            t_indexer.Start();
            t_indexer.Join();
            for (i = 0; i < 5; i++)
            {
                d_terms[i] = null;
            }
            d_abNumTerms = null;
            d_cfTerms    = null;
            d_gmTerms    = null;
            d_nrTerms    = null;
            d_szTerms    = null;

            //calculate data for all documents
            _indexer.calculatDocumentsData();
            //save all list data for import
            _indexer.saveLists();

            float time       = (DateTime.Now.Minute * 60 + DateTime.Now.Second - start.Minute * 60 - start.Second) / 60;
            int   numOfTerms = _indexer.getNumOfTerms();

            ModelChanged(1, "Finshed parsing and indexing docs after " + time + " min\n" + "Number of Docs: " + d_docs.Count + "\nNumber of Terms: " + numOfTerms);
        }