Example #1
0
        private void btn_startParsing_Click(object sender, RoutedEventArgs e)
        {
            if (txtbx_filesPath.Text.Length != 0)
            {
                if (!Directory.Exists(txtbx_filesPath.Text))
                {
                    MessageBox.Show("Please enter a valid Data Files path");
                    return;
                }
                filesPath = txtbx_filesPath.Text;
            }
            else
            {
                MessageBox.Show("Please enter Data Files path");
                return;
            }
            if (txtbx_postingPath.Text.Length != 0)
            {
                if (!Directory.Exists(txtbx_postingPath.Text))
                {
                    MessageBox.Show("Please enter a valid Posting Files path");
                    return;
                }
                postingPath = txtbx_postingPath.Text;
            }
            else
            {
                MessageBox.Show("Please enter Posting Files path");
                return;
            }



            try
            {
                File.Create(postingPath + @"\abNumsPosting.txt").Dispose();
                File.Create(postingPath + @"\cfPosting.txt").Dispose();
                File.Create(postingPath + @"\gmPosting.txt").Dispose();
                File.Create(postingPath + @"\nrPosting.txt").Dispose();
                File.Create(postingPath + @"\szPosting.txt").Dispose();
                if (Directory.Exists(postingPath + @"\docs"))
                {
                    Directory.Delete(postingPath + @"\docs");
                }
                Directory.CreateDirectory(postingPath + @"\docs");
            }
            catch (Exception exp)
            {
                return;
            }

            btn_startParsing.IsEnabled = false;
            btn_loadPosting.IsEnabled  = false;
            indexer              = new Indexer(postingPath);
            ranker.postingPath   = postingPath;
            parser               = new Parse(filesPath, postingPath, indexer, cb_Stemmeing.IsChecked.Value);
            parser.ModelChanged += vModelChanged;
            Thread thread = new Thread(new ThreadStart(parser.startParsing));

            thread.Start();
            btn_runQuery.IsEnabled     = true;
            btn_runQueryFile.IsEnabled = true;
        }
Example #2
0
        //Load all files of part 2-corpus, stop words, cache, dictionary ,posting and rank
        private void Load2_click(object sender, RoutedEventArgs e)
        {
            //Folder Chooser
            var dlg = new FolderBrowserDialog();

            System.Windows.Forms.DialogResult result = dlg.ShowDialog(this.GetIWin32Window());
            //change the source path
            if (dlg.SelectedPath != "")
            {
                pathopen  = dlg.SelectedPath;
                pathclose = dlg.SelectedPath;
                //init all the first part objects
                ind = new Indexer(pathclose, isStem);
                p   = new Parser(pathopen + @"\stop_words.txt", isStem);
                r   = new ReadFile(pathopen + @"\corpus\");
                string dic;
                string cache;
                if (isStem)//check if stem
                {
                    dic   = pathclose + @"\CacheDic\dicStem.dicx";
                    cache = pathclose + @"\CacheDic\cacheStem.chex";
                }
                else
                {
                    dic   = pathclose + @"\CacheDic\dic.dicx";
                    cache = pathclose + @"\CacheDic\cache.chex";
                }
                try
                {
                    //load dic
                    using (FileStream fs = new FileStream(dic, FileMode.Open))
                    {
                        IFormatter bf = new BinaryFormatter();
                        ind.dic = (Dictionary <string, DicRecord>)bf.Deserialize(fs);//read object
                    }

                    //load cache
                    using (FileStream fs = new FileStream(cache, FileMode.Open))
                    {
                        IFormatter bf = new BinaryFormatter();
                        ind.cache = (Dictionary <string, List <PostingInfo> >)bf.Deserialize(fs);//read object
                    }
                }
                catch (IOException)
                {
                    //cant find load and cache files in currect folder
                    System.Windows.Forms.MessageBox.Show("Files Missing, can't Load", "ERROR!", MessageBoxButtons.OK, MessageBoxIcon.Error);
                    return;
                }
                //for vieiwing
                ind.writeTextChache();
                showcatch.IsEnabled = true;
                ind.writeTextDic();
                showDic.IsEnabled = true;

                //new ranker and load the dictionaries of the class if the file exists in the selected folder
                rank     = new Ranker(pathclose, p, r, ind, isStem);
                searcher = new Searcher(p, ind, rank, pathopen);
                //open the run btn
                runQuery.IsEnabled = true;
                //notify when finished
                System.Windows.Forms.MessageBox.Show("Ready To search!!", "Done!", MessageBoxButtons.OK, MessageBoxIcon.Information);
            }
        }
Example #3
0
 public Ranker(Crawler _c, Indexer _i)
 {
     c = _c;
     i = _i;
 }
Example #4
0
        private void FetchData(string url)
        {
            List <string> hyperlinks = new List <string>();
            string        content    = string.Empty;

            HtmlWeb      web = new HtmlWeb();
            HtmlDocument doc;

            try
            {
                doc = web.Load(url);
            }
            catch (Exception)
            {
                return;
            }

            Task hyperlinkTask = Task.Run(() =>
            {
                HtmlNodeCollection hyperNodes = doc.DocumentNode.SelectNodes("//a[@href]");

                if (!(hyperNodes == null))
                {
                    foreach (HtmlNode link in hyperNodes)
                    {
                        string href = string.Empty;

                        try
                        {
                            href = link.OuterHtml.Split("\"")[1];
                        }
                        catch (IndexOutOfRangeException)
                        {
                            continue;
                        }

                        if (href.StartsWith("/"))
                        {
                            href = url + href.Substring(1);
                        }

                        if (href.StartsWith("http"))
                        {
                            hyperlinks.Add(href);
                        }
                    }
                }

                SortHyperLinks(new Uri(url).Host, hyperlinks);
            });

            // Preprocessering the content
            Task preprocesseringTask = Task.Run(() =>
            {
                HtmlNodeCollection contentNodes = doc.DocumentNode.SelectNodes("//body");

                if (!(contentNodes == null))
                {
                    foreach (HtmlNode text in doc.DocumentNode.SelectNodes("//body"))
                    {
                        if (!string.IsNullOrWhiteSpace(text.InnerText))
                        {
                            content += text.InnerText.Trim().Replace("&nbsp", "");
                        }
                    }
                }

                content   = Regex.Replace(content, @"\s+", " ");
                Regex rgx = new Regex("[^a-zA-Z0-9 ÆØÅ æøå -]");
                content   = rgx.Replace(content, "");
                content   = Indexer.RemoveStopWords(content.ToLower());
                ContentHandler.AddContent(content, url);
            });
        }