private void btn_startParsing_Click(object sender, RoutedEventArgs e) { if (txtbx_filesPath.Text.Length != 0) { if (!Directory.Exists(txtbx_filesPath.Text)) { MessageBox.Show("Please enter a valid Data Files path"); return; } filesPath = txtbx_filesPath.Text; } else { MessageBox.Show("Please enter Data Files path"); return; } if (txtbx_postingPath.Text.Length != 0) { if (!Directory.Exists(txtbx_postingPath.Text)) { MessageBox.Show("Please enter a valid Posting Files path"); return; } postingPath = txtbx_postingPath.Text; } else { MessageBox.Show("Please enter Posting Files path"); return; } try { File.Create(postingPath + @"\abNumsPosting.txt").Dispose(); File.Create(postingPath + @"\cfPosting.txt").Dispose(); File.Create(postingPath + @"\gmPosting.txt").Dispose(); File.Create(postingPath + @"\nrPosting.txt").Dispose(); File.Create(postingPath + @"\szPosting.txt").Dispose(); if (Directory.Exists(postingPath + @"\docs")) { Directory.Delete(postingPath + @"\docs"); } Directory.CreateDirectory(postingPath + @"\docs"); } catch (Exception exp) { return; } btn_startParsing.IsEnabled = false; btn_loadPosting.IsEnabled = false; indexer = new Indexer(postingPath); ranker.postingPath = postingPath; parser = new Parse(filesPath, postingPath, indexer, cb_Stemmeing.IsChecked.Value); parser.ModelChanged += vModelChanged; Thread thread = new Thread(new ThreadStart(parser.startParsing)); thread.Start(); btn_runQuery.IsEnabled = true; btn_runQueryFile.IsEnabled = true; }
//Load all files of part 2-corpus, stop words, cache, dictionary ,posting and rank private void Load2_click(object sender, RoutedEventArgs e) { //Folder Chooser var dlg = new FolderBrowserDialog(); System.Windows.Forms.DialogResult result = dlg.ShowDialog(this.GetIWin32Window()); //change the source path if (dlg.SelectedPath != "") { pathopen = dlg.SelectedPath; pathclose = dlg.SelectedPath; //init all the first part objects ind = new Indexer(pathclose, isStem); p = new Parser(pathopen + @"\stop_words.txt", isStem); r = new ReadFile(pathopen + @"\corpus\"); string dic; string cache; if (isStem)//check if stem { dic = pathclose + @"\CacheDic\dicStem.dicx"; cache = pathclose + @"\CacheDic\cacheStem.chex"; } else { dic = pathclose + @"\CacheDic\dic.dicx"; cache = pathclose + @"\CacheDic\cache.chex"; } try { //load dic using (FileStream fs = new FileStream(dic, FileMode.Open)) { IFormatter bf = new BinaryFormatter(); ind.dic = (Dictionary <string, DicRecord>)bf.Deserialize(fs);//read object } //load cache using (FileStream fs = new FileStream(cache, FileMode.Open)) { IFormatter bf = new BinaryFormatter(); ind.cache = (Dictionary <string, List <PostingInfo> >)bf.Deserialize(fs);//read object } } catch (IOException) { //cant find load and cache files in currect folder System.Windows.Forms.MessageBox.Show("Files Missing, can't Load", "ERROR!", MessageBoxButtons.OK, MessageBoxIcon.Error); return; } //for vieiwing ind.writeTextChache(); showcatch.IsEnabled = true; ind.writeTextDic(); showDic.IsEnabled = true; //new ranker and load the dictionaries of the class if the file exists in the selected folder rank = new Ranker(pathclose, p, r, ind, isStem); searcher = new Searcher(p, ind, rank, pathopen); //open the run btn runQuery.IsEnabled = true; //notify when finished System.Windows.Forms.MessageBox.Show("Ready To search!!", "Done!", MessageBoxButtons.OK, MessageBoxIcon.Information); } }
public Ranker(Crawler _c, Indexer _i) { c = _c; i = _i; }
private void FetchData(string url) { List <string> hyperlinks = new List <string>(); string content = string.Empty; HtmlWeb web = new HtmlWeb(); HtmlDocument doc; try { doc = web.Load(url); } catch (Exception) { return; } Task hyperlinkTask = Task.Run(() => { HtmlNodeCollection hyperNodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (!(hyperNodes == null)) { foreach (HtmlNode link in hyperNodes) { string href = string.Empty; try { href = link.OuterHtml.Split("\"")[1]; } catch (IndexOutOfRangeException) { continue; } if (href.StartsWith("/")) { href = url + href.Substring(1); } if (href.StartsWith("http")) { hyperlinks.Add(href); } } } SortHyperLinks(new Uri(url).Host, hyperlinks); }); // Preprocessering the content Task preprocesseringTask = Task.Run(() => { HtmlNodeCollection contentNodes = doc.DocumentNode.SelectNodes("//body"); if (!(contentNodes == null)) { foreach (HtmlNode text in doc.DocumentNode.SelectNodes("//body")) { if (!string.IsNullOrWhiteSpace(text.InnerText)) { content += text.InnerText.Trim().Replace(" ", ""); } } } content = Regex.Replace(content, @"\s+", " "); Regex rgx = new Regex("[^a-zA-Z0-9 ÆØÅ æøå -]"); content = rgx.Replace(content, ""); content = Indexer.RemoveStopWords(content.ToLower()); ContentHandler.AddContent(content, url); }); }