static void testCrawler() { Crawler crawler = new Crawler("https://heste-nettet.dk/"); crawler.Crawl(100); QueryTool queryMaster = new QueryTool(crawler); int websiteCounter = 0; foreach (var item in crawler.websites) { try { string noHtmlString = HtmlRemoval.MasterStripper(item.HTMLContent); queryMaster.GenerateTokensAndInvertedIndex(noHtmlString, websiteCounter); websiteCounter++; } catch (Exception e) { //Console.WriteLine(e.Message); } } List <KeyValuePair <int, double> > pagerank = queryMaster.passQueryPageRank(); Console.WriteLine("\n Boolean: \n"); List <Document> pageResultsBoolean = queryMaster.PassQueryBoolean("dansk *and* ride *not* forbund"); foreach (Document doc in pageResultsBoolean.Take(10)) { Console.WriteLine(crawler.websites[doc.Id].currentPath); } Console.WriteLine("\n Content: \n"); List <KeyValuePair <int, double> > pageResultsContent = queryMaster.passQueryContent("dansk ride forbund", websiteCounter); foreach (KeyValuePair <int, double> doc in pageResultsContent.Take(10)) { Console.WriteLine(crawler.websites[doc.Key].currentPath); } Console.WriteLine("\n Content with pagerank: \n"); List <KeyValuePair <int, double> > pageResultsContentAndPageRank = queryMaster.passQueryContentAndPageRank(pagerank, pageResultsContent); foreach (KeyValuePair <int, double> doc in pageResultsContentAndPageRank.Take(10)) { Console.WriteLine(crawler.websites[doc.Key].currentPath); } foreach (Website page in crawler.websites) { Console.WriteLine(page.currentPath); } Console.WriteLine(crawler.websites.Count); Console.ReadLine(); }
public List <Shingle> FindShingles(string textinput) { string text = HtmlRemoval.StripTagsRegex(textinput); string fixedText = Regex.Replace(text, "[^a-zA-Z0-9% -]", string.Empty); string[] textsplit = fixedText.Split(' '); List <Shingle> returnlist = new List <Shingle>(); for (int i = 3; i < textsplit.Length; i++) { Shingle temp = new Shingle(); temp.words.Add(textsplit[i - 3]); temp.words.Add(textsplit[i - 2]); temp.words.Add(textsplit[i - 1]); temp.words.Add(textsplit[i]); returnlist.Add(temp); } return(returnlist); }
static void testCrawler() { Crawler crawler = new Crawler("https://heste-nettet.dk/"); crawler.Crawl(100); QueryTool queryMaster = new QueryTool(crawler); int websiteCounter = 0; foreach (var item in crawler.websites) { try { string noHtmlString = HtmlRemoval.MasterStripper(item.HTMLContent); queryMaster.GenerateTokens(noHtmlString, websiteCounter); websiteCounter++; } catch (Exception e) { //Console.WriteLine(e.Message); } } List <KeyValuePair <Document, double> > pageResults = queryMaster.PassQuery("Herning"); foreach (KeyValuePair <Document, double> doc in pageResults) { Console.WriteLine(crawler.websites[doc.Key.Id].currentPath); Console.WriteLine(doc.Value); } Console.WriteLine(crawler.websites.Count); //Console.WriteLine(crawler.websites[1].HTMLContent); Console.ReadLine(); }