예제 #1
0
        static void testCrawler()
        {
            Crawler crawler = new Crawler("https://heste-nettet.dk/");

            crawler.Crawl(100);
            QueryTool queryMaster    = new QueryTool(crawler);
            int       websiteCounter = 0;

            foreach (var item in crawler.websites)
            {
                try
                {
                    string noHtmlString = HtmlRemoval.MasterStripper(item.HTMLContent);
                    queryMaster.GenerateTokensAndInvertedIndex(noHtmlString, websiteCounter);

                    websiteCounter++;
                }
                catch (Exception e)
                {
                    //Console.WriteLine(e.Message);
                }
            }

            List <KeyValuePair <int, double> > pagerank = queryMaster.passQueryPageRank();

            Console.WriteLine("\n Boolean: \n");
            List <Document> pageResultsBoolean = queryMaster.PassQueryBoolean("dansk *and* ride *not* forbund");

            foreach (Document doc in pageResultsBoolean.Take(10))
            {
                Console.WriteLine(crawler.websites[doc.Id].currentPath);
            }

            Console.WriteLine("\n Content: \n");
            List <KeyValuePair <int, double> > pageResultsContent = queryMaster.passQueryContent("dansk ride forbund", websiteCounter);

            foreach (KeyValuePair <int, double> doc in pageResultsContent.Take(10))
            {
                Console.WriteLine(crawler.websites[doc.Key].currentPath);
            }

            Console.WriteLine("\n Content with pagerank: \n");
            List <KeyValuePair <int, double> > pageResultsContentAndPageRank = queryMaster.passQueryContentAndPageRank(pagerank, pageResultsContent);

            foreach (KeyValuePair <int, double> doc in pageResultsContentAndPageRank.Take(10))
            {
                Console.WriteLine(crawler.websites[doc.Key].currentPath);
            }


            foreach (Website page in crawler.websites)
            {
                Console.WriteLine(page.currentPath);
            }

            Console.WriteLine(crawler.websites.Count);
            Console.ReadLine();
        }
예제 #2
0
        public List <Shingle> FindShingles(string textinput)
        {
            string text      = HtmlRemoval.StripTagsRegex(textinput);
            string fixedText = Regex.Replace(text, "[^a-zA-Z0-9% -]", string.Empty);

            string[] textsplit = fixedText.Split(' ');

            List <Shingle> returnlist = new List <Shingle>();

            for (int i = 3; i < textsplit.Length; i++)
            {
                Shingle temp = new Shingle();
                temp.words.Add(textsplit[i - 3]);
                temp.words.Add(textsplit[i - 2]);
                temp.words.Add(textsplit[i - 1]);
                temp.words.Add(textsplit[i]);
                returnlist.Add(temp);
            }

            return(returnlist);
        }
예제 #3
0
파일: Program.cs 프로젝트: Saftevand/WIe18
        static void testCrawler()
        {
            Crawler crawler = new Crawler("https://heste-nettet.dk/");

            crawler.Crawl(100);
            QueryTool queryMaster    = new QueryTool(crawler);
            int       websiteCounter = 0;

            foreach (var item in crawler.websites)
            {
                try
                {
                    string noHtmlString = HtmlRemoval.MasterStripper(item.HTMLContent);
                    queryMaster.GenerateTokens(noHtmlString, websiteCounter);
                    websiteCounter++;
                }
                catch (Exception e)
                {
                    //Console.WriteLine(e.Message);
                }
            }



            List <KeyValuePair <Document, double> > pageResults = queryMaster.PassQuery("Herning");

            foreach (KeyValuePair <Document, double> doc in pageResults)
            {
                Console.WriteLine(crawler.websites[doc.Key.Id].currentPath);
                Console.WriteLine(doc.Value);
            }

            Console.WriteLine(crawler.websites.Count);
            //Console.WriteLine(crawler.websites[1].HTMLContent);

            Console.ReadLine();
        }