public async Task OnStartCrawling(object param)
        {
            ClearCrawlerTree();

            CrawlerInputParser inputParser   = new CrawlerInputParser();
            List <Uri>         rootResources = inputParser.Parse(ViewModelsMediator.Instance.SourceFilePath);

            if (rootResources != null)
            {
                StartBtnEnabled = false;
                StopBtnEnabled  = true;

                // Init crawler
                WebCrawler.WebCrawler crawler = new WebCrawler.WebCrawler();
                crawler.MaxDepth         = crawlingDepth;
                crawler.Logger           = LoggerViewModel.Instance;
                crawler.LoadingFinished += OnPageLoadingFinished;

                foreach (var rootUri in rootResources)
                {
                    WebCrawlerOutput crawlerOutput = await crawler.PerformCrawlingAsync(rootUri, 0, -1);
                }

                StartBtnEnabled = true;
                StopBtnEnabled  = false;
            }
        }
Esempio n. 2
0
        // Dictionary<string, string>

        public static void Main(string[] args)
        {
            WebCrawler crawler = new WebCrawler();

            crawler.urlList.Add("http://tw.msn.com/");
            crawler.craw();
        }
Esempio n. 3
0
        static void Main(string[] args)
        {
            WebCrawler wc = new WebCrawler();

            WebClient wbc = new WebClient();
            string src = wbc.DownloadString("http://www.youtube.com/robots.txt");
            string[] lines = src.Split('\n');

            List<String> disallowedSites = wc.GetDisallowedSites(lines, "*");
            List<String> frontier = wc.FetchUrlsFromSource("http://www.youtube.com");
            int x = 5;
        }
Esempio n. 4
0
        static void Main(string[] args)
        {
            WebCrawler wc = new WebCrawler();

            WebClient wbc = new WebClient();
            string    src = wbc.DownloadString("http://www.youtube.com/robots.txt");

            string[] lines = src.Split('\n');

            List <String> disallowedSites = wc.GetDisallowedSites(lines, "*");
            List <String> frontier        = wc.FetchUrlsFromSource("http://www.youtube.com");
            int           x = 5;
        }
        private double[] getTF_IDF(WebCrawler.Index.DocumentReference[] documents, int docCount)
        {
            double N = docCount;

            double[] values = new double[documents.Length];

            for (int i = 0; i < documents.Length; i++)
            {
                Document d = documents[i].Document;
                int c = documents[i].Count;
                if (!lengths.ContainsKey(d))
                    lengths.Add(d, c * c);
                else
                    lengths[d] += c * c;

                values[i] = (1 + Math.Log10(c)) * Math.Log10(N / documents.Length);
            }
            return values;
        }
Esempio n. 6
0
        private void FindUrl()
        {
            Console.Write("Enter a URL: ");
            bool hasSetMaximumLinkAmount = default;
            int  maximumLinkAmount       = default;

            do
            {
                Console.Write("Enter the amount of links to visit: ");
                bool isNumber = int.TryParse(Console.ReadLine().Trim(), out int outMaximumLinkAmount);
                if (isNumber)
                {
                    hasSetMaximumLinkAmount = true;
                    maximumLinkAmount       = outMaximumLinkAmount;
                }
                else
                {
                    Console.WriteLine("The value you inserted was not a number.");
                }
            } while (!hasSetMaximumLinkAmount);

            Console.WriteLine("Crawling...");
            Console.WriteLine();
            WebCrawler webCrawler = new WebCrawler();

            webCrawler.Start(_userUrl, maximumLinkAmount);

            Console.Write($"---Found web page: {_userUrl}---");
            Queue <Uri> results = webCrawler.GetResultUrls();

            foreach (Uri url in results)
            {
                Console.WriteLine(url);
            }
            Console.WriteLine($"*Total found links: {results.Count}");
        }
Esempio n. 7
0
        static void Main(string[] args)
        {
            List <Uri> uris = new List <Uri>()
            {
                new Uri("https://www.rbc.ru/"),
                new Uri("https://habrahabr.ru/"),
                new Uri("https://zr.ru/"),
                new Uri("https://youtube.com/"),
                new Uri("https://rp5.ru/"),
            };

            List <WebCrawlerItem> crawlerItems = new List <WebCrawlerItem>();

            foreach (var uri in uris)
            {
                crawlerItems.Add(new WebCrawlerItem(uri, new DomainCrawlerConfiguration()));
            }

            var crawler = new WebCrawler.WebCrawler(new CrawlerConfiguration());

            crawler.StartCrawlingAsync(crawlerItems);

            Console.ReadLine();
        }
Esempio n. 8
0
        static void Main(string[] args)
        {
            String indexPath = @"C:\Users\Brandon\Desktop\Multimedia Retrieval\W3 Files\Index";
            //Analyzers build token streams which analyze text
            Analyzer    analyzer = new StandardAnalyzer();
            IndexWriter writer   = new IndexWriter(indexPath, analyzer, true);

            //Set the seedUrl and initialize the crawler
            String         seedUrl   = "http://sydney.edu.au/engineering/it/";
            WebCrawler     crawler   = new WebCrawler();
            Queue <String> linkQueue = new Queue <String>();

            linkQueue.Enqueue(seedUrl);
            HashSet <String> linkSet = new HashSet <String>();

            Console.Write("Sites Explored: 0");

            //Iteratively extract links from the first URL in the frontier
            //and adds its content to index
            while (linkQueue.Count != 0 && linkSet.Count < 50)
            {
                String currentLink = linkQueue.Dequeue();
                try
                {
                    if (linkSet.Contains(currentLink))
                    {
                        continue;
                    }
                    String content = crawler.getUrlContent(currentLink);
                    crawler.getLinks(linkQueue, content, currentLink);
                    linkSet.Add(currentLink);
                    Document doc = new Document();
                    doc.Add(new Field("link", currentLink, Field.Store.YES, Field.Index.NOT_ANALYZED));
                    doc.Add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
                    writer.AddDocument(doc);

                    Console.Write("\rSites Explored: {0}", linkSet.Count);
                }
                catch (Exception) { continue; }
            }
            writer.Optimize();
            writer.Close();
            Console.WriteLine();

            //Execute the search
            String      search   = "suits";
            QueryParser parser   = new QueryParser("content", analyzer);
            Query       query    = parser.Parse(search);
            var         searcher = new IndexSearcher(indexPath);
            Hits        hits     = searcher.Search(query);
            int         results  = hits.Length();

            Console.WriteLine("Found {0} results for \"{1}\"", results, search);
            for (int i = 0; i < results; i++)
            {
                Document doc   = hits.Doc(i);
                float    score = hits.Score(i);
                Console.WriteLine("Result num {0}, score {1}", i + 1, score);
                Console.WriteLine("URL: {0}", doc.Get("link"));
            }
        }
Esempio n. 9
0
 // Dictionary<string, string>
 public static void Main(string[] args)
 {
     WebCrawler crawler = new WebCrawler();
     crawler.urlList.Add("http://tw.msn.com/");
     crawler.craw();
 }
 static void Main(string[] args)
 {
     var webCrawler = new WebCrawler("https://www.google.com");
     var robotstxt  = webCrawler.GetRobotsTxt();
     var rules      = RobotsTxtReader.RobotsTxtParser.GetRulesApplyingForAll(robotstxt);
 }