public async Task OnStartCrawling(object param) { ClearCrawlerTree(); CrawlerInputParser inputParser = new CrawlerInputParser(); List <Uri> rootResources = inputParser.Parse(ViewModelsMediator.Instance.SourceFilePath); if (rootResources != null) { StartBtnEnabled = false; StopBtnEnabled = true; // Init crawler WebCrawler.WebCrawler crawler = new WebCrawler.WebCrawler(); crawler.MaxDepth = crawlingDepth; crawler.Logger = LoggerViewModel.Instance; crawler.LoadingFinished += OnPageLoadingFinished; foreach (var rootUri in rootResources) { WebCrawlerOutput crawlerOutput = await crawler.PerformCrawlingAsync(rootUri, 0, -1); } StartBtnEnabled = true; StopBtnEnabled = false; } }
// Dictionary<string, string> public static void Main(string[] args) { WebCrawler crawler = new WebCrawler(); crawler.urlList.Add("http://tw.msn.com/"); crawler.craw(); }
static void Main(string[] args) { WebCrawler wc = new WebCrawler(); WebClient wbc = new WebClient(); string src = wbc.DownloadString("http://www.youtube.com/robots.txt"); string[] lines = src.Split('\n'); List<String> disallowedSites = wc.GetDisallowedSites(lines, "*"); List<String> frontier = wc.FetchUrlsFromSource("http://www.youtube.com"); int x = 5; }
static void Main(string[] args) { WebCrawler wc = new WebCrawler(); WebClient wbc = new WebClient(); string src = wbc.DownloadString("http://www.youtube.com/robots.txt"); string[] lines = src.Split('\n'); List <String> disallowedSites = wc.GetDisallowedSites(lines, "*"); List <String> frontier = wc.FetchUrlsFromSource("http://www.youtube.com"); int x = 5; }
private double[] getTF_IDF(WebCrawler.Index.DocumentReference[] documents, int docCount) { double N = docCount; double[] values = new double[documents.Length]; for (int i = 0; i < documents.Length; i++) { Document d = documents[i].Document; int c = documents[i].Count; if (!lengths.ContainsKey(d)) lengths.Add(d, c * c); else lengths[d] += c * c; values[i] = (1 + Math.Log10(c)) * Math.Log10(N / documents.Length); } return values; }
private void FindUrl() { Console.Write("Enter a URL: "); bool hasSetMaximumLinkAmount = default; int maximumLinkAmount = default; do { Console.Write("Enter the amount of links to visit: "); bool isNumber = int.TryParse(Console.ReadLine().Trim(), out int outMaximumLinkAmount); if (isNumber) { hasSetMaximumLinkAmount = true; maximumLinkAmount = outMaximumLinkAmount; } else { Console.WriteLine("The value you inserted was not a number."); } } while (!hasSetMaximumLinkAmount); Console.WriteLine("Crawling..."); Console.WriteLine(); WebCrawler webCrawler = new WebCrawler(); webCrawler.Start(_userUrl, maximumLinkAmount); Console.Write($"---Found web page: {_userUrl}---"); Queue <Uri> results = webCrawler.GetResultUrls(); foreach (Uri url in results) { Console.WriteLine(url); } Console.WriteLine($"*Total found links: {results.Count}"); }
static void Main(string[] args) { List <Uri> uris = new List <Uri>() { new Uri("https://www.rbc.ru/"), new Uri("https://habrahabr.ru/"), new Uri("https://zr.ru/"), new Uri("https://youtube.com/"), new Uri("https://rp5.ru/"), }; List <WebCrawlerItem> crawlerItems = new List <WebCrawlerItem>(); foreach (var uri in uris) { crawlerItems.Add(new WebCrawlerItem(uri, new DomainCrawlerConfiguration())); } var crawler = new WebCrawler.WebCrawler(new CrawlerConfiguration()); crawler.StartCrawlingAsync(crawlerItems); Console.ReadLine(); }
static void Main(string[] args) { String indexPath = @"C:\Users\Brandon\Desktop\Multimedia Retrieval\W3 Files\Index"; //Analyzers build token streams which analyze text Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = new IndexWriter(indexPath, analyzer, true); //Set the seedUrl and initialize the crawler String seedUrl = "http://sydney.edu.au/engineering/it/"; WebCrawler crawler = new WebCrawler(); Queue <String> linkQueue = new Queue <String>(); linkQueue.Enqueue(seedUrl); HashSet <String> linkSet = new HashSet <String>(); Console.Write("Sites Explored: 0"); //Iteratively extract links from the first URL in the frontier //and adds its content to index while (linkQueue.Count != 0 && linkSet.Count < 50) { String currentLink = linkQueue.Dequeue(); try { if (linkSet.Contains(currentLink)) { continue; } String content = crawler.getUrlContent(currentLink); crawler.getLinks(linkQueue, content, currentLink); linkSet.Add(currentLink); Document doc = new Document(); doc.Add(new Field("link", currentLink, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); Console.Write("\rSites Explored: {0}", linkSet.Count); } catch (Exception) { continue; } } writer.Optimize(); writer.Close(); Console.WriteLine(); //Execute the search String search = "suits"; QueryParser parser = new QueryParser("content", analyzer); Query query = parser.Parse(search); var searcher = new IndexSearcher(indexPath); Hits hits = searcher.Search(query); int results = hits.Length(); Console.WriteLine("Found {0} results for \"{1}\"", results, search); for (int i = 0; i < results; i++) { Document doc = hits.Doc(i); float score = hits.Score(i); Console.WriteLine("Result num {0}, score {1}", i + 1, score); Console.WriteLine("URL: {0}", doc.Get("link")); } }
static void Main(string[] args) { var webCrawler = new WebCrawler("https://www.google.com"); var robotstxt = webCrawler.GetRobotsTxt(); var rules = RobotsTxtReader.RobotsTxtParser.GetRulesApplyingForAll(robotstxt); }