private Page fetchNextPage(BackQueue backQueue, Uri currentUrl) { String host = currentUrl.GetLeftPart(UriPartial.Authority); while (backQueue.EnoughTimeHasPassed(host, DateTime.Now) == false) { Thread.Yield(); } Page newPage = new Page(currentUrl); parser.AddHtmlToPage(newPage); if (String.IsNullOrEmpty(newPage.Html)) { return(null); } parser.AddBodyToPage(newPage); if (String.IsNullOrEmpty(newPage.SiteText)) { return(null); } parser.AddPathsToPage(newPage); return(newPage); }
private void initialiseSeed() { foreach (Uri seed in initialSeeds) { String domain = seed.GetLeftPart(UriPartial.Authority); BackQueue backQueue = new BackQueue(domain); backQueue.Enqueue(seed); BackQueues.TryAdd(Interlocked.Increment(ref i), backQueue); } }
private void NewMethod() { var random = new Random(); while (_webGraph.Count() < _numberOfPages) { BackQueue b = new BackQueue(null); //dummy Uri nextUrl = new Uri("https://www.aau.dk"); // dummy if (BackQueues.Count() == 0) { continue; } b = BackQueues[random.Next(0, BackQueues.Count)]; if (b.Count == 0 || !b.TryPeek(out nextUrl)) { continue; } if (_webGraph.ToList().Where(x => x.Url == nextUrl).Count() > 0) { continue; } //2. Fetch next page from URL in queue Page newPage = fetchNextPage(b, nextUrl); if (newPage is null) { continue; } addToWebGraph(newPage); //For each extracted URL //• Obey robots.txt (freshness caveat) //c. Check that not already in frontier var paths = newPage.OutLinks.Where(x => b.RobotsAreObeyed(x) && b.Contains(x) == false); //d. Add to frontier if passing tests addPathToFrontierIfTestsPassed(b, paths); } //5. Delete or re-prioritize current URL from queue }
private void addPathToFrontierIfTestsPassed(BackQueue b, IEnumerable <Uri> paths) { foreach (Uri path in paths) { String pathDomain = path.GetLeftPart(UriPartial.Authority); if (pathDomain.Equals(b.Domain)) { b.Enqueue(path); } else if (BackQueues.Any(x => x.Value.Domain.Equals(pathDomain))) { BackQueues.First(x => x.Value.Domain.Equals(pathDomain)).Value.Enqueue(path); } else { BackQueue newQueue = new BackQueue(pathDomain); newQueue.Enqueue(path); BackQueues.TryAdd(Interlocked.Increment(ref i), newQueue); } } }