/// <summary> /// Attempts to add the given host to the global policy /// </summary> /// <param name="webpagePolicy"></param> public void tryMarkForHostTimeout(string webpagePolicy) { WebHostPolicy policy = null; if (singleShardHostPolicy.TryGetValue(0, out policy)) { policy.markHostForTimeout(webpagePolicy); } }
public SharedSearchContextState(bool usingQueue) { webpageUrlQueue = new WebPageUrlCollection(usingQueue); contextInfo = new SharedSearchContextInfo(); visitedWebpageUrlHashes = new ConcurrentDictionary <int, byte>(); visitedWebpageUrls = new Filter <string>(ApplicationConstants.GREATER_STORAGE_LIMIT); singleShardHostPolicy = new ConcurrentDictionary <byte, WebHostPolicy>(); singleShardHostPolicy[0] = new WebHostPolicy(); }
public void tryHandlePotentialTimeout(string webpageUrl) { WebHostPolicy policy = null; if (singleShardHostPolicy.TryGetValue(0, out policy)) { string host = policy.getHost(webpageUrl); policy.handle(host); } }
private void handlePotentialTimeout(WebHostPolicy policy, WebUtils utils, string webpageUrl) { string host = policy.getHost(webpageUrl); if (host != null) { sharedSearchContext.getContextInfo().incrementThreadSleepCounter(); secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext)); policy.handle(host); sharedSearchContext.tryMarkForHostTimeout(webpageUrl); sharedSearchContext.getContextInfo().decrementThreadSleepCounter(); secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext)); } }
/// <summary> /// Searches the given url once for information. /// </summary /// <param name="url"></param> /// <returns></returns> private void explore() { WebUtils utils = new WebUtils(); WebCache cache = new WebCache(); WebCrawler crawler = new WebCrawler(); WebHostPolicy policy = new WebHostPolicy(); WebProcessor processor = new WebProcessor(configuredSettings); ThreadSleeper sleeper = new ThreadSleeper(5000); // init the queue if not already created, if (sizeOfQueue() < 1) { initQueue(cache, currentUrl); } // traverse as long as the visited urls is less than the limit, is callable, and URL collection is not empty while (amountOfWebpageUrlsTraversed() < maxPageSearchLimit && callable && !isQueueEmpty()) { string currentWebpageUrl = dequeueWebpageUrl(traversalStyle, cache); // ensure the url is valid and has not been visited already if (!utils.isValidWebpageURL(currentWebpageUrl) || hasWebpageUrlBeenVisied(currentWebpageUrl)) { continue; } // try to timeout checking shared state and current thread handlePotentialTimeout(policy, utils, currentWebpageUrl); // if the crawl returns false, then it is an unsupported url if (!crawler.tryCrawl(currentWebpageUrl)) { continue; } setWebpageUrlAsVisited(currentWebpageUrl); // Retrieve all the texts found by the crawler Queue <string> texts = crawler.releaseTexts(); Queue <string> webpageUrls = crawler.releaseWebpages(); Queue <string> imageUrls = crawler.releaseImages(); string currentWebpageHost = crawler.releaseHost(); // filters the texts potentially and handles the links/images/etc WebPage page = processor.constructWebsite(texts, webpageUrls, imageUrls, currentWebpageUrl, currentWebpageHost); processor.tryBasicFilter(texts); // handles the cache to context communication for the newly discovered site URLS addWebpageUrlsToQueue(cache, page, webpageUrls, imageUrls); // enqueue the website to the hub sendToHub(page); // Update the state object sharedSearchContext.getContextInfo().addToThreadScore(contextualId, page.getSearchPhraseCount()); sharedSearchContext.getContextInfo().incrementUrlsTraversed(); // construct the display for the end user mainDisplayQueue.Enqueue(utils.createPrimaryDisplayView(page, contextualId)); // consturct the secondary display for the end user secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext)); // try to set webpage for timeout on all threads addOrUpdatePolicy(policy, currentWebpageHost); sleeper.trySleeping(); } secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext)); }
private void addOrUpdatePolicy(WebHostPolicy policy, string webpageHost) { policy.markHostForTimeout(webpageHost); sharedSearchContext.tryMarkForHostTimeout(webpageHost); }