/// <summary>
        /// Attempts to add the given host to the global policy
        /// </summary>
        /// <param name="webpagePolicy"></param>
        public void tryMarkForHostTimeout(string webpagePolicy)
        {
            WebHostPolicy policy = null;

            if (singleShardHostPolicy.TryGetValue(0, out policy))
            {
                policy.markHostForTimeout(webpagePolicy);
            }
        }
 public SharedSearchContextState(bool usingQueue)
 {
     webpageUrlQueue          = new WebPageUrlCollection(usingQueue);
     contextInfo              = new SharedSearchContextInfo();
     visitedWebpageUrlHashes  = new ConcurrentDictionary <int, byte>();
     visitedWebpageUrls       = new Filter <string>(ApplicationConstants.GREATER_STORAGE_LIMIT);
     singleShardHostPolicy    = new ConcurrentDictionary <byte, WebHostPolicy>();
     singleShardHostPolicy[0] = new WebHostPolicy();
 }
        public void tryHandlePotentialTimeout(string webpageUrl)
        {
            WebHostPolicy policy = null;

            if (singleShardHostPolicy.TryGetValue(0, out policy))
            {
                string host = policy.getHost(webpageUrl);
                policy.handle(host);
            }
        }
Ejemplo n.º 4
0
        private void handlePotentialTimeout(WebHostPolicy policy, WebUtils utils, string webpageUrl)
        {
            string host = policy.getHost(webpageUrl);

            if (host != null)
            {
                sharedSearchContext.getContextInfo().incrementThreadSleepCounter();
                secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext));
                policy.handle(host);
                sharedSearchContext.tryMarkForHostTimeout(webpageUrl);
                sharedSearchContext.getContextInfo().decrementThreadSleepCounter();
                secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext));
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Searches the given url once for information.
        /// </summary
        /// <param name="url"></param>
        /// <returns></returns>
        private void explore()
        {
            WebUtils      utils     = new WebUtils();
            WebCache      cache     = new WebCache();
            WebCrawler    crawler   = new WebCrawler();
            WebHostPolicy policy    = new WebHostPolicy();
            WebProcessor  processor = new WebProcessor(configuredSettings);
            ThreadSleeper sleeper   = new ThreadSleeper(5000);

            // init the queue if not already created,
            if (sizeOfQueue() < 1)
            {
                initQueue(cache, currentUrl);
            }

            // traverse as long as the visited urls is less than the limit, is callable, and URL collection is not empty
            while (amountOfWebpageUrlsTraversed() < maxPageSearchLimit && callable && !isQueueEmpty())
            {
                string currentWebpageUrl = dequeueWebpageUrl(traversalStyle, cache);

                // ensure the url is valid and has not been visited already
                if (!utils.isValidWebpageURL(currentWebpageUrl) || hasWebpageUrlBeenVisied(currentWebpageUrl))
                {
                    continue;
                }

                // try to timeout checking shared state and current thread
                handlePotentialTimeout(policy, utils, currentWebpageUrl);

                // if the crawl returns false, then it is an unsupported url
                if (!crawler.tryCrawl(currentWebpageUrl))
                {
                    continue;
                }

                setWebpageUrlAsVisited(currentWebpageUrl);

                // Retrieve all the texts found by the crawler
                Queue <string> texts              = crawler.releaseTexts();
                Queue <string> webpageUrls        = crawler.releaseWebpages();
                Queue <string> imageUrls          = crawler.releaseImages();
                string         currentWebpageHost = crawler.releaseHost();

                // filters the texts potentially and handles the links/images/etc
                WebPage page = processor.constructWebsite(texts, webpageUrls, imageUrls, currentWebpageUrl, currentWebpageHost);
                processor.tryBasicFilter(texts);

                // handles the cache to context communication for the newly discovered site URLS
                addWebpageUrlsToQueue(cache, page, webpageUrls, imageUrls);
                // enqueue the website to the hub
                sendToHub(page);

                // Update the state object
                sharedSearchContext.getContextInfo().addToThreadScore(contextualId, page.getSearchPhraseCount());
                sharedSearchContext.getContextInfo().incrementUrlsTraversed();

                // construct the display for the end user
                mainDisplayQueue.Enqueue(utils.createPrimaryDisplayView(page, contextualId));

                // consturct the secondary display for the end user
                secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext));

                // try to set webpage for timeout on all threads
                addOrUpdatePolicy(policy, currentWebpageHost);
                sleeper.trySleeping();
            }
            secondaryDisplayQueue.Enqueue(utils.createSecondaryDisplayView(sharedSearchContext));
        }
Ejemplo n.º 6
0
 private void addOrUpdatePolicy(WebHostPolicy policy, string webpageHost)
 {
     policy.markHostForTimeout(webpageHost);
     sharedSearchContext.tryMarkForHostTimeout(webpageHost);
 }