Beispiel #1
0
        /// <summary>
        /// Downloads a set of URIs in parallel using a ThreadPool.
        /// </summary>
        /// <param name="parentRequest">Root request.</param>
        /// <param name="childObjects">Children URIs to be downloaded.</param>
        /// <returns>List of downloaded requests.</returns>
        private LinkedList<RCRequest> DownloadObjectsInParallel(RCRequest parentRequest, LinkedList<Uri> childObjects)
        {
            LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>();
            parentRequest.ResetEvents = new ManualResetEvent[childObjects.Count];

            try
            {
                // queue up worker threads to download URIs
                for (int i = 0; i < childObjects.Count; i++)
                {
                    // create the RCRequest for the object
                    RCRequest currChildObject = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(childObjects.ElementAt(i)));
                    currChildObject.ChildNumber = i;
                    // Set the root request.
                    currChildObject.RootRequest = parentRequest;
                    addedObjects.AddLast(currChildObject);

                    // set the resetEvent
                    currChildObject.ResetEvents = parentRequest.ResetEvents;
                    parentRequest.ResetEvents[i] = new ManualResetEvent(false);

                    // download the page
                    ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadPageWorkerThread), (object)currChildObject);
                }

                // wait for timeout
                Utils.WaitAll(parentRequest.ResetEvents);
            }
            catch (Exception)
            {
            }

            return addedObjects;
        }
Beispiel #2
0
        /// <summary>
        /// This method works very similar to RemoteRequestHandler.DownloadPageRecursively.
        /// It downloads a page and all its embedded objects to the local cache, and also
        /// indexes main pages.
        /// </summary>
        /// <param name="uri">The URI to download.</param>
        private void DownloadPage(string uri)
        {
            // create the main RCRequest
            RCRequest rcRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(uri));
            rcRequest.GenericWebRequest.Timeout = CRAWLER_PAGE_TIMEOUT;
            // Only download for not already existing items
            if (!_proxy.ProxyCacheManager.IsCached(rcRequest.RelCacheFileName))
            {
                // Download!
                try
                {
                    _proxy.WaitForAdmissionControlAndAddActiveRequest(rcRequest.RequestId);
                    // Index main pages
                    rcRequest.DownloadToCache(true);
                }
                catch (Exception)
                {
                    // Ignore
                    return;
                }
                finally
                {
                    _proxy.RemoveActiveRequest();
                }

                if (!_proxy.ProxyCacheManager.IsHTMLFile(rcRequest.RelCacheFileName))
                {
                    return;
                }
                // Getting embedded objects only makes sense for html pages.
                Uri baseUri = new Uri(rcRequest.Uri);
                string htmlContent = Utils.ReadFileAsString(rcRequest.CacheFileName).ToLower();

                // get the embedded content of the search result page
                DownloadEmbeddedObjects(rcRequest, baseUri, htmlContent);
            }

            // Notify that new threads are available in the pool
            lock (_threadPoolEvent)
            {
                _threadPoolEvent.Set();
            }
        }
Beispiel #3
0
 /// <summary>
 /// Downloads embedded objects based on the richness.
 /// </summary>
 /// <param name="rcRequest">Request page to start from.</param>
 /// <param name="baseUri">The Uri of the website where to download embedded objects.</param>
 /// <param name="htmlContent">The HTML content of the webiste.</param>
 /// <returns>List of RCRequests of embedded objects downloaded</returns>
 private LinkedList<RCRequest> DownloadEmbeddedObjects(RCRequest rcRequest, Uri baseUri, string htmlContent)
 {
     LinkedList<Uri> embeddedObjects = HtmlUtils.ExtractEmbeddedObjects(baseUri, htmlContent);
     return DownloadObjectsInParallel(rcRequest, embeddedObjects);
 }