/// <summary> /// Downloads a set of URIs in parallel using a ThreadPool. /// </summary> /// <param name="parentRequest">Root request.</param> /// <param name="childObjects">Children URIs to be downloaded.</param> /// <returns>List of downloaded requests.</returns> private LinkedList<RCRequest> DownloadObjectsInParallel(RCRequest parentRequest, LinkedList<Uri> childObjects) { LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>(); parentRequest.ResetEvents = new ManualResetEvent[childObjects.Count]; try { // queue up worker threads to download URIs for (int i = 0; i < childObjects.Count; i++) { // create the RCRequest for the object RCRequest currChildObject = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(childObjects.ElementAt(i))); currChildObject.ChildNumber = i; // Set the root request. currChildObject.RootRequest = parentRequest; addedObjects.AddLast(currChildObject); // set the resetEvent currChildObject.ResetEvents = parentRequest.ResetEvents; parentRequest.ResetEvents[i] = new ManualResetEvent(false); // download the page ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadPageWorkerThread), (object)currChildObject); } // wait for timeout Utils.WaitAll(parentRequest.ResetEvents); } catch (Exception) { } return addedObjects; }
/// <summary> /// This method works very similar to RemoteRequestHandler.DownloadPageRecursively. /// It downloads a page and all its embedded objects to the local cache, and also /// indexes main pages. /// </summary> /// <param name="uri">The URI to download.</param> private void DownloadPage(string uri) { // create the main RCRequest RCRequest rcRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(uri)); rcRequest.GenericWebRequest.Timeout = CRAWLER_PAGE_TIMEOUT; // Only download for not already existing items if (!_proxy.ProxyCacheManager.IsCached(rcRequest.RelCacheFileName)) { // Download! try { _proxy.WaitForAdmissionControlAndAddActiveRequest(rcRequest.RequestId); // Index main pages rcRequest.DownloadToCache(true); } catch (Exception) { // Ignore return; } finally { _proxy.RemoveActiveRequest(); } if (!_proxy.ProxyCacheManager.IsHTMLFile(rcRequest.RelCacheFileName)) { return; } // Getting embedded objects only makes sense for html pages. Uri baseUri = new Uri(rcRequest.Uri); string htmlContent = Utils.ReadFileAsString(rcRequest.CacheFileName).ToLower(); // get the embedded content of the search result page DownloadEmbeddedObjects(rcRequest, baseUri, htmlContent); } // Notify that new threads are available in the pool lock (_threadPoolEvent) { _threadPoolEvent.Set(); } }
/// <summary> /// Downloads embedded objects based on the richness. /// </summary> /// <param name="rcRequest">Request page to start from.</param> /// <param name="baseUri">The Uri of the website where to download embedded objects.</param> /// <param name="htmlContent">The HTML content of the webiste.</param> /// <returns>List of RCRequests of embedded objects downloaded</returns> private LinkedList<RCRequest> DownloadEmbeddedObjects(RCRequest rcRequest, Uri baseUri, string htmlContent) { LinkedList<Uri> embeddedObjects = HtmlUtils.ExtractEmbeddedObjects(baseUri, htmlContent); return DownloadObjectsInParallel(rcRequest, embeddedObjects); }