/// <summary> /// Recursively downloads a page and its embedded objects, and its outlinks. /// </summary> /// <param name="rcRequest">Requested page to start from.</param> /// <param name="richness">Richness setting.</param> /// <param name="depth">Depth to download.</param> /// <returns>Wheter something was downloaded successfully.</returns> public bool RecursivelyDownloadPage(RCRequest rcRequest, Richness richness, int depth) { if (_killYourself || _quota < DEFAULT_LOW_WATERMARK) { // Send error page if we're on top level if (depth == 0) { SendErrorPage(HttpStatusCode.InternalServerError, "Request aborted or it does not fit in quota."); } return false; } // reduce the timer DateTime currTime = DateTime.Now; DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT); if (endTime.CompareTo(currTime) > 0) { RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds; } else { RCRequest.GenericWebRequest.Timeout = 0; } // Only download for POST/... or not already existing items if (!IsGetOrHeadHeader() || !_proxy.ProxyCacheManager.IsCached(rcRequest.RelCacheFileName)) { // Download! try { // There is no index on the remote side anyway rcRequest.DownloadToCache(false); } catch (Exception e) { Logger.Warn("[depth = " + depth + "] error downloading: " + rcRequest.Uri + " " + e.Message); // Send error page if we're on top level if (depth == 0) { if (e is WebException) { WebException exp = e as WebException; HttpWebResponse response = (e as WebException).Response as HttpWebResponse; SendErrorPage(response != null ? response.StatusCode : HttpStatusCode.InternalServerError, e.Message); } else { SendErrorPage(HttpStatusCode.InternalServerError, e.Message); } } return false; } } else { Logger.Debug("Already existed: " + rcRequest.Uri); } // add to the package if (_package.Pack(this, rcRequest, ref _quota)) { Logger.Debug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left"); } // add a new request for the old location if it was redirected. This will then // get the 301 file from the cache, so the local proxy does not need to send // another request to the remote proxy to find that out. if (rcRequest.UriBeforeRedirect != null) { Logger.Debug("Redirected: Also packing old URI with a 301 file."); RCRequest rc301 = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(rcRequest.UriBeforeRedirect)); _package.Pack(this, rc301, ref _quota); } if(!_proxy.ProxyCacheManager.IsHTMLFile(rcRequest.RelCacheFileName)) { return true; } // Getting embedded objects and recursing only makes sense for html pages. Uri baseUri = new Uri(rcRequest.Uri); string htmlContent = Utils.ReadFileAsString(rcRequest.CacheFileName).ToLower(); // get the embedded content of the search result page DownloadEmbeddedObjects(rcRequest, baseUri, htmlContent, richness); // Don't recurse if we're on the deepest layer allowed if (depth == Properties.Settings.Default.DEFAULT_DEPTH - 1) { return true; } // recurse LinkedList<Uri> resultLinkUris = HtmlUtils.ExtractLinks(baseUri, htmlContent); foreach (Uri uri in resultLinkUris) { RCRequest currRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(uri)); RecursivelyDownloadPage(currRequest, richness, depth + 1); } return true; }
void ServeRCRemoteResultPage() { if (_proxy.NetworkStatus == (int)RCProxy.NetworkStatusCode.Offline) { return; } // Parse parameters NameValueCollection qscoll = Util.ParseHtmlQuery(RequestUri); int numItemsPerPage = Int32.Parse(qscoll.Get("n")); int pageNumber = Int32.Parse(qscoll.Get("p")); string queryString = qscoll.Get("s"); // Google search string googleSearchString = ConstructGoogleSearch(queryString); _rcRequest = new RCRequest(this, googleSearchString); //LogDebug("streaming: " + _rcRequest.GenericWebRequest.RequestUri + " to cache and client."); //_rcRequest.GenericWebRequest.Proxy = null; long bytesDownloaded = _rcRequest.DownloadToCache(true); try { FileInfo f = new FileInfo(_rcRequest.CacheFileName); if (bytesDownloaded > -1 && f.Exists) { LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest); string resultsString = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; resultsString = resultsString + "<search total=\"" + resultLinkUris.Count.ToString() + "\">"; int currentItemNumber = 0; foreach (RCRequest linkObject in resultLinkUris) { currentItemNumber++; if ((currentItemNumber > ((pageNumber - 1) * numItemsPerPage)) && (currentItemNumber < (pageNumber * numItemsPerPage) + 1)) { string uri = System.Security.SecurityElement.Escape(linkObject.Uri); //System.Security.SecurityElement.Escape(result.Get("uri")); // escape xml string string title = System.Security.SecurityElement.Escape(linkObject.AnchorText); //System.Security.SecurityElement.Escape(result.Get("title")); //escape xml string //string displayUri = uri; string contentSnippet = ""; // XXX: find content snippet here if (uri.StartsWith("http://")) //laura: obmit http:// uri = uri.Substring(7); resultsString = resultsString + "<item>" + "<title>" + title + "</title>" + "<url>" + uri + "</url>" + "<snippet>" + contentSnippet + "</snippet>" + "</item>"; } } resultsString = resultsString + "</search>"; SendOkHeaders("text/xml", "Cache-Control: no-cache" + "\r\n" + "Pragma: no-cache" + "\r\n" + "Expires: -1" + "\r\n"); SendMessage(resultsString); } else { // do nothing } } catch { // do nothing } }
/* // benchmarking stuff public void PrefetchAnalysis(string richness, int depth) { LogDebug("Running Benchmarker"); // XXX: should add a parameter to always download or just read from cache // convert to Uri format //string pageUri = _webRequestUri; LogRequest(); long bytesDownloaded = _rcRequest.DownloadToCache(); FileInfo f; try { f = new FileInfo(_rcRequest.CacheFileName); if (bytesDownloaded < 0 || !f.Exists) { return; } } catch (Exception e) { LogDebug("problem getting file info " + e.StackTrace + " " + e.Message); return; } // get the embedded content of the search result page LinkedList<RCRequest> objectsFound = DownloadEmbeddedObjects(_rcRequest, richness); // benchmarking: store the number of images found //imagesOnResultsPage.Add(objectsFound.Count); // recursively download pages LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest); // benchmarking: store the number of links found //linksOnResultsPage.Add(resultLinkUris.Count); foreach (RCRequest linkObject in resultLinkUris) { bytesDownloaded = linkObject.DownloadToCache(); if (bytesDownloaded > -1 && f.Exists) { linkObject.RequestStatus = (int)Status.Completed; } try { f = new FileInfo(linkObject.CacheFileName); } catch (Exception) { linkObject.RequestStatus = (int)Status.Failed; continue; } if (linkObject.RequestStatus == (int)Status.Failed || !f.Exists) { linkObject.RequestStatus = (int)Status.Failed; continue; } // XXX: hackery // make a copy of this file try { // create directory if it doesn't exist if (!Util.CreateDirectoryForFile(linkObject.CacheFileName)) { return; } // create directory if it doesn't exist if (!Util.CreateDirectoryForFile("ZZZZZZ\\" + linkObject.CacheFileName)) { return; } File.Delete("ZZZZZZ\\" + linkObject.CacheFileName); File.Copy(linkObject.CacheFileName, "ZZZZZZ\\" + linkObject.CacheFileName); // skip parseable check if (!Util.IsParseable(linkObject)) { continue; } // get the embedded content of the search result page objectsFound = DownloadEmbeddedObjects(linkObject, richness); // benchmarking: store the number of images on the page //imagesOnTargetPage.Add(objectsFound.Count); File.Delete(linkObject.CacheFileName); } catch (Exception e) { LogDebug("problem downloading a file or something " + e.StackTrace + " " + e.Message); } } }*/ /* // XXX: obsolete (currently not in use) /// <summary> /// Prefetch a search page in breadth first search order. /// </summary> /// <param name="richness">Richness of the prefetch.</param> /// <param name="depth">Depth to prefetch.</param> /// <returns>Status.</returns> private bool PrefetchBFS(string richness, int depth) { // benchmarking //downloadPagesStart = DateTime.Now; LogDebug("Running BFS"); // reconstruct _rcRequest string pageUri = _rcRequest.TranslateRCSearchToGoogle(); if (!Util.IsValidUri(pageUri)) { return false; } _rcRequest = new RCRequest(this, pageUri); //_rcRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT); // download the file long bytesDownloaded = _rcRequest.DownloadToCache(); if (bytesDownloaded < 0) { LogDebug("Error downloading: " + _rcRequest.Uri); return false; } // add to the package //if ( _package.Pack(this, _rcRequest, ref _quota);//) //{ // LogDebug("packed: " + RequestUri + " " + _rcRequest.FileSize + " bytes, " + _quota + " left"); //} // check quota if (_quota < DEFAULT_LOW_WATERMARK) { // benchmarking //downloadPagesEnd = DateTime.Now; return true; } // setup the initial frontier LinkedList<RCRequest> currentBFSFrontier = ExtractGoogleResults(_rcRequest); LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>(); // run BFS while (depth < DEFAULT_MAX_DEPTH) { // download objects in parallel currentBFSFrontier = DownloadObjectsInParallel(_rcRequest, currentBFSFrontier); // download embedded objects for each downloaded object foreach (RCRequest currObject in currentBFSFrontier) { // download embedded objects DownloadEmbeddedObjects(currObject, richness); } if (_quota < DEFAULT_LOW_WATERMARK) { // quota met break; } // get the next frontier from the current ones nextBFSFrontier = GetNewBFSFrontier(currentBFSFrontier); currentBFSFrontier = nextBFSFrontier; depth++; } return true; //downloadPagesEnd = DateTime.Now; } /// <summary> /// Gets the next BFS frontier from the current frontier. /// </summary> /// <param name="currentBFSFrontier">Current BFS frontier.</param> /// <returns>Next BFS frontier as a LinkedList.</returns> private LinkedList<RCRequest> GetNewBFSFrontier(LinkedList<RCRequest> currentBFSFrontier) { LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>(); LinkedList<RCRequest> extractedLinks; // go through the current frontier and collect the links foreach (RCRequest rcRequest in currentBFSFrontier) { // get all the links extractedLinks = ExtractLinks(rcRequest); // add to the frontier if we haven't seen it recently foreach (RCRequest extractedLink in extractedLinks) { // ignore blacklisted domains if (IsBlacklisted(extractedLink.Uri)) { continue; } if (!currentBFSFrontier.Contains(extractedLink) && !nextBFSFrontier.Contains(extractedLink)) { nextBFSFrontier.AddLast(extractedLink); } } } return nextBFSFrontier; }*/ /// <summary> /// Recursively downloads a page and its embedded objects, and its outlinks. /// </summary> /// <param name="rcRequest">Requested page to start from.</param> /// <param name="richness">Richness setting.</param> /// <param name="depth">Depth to download.</param> /// <returns></returns> private bool RecursivelyDownloadPage(RCRequest rcRequest, string richness, int depth) { if (_quota < DEFAULT_LOW_WATERMARK) { return false; } if (depth == DEFAULT_MAX_DEPTH) { return false; } // check for parseable since its just some URL if (!Util.IsParseable(rcRequest)) { return false; } // reduce the timer DateTime currTime = DateTime.Now; DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT); if (endTime.CompareTo(currTime) > 0) { RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds; } else { RCRequest.GenericWebRequest.Timeout = 0; } // download the page long bytesDownloaded = rcRequest.DownloadToCache(false); if (bytesDownloaded < 0 ) { LogDebug("[depth = " + depth + "] error downloading: " + rcRequest.Uri); return false; } // add to the package if (_package.Pack(this, rcRequest, ref _quota)) { LogDebug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left"); } // get the embedded content of the search result page DownloadEmbeddedObjects(rcRequest, richness); // recurse if necessary LinkedList<RCRequest> resultLinkUris = ExtractLinks(rcRequest); foreach (RCRequest currObject in resultLinkUris) { RecursivelyDownloadPage(currObject, richness, depth + 1); } return true; }