Exemplo n.º 1
0
        /// <summary>
        /// Recursively downloads a page and its embedded objects, and its outlinks.
        /// </summary>
        /// <param name="rcRequest">Requested page to start from.</param>
        /// <param name="richness">Richness setting.</param>
        /// <param name="depth">Depth to download.</param>
        /// <returns>Wheter something was downloaded successfully.</returns>
        public bool RecursivelyDownloadPage(RCRequest rcRequest, Richness richness, int depth)
        {
            if (_killYourself || _quota < DEFAULT_LOW_WATERMARK)
            {
                // Send error page if we're on top level
                if (depth == 0)
                {
                    SendErrorPage(HttpStatusCode.InternalServerError, "Request aborted or it does not fit in quota.");
                }
                return false;
            }

            // reduce the timer
            DateTime currTime = DateTime.Now;
            DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT);
            if (endTime.CompareTo(currTime) > 0)
            {
                RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds;
            }
            else
            {
                RCRequest.GenericWebRequest.Timeout = 0;
            }

            // Only download for POST/... or not already existing items
            if (!IsGetOrHeadHeader() || !_proxy.ProxyCacheManager.IsCached(rcRequest.RelCacheFileName))
            {
                // Download!
                try
                {
                    // There is no index on the remote side anyway
                    rcRequest.DownloadToCache(false);
                }
                catch (Exception e)
                {
                    Logger.Warn("[depth = " + depth + "] error downloading: " + rcRequest.Uri + " " + e.Message);
                    // Send error page if we're on top level
                    if (depth == 0)
                    {
                        if (e is WebException)
                        {
                            WebException exp = e as WebException;
                            HttpWebResponse response = (e as WebException).Response as HttpWebResponse;
                            SendErrorPage(response != null ? response.StatusCode : HttpStatusCode.InternalServerError, e.Message);
                        }
                        else
                        {
                            SendErrorPage(HttpStatusCode.InternalServerError, e.Message);
                        }
                    }
                    return false;
                }
            }
            else
            {
                Logger.Debug("Already existed: " + rcRequest.Uri);
            }

            // add to the package
            if (_package.Pack(this, rcRequest, ref _quota))
            {
                Logger.Debug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left");
            }

            // add a new request for the old location if it was redirected. This will then
            // get the 301 file from the cache, so the local proxy does not need to send
            // another request to the remote proxy to find that out.
            if (rcRequest.UriBeforeRedirect != null)
            {
                Logger.Debug("Redirected: Also packing old URI with a 301 file.");
                RCRequest rc301 = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(rcRequest.UriBeforeRedirect));
                _package.Pack(this, rc301, ref _quota);
            }

            if(!_proxy.ProxyCacheManager.IsHTMLFile(rcRequest.RelCacheFileName))
            {
                return true;
            }
            // Getting embedded objects and recursing only makes sense for html pages.
            Uri baseUri = new Uri(rcRequest.Uri);
            string htmlContent = Utils.ReadFileAsString(rcRequest.CacheFileName).ToLower();

            // get the embedded content of the search result page
            DownloadEmbeddedObjects(rcRequest, baseUri, htmlContent, richness);

            // Don't recurse if we're on the deepest layer allowed
            if (depth == Properties.Settings.Default.DEFAULT_DEPTH - 1)
            {
                return true;
            }

            // recurse
            LinkedList<Uri> resultLinkUris = HtmlUtils.ExtractLinks(baseUri, htmlContent);
            foreach (Uri uri in resultLinkUris)
            {
                RCRequest currRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(uri));
                RecursivelyDownloadPage(currRequest, richness, depth + 1);
            }
            return true;
        }
Exemplo n.º 2
0
        void ServeRCRemoteResultPage()
        {
            if (_proxy.NetworkStatus == (int)RCProxy.NetworkStatusCode.Offline)
            {
                return;
            }

            // Parse parameters
            NameValueCollection qscoll = Util.ParseHtmlQuery(RequestUri);
            int numItemsPerPage = Int32.Parse(qscoll.Get("n"));
            int pageNumber = Int32.Parse(qscoll.Get("p"));
            string queryString = qscoll.Get("s");

            // Google search
            string googleSearchString = ConstructGoogleSearch(queryString);
            _rcRequest = new RCRequest(this, googleSearchString);

            //LogDebug("streaming: " + _rcRequest.GenericWebRequest.RequestUri + " to cache and client.");
            //_rcRequest.GenericWebRequest.Proxy = null;
            long bytesDownloaded = _rcRequest.DownloadToCache(true);
            try
            {
                FileInfo f = new FileInfo(_rcRequest.CacheFileName);
                if (bytesDownloaded > -1 && f.Exists)
                {
                    LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest);
                    string resultsString = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
                    resultsString = resultsString + "<search total=\"" + resultLinkUris.Count.ToString() + "\">";
                    int currentItemNumber = 0;
                    foreach (RCRequest linkObject in resultLinkUris)
                    {
                        currentItemNumber++;
                        if ((currentItemNumber > ((pageNumber - 1) * numItemsPerPage)) &&
                            (currentItemNumber < (pageNumber * numItemsPerPage) + 1))
                        {
                            string uri = System.Security.SecurityElement.Escape(linkObject.Uri); //System.Security.SecurityElement.Escape(result.Get("uri")); // escape xml string
                            string title = System.Security.SecurityElement.Escape(linkObject.AnchorText); //System.Security.SecurityElement.Escape(result.Get("title")); //escape xml string
                            //string displayUri = uri;
                            string contentSnippet = "";

                            // XXX: find content snippet here
                            if (uri.StartsWith("http://")) //laura: obmit http://
                                uri = uri.Substring(7);
                            resultsString = resultsString +
                                            "<item>" +
                                            "<title>" + title + "</title>" +
                                            "<url>" + uri + "</url>" +
                                            "<snippet>" + contentSnippet + "</snippet>" +
                                            "</item>";
                        }
                    }

                    resultsString = resultsString + "</search>";

                    SendOkHeaders("text/xml", "Cache-Control: no-cache" + "\r\n" +
                                              "Pragma: no-cache" + "\r\n" +
                                              "Expires: -1" + "\r\n");
                    SendMessage(resultsString);
                }
                else
                {
                    // do nothing
                }
            }
            catch
            {
                // do nothing
            }
        }
Exemplo n.º 3
0
        /*
        // benchmarking stuff
        public void PrefetchAnalysis(string richness, int depth)
        {
            LogDebug("Running Benchmarker");

            // XXX: should add a parameter to always download or just read from cache
            // convert to Uri format
            //string pageUri = _webRequestUri;
            LogRequest();

            long bytesDownloaded = _rcRequest.DownloadToCache();

            FileInfo f;
            try
            {
                f = new FileInfo(_rcRequest.CacheFileName);
                if (bytesDownloaded < 0 || !f.Exists)
                {
                    return;
                }
            }
            catch (Exception e)
            {
                LogDebug("problem getting file info " + e.StackTrace + " " + e.Message);
                return;
            }

            // get the embedded content of the search result page
            LinkedList<RCRequest> objectsFound = DownloadEmbeddedObjects(_rcRequest, richness);
            // benchmarking: store the number of images found
            //imagesOnResultsPage.Add(objectsFound.Count);

            // recursively download pages
            LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest);
            // benchmarking: store the number of links found
            //linksOnResultsPage.Add(resultLinkUris.Count);
            foreach (RCRequest linkObject in resultLinkUris)
            {
                bytesDownloaded = linkObject.DownloadToCache();
                if (bytesDownloaded > -1 && f.Exists)
                {
                    linkObject.RequestStatus = (int)Status.Completed;
                }
                try
                {
                    f = new FileInfo(linkObject.CacheFileName);
                }
                catch (Exception)
                {
                    linkObject.RequestStatus = (int)Status.Failed;
                    continue;
                }
                if (linkObject.RequestStatus == (int)Status.Failed || !f.Exists)
                {
                    linkObject.RequestStatus = (int)Status.Failed;
                    continue;
                }

                // XXX: hackery
                // make a copy of this file
                try
                {
                    // create directory if it doesn't exist
                    if (!Util.CreateDirectoryForFile(linkObject.CacheFileName))
                    {
                        return;
                    }
                    // create directory if it doesn't exist
                    if (!Util.CreateDirectoryForFile("ZZZZZZ\\" + linkObject.CacheFileName))
                    {
                        return;
                    }

                    File.Delete("ZZZZZZ\\" + linkObject.CacheFileName);
                    File.Copy(linkObject.CacheFileName, "ZZZZZZ\\" + linkObject.CacheFileName);

                    // skip parseable check
                    if (!Util.IsParseable(linkObject))
                    {
                        continue;
                    }

                    // get the embedded content of the search result page
                    objectsFound = DownloadEmbeddedObjects(linkObject, richness);
                    // benchmarking: store the number of images on the page
                    //imagesOnTargetPage.Add(objectsFound.Count);

                    File.Delete(linkObject.CacheFileName);
                }
                catch (Exception e)
                {
                    LogDebug("problem downloading a file or something " + e.StackTrace + " " + e.Message);
                }
            }
        }*/
        /*
        // XXX: obsolete (currently not in use)
        /// <summary>
        /// Prefetch a search page in breadth first search order.
        /// </summary>
        /// <param name="richness">Richness of the prefetch.</param>
        /// <param name="depth">Depth to prefetch.</param>
        /// <returns>Status.</returns>
        private bool PrefetchBFS(string richness, int depth)
        {
            // benchmarking
            //downloadPagesStart = DateTime.Now;

            LogDebug("Running BFS");

            // reconstruct _rcRequest
            string pageUri = _rcRequest.TranslateRCSearchToGoogle();
            if (!Util.IsValidUri(pageUri))
            {
                return false;
            }
            _rcRequest = new RCRequest(this, pageUri);
            //_rcRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT);

            // download the file
            long bytesDownloaded = _rcRequest.DownloadToCache();
            if (bytesDownloaded < 0)
            {
                LogDebug("Error downloading: " + _rcRequest.Uri);
                return false;
            }

            // add to the package
            //if (
            _package.Pack(this, _rcRequest, ref _quota);//)
            //{
            //    LogDebug("packed: " + RequestUri + " " + _rcRequest.FileSize + " bytes, " + _quota + " left");
            //}

            // check quota
            if (_quota < DEFAULT_LOW_WATERMARK)
            {
                // benchmarking
                //downloadPagesEnd = DateTime.Now;

                return true;
            }

            // setup the initial frontier
            LinkedList<RCRequest> currentBFSFrontier = ExtractGoogleResults(_rcRequest);
            LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>();

            // run BFS
            while (depth < DEFAULT_MAX_DEPTH)
            {
                // download objects in parallel
                currentBFSFrontier = DownloadObjectsInParallel(_rcRequest, currentBFSFrontier);

                // download embedded objects for each downloaded object
                foreach (RCRequest currObject in currentBFSFrontier)
                {
                    // download embedded objects
                    DownloadEmbeddedObjects(currObject, richness);
                }

                if (_quota < DEFAULT_LOW_WATERMARK)
                {
                    // quota met
                    break;
                }

                // get the next frontier from the current ones
                nextBFSFrontier = GetNewBFSFrontier(currentBFSFrontier);
                currentBFSFrontier = nextBFSFrontier;
                depth++;
            }

            return true;
            //downloadPagesEnd = DateTime.Now;
        }

        /// <summary>
        /// Gets the next BFS frontier from the current frontier.
        /// </summary>
        /// <param name="currentBFSFrontier">Current BFS frontier.</param>
        /// <returns>Next BFS frontier as a LinkedList.</returns>
        private LinkedList<RCRequest> GetNewBFSFrontier(LinkedList<RCRequest> currentBFSFrontier)
        {
            LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>();
            LinkedList<RCRequest> extractedLinks;

            // go through the current frontier and collect the links
            foreach (RCRequest rcRequest in currentBFSFrontier)
            {
                // get all the links
                extractedLinks = ExtractLinks(rcRequest);

                // add to the frontier if we haven't seen it recently
                foreach (RCRequest extractedLink in extractedLinks)
                {
                    // ignore blacklisted domains
                    if (IsBlacklisted(extractedLink.Uri))
                    {
                        continue;
                    }

                    if (!currentBFSFrontier.Contains(extractedLink) &&
                        !nextBFSFrontier.Contains(extractedLink))
                    {
                        nextBFSFrontier.AddLast(extractedLink);
                    }
                }

            }
            return nextBFSFrontier;
        }*/
        /// <summary>
        /// Recursively downloads a page and its embedded objects, and its outlinks.
        /// </summary>
        /// <param name="rcRequest">Requested page to start from.</param>
        /// <param name="richness">Richness setting.</param>
        /// <param name="depth">Depth to download.</param>
        /// <returns></returns>
        private bool RecursivelyDownloadPage(RCRequest rcRequest, string richness, int depth)
        {
            if (_quota < DEFAULT_LOW_WATERMARK)
            {
                return false;
            }

            if (depth == DEFAULT_MAX_DEPTH)
            {
                return false;
            }

            // check for parseable since its just some URL
            if (!Util.IsParseable(rcRequest))
            {
                return false;
            }

            // reduce the timer
            DateTime currTime = DateTime.Now;
            DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT);
            if (endTime.CompareTo(currTime) > 0)
            {
                RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds;
            }
            else
            {
                RCRequest.GenericWebRequest.Timeout = 0;
            }

            // download the page
            long bytesDownloaded = rcRequest.DownloadToCache(false);
            if (bytesDownloaded < 0 )
            {
                LogDebug("[depth = " + depth + "] error downloading: " + rcRequest.Uri);
                return false;
            }

            // add to the package
            if (_package.Pack(this, rcRequest, ref _quota))
            {
                LogDebug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left");
            }

            // get the embedded content of the search result page
            DownloadEmbeddedObjects(rcRequest, richness);

            // recurse if necessary
            LinkedList<RCRequest> resultLinkUris = ExtractLinks(rcRequest);
            foreach (RCRequest currObject in resultLinkUris)
            {
                RecursivelyDownloadPage(currObject, richness, depth + 1);
            }
            return true;
        }