RCRequest, RuralCafe C# (CSharp)代码示例

示例#1

0

显示文件

文件： LocalRequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// DUMMY used for request matching.
        /// Not the cleanest implementation need to instantiate a whole object just to match
        /// </summary> 
        private LocalRequestHandler(string itemId)
        {
            /*
            if (!Util.IsValidUri(uri))
            {
                // XXX: do nothing
            }
            else
            {*/
            _rcRequest = new RCRequest(itemId);
            //}

            /* XXX: don't think the dummy needs this
            // setup the header variables
            if (IsRuralCafeLocalSearch() || IsRuralCafeRemoteRequest())
            {
                ParseRuralCafeQuery();
            }
             */
        }

示例#2

0

显示文件

文件： RemoteRequestHandler.cs 项目： o0111/ruralcafe

        /// <summary>
        /// Downloads embedded objects based on the richness.
        /// </summary>
        /// <param name="rcRequest">Request page to start from.</param>
        /// <param name="richness">Richness setting.</param>
        /// <param name="baseUri">The Uri of the website where to download embedded objects.</param>
        /// <param name="htmlContent">The HTML content of the webiste.</param>
        /// <returns>List of RCRequests of embedded objects downloaded</returns>
        private LinkedList<RCRequest> DownloadEmbeddedObjects(RCRequest rcRequest, Uri baseUri, string htmlContent, Richness richness)
        {
            LinkedList<Uri> filteredEmbeddedObjects = new LinkedList<Uri>();

            if (_killYourself || _quota < DEFAULT_LOW_WATERMARK)
            {
                return new LinkedList<RCRequest>();
            }

            LinkedList<Uri> embeddedObjects = HtmlUtils.ExtractEmbeddedObjects(baseUri, htmlContent);

            // XXX: refactor into filter class/method.
            // filter out based on richness
            foreach (Uri uri in embeddedObjects)
            {
                string uriS = uri.ToString();
                // ignore blacklisted domains
                if (IsBlacklisted(uriS))
                {
                    continue;
                }

                if (richness == Richness.Normal || (richness == Richness.Low && IsATextPage(uriS)))
                {
                    filteredEmbeddedObjects.AddLast(uri);
                }
            }
            embeddedObjects = filteredEmbeddedObjects;

            return DownloadObjectsInParallel(rcRequest, embeddedObjects);
        }

示例#3

0

显示文件

文件： RemoteRequestHandler.cs 项目： o0111/ruralcafe

        /// <summary>
        /// Recursively downloads a page and its embedded objects, and its outlinks.
        /// </summary>
        /// <param name="rcRequest">Requested page to start from.</param>
        /// <param name="richness">Richness setting.</param>
        /// <param name="depth">Depth to download.</param>
        /// <returns>Wheter something was downloaded successfully.</returns>
        public bool RecursivelyDownloadPage(RCRequest rcRequest, Richness richness, int depth)
        {
            if (_killYourself || _quota < DEFAULT_LOW_WATERMARK)
            {
                // Send error page if we're on top level
                if (depth == 0)
                {
                    SendErrorPage(HttpStatusCode.InternalServerError, "Request aborted or it does not fit in quota.");
                }
                return false;
            }

            // reduce the timer
            DateTime currTime = DateTime.Now;
            DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT);
            if (endTime.CompareTo(currTime) > 0)
            {
                RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds;
            }
            else
            {
                RCRequest.GenericWebRequest.Timeout = 0;
            }

            // Only download for POST/... or not already existing items
            if (!IsGetOrHeadHeader() || !_proxy.ProxyCacheManager.IsCached(rcRequest.RelCacheFileName))
            {
                // Download!
                try
                {
                    // There is no index on the remote side anyway
                    rcRequest.DownloadToCache(false);
                }
                catch (Exception e)
                {
                    Logger.Warn("[depth = " + depth + "] error downloading: " + rcRequest.Uri + " " + e.Message);
                    // Send error page if we're on top level
                    if (depth == 0)
                    {
                        if (e is WebException)
                        {
                            WebException exp = e as WebException;
                            HttpWebResponse response = (e as WebException).Response as HttpWebResponse;
                            SendErrorPage(response != null ? response.StatusCode : HttpStatusCode.InternalServerError, e.Message);
                        }
                        else
                        {
                            SendErrorPage(HttpStatusCode.InternalServerError, e.Message);
                        }
                    }
                    return false;
                }
            }
            else
            {
                Logger.Debug("Already existed: " + rcRequest.Uri);
            }

            // add to the package
            if (_package.Pack(this, rcRequest, ref _quota))
            {
                Logger.Debug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left");
            }

            // add a new request for the old location if it was redirected. This will then
            // get the 301 file from the cache, so the local proxy does not need to send
            // another request to the remote proxy to find that out.
            if (rcRequest.UriBeforeRedirect != null)
            {
                Logger.Debug("Redirected: Also packing old URI with a 301 file.");
                RCRequest rc301 = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(rcRequest.UriBeforeRedirect));
                _package.Pack(this, rc301, ref _quota);
            }

            if(!_proxy.ProxyCacheManager.IsHTMLFile(rcRequest.RelCacheFileName))
            {
                return true;
            }
            // Getting embedded objects and recursing only makes sense for html pages.
            Uri baseUri = new Uri(rcRequest.Uri);
            string htmlContent = Utils.ReadFileAsString(rcRequest.CacheFileName).ToLower();

            // get the embedded content of the search result page
            DownloadEmbeddedObjects(rcRequest, baseUri, htmlContent, richness);

            // Don't recurse if we're on the deepest layer allowed
            if (depth == Properties.Settings.Default.DEFAULT_DEPTH - 1)
            {
                return true;
            }

            // recurse
            LinkedList<Uri> resultLinkUris = HtmlUtils.ExtractLinks(baseUri, htmlContent);
            foreach (Uri uri in resultLinkUris)
            {
                RCRequest currRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(uri));
                RecursivelyDownloadPage(currRequest, richness, depth + 1);
            }
            return true;
        }

示例#4

0

显示文件

文件： RequestHandler.cs 项目： kipropesque/RuralCafe

 /// <summary>
 /// Creates RCRequest object for the request.
 /// </summary>
 protected bool CreateRequest(string requestedUri, string refererUri, string recvString)
 {
     if (Util.IsValidUri(requestedUri))
     {
         // create the request object
         _rcRequest = new RCRequest(this, requestedUri, "", refererUri);
         // XXX: obsolete
         //_rcRequest.ParseRCSearchFields();
         _rcRequest.GenericWebRequest.Referer = refererUri;
         _rcRequest._recvString = recvString;
         return true;
     }
     return false;
 }

示例#5

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Downloads a set of URIs in series.
        /// </summary>
        /// <param name="parentRequest">Root request.</param>
        /// <param name="children">Children requests to be downloaded.</param>
        /// <returns>List of downloaded requests.</returns>
        private LinkedList<RCRequest> DownloadObjects(RCRequest parentRequest, LinkedList<RCRequest> children)
        {
            LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>();

            if (children.Count == 0)
            {
                return addedObjects;
            }

            parentRequest.ResetEvents = new ManualResetEvent[children.Count];

            try
            {
                // queue up worker threads to download URIs
                for (int i = 0; i < children.Count; i++)
                {
                    RCRequest currChild = children.ElementAt(i);
                    // make sure we haven't downloaded this before
                    if (_package.RCRequests.Contains(currChild))
                    {
                        // skip it
                        parentRequest.SetDone();
                        continue;
                    }

                    // reduce the timer
                    DateTime currTime = DateTime.Now;
                    DateTime endTime = parentRequest.StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT);
                    if (endTime.CompareTo(currTime) > 0)
                    {
                        currChild.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds;
                    }
                    else
                    {
                        currChild.GenericWebRequest.Timeout = 0;
                    }

                    // download the page
                    currChild.DownloadToCache(false);

                    if (IsTimedOut())
                    {
                        break;
                    }
                }

                addedObjects = _package.Pack(this, children, ref _quota);
            }
            catch (Exception e)
            {
                LogDebug("unable to download embeddedObjects: " + e.StackTrace + " " + e.Message);
            }

            return addedObjects;
        }

示例#6

0

显示文件

文件： LocalRequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Queues this request.
        /// </summary>
        private void AddRequest()
        {
            // Parse parameters
            NameValueCollection qscoll = Util.ParseHtmlQuery(RequestUri);
            int userId = Int32.Parse(qscoll.Get("u"));
            string targetName = qscoll.Get("t");
            string targetUri = qscoll.Get("a");
            string refererUri = qscoll.Get("r");
            if (targetName == null)
            {
                targetName = "fake title";
            }
            if (targetUri == null)
            {
                // error
                targetUri = "";
                SendErrorPage(HTTP_NOT_FOUND, "malformed add request", "");
                return;
            }
            if (refererUri == null)
            {
                refererUri = targetUri;
            }

            _originalRequestUri = RequestUri;
            // preserve the original request status (for HandleLogRequest)
            int originalRequestStatus = _rcRequest.RequestStatus;
            _rcRequest = new RCRequest(this, targetUri, targetName, refererUri);
            _rcRequest.RequestStatus = originalRequestStatus;

            ((RCLocalProxy)_proxy).QueueRequest(userId, this);
            SendOkHeaders("text/html");
            SendMessage(RefererUri);
        }

示例#7

0

显示文件

文件： Package.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Unpacks the package contents and indexes them.
        /// </summary>
        /// <param name="indexPath">Path to the index.</param>
        /// <param name="requestHandler">Calling handler for this method.</param>
        /// <returns>Total unpacked content size.</returns>
        public static long Unpack(LocalRequestHandler requestHandler, string indexPath)
        {
            string packageIndexSizeStr = requestHandler.RCRequest.GenericWebResponse.GetResponseHeader("Package-IndexSize");
            string packageContentSizeStr = requestHandler.RCRequest.GenericWebResponse.GetResponseHeader("Package-ContentSize");
            long packageIndexSize = Int64.Parse(packageIndexSizeStr);
            long packageContentSize = Int64.Parse(packageContentSizeStr);
            string packageFileName = requestHandler.PackageFileName;
            string unpackedPackageFileName = packageFileName.Replace(".gzip", "");

            GZipWrapper.GZipDecompress(packageFileName, unpackedPackageFileName, packageIndexSize + packageContentSize);
            FileStream packageFs = new FileStream(unpackedPackageFileName, FileMode.Open);

            // read the package index
            Byte[] packageIndexBuffer = new Byte[packageIndexSize];
            packageFs.Read(packageIndexBuffer, 0, (int)packageIndexSize);

            // split the big package file into pieces
            string[] stringSeparator = new string[] { "\r\n" };
            System.Text.UTF8Encoding enc = new System.Text.UTF8Encoding();
            string package = enc.GetString(packageIndexBuffer);
            string[] packageContentArr = package.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries);

            Byte[] bufferOverflow = new Byte[1024];
            int bufferOverflowCount = 0;
            int bytesRead = 0;
            long bytesReadOfCurrFile = 0;
            long unpackedBytes = 0;
            Byte[] buffer = new Byte[1024];
            string[] packageEntryArr;
            string currUri = "";
            long currFileSize = 0;
            foreach (string entry in packageContentArr)
            {
                stringSeparator = new string[] { " " };
                packageEntryArr = entry.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries);
                currUri = packageEntryArr[0];

                try
                {
                    currFileSize = Int64.Parse(packageEntryArr[1]);
                }
                catch (Exception e)
                {
                    requestHandler.LogDebug("problem unpacking: " + entry + " " + e.StackTrace + " " + e.Message);
                    return unpackedBytes;
                }

                if (!Util.IsValidUri(currUri))
                {
                    requestHandler.LogDebug("problem unpacking: " + currUri);
                    return unpackedBytes;
                }
                RCRequest rcRequest = new RCRequest(requestHandler, currUri);

                unpackedBytes += currFileSize;

                //requestHandler.LogDebug("unpacking: " + rcRequest.Uri + " - " + currFileSize + " bytes");

                // make sure the file doesn't already exist for indexing purposes only
                bool existed = false;
                FileInfo ftest = new FileInfo(rcRequest.CacheFileName);
                if (ftest.Exists)
                {
                    existed = true;
                }

                // try to delete the old version
                if (!Util.DeleteFile(rcRequest.CacheFileName))
                {
                    return unpackedBytes;
                }

                // create directory if it doesn't exist
                if (!Util.CreateDirectoryForFile(rcRequest.CacheFileName))
                {
                    return unpackedBytes;
                }

                // create the file if it doesn't exist
                FileStream currFileFS = Util.CreateFile(rcRequest.CacheFileName);
                if (currFileFS == null)
                {
                    return unpackedBytes;
                }

                // check for overflow from previous file, and use it up first
                if (bufferOverflowCount > 0)
                {
                    Buffer.BlockCopy(bufferOverflow, 0, buffer, 0, bufferOverflowCount);
                    bytesRead = bufferOverflowCount;
                }
                else
                {
                    bytesRead = packageFs.Read(buffer, 0, 1024);
                }

                // reset for current file
                bytesReadOfCurrFile = 0;
                while (bytesRead != 0 && bytesReadOfCurrFile < currFileSize)
                {
                    // check if we read too much
                    if (bytesReadOfCurrFile + bytesRead > currFileSize)
                    {
                        // bytes left must be less than 1024, fine to convert to Int
                        int bytesLeftOfCurrFile = ((int)(currFileSize - bytesReadOfCurrFile));
                        currFileFS.Write(buffer, 0, bytesLeftOfCurrFile);
                        // done with this file
                        bytesReadOfCurrFile = currFileSize;

                        // handle overflow
                        bufferOverflowCount = bytesRead - bytesLeftOfCurrFile;
                        Buffer.BlockCopy(buffer, bytesLeftOfCurrFile, bufferOverflow, 0, bytesRead - bytesLeftOfCurrFile);
                    }
                    else
                    {
                        // append what we read
                        currFileFS.Write(buffer, 0, bytesRead);
                        // update bytesReadOfCurrFile
                        bytesReadOfCurrFile += bytesRead;

                        bytesRead = packageFs.Read(buffer, 0, 1024);
                    }
                }

                if (bytesReadOfCurrFile != currFileSize)
                {
                    // ran out of bytes for this file
                    requestHandler.LogDebug("error, unexpected package size: " + rcRequest.CacheFileName +
                        "(" + bytesReadOfCurrFile + " / " + currFileSize + ")");
                    return unpackedBytes * -1;
                }

                currFileFS.Close();

                // add the file to Lucene
                if (Util.IsParseable(rcRequest))
                {
                    string document = Util.ReadFileAsString(rcRequest.CacheFileName);
                    string title = Util.GetPageTitle(document);
                    string content = Util.GetPageContent(document);

                    //request.LogDebug("indexing: " + rcRequest._uri);
                    if (!existed)
                    {
                        IndexWrapper.IndexDocument(indexPath, "Content-Type: text/html", rcRequest.Uri, title, content);
                    }
                }
            }
            if (packageFs != null)
            {
                packageFs.Close();
            }
            return unpackedBytes;
        }

示例#8

0

显示文件

文件： RCRequest.cs 项目： o0111/ruralcafe

        /// <summary>
        /// Constructor for a RuralCafe Request.
        /// </summary>
        /// <param name="proxy">The proxy for the request.</param>
        /// <param name="request">The request.</param>
        /// <param name="anchorText">Text of the anchor tag.</param>
        /// <param name="referrerUri">URI of the referer.</param>
        /// <param name="body">The body for POSTs, ...</param>
        public RCRequest(RCProxy proxy, HttpWebRequest request, string anchorText,
            string referrerUri, byte[] body)
        {
            _anchorText = anchorText;
            _refererUri = referrerUri.Trim();

            _status = RequestHandler.Status.Pending;

            _webRequest = request;
            _webRequest.Timeout = RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT;
            _webRequest.Referer = _refererUri;
            _body = body;

            string fileName = CacheManager.UriToFilePath(_webRequest.RequestUri.ToString());
            string hashPath = CacheManager.GetHashPath(fileName);
            // Cache file name like ./GET/2876/627/...
            _relCacheFileName = request.Method + Path.DirectorySeparatorChar + hashPath + fileName;
            _requestId = _relCacheFileName.Replace(Path.DirectorySeparatorChar.ToString(), "");
            _cacheFileName = proxy.CachePath + _relCacheFileName;

            _packageFileName = proxy.PackagesPath + hashPath + fileName + ".gzip";
            _fileSize = 0;

            _proxy = proxy;
            // Root request is this, unless overridden.
            _rootRequest = this;

            _startTime = DateTime.Now;
            _finishTime = _startTime;
        }

示例#9

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Main logic of RuralCafe RPRequestHandler.
        /// Called by Go() in the base RequestHandler class.
        /// </summary>
        public override int HandleRequest()
        {
            // benchmarking
            //handleRequestStart = DateTime.Now;

            /*
            // XXX: obsolete
            // not checking this anymore, make sure you can establish the connection properly, after that its all good.
            if (!IsRCRemoteQuery())
            {
                LogDebug("error not RuralCafe URL or search request: " + RequestUri);
                return (int)Status.Ignored;
            }*/

            string richness = DEFAULT_RICHNESS;//_rcRequest.GetRCSearchField("richness");

            // XXX: static quota for now
            /* QUOTA parameterization in the UI
            // get the quota
            string quotaString = GetRuralCafeSearchField("quota");
            long remainingQuota = Int32.Parse(quotaString);
            if (quotaString.Equals(""))
            {
                // no quota
                remainingQuota = 1000000000; // XXX: very large number
            }
            else
            {
                try
                {
                    remainingQuota = Int32.Parse(quotaString);
                }
                catch (Exception e)
                {
                    remainingQuota = 0;
                    LogException("Couldn't parse quota: " + e.StackTrace + " " + e.Message);
                }
            }*/

            /*
            // XXX: obsolete
            if (IsRCURLRequest())
            {
             */
                //LogDebug("page request, downloading page as package");
                //string requestUri = _rcRequest.GetRCSearchField("textfield");
                string requestUri = _rcRequest.Uri;

                if (requestUri.Trim().Length > 0)
                {
                    string fileExtension = Util.GetFileExtension(requestUri);
                    if (!requestUri.StartsWith("http://"))
                    {
                        requestUri = "http://" + requestUri;
                    }

                    if (IsCacheable())
                    {
                        // remove RuralCafe stuff from the request
                        _rcRequest = new RCRequest(this, requestUri);
                        //_rcRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT);

                        if (RecursivelyDownloadPage(_rcRequest, richness, 0))
                        {
                            _rcRequest.FileSize = SendResponsePackage();
                            if (_rcRequest.FileSize > 0)
                            {
                                return (int)Status.Completed;
                            }
                        }
                    }
                    else
                    {
                        // XXX: not handled at the moment, technically nothing should be "not cacheable" though.
                        LogDebug("not cacheable, failed.");

                        return (int)Status.Failed;
                    }
                }
                /*
                // XXX: obsolete
            }
            else
            {
                LogDebug("RuralCafe search request: " + RequestUri);

                if (PrefetchBFS(richness, depth))
                {
                    _rcRequest.FileSize = SendResponsePackage();
                    if (_rcRequest.FileSize > 0)
                    {
                        return (int)Status.Completed;
                    }
                }
            }*/

            // benchmarking
            //handleRequestEnd = DateTime.Now;
            //SaveBenchmarkTimes();

            return (int)Status.Failed;
        }

示例#10

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

        /*
        // benchmarking stuff
        public void PrefetchAnalysis(string richness, int depth)
        {
            LogDebug("Running Benchmarker");

            // XXX: should add a parameter to always download or just read from cache
            // convert to Uri format
            //string pageUri = _webRequestUri;
            LogRequest();

            long bytesDownloaded = _rcRequest.DownloadToCache();

            FileInfo f;
            try
            {
                f = new FileInfo(_rcRequest.CacheFileName);
                if (bytesDownloaded < 0 || !f.Exists)
                {
                    return;
                }
            }
            catch (Exception e)
            {
                LogDebug("problem getting file info " + e.StackTrace + " " + e.Message);
                return;
            }

            // get the embedded content of the search result page
            LinkedList<RCRequest> objectsFound = DownloadEmbeddedObjects(_rcRequest, richness);
            // benchmarking: store the number of images found
            //imagesOnResultsPage.Add(objectsFound.Count);

            // recursively download pages
            LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest);
            // benchmarking: store the number of links found
            //linksOnResultsPage.Add(resultLinkUris.Count);
            foreach (RCRequest linkObject in resultLinkUris)
            {
                bytesDownloaded = linkObject.DownloadToCache();
                if (bytesDownloaded > -1 && f.Exists)
                {
                    linkObject.RequestStatus = (int)Status.Completed;
                }
                try
                {
                    f = new FileInfo(linkObject.CacheFileName);
                }
                catch (Exception)
                {
                    linkObject.RequestStatus = (int)Status.Failed;
                    continue;
                }
                if (linkObject.RequestStatus == (int)Status.Failed || !f.Exists)
                {
                    linkObject.RequestStatus = (int)Status.Failed;
                    continue;
                }

                // XXX: hackery
                // make a copy of this file
                try
                {
                    // create directory if it doesn't exist
                    if (!Util.CreateDirectoryForFile(linkObject.CacheFileName))
                    {
                        return;
                    }
                    // create directory if it doesn't exist
                    if (!Util.CreateDirectoryForFile("ZZZZZZ\\" + linkObject.CacheFileName))
                    {
                        return;
                    }

                    File.Delete("ZZZZZZ\\" + linkObject.CacheFileName);
                    File.Copy(linkObject.CacheFileName, "ZZZZZZ\\" + linkObject.CacheFileName);

                    // skip parseable check
                    if (!Util.IsParseable(linkObject))
                    {
                        continue;
                    }

                    // get the embedded content of the search result page
                    objectsFound = DownloadEmbeddedObjects(linkObject, richness);
                    // benchmarking: store the number of images on the page
                    //imagesOnTargetPage.Add(objectsFound.Count);

                    File.Delete(linkObject.CacheFileName);
                }
                catch (Exception e)
                {
                    LogDebug("problem downloading a file or something " + e.StackTrace + " " + e.Message);
                }
            }
        }*/
        /*
        // XXX: obsolete (currently not in use)
        /// <summary>
        /// Prefetch a search page in breadth first search order.
        /// </summary>
        /// <param name="richness">Richness of the prefetch.</param>
        /// <param name="depth">Depth to prefetch.</param>
        /// <returns>Status.</returns>
        private bool PrefetchBFS(string richness, int depth)
        {
            // benchmarking
            //downloadPagesStart = DateTime.Now;

            LogDebug("Running BFS");

            // reconstruct _rcRequest
            string pageUri = _rcRequest.TranslateRCSearchToGoogle();
            if (!Util.IsValidUri(pageUri))
            {
                return false;
            }
            _rcRequest = new RCRequest(this, pageUri);
            //_rcRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT);

            // download the file
            long bytesDownloaded = _rcRequest.DownloadToCache();
            if (bytesDownloaded < 0)
            {
                LogDebug("Error downloading: " + _rcRequest.Uri);
                return false;
            }

            // add to the package
            //if (
            _package.Pack(this, _rcRequest, ref _quota);//)
            //{
            //    LogDebug("packed: " + RequestUri + " " + _rcRequest.FileSize + " bytes, " + _quota + " left");
            //}

            // check quota
            if (_quota < DEFAULT_LOW_WATERMARK)
            {
                // benchmarking
                //downloadPagesEnd = DateTime.Now;

                return true;
            }

            // setup the initial frontier
            LinkedList<RCRequest> currentBFSFrontier = ExtractGoogleResults(_rcRequest);
            LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>();

            // run BFS
            while (depth < DEFAULT_MAX_DEPTH)
            {
                // download objects in parallel
                currentBFSFrontier = DownloadObjectsInParallel(_rcRequest, currentBFSFrontier);

                // download embedded objects for each downloaded object
                foreach (RCRequest currObject in currentBFSFrontier)
                {
                    // download embedded objects
                    DownloadEmbeddedObjects(currObject, richness);
                }

                if (_quota < DEFAULT_LOW_WATERMARK)
                {
                    // quota met
                    break;
                }

                // get the next frontier from the current ones
                nextBFSFrontier = GetNewBFSFrontier(currentBFSFrontier);
                currentBFSFrontier = nextBFSFrontier;
                depth++;
            }

            return true;
            //downloadPagesEnd = DateTime.Now;
        }

        /// <summary>
        /// Gets the next BFS frontier from the current frontier.
        /// </summary>
        /// <param name="currentBFSFrontier">Current BFS frontier.</param>
        /// <returns>Next BFS frontier as a LinkedList.</returns>
        private LinkedList<RCRequest> GetNewBFSFrontier(LinkedList<RCRequest> currentBFSFrontier)
        {
            LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>();
            LinkedList<RCRequest> extractedLinks;

            // go through the current frontier and collect the links
            foreach (RCRequest rcRequest in currentBFSFrontier)
            {
                // get all the links
                extractedLinks = ExtractLinks(rcRequest);

                // add to the frontier if we haven't seen it recently
                foreach (RCRequest extractedLink in extractedLinks)
                {
                    // ignore blacklisted domains
                    if (IsBlacklisted(extractedLink.Uri))
                    {
                        continue;
                    }

                    if (!currentBFSFrontier.Contains(extractedLink) &&
                        !nextBFSFrontier.Contains(extractedLink))
                    {
                        nextBFSFrontier.AddLast(extractedLink);
                    }
                }

            }
            return nextBFSFrontier;
        }*/
        /// <summary>
        /// Recursively downloads a page and its embedded objects, and its outlinks.
        /// </summary>
        /// <param name="rcRequest">Requested page to start from.</param>
        /// <param name="richness">Richness setting.</param>
        /// <param name="depth">Depth to download.</param>
        /// <returns></returns>
        private bool RecursivelyDownloadPage(RCRequest rcRequest, string richness, int depth)
        {
            if (_quota < DEFAULT_LOW_WATERMARK)
            {
                return false;
            }

            if (depth == DEFAULT_MAX_DEPTH)
            {
                return false;
            }

            // check for parseable since its just some URL
            if (!Util.IsParseable(rcRequest))
            {
                return false;
            }

            // reduce the timer
            DateTime currTime = DateTime.Now;
            DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT);
            if (endTime.CompareTo(currTime) > 0)
            {
                RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds;
            }
            else
            {
                RCRequest.GenericWebRequest.Timeout = 0;
            }

            // download the page
            long bytesDownloaded = rcRequest.DownloadToCache(false);
            if (bytesDownloaded < 0 )
            {
                LogDebug("[depth = " + depth + "] error downloading: " + rcRequest.Uri);
                return false;
            }

            // add to the package
            if (_package.Pack(this, rcRequest, ref _quota))
            {
                LogDebug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left");
            }

            // get the embedded content of the search result page
            DownloadEmbeddedObjects(rcRequest, richness);

            // recurse if necessary
            LinkedList<RCRequest> resultLinkUris = ExtractLinks(rcRequest);
            foreach (RCRequest currObject in resultLinkUris)
            {
                RecursivelyDownloadPage(currObject, richness, depth + 1);
            }
            return true;
        }

示例#11

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Extracts the html references using a separator token and returns them.
        /// XXX: should replace and obsolete this with a better HTML parser.
        /// </summary>
        /// <param name="rcRequest">Page to parse.</param>
        /// <param name="tagAttributes">Seperator tokens.</param>
        /// <returns>List of references.</returns>
        LinkedList<RCRequest> ExtractReferences(RCRequest rcRequest, string[,] tagAttributes)
        {
            LinkedList<RCRequest> extractedReferences = new LinkedList<RCRequest>();

            string fileString = Util.ReadFileAsString(rcRequest.CacheFileName).ToLower();

            for (int i = 0; i < tagAttributes.GetLength(0); i++)
            {
                string tag = tagAttributes[i, 0];
                string attribute = tagAttributes[i, 1];

                HtmlParser parse = new HtmlParser(fileString);
                HtmlTag foundTag;
                while (parse.ParseNext(tag, out foundTag))
                {
                    // See if this attribute exists
                    string currUri;
                    if (foundTag.Attributes.TryGetValue(attribute, out currUri))
                    {
                        // value contains URL referenced by this link
                        // convert to absolute addresses before setting as a uri
                        currUri = TranslateToAbsoluteAddress(rcRequest.Uri, currUri);
                        // XXX: need to make sure the currUri isn't going to cause an exception to be thrown
                        if (!Util.IsValidUri(currUri))
                        {
                            continue;
                        }

                        RCRequest extractedRCRequest = new RCRequest(this, currUri);
                        //extractedRCRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT);

                        if (!extractedReferences.Contains(extractedRCRequest))
                        {
                            extractedReferences.AddLast(extractedRCRequest);
                        }
                    }
                }
            }

            /*
            string[] lines = fileString.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries);

            // get links
            int pos;
            string currLine;
            string currUri;
            // stagger starting index by 1 since first split can't be a link
            for (int i = 1; i < lines.Length; i++)
            {
                currLine = (string)lines[i];
                // to the next " symbol
                if ((pos = currLine.IndexOf("\"")) > 0)
                {
                    currUri = currLine.Substring(0, pos);

                    // convert to absolute addresses before setting as a uri
                    currUri = TranslateToAbsoluteAddress(rcRequest.Uri, currUri);
                    // XXX: need to make sure the currUri isn't going to cause an exception to be thrown
                    if (!Util.IsValidUri(currUri))
                    {
                        continue;
                    }

                    RCRequest extractedRCRequest = new RCRequest(this, currUri);
                    //extractedRCRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT);

                    if (!extractedReferences.Contains(extractedRCRequest))
                    {
                        extractedReferences.AddLast(extractedRCRequest);
                    }
                }
            }
            */
            return extractedReferences;
        }

示例#12

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

 /// <summary>
 /// Extracts the links on a page.
 /// Wrapper for ExtractReferences()
 /// XXX: not completely implemented, need non HTML/"a href=" references.
 /// </summary>
 LinkedList<RCRequest> ExtractLinks(RCRequest rcRequest)
 {
     //string[] stringSeparator = new string[] { "a href=\"" };
     return ExtractReferences(rcRequest, HtmlParser.LinkTagAttributes);
 }

示例#13

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

 /// <summary>
 /// Extracts the embedded objects on a page.
 /// Wrapper for ExtractReferences()
 /// XXX: not completely implemented, need non HTML/"src=" references.
 /// </summary>
 LinkedList<RCRequest> ExtractEmbeddedObjects(RCRequest rcRequest)
 {
     //string[] stringSeparator = new string[] { "src=\"", "link href=\"", "SRC=\"" };
     return ExtractReferences(rcRequest, HtmlParser.EmbeddedObjectTagAttributes);
 }

示例#14

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Downloads a set of URIs in parallel using a ThreadPool.
        /// </summary>
        /// <param name="parentRequest">Root request.</param>
        /// <param name="children">Children requests to be downloaded.</param>
        /// <returns>List of downloaded requests.</returns>
        private LinkedList<RCRequest> DownloadObjectsInParallel(RCRequest parentRequest, LinkedList<RCRequest> children)
        {
            ThreadPool.SetMaxThreads(4, 4);
            LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>();

            if (children.Count == 0)
            {
                return addedObjects;
            }

            parentRequest.ResetEvents = new ManualResetEvent[children.Count];

            try
            {
                // queue up worker threads to download URIs
                for (int i = 0; i < children.Count; i++)
                {
                    RCRequest currChild = children.ElementAt(i);
                    // set the resetEvent
                    currChild.ResetEvents = parentRequest.ResetEvents;
                    parentRequest.ResetEvents[i] = new ManualResetEvent(false);

                    // make sure we haven't downloaded this before
                    if (_package.RCRequests.Contains(currChild))
                    {
                        // skip it
                        currChild.SetDone();
                        continue;
                    }

                    // download the page
                    //LogDebug("queueing: " + currChild.ChildNumber + " " + currChild.Uri);
                    ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadPageWorkerThread), (object)currChild);
                }

                // wait for timeout
                WaitAll(parentRequest.ResetEvents);

                addedObjects = _package.Pack(this, children, ref _quota);
            }
            catch (Exception e)
            {
                LogDebug("unable to download embeddedObjects: " + e.StackTrace + " " + e.Message);
            }

            return addedObjects;
        }

示例#15

0

显示文件

文件： RemoteRequestHandler.cs 项目： o0111/ruralcafe

        /// <summary>
        /// Downloads a set of URIs in parallel using a ThreadPool.
        /// </summary>
        /// <param name="parentRequest">Root request.</param>
        /// <param name="childObjects">Children URIs to be downloaded.</param>
        /// <returns>List of downloaded requests.</returns>
        private LinkedList<RCRequest> DownloadObjectsInParallel(RCRequest parentRequest, LinkedList<Uri> childObjects)
        {
            LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>();

            if (_killYourself || childObjects.Count == 0)
            {
                return addedObjects;
            }

            parentRequest.ResetEvents = new ManualResetEvent[childObjects.Count];

            try
            {
                // queue up worker threads to download URIs
                for (int i = 0; i < childObjects.Count; i++)
                {
                    // create the RCRequest for the object
                    RCRequest currChildObject = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(childObjects.ElementAt(i)));
                    currChildObject.ChildNumber = i;
                    // Set the root request.
                    currChildObject.RootRequest = parentRequest;
                    addedObjects.AddLast(currChildObject);

                    // set the resetEvent
                    currChildObject.ResetEvents = parentRequest.ResetEvents;
                    parentRequest.ResetEvents[i] = new ManualResetEvent(false);

                    // make sure we haven't downloaded this before
                    if (_package.RCRequests.Contains(currChildObject))
                    {
                        // skip it
                        currChildObject.SetDone();
                        continue;
                    }

                    // download the page
                    //LogDebug("queueing: " + currChild.ChildNumber + " " + currChild.Uri);
                    ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadPageWorkerThread), (object)currChildObject);
                }

                // wait for timeout
                Utils.WaitAll(parentRequest.ResetEvents);

                addedObjects = _package.Pack(this, addedObjects, ref _quota);
            }
            catch (Exception e)
            {
                Logger.Warn("unable to download embeddedObjects.", e);
            }

            return addedObjects;
        }

示例#16

0

显示文件

文件： Util.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Checks if the URI is parseable by RuralCafe.
        /// </summary>
        /// <param name="rcRequest">A RCRequest object.</param>
        /// <returns>True or false for parseable or not.</returns>
        public static bool IsParseable(RCRequest rcRequest)
        {
            string contentType = GetContentTypeOfFile(rcRequest.CacheFileName);

            if (contentType.Contains("htm"))
            {
                return true;
            }

            return false;
        }

示例#17

0

显示文件

文件： Package.cs 项目： o0111/ruralcafe

        /// <summary>
        /// Adds a RCRequest to the package given the quota limitation.
        /// Only called by the remote proxy.
        /// XXX: quota is currently simple algorithm.
        /// XXX: should be changed to check for compressed size rather than actual size.
        /// </summary>
        /// <param name="requestHandler">Calling handler for this method.</param>
        /// <param name="requestObject">RCRequest to add.</param>
        /// <param name="quota">Quota limit.</param>
        /// <returns>True iff the request has been packed successfully.</returns>
        public bool Pack(RemoteRequestHandler requestHandler, RCRequest requestObject, ref long quota)
        {
            if (_rcRequests.Contains(requestObject))
            {
                requestHandler.Logger.Debug("object exists in package: " + requestObject.Uri);
                return false;
            }

            requestObject.FileSize = Utils.GetFileSize(requestObject.CacheFileName);
            if (requestObject.FileSize <= 0)
            {
                return false;
            }

            // quota check
            if ((quota - requestObject.FileSize) < 0)
            {
                requestHandler.Logger.Debug("object doesn't fit in quota: " + requestObject.Uri);
                return false;
            }

            _rcRequests.Add(requestObject);
            quota -= requestObject.FileSize;

            return true;
        }

示例#18

0

显示文件

文件： Package.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Adds a RCRequest to the package given the quota limitation.
        /// Only called by the remote proxy.
        /// XXX: quota is currently simple algorithm.
        /// XXX: should be changed to check for compressed size rather than actual size.
        /// </summary>
        /// <param name="requestHandler">Calling handler for this method.</param>
        /// <param name="rcRequest">RCRequest to add.</param>
        /// <param name="quota">Quota limit.</param>
        /// <returns>Error message.</returns>
        public bool Pack(RemoteRequestHandler requestHandler, RCRequest rcRequest, ref long quota)
        {
            if (rcRequest.Uri.Contains(' '))
            {
                requestHandler.LogDebug("object contains spaces: " + rcRequest.Uri);
                return false;
            }

            if (_rcRequests.Contains(rcRequest))
            {
                requestHandler.LogDebug("object exists in package: " + rcRequest.Uri);
                return false;
            }

            rcRequest.FileSize = Util.GetFileSize(rcRequest.CacheFileName);
            if (rcRequest.FileSize <= 0)
            {
                //requestHandler.LogDebug("object has no content: " + rcRequest.Uri);
                return false;
            }

            // quota check
            if ((quota - rcRequest.FileSize) < 0)
            {
                requestHandler.LogDebug("object doesn't fit in quota: " + rcRequest.Uri);
                return false;
            }

            _rcRequests.AddLast(rcRequest);
            quota -= rcRequest.FileSize;

            //requestHandler.LogDebug("packed: " + requestHandler.RequestUri + " " + rcRequest.FileSize + " bytes - " + quota + " left");
            return true;
        }

示例#19

0

显示文件

文件： LocalInternalRequestHandler.cs 项目： o0111/ruralcafe

        /// <summary>
        /// Sends the frame page to the client.
        /// GET request will be sent to <![CDATA[request/search.xml?p=1&s=searchstring]]> where
        /// p is the current page number, if there are multipage pages, page number starts from 1, 2, 3...,
        /// s is the search query string
        /// 
        /// This gets the results from google, always the same amount of results google gets, usually 10.
        /// </summary>
        public Response ServeRCLiveResultPage(int pageNumber, string queryString)
        {
            XmlDocument xmlDoc = new XmlDocument();
            xmlDoc.AppendChild(xmlDoc.CreateXmlDeclaration("1.0", "UTF-8", String.Empty));
            XmlElement searchXml = xmlDoc.CreateElement("search");
            xmlDoc.AppendChild(searchXml);

            if (queryString.Trim().Length == 0 || Proxy.NetworkStatus == RCLocalProxy.NetworkStatusCode.Offline)
            {
                searchXml.SetAttribute("total", "0");
                return new Response(xmlDoc.InnerXml);
            }
            // Google search
            string googleSearchString = ConstructGoogleSearch(queryString, pageNumber);
            _rcRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(googleSearchString));
            // Download result page
            string resultPage = _rcRequest.DownloadAsString();

            try
            {
                if (resultPage != null)
                {
                    LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(resultPage);
                    long numResults = GetGoogleResultsNumber(resultPage);
                    searchXml.SetAttribute("total", "" + numResults);

                    foreach (RCRequest linkObject in resultLinkUris)
                    {
                        searchXml.AppendChild(BuildSearchResultXmlElement(xmlDoc,
                            linkObject.AnchorText, linkObject.Uri, linkObject.ContentSnippet));
                    }

                    PrepareXMLRequestAnswer();
                    return new Response(xmlDoc.InnerXml);
                }
                else
                {
                    return new Response();
                }
            }
            catch
            {
                return new Response();
            }
        }

示例#20

0

显示文件

文件： LocalRequestHandler.cs 项目： kipropesque/RuralCafe

        void ServeRCRemoteResultPage()
        {
            if (_proxy.NetworkStatus == (int)RCProxy.NetworkStatusCode.Offline)
            {
                return;
            }

            // Parse parameters
            NameValueCollection qscoll = Util.ParseHtmlQuery(RequestUri);
            int numItemsPerPage = Int32.Parse(qscoll.Get("n"));
            int pageNumber = Int32.Parse(qscoll.Get("p"));
            string queryString = qscoll.Get("s");

            // Google search
            string googleSearchString = ConstructGoogleSearch(queryString);
            _rcRequest = new RCRequest(this, googleSearchString);

            //LogDebug("streaming: " + _rcRequest.GenericWebRequest.RequestUri + " to cache and client.");
            //_rcRequest.GenericWebRequest.Proxy = null;
            long bytesDownloaded = _rcRequest.DownloadToCache(true);
            try
            {
                FileInfo f = new FileInfo(_rcRequest.CacheFileName);
                if (bytesDownloaded > -1 && f.Exists)
                {
                    LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest);
                    string resultsString = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
                    resultsString = resultsString + "<search total=\"" + resultLinkUris.Count.ToString() + "\">";
                    int currentItemNumber = 0;
                    foreach (RCRequest linkObject in resultLinkUris)
                    {
                        currentItemNumber++;
                        if ((currentItemNumber > ((pageNumber - 1) * numItemsPerPage)) &&
                            (currentItemNumber < (pageNumber * numItemsPerPage) + 1))
                        {
                            string uri = System.Security.SecurityElement.Escape(linkObject.Uri); //System.Security.SecurityElement.Escape(result.Get("uri")); // escape xml string
                            string title = System.Security.SecurityElement.Escape(linkObject.AnchorText); //System.Security.SecurityElement.Escape(result.Get("title")); //escape xml string
                            //string displayUri = uri;
                            string contentSnippet = "";

                            // XXX: find content snippet here
                            if (uri.StartsWith("http://")) //laura: obmit http://
                                uri = uri.Substring(7);
                            resultsString = resultsString +
                                            "<item>" +
                                            "<title>" + title + "</title>" +
                                            "<url>" + uri + "</url>" +
                                            "<snippet>" + contentSnippet + "</snippet>" +
                                            "</item>";
                        }
                    }

                    resultsString = resultsString + "</search>";

                    SendOkHeaders("text/xml", "Cache-Control: no-cache" + "\r\n" +
                                              "Pragma: no-cache" + "\r\n" +
                                              "Expires: -1" + "\r\n");
                    SendMessage(resultsString);
                }
                else
                {
                    // do nothing
                }
            }
            catch
            {
                // do nothing
            }
        }

示例#21

0

显示文件

文件： LocalInternalRequestHandler.cs 项目： o0111/ruralcafe

        /// <summary>
        /// Extracts the result links from a google results page.
        /// </summary>
        /// <param name="googleResultPage">The Google results page.</param>
        /// <returns>List of links.</returns>
        private LinkedList<RCRequest> ExtractGoogleResults(string googleResultPage)
        {
            string[] stringSeparator = new string[] { "</cite>" };
            LinkedList<RCRequest> resultLinks = new LinkedList<RCRequest>();
            string[] lines = googleResultPage.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries);

            // get links
            int pos;
            string currLine;
            string currUri = "";
            string currTitle = "";
            string currSnippet = "";
            // Omitting last split, since there is no link any more.
            for (int i = 0; i < lines.Length - 1; i++)
            {
                currLine = lines[i];

                // get the title
                if ((pos = currLine.LastIndexOf("<a href=")) >= 0)
                {
                    // title
                    currTitle = currLine.Substring(pos);
                    // find start
                    if ((pos = currTitle.IndexOf(">")) >= 0)
                    {
                        // cut start
                        currTitle = currTitle.Substring(pos + 1);
                        if ((pos = currTitle.IndexOf("</a>")) >= 0)
                        {
                            // cut end
                            currTitle = currTitle.Substring(0, pos);
                            currTitle = HtmlUtils.StripTagsCharArray(currTitle);
                            currTitle = currTitle.Trim();
                        }
                    }
                }

                // get the uri
                if ((pos = currLine.LastIndexOf("<a href=\"/url?q=")) >= 0)
                {
                    // start right after
                    currUri = currLine.Substring(pos + "<a href=\"/url?q=".Length);
                    if ((pos = currUri.IndexOf("&amp")) >= 0)
                    {
                        // cut end
                        currUri = currUri.Substring(0, pos);
                        currUri = HtmlUtils.StripTagsCharArray(currUri);
                        currUri = currUri.Trim();
                    }

                    if (!HttpUtils.IsValidUri(currUri))
                    {
                        continue;
                    }

                    // check blacklist
                    if (IsBlacklisted(currUri))
                    {
                        continue;
                    }

                    if (!currUri.Contains(".") || currTitle.Equals(""))
                    {
                        continue;
                    }

                    // If we're in slow mode, we don't want cached results within the live results
                    if (_proxy.NetworkStatus == RCProxy.NetworkStatusCode.Slow &&
                        _proxy.ProxyCacheManager.IsCached(CacheManager.GetRelativeCacheFileName(currUri, "GET")))
                    {
                        continue;
                    }

                    // get the content snippet (in next split)
                    currLine = lines[i + 1];
                    // find start
                    string snippetSplit = "<span class=\"st\">";
                    if ((pos = currLine.LastIndexOf(snippetSplit)) >= 0)
                    {
                        // cut start
                        currSnippet = currLine.Substring(pos + snippetSplit.Length);
                        if ((pos = currSnippet.IndexOf("</span>")) >= 0)
                        {
                            // cut end
                            currSnippet = currSnippet.Substring(0, pos);
                            currSnippet = HtmlUtils.StripTagsCharArray(currSnippet, false);
                            currSnippet = RegExs.MULTIPLE_SPACES_REGEX.Replace(currSnippet.Trim(), " ");
                        }
                    }

                    // Create request and save anchorText and snippet
                    RCRequest currRCRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(currUri));
                    currRCRequest.AnchorText = currTitle;
                    currRCRequest.ContentSnippet = currSnippet;

                    resultLinks.AddLast(currRCRequest);
                }
            }

            return resultLinks;
        }

示例#22

0

显示文件

文件： RequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Extracts the result links from a google results page.
        /// XXX: Probably broken all the time due to Google's constantly changing HTML format.
        /// </summary>
        /// <param name="rcRequest">Request to make.</param>
        /// <returns>List of links.</returns>
        public LinkedList<RCRequest> ExtractGoogleResults(RCRequest rcRequest)
        {
            string[] stringSeparator = new string[] { "<cite>" };
            LinkedList<RCRequest> resultLinks = new LinkedList<RCRequest>();
            string fileString = Util.ReadFileAsString(rcRequest.CacheFileName);
            string[] lines = fileString.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries);

            // get links
            int pos;
            string currLine;
            string currUri;
            string currTitle;
            // stagger starting index by 1 since first split can't be a link
            for (int i = 0; i < lines.Length - 1; i++)
            {
                currLine = (string)lines[i];
                currTitle = "";
                // get the title of the page as well
                if ((pos = currLine.LastIndexOf("<a href=")) >= 0)
                {
                    currTitle = currLine.Substring(pos);
                    if ((pos = currTitle.IndexOf(">")) >= 0)
                    {
                        currTitle = currTitle.Substring(pos + 1);
                        if ((pos = currTitle.IndexOf("</a>")) >= 0)
                        {
                            currTitle = currTitle.Substring(0, pos);
                            currTitle = Util.StripTagsCharArray(currTitle);
                            currTitle = currTitle.Trim();
                        }
                    }
                }

                currLine = (string)lines[i + 1];
                // to the next " symbol
                if ((pos = currLine.IndexOf("</cite>")) > 0)
                {
                    currUri = currLine.Substring(0, pos);

                    if ((pos = currUri.IndexOf(" - ")) > 0)
                    {
                        currUri = currUri.Substring(0, pos);
                    }

                    currUri = Util.StripTagsCharArray(currUri);
                    currUri = currUri.Trim();

                    // instead of translating to absolute, prepend http:// to make webrequest constructor happy
                    currUri = "http://" + currUri;

                    if (!Util.IsValidUri(currUri))
                    {
                        continue;
                    }

                    // check blacklist
                    if (IsBlacklisted(currUri))
                    {
                        continue;
                    }

                    if (!currUri.Contains(".") || currTitle.Equals(""))
                    {
                        continue;
                    }
                    RCRequest currRCRequest = new RCRequest(this, currUri);
                    currRCRequest.AnchorText = currTitle;
                    //currRCRequest.ChildNumber = i - 1;
                    //currRCRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT);

                    resultLinks.AddLast(currRCRequest);
                }
            }

            return resultLinks;
        }

示例#23

0

显示文件

文件： RemoteRequestHandler.cs 项目： kipropesque/RuralCafe

        /// <summary>
        /// Downloads embedded objects based on the richness.
        /// </summary>
        /// <param name="rcRequest">Request page to start from.</param>
        /// <param name="richness">Richness setting.</param>
        /// <returns>List of RCRequests of embedded objects downloaded</returns>
        private LinkedList<RCRequest> DownloadEmbeddedObjects(RCRequest rcRequest, string richness)
        {
            LinkedList<RCRequest> filteredEmbeddedObjects = new LinkedList<RCRequest>();

            if (_quota < DEFAULT_LOW_WATERMARK)
            {
                return filteredEmbeddedObjects;
            }

            LinkedList<RCRequest> embeddedObjects = ExtractEmbeddedObjects(rcRequest);

            // XXX: refactor into filter class/method.
            // filter out based on richness
            int objectNumber = 0;
            foreach (RCRequest embeddedObject in embeddedObjects)
            {
                // ignore blacklisted domains
                if (IsBlacklisted(embeddedObject.Uri))
                {
                    continue;
                }

                if (richness.Equals("normal"))
                {
                    filteredEmbeddedObjects.AddLast(embeddedObject);
                }
                else if (richness.Equals("low"))
                {
                    // XXX: logic here is ugly, and not perfect
                    // XXX: since the implementation of PossiblyATextPage is incomplete
                    // if its an image or couldn't possibly be a text page
                    if (!IsImagePage(embeddedObject.Uri) && PossiblyATextPage(embeddedObject.Uri))
                    {
                        filteredEmbeddedObjects.AddLast(embeddedObject);
                    }
                }
                embeddedObject.ChildNumber = objectNumber;
                objectNumber++;
            }
            embeddedObjects = filteredEmbeddedObjects;

            //return DownloadObjects(rcRequest, embeddedObjects);
            return DownloadObjectsInParallel(rcRequest, embeddedObjects);
        }

C# (CSharp) RuralCafe RCRequest示例