/// <summary> /// DUMMY used for request matching. /// Not the cleanest implementation need to instantiate a whole object just to match /// </summary> private LocalRequestHandler(string itemId) { /* if (!Util.IsValidUri(uri)) { // XXX: do nothing } else {*/ _rcRequest = new RCRequest(itemId); //} /* XXX: don't think the dummy needs this // setup the header variables if (IsRuralCafeLocalSearch() || IsRuralCafeRemoteRequest()) { ParseRuralCafeQuery(); } */ }
/// <summary> /// Downloads embedded objects based on the richness. /// </summary> /// <param name="rcRequest">Request page to start from.</param> /// <param name="richness">Richness setting.</param> /// <param name="baseUri">The Uri of the website where to download embedded objects.</param> /// <param name="htmlContent">The HTML content of the webiste.</param> /// <returns>List of RCRequests of embedded objects downloaded</returns> private LinkedList<RCRequest> DownloadEmbeddedObjects(RCRequest rcRequest, Uri baseUri, string htmlContent, Richness richness) { LinkedList<Uri> filteredEmbeddedObjects = new LinkedList<Uri>(); if (_killYourself || _quota < DEFAULT_LOW_WATERMARK) { return new LinkedList<RCRequest>(); } LinkedList<Uri> embeddedObjects = HtmlUtils.ExtractEmbeddedObjects(baseUri, htmlContent); // XXX: refactor into filter class/method. // filter out based on richness foreach (Uri uri in embeddedObjects) { string uriS = uri.ToString(); // ignore blacklisted domains if (IsBlacklisted(uriS)) { continue; } if (richness == Richness.Normal || (richness == Richness.Low && IsATextPage(uriS))) { filteredEmbeddedObjects.AddLast(uri); } } embeddedObjects = filteredEmbeddedObjects; return DownloadObjectsInParallel(rcRequest, embeddedObjects); }
/// <summary> /// Recursively downloads a page and its embedded objects, and its outlinks. /// </summary> /// <param name="rcRequest">Requested page to start from.</param> /// <param name="richness">Richness setting.</param> /// <param name="depth">Depth to download.</param> /// <returns>Wheter something was downloaded successfully.</returns> public bool RecursivelyDownloadPage(RCRequest rcRequest, Richness richness, int depth) { if (_killYourself || _quota < DEFAULT_LOW_WATERMARK) { // Send error page if we're on top level if (depth == 0) { SendErrorPage(HttpStatusCode.InternalServerError, "Request aborted or it does not fit in quota."); } return false; } // reduce the timer DateTime currTime = DateTime.Now; DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT); if (endTime.CompareTo(currTime) > 0) { RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds; } else { RCRequest.GenericWebRequest.Timeout = 0; } // Only download for POST/... or not already existing items if (!IsGetOrHeadHeader() || !_proxy.ProxyCacheManager.IsCached(rcRequest.RelCacheFileName)) { // Download! try { // There is no index on the remote side anyway rcRequest.DownloadToCache(false); } catch (Exception e) { Logger.Warn("[depth = " + depth + "] error downloading: " + rcRequest.Uri + " " + e.Message); // Send error page if we're on top level if (depth == 0) { if (e is WebException) { WebException exp = e as WebException; HttpWebResponse response = (e as WebException).Response as HttpWebResponse; SendErrorPage(response != null ? response.StatusCode : HttpStatusCode.InternalServerError, e.Message); } else { SendErrorPage(HttpStatusCode.InternalServerError, e.Message); } } return false; } } else { Logger.Debug("Already existed: " + rcRequest.Uri); } // add to the package if (_package.Pack(this, rcRequest, ref _quota)) { Logger.Debug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left"); } // add a new request for the old location if it was redirected. This will then // get the 301 file from the cache, so the local proxy does not need to send // another request to the remote proxy to find that out. if (rcRequest.UriBeforeRedirect != null) { Logger.Debug("Redirected: Also packing old URI with a 301 file."); RCRequest rc301 = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(rcRequest.UriBeforeRedirect)); _package.Pack(this, rc301, ref _quota); } if(!_proxy.ProxyCacheManager.IsHTMLFile(rcRequest.RelCacheFileName)) { return true; } // Getting embedded objects and recursing only makes sense for html pages. Uri baseUri = new Uri(rcRequest.Uri); string htmlContent = Utils.ReadFileAsString(rcRequest.CacheFileName).ToLower(); // get the embedded content of the search result page DownloadEmbeddedObjects(rcRequest, baseUri, htmlContent, richness); // Don't recurse if we're on the deepest layer allowed if (depth == Properties.Settings.Default.DEFAULT_DEPTH - 1) { return true; } // recurse LinkedList<Uri> resultLinkUris = HtmlUtils.ExtractLinks(baseUri, htmlContent); foreach (Uri uri in resultLinkUris) { RCRequest currRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(uri)); RecursivelyDownloadPage(currRequest, richness, depth + 1); } return true; }
/// <summary> /// Creates RCRequest object for the request. /// </summary> protected bool CreateRequest(string requestedUri, string refererUri, string recvString) { if (Util.IsValidUri(requestedUri)) { // create the request object _rcRequest = new RCRequest(this, requestedUri, "", refererUri); // XXX: obsolete //_rcRequest.ParseRCSearchFields(); _rcRequest.GenericWebRequest.Referer = refererUri; _rcRequest._recvString = recvString; return true; } return false; }
/// <summary> /// Downloads a set of URIs in series. /// </summary> /// <param name="parentRequest">Root request.</param> /// <param name="children">Children requests to be downloaded.</param> /// <returns>List of downloaded requests.</returns> private LinkedList<RCRequest> DownloadObjects(RCRequest parentRequest, LinkedList<RCRequest> children) { LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>(); if (children.Count == 0) { return addedObjects; } parentRequest.ResetEvents = new ManualResetEvent[children.Count]; try { // queue up worker threads to download URIs for (int i = 0; i < children.Count; i++) { RCRequest currChild = children.ElementAt(i); // make sure we haven't downloaded this before if (_package.RCRequests.Contains(currChild)) { // skip it parentRequest.SetDone(); continue; } // reduce the timer DateTime currTime = DateTime.Now; DateTime endTime = parentRequest.StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT); if (endTime.CompareTo(currTime) > 0) { currChild.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds; } else { currChild.GenericWebRequest.Timeout = 0; } // download the page currChild.DownloadToCache(false); if (IsTimedOut()) { break; } } addedObjects = _package.Pack(this, children, ref _quota); } catch (Exception e) { LogDebug("unable to download embeddedObjects: " + e.StackTrace + " " + e.Message); } return addedObjects; }
/// <summary> /// Queues this request. /// </summary> private void AddRequest() { // Parse parameters NameValueCollection qscoll = Util.ParseHtmlQuery(RequestUri); int userId = Int32.Parse(qscoll.Get("u")); string targetName = qscoll.Get("t"); string targetUri = qscoll.Get("a"); string refererUri = qscoll.Get("r"); if (targetName == null) { targetName = "fake title"; } if (targetUri == null) { // error targetUri = ""; SendErrorPage(HTTP_NOT_FOUND, "malformed add request", ""); return; } if (refererUri == null) { refererUri = targetUri; } _originalRequestUri = RequestUri; // preserve the original request status (for HandleLogRequest) int originalRequestStatus = _rcRequest.RequestStatus; _rcRequest = new RCRequest(this, targetUri, targetName, refererUri); _rcRequest.RequestStatus = originalRequestStatus; ((RCLocalProxy)_proxy).QueueRequest(userId, this); SendOkHeaders("text/html"); SendMessage(RefererUri); }
/// <summary> /// Unpacks the package contents and indexes them. /// </summary> /// <param name="indexPath">Path to the index.</param> /// <param name="requestHandler">Calling handler for this method.</param> /// <returns>Total unpacked content size.</returns> public static long Unpack(LocalRequestHandler requestHandler, string indexPath) { string packageIndexSizeStr = requestHandler.RCRequest.GenericWebResponse.GetResponseHeader("Package-IndexSize"); string packageContentSizeStr = requestHandler.RCRequest.GenericWebResponse.GetResponseHeader("Package-ContentSize"); long packageIndexSize = Int64.Parse(packageIndexSizeStr); long packageContentSize = Int64.Parse(packageContentSizeStr); string packageFileName = requestHandler.PackageFileName; string unpackedPackageFileName = packageFileName.Replace(".gzip", ""); GZipWrapper.GZipDecompress(packageFileName, unpackedPackageFileName, packageIndexSize + packageContentSize); FileStream packageFs = new FileStream(unpackedPackageFileName, FileMode.Open); // read the package index Byte[] packageIndexBuffer = new Byte[packageIndexSize]; packageFs.Read(packageIndexBuffer, 0, (int)packageIndexSize); // split the big package file into pieces string[] stringSeparator = new string[] { "\r\n" }; System.Text.UTF8Encoding enc = new System.Text.UTF8Encoding(); string package = enc.GetString(packageIndexBuffer); string[] packageContentArr = package.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries); Byte[] bufferOverflow = new Byte[1024]; int bufferOverflowCount = 0; int bytesRead = 0; long bytesReadOfCurrFile = 0; long unpackedBytes = 0; Byte[] buffer = new Byte[1024]; string[] packageEntryArr; string currUri = ""; long currFileSize = 0; foreach (string entry in packageContentArr) { stringSeparator = new string[] { " " }; packageEntryArr = entry.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries); currUri = packageEntryArr[0]; try { currFileSize = Int64.Parse(packageEntryArr[1]); } catch (Exception e) { requestHandler.LogDebug("problem unpacking: " + entry + " " + e.StackTrace + " " + e.Message); return unpackedBytes; } if (!Util.IsValidUri(currUri)) { requestHandler.LogDebug("problem unpacking: " + currUri); return unpackedBytes; } RCRequest rcRequest = new RCRequest(requestHandler, currUri); unpackedBytes += currFileSize; //requestHandler.LogDebug("unpacking: " + rcRequest.Uri + " - " + currFileSize + " bytes"); // make sure the file doesn't already exist for indexing purposes only bool existed = false; FileInfo ftest = new FileInfo(rcRequest.CacheFileName); if (ftest.Exists) { existed = true; } // try to delete the old version if (!Util.DeleteFile(rcRequest.CacheFileName)) { return unpackedBytes; } // create directory if it doesn't exist if (!Util.CreateDirectoryForFile(rcRequest.CacheFileName)) { return unpackedBytes; } // create the file if it doesn't exist FileStream currFileFS = Util.CreateFile(rcRequest.CacheFileName); if (currFileFS == null) { return unpackedBytes; } // check for overflow from previous file, and use it up first if (bufferOverflowCount > 0) { Buffer.BlockCopy(bufferOverflow, 0, buffer, 0, bufferOverflowCount); bytesRead = bufferOverflowCount; } else { bytesRead = packageFs.Read(buffer, 0, 1024); } // reset for current file bytesReadOfCurrFile = 0; while (bytesRead != 0 && bytesReadOfCurrFile < currFileSize) { // check if we read too much if (bytesReadOfCurrFile + bytesRead > currFileSize) { // bytes left must be less than 1024, fine to convert to Int int bytesLeftOfCurrFile = ((int)(currFileSize - bytesReadOfCurrFile)); currFileFS.Write(buffer, 0, bytesLeftOfCurrFile); // done with this file bytesReadOfCurrFile = currFileSize; // handle overflow bufferOverflowCount = bytesRead - bytesLeftOfCurrFile; Buffer.BlockCopy(buffer, bytesLeftOfCurrFile, bufferOverflow, 0, bytesRead - bytesLeftOfCurrFile); } else { // append what we read currFileFS.Write(buffer, 0, bytesRead); // update bytesReadOfCurrFile bytesReadOfCurrFile += bytesRead; bytesRead = packageFs.Read(buffer, 0, 1024); } } if (bytesReadOfCurrFile != currFileSize) { // ran out of bytes for this file requestHandler.LogDebug("error, unexpected package size: " + rcRequest.CacheFileName + "(" + bytesReadOfCurrFile + " / " + currFileSize + ")"); return unpackedBytes * -1; } currFileFS.Close(); // add the file to Lucene if (Util.IsParseable(rcRequest)) { string document = Util.ReadFileAsString(rcRequest.CacheFileName); string title = Util.GetPageTitle(document); string content = Util.GetPageContent(document); //request.LogDebug("indexing: " + rcRequest._uri); if (!existed) { IndexWrapper.IndexDocument(indexPath, "Content-Type: text/html", rcRequest.Uri, title, content); } } } if (packageFs != null) { packageFs.Close(); } return unpackedBytes; }
/// <summary> /// Constructor for a RuralCafe Request. /// </summary> /// <param name="proxy">The proxy for the request.</param> /// <param name="request">The request.</param> /// <param name="anchorText">Text of the anchor tag.</param> /// <param name="referrerUri">URI of the referer.</param> /// <param name="body">The body for POSTs, ...</param> public RCRequest(RCProxy proxy, HttpWebRequest request, string anchorText, string referrerUri, byte[] body) { _anchorText = anchorText; _refererUri = referrerUri.Trim(); _status = RequestHandler.Status.Pending; _webRequest = request; _webRequest.Timeout = RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT; _webRequest.Referer = _refererUri; _body = body; string fileName = CacheManager.UriToFilePath(_webRequest.RequestUri.ToString()); string hashPath = CacheManager.GetHashPath(fileName); // Cache file name like ./GET/2876/627/... _relCacheFileName = request.Method + Path.DirectorySeparatorChar + hashPath + fileName; _requestId = _relCacheFileName.Replace(Path.DirectorySeparatorChar.ToString(), ""); _cacheFileName = proxy.CachePath + _relCacheFileName; _packageFileName = proxy.PackagesPath + hashPath + fileName + ".gzip"; _fileSize = 0; _proxy = proxy; // Root request is this, unless overridden. _rootRequest = this; _startTime = DateTime.Now; _finishTime = _startTime; }
/// <summary> /// Main logic of RuralCafe RPRequestHandler. /// Called by Go() in the base RequestHandler class. /// </summary> public override int HandleRequest() { // benchmarking //handleRequestStart = DateTime.Now; /* // XXX: obsolete // not checking this anymore, make sure you can establish the connection properly, after that its all good. if (!IsRCRemoteQuery()) { LogDebug("error not RuralCafe URL or search request: " + RequestUri); return (int)Status.Ignored; }*/ string richness = DEFAULT_RICHNESS;//_rcRequest.GetRCSearchField("richness"); // XXX: static quota for now /* QUOTA parameterization in the UI // get the quota string quotaString = GetRuralCafeSearchField("quota"); long remainingQuota = Int32.Parse(quotaString); if (quotaString.Equals("")) { // no quota remainingQuota = 1000000000; // XXX: very large number } else { try { remainingQuota = Int32.Parse(quotaString); } catch (Exception e) { remainingQuota = 0; LogException("Couldn't parse quota: " + e.StackTrace + " " + e.Message); } }*/ /* // XXX: obsolete if (IsRCURLRequest()) { */ //LogDebug("page request, downloading page as package"); //string requestUri = _rcRequest.GetRCSearchField("textfield"); string requestUri = _rcRequest.Uri; if (requestUri.Trim().Length > 0) { string fileExtension = Util.GetFileExtension(requestUri); if (!requestUri.StartsWith("http://")) { requestUri = "http://" + requestUri; } if (IsCacheable()) { // remove RuralCafe stuff from the request _rcRequest = new RCRequest(this, requestUri); //_rcRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT); if (RecursivelyDownloadPage(_rcRequest, richness, 0)) { _rcRequest.FileSize = SendResponsePackage(); if (_rcRequest.FileSize > 0) { return (int)Status.Completed; } } } else { // XXX: not handled at the moment, technically nothing should be "not cacheable" though. LogDebug("not cacheable, failed."); return (int)Status.Failed; } } /* // XXX: obsolete } else { LogDebug("RuralCafe search request: " + RequestUri); if (PrefetchBFS(richness, depth)) { _rcRequest.FileSize = SendResponsePackage(); if (_rcRequest.FileSize > 0) { return (int)Status.Completed; } } }*/ // benchmarking //handleRequestEnd = DateTime.Now; //SaveBenchmarkTimes(); return (int)Status.Failed; }
/* // benchmarking stuff public void PrefetchAnalysis(string richness, int depth) { LogDebug("Running Benchmarker"); // XXX: should add a parameter to always download or just read from cache // convert to Uri format //string pageUri = _webRequestUri; LogRequest(); long bytesDownloaded = _rcRequest.DownloadToCache(); FileInfo f; try { f = new FileInfo(_rcRequest.CacheFileName); if (bytesDownloaded < 0 || !f.Exists) { return; } } catch (Exception e) { LogDebug("problem getting file info " + e.StackTrace + " " + e.Message); return; } // get the embedded content of the search result page LinkedList<RCRequest> objectsFound = DownloadEmbeddedObjects(_rcRequest, richness); // benchmarking: store the number of images found //imagesOnResultsPage.Add(objectsFound.Count); // recursively download pages LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest); // benchmarking: store the number of links found //linksOnResultsPage.Add(resultLinkUris.Count); foreach (RCRequest linkObject in resultLinkUris) { bytesDownloaded = linkObject.DownloadToCache(); if (bytesDownloaded > -1 && f.Exists) { linkObject.RequestStatus = (int)Status.Completed; } try { f = new FileInfo(linkObject.CacheFileName); } catch (Exception) { linkObject.RequestStatus = (int)Status.Failed; continue; } if (linkObject.RequestStatus == (int)Status.Failed || !f.Exists) { linkObject.RequestStatus = (int)Status.Failed; continue; } // XXX: hackery // make a copy of this file try { // create directory if it doesn't exist if (!Util.CreateDirectoryForFile(linkObject.CacheFileName)) { return; } // create directory if it doesn't exist if (!Util.CreateDirectoryForFile("ZZZZZZ\\" + linkObject.CacheFileName)) { return; } File.Delete("ZZZZZZ\\" + linkObject.CacheFileName); File.Copy(linkObject.CacheFileName, "ZZZZZZ\\" + linkObject.CacheFileName); // skip parseable check if (!Util.IsParseable(linkObject)) { continue; } // get the embedded content of the search result page objectsFound = DownloadEmbeddedObjects(linkObject, richness); // benchmarking: store the number of images on the page //imagesOnTargetPage.Add(objectsFound.Count); File.Delete(linkObject.CacheFileName); } catch (Exception e) { LogDebug("problem downloading a file or something " + e.StackTrace + " " + e.Message); } } }*/ /* // XXX: obsolete (currently not in use) /// <summary> /// Prefetch a search page in breadth first search order. /// </summary> /// <param name="richness">Richness of the prefetch.</param> /// <param name="depth">Depth to prefetch.</param> /// <returns>Status.</returns> private bool PrefetchBFS(string richness, int depth) { // benchmarking //downloadPagesStart = DateTime.Now; LogDebug("Running BFS"); // reconstruct _rcRequest string pageUri = _rcRequest.TranslateRCSearchToGoogle(); if (!Util.IsValidUri(pageUri)) { return false; } _rcRequest = new RCRequest(this, pageUri); //_rcRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT); // download the file long bytesDownloaded = _rcRequest.DownloadToCache(); if (bytesDownloaded < 0) { LogDebug("Error downloading: " + _rcRequest.Uri); return false; } // add to the package //if ( _package.Pack(this, _rcRequest, ref _quota);//) //{ // LogDebug("packed: " + RequestUri + " " + _rcRequest.FileSize + " bytes, " + _quota + " left"); //} // check quota if (_quota < DEFAULT_LOW_WATERMARK) { // benchmarking //downloadPagesEnd = DateTime.Now; return true; } // setup the initial frontier LinkedList<RCRequest> currentBFSFrontier = ExtractGoogleResults(_rcRequest); LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>(); // run BFS while (depth < DEFAULT_MAX_DEPTH) { // download objects in parallel currentBFSFrontier = DownloadObjectsInParallel(_rcRequest, currentBFSFrontier); // download embedded objects for each downloaded object foreach (RCRequest currObject in currentBFSFrontier) { // download embedded objects DownloadEmbeddedObjects(currObject, richness); } if (_quota < DEFAULT_LOW_WATERMARK) { // quota met break; } // get the next frontier from the current ones nextBFSFrontier = GetNewBFSFrontier(currentBFSFrontier); currentBFSFrontier = nextBFSFrontier; depth++; } return true; //downloadPagesEnd = DateTime.Now; } /// <summary> /// Gets the next BFS frontier from the current frontier. /// </summary> /// <param name="currentBFSFrontier">Current BFS frontier.</param> /// <returns>Next BFS frontier as a LinkedList.</returns> private LinkedList<RCRequest> GetNewBFSFrontier(LinkedList<RCRequest> currentBFSFrontier) { LinkedList<RCRequest> nextBFSFrontier = new LinkedList<RCRequest>(); LinkedList<RCRequest> extractedLinks; // go through the current frontier and collect the links foreach (RCRequest rcRequest in currentBFSFrontier) { // get all the links extractedLinks = ExtractLinks(rcRequest); // add to the frontier if we haven't seen it recently foreach (RCRequest extractedLink in extractedLinks) { // ignore blacklisted domains if (IsBlacklisted(extractedLink.Uri)) { continue; } if (!currentBFSFrontier.Contains(extractedLink) && !nextBFSFrontier.Contains(extractedLink)) { nextBFSFrontier.AddLast(extractedLink); } } } return nextBFSFrontier; }*/ /// <summary> /// Recursively downloads a page and its embedded objects, and its outlinks. /// </summary> /// <param name="rcRequest">Requested page to start from.</param> /// <param name="richness">Richness setting.</param> /// <param name="depth">Depth to download.</param> /// <returns></returns> private bool RecursivelyDownloadPage(RCRequest rcRequest, string richness, int depth) { if (_quota < DEFAULT_LOW_WATERMARK) { return false; } if (depth == DEFAULT_MAX_DEPTH) { return false; } // check for parseable since its just some URL if (!Util.IsParseable(rcRequest)) { return false; } // reduce the timer DateTime currTime = DateTime.Now; DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT); if (endTime.CompareTo(currTime) > 0) { RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds; } else { RCRequest.GenericWebRequest.Timeout = 0; } // download the page long bytesDownloaded = rcRequest.DownloadToCache(false); if (bytesDownloaded < 0 ) { LogDebug("[depth = " + depth + "] error downloading: " + rcRequest.Uri); return false; } // add to the package if (_package.Pack(this, rcRequest, ref _quota)) { LogDebug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left"); } // get the embedded content of the search result page DownloadEmbeddedObjects(rcRequest, richness); // recurse if necessary LinkedList<RCRequest> resultLinkUris = ExtractLinks(rcRequest); foreach (RCRequest currObject in resultLinkUris) { RecursivelyDownloadPage(currObject, richness, depth + 1); } return true; }
/// <summary> /// Extracts the html references using a separator token and returns them. /// XXX: should replace and obsolete this with a better HTML parser. /// </summary> /// <param name="rcRequest">Page to parse.</param> /// <param name="tagAttributes">Seperator tokens.</param> /// <returns>List of references.</returns> LinkedList<RCRequest> ExtractReferences(RCRequest rcRequest, string[,] tagAttributes) { LinkedList<RCRequest> extractedReferences = new LinkedList<RCRequest>(); string fileString = Util.ReadFileAsString(rcRequest.CacheFileName).ToLower(); for (int i = 0; i < tagAttributes.GetLength(0); i++) { string tag = tagAttributes[i, 0]; string attribute = tagAttributes[i, 1]; HtmlParser parse = new HtmlParser(fileString); HtmlTag foundTag; while (parse.ParseNext(tag, out foundTag)) { // See if this attribute exists string currUri; if (foundTag.Attributes.TryGetValue(attribute, out currUri)) { // value contains URL referenced by this link // convert to absolute addresses before setting as a uri currUri = TranslateToAbsoluteAddress(rcRequest.Uri, currUri); // XXX: need to make sure the currUri isn't going to cause an exception to be thrown if (!Util.IsValidUri(currUri)) { continue; } RCRequest extractedRCRequest = new RCRequest(this, currUri); //extractedRCRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT); if (!extractedReferences.Contains(extractedRCRequest)) { extractedReferences.AddLast(extractedRCRequest); } } } } /* string[] lines = fileString.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries); // get links int pos; string currLine; string currUri; // stagger starting index by 1 since first split can't be a link for (int i = 1; i < lines.Length; i++) { currLine = (string)lines[i]; // to the next " symbol if ((pos = currLine.IndexOf("\"")) > 0) { currUri = currLine.Substring(0, pos); // convert to absolute addresses before setting as a uri currUri = TranslateToAbsoluteAddress(rcRequest.Uri, currUri); // XXX: need to make sure the currUri isn't going to cause an exception to be thrown if (!Util.IsValidUri(currUri)) { continue; } RCRequest extractedRCRequest = new RCRequest(this, currUri); //extractedRCRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT); if (!extractedReferences.Contains(extractedRCRequest)) { extractedReferences.AddLast(extractedRCRequest); } } } */ return extractedReferences; }
/// <summary> /// Extracts the links on a page. /// Wrapper for ExtractReferences() /// XXX: not completely implemented, need non HTML/"a href=" references. /// </summary> LinkedList<RCRequest> ExtractLinks(RCRequest rcRequest) { //string[] stringSeparator = new string[] { "a href=\"" }; return ExtractReferences(rcRequest, HtmlParser.LinkTagAttributes); }
/// <summary> /// Extracts the embedded objects on a page. /// Wrapper for ExtractReferences() /// XXX: not completely implemented, need non HTML/"src=" references. /// </summary> LinkedList<RCRequest> ExtractEmbeddedObjects(RCRequest rcRequest) { //string[] stringSeparator = new string[] { "src=\"", "link href=\"", "SRC=\"" }; return ExtractReferences(rcRequest, HtmlParser.EmbeddedObjectTagAttributes); }
/// <summary> /// Downloads a set of URIs in parallel using a ThreadPool. /// </summary> /// <param name="parentRequest">Root request.</param> /// <param name="children">Children requests to be downloaded.</param> /// <returns>List of downloaded requests.</returns> private LinkedList<RCRequest> DownloadObjectsInParallel(RCRequest parentRequest, LinkedList<RCRequest> children) { ThreadPool.SetMaxThreads(4, 4); LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>(); if (children.Count == 0) { return addedObjects; } parentRequest.ResetEvents = new ManualResetEvent[children.Count]; try { // queue up worker threads to download URIs for (int i = 0; i < children.Count; i++) { RCRequest currChild = children.ElementAt(i); // set the resetEvent currChild.ResetEvents = parentRequest.ResetEvents; parentRequest.ResetEvents[i] = new ManualResetEvent(false); // make sure we haven't downloaded this before if (_package.RCRequests.Contains(currChild)) { // skip it currChild.SetDone(); continue; } // download the page //LogDebug("queueing: " + currChild.ChildNumber + " " + currChild.Uri); ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadPageWorkerThread), (object)currChild); } // wait for timeout WaitAll(parentRequest.ResetEvents); addedObjects = _package.Pack(this, children, ref _quota); } catch (Exception e) { LogDebug("unable to download embeddedObjects: " + e.StackTrace + " " + e.Message); } return addedObjects; }
/// <summary> /// Downloads a set of URIs in parallel using a ThreadPool. /// </summary> /// <param name="parentRequest">Root request.</param> /// <param name="childObjects">Children URIs to be downloaded.</param> /// <returns>List of downloaded requests.</returns> private LinkedList<RCRequest> DownloadObjectsInParallel(RCRequest parentRequest, LinkedList<Uri> childObjects) { LinkedList<RCRequest> addedObjects = new LinkedList<RCRequest>(); if (_killYourself || childObjects.Count == 0) { return addedObjects; } parentRequest.ResetEvents = new ManualResetEvent[childObjects.Count]; try { // queue up worker threads to download URIs for (int i = 0; i < childObjects.Count; i++) { // create the RCRequest for the object RCRequest currChildObject = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(childObjects.ElementAt(i))); currChildObject.ChildNumber = i; // Set the root request. currChildObject.RootRequest = parentRequest; addedObjects.AddLast(currChildObject); // set the resetEvent currChildObject.ResetEvents = parentRequest.ResetEvents; parentRequest.ResetEvents[i] = new ManualResetEvent(false); // make sure we haven't downloaded this before if (_package.RCRequests.Contains(currChildObject)) { // skip it currChildObject.SetDone(); continue; } // download the page //LogDebug("queueing: " + currChild.ChildNumber + " " + currChild.Uri); ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadPageWorkerThread), (object)currChildObject); } // wait for timeout Utils.WaitAll(parentRequest.ResetEvents); addedObjects = _package.Pack(this, addedObjects, ref _quota); } catch (Exception e) { Logger.Warn("unable to download embeddedObjects.", e); } return addedObjects; }
/// <summary> /// Checks if the URI is parseable by RuralCafe. /// </summary> /// <param name="rcRequest">A RCRequest object.</param> /// <returns>True or false for parseable or not.</returns> public static bool IsParseable(RCRequest rcRequest) { string contentType = GetContentTypeOfFile(rcRequest.CacheFileName); if (contentType.Contains("htm")) { return true; } return false; }
/// <summary> /// Adds a RCRequest to the package given the quota limitation. /// Only called by the remote proxy. /// XXX: quota is currently simple algorithm. /// XXX: should be changed to check for compressed size rather than actual size. /// </summary> /// <param name="requestHandler">Calling handler for this method.</param> /// <param name="requestObject">RCRequest to add.</param> /// <param name="quota">Quota limit.</param> /// <returns>True iff the request has been packed successfully.</returns> public bool Pack(RemoteRequestHandler requestHandler, RCRequest requestObject, ref long quota) { if (_rcRequests.Contains(requestObject)) { requestHandler.Logger.Debug("object exists in package: " + requestObject.Uri); return false; } requestObject.FileSize = Utils.GetFileSize(requestObject.CacheFileName); if (requestObject.FileSize <= 0) { return false; } // quota check if ((quota - requestObject.FileSize) < 0) { requestHandler.Logger.Debug("object doesn't fit in quota: " + requestObject.Uri); return false; } _rcRequests.Add(requestObject); quota -= requestObject.FileSize; return true; }
/// <summary> /// Adds a RCRequest to the package given the quota limitation. /// Only called by the remote proxy. /// XXX: quota is currently simple algorithm. /// XXX: should be changed to check for compressed size rather than actual size. /// </summary> /// <param name="requestHandler">Calling handler for this method.</param> /// <param name="rcRequest">RCRequest to add.</param> /// <param name="quota">Quota limit.</param> /// <returns>Error message.</returns> public bool Pack(RemoteRequestHandler requestHandler, RCRequest rcRequest, ref long quota) { if (rcRequest.Uri.Contains(' ')) { requestHandler.LogDebug("object contains spaces: " + rcRequest.Uri); return false; } if (_rcRequests.Contains(rcRequest)) { requestHandler.LogDebug("object exists in package: " + rcRequest.Uri); return false; } rcRequest.FileSize = Util.GetFileSize(rcRequest.CacheFileName); if (rcRequest.FileSize <= 0) { //requestHandler.LogDebug("object has no content: " + rcRequest.Uri); return false; } // quota check if ((quota - rcRequest.FileSize) < 0) { requestHandler.LogDebug("object doesn't fit in quota: " + rcRequest.Uri); return false; } _rcRequests.AddLast(rcRequest); quota -= rcRequest.FileSize; //requestHandler.LogDebug("packed: " + requestHandler.RequestUri + " " + rcRequest.FileSize + " bytes - " + quota + " left"); return true; }
/// <summary> /// Sends the frame page to the client. /// GET request will be sent to <![CDATA[request/search.xml?p=1&s=searchstring]]> where /// p is the current page number, if there are multipage pages, page number starts from 1, 2, 3..., /// s is the search query string /// /// This gets the results from google, always the same amount of results google gets, usually 10. /// </summary> public Response ServeRCLiveResultPage(int pageNumber, string queryString) { XmlDocument xmlDoc = new XmlDocument(); xmlDoc.AppendChild(xmlDoc.CreateXmlDeclaration("1.0", "UTF-8", String.Empty)); XmlElement searchXml = xmlDoc.CreateElement("search"); xmlDoc.AppendChild(searchXml); if (queryString.Trim().Length == 0 || Proxy.NetworkStatus == RCLocalProxy.NetworkStatusCode.Offline) { searchXml.SetAttribute("total", "0"); return new Response(xmlDoc.InnerXml); } // Google search string googleSearchString = ConstructGoogleSearch(queryString, pageNumber); _rcRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(googleSearchString)); // Download result page string resultPage = _rcRequest.DownloadAsString(); try { if (resultPage != null) { LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(resultPage); long numResults = GetGoogleResultsNumber(resultPage); searchXml.SetAttribute("total", "" + numResults); foreach (RCRequest linkObject in resultLinkUris) { searchXml.AppendChild(BuildSearchResultXmlElement(xmlDoc, linkObject.AnchorText, linkObject.Uri, linkObject.ContentSnippet)); } PrepareXMLRequestAnswer(); return new Response(xmlDoc.InnerXml); } else { return new Response(); } } catch { return new Response(); } }
void ServeRCRemoteResultPage() { if (_proxy.NetworkStatus == (int)RCProxy.NetworkStatusCode.Offline) { return; } // Parse parameters NameValueCollection qscoll = Util.ParseHtmlQuery(RequestUri); int numItemsPerPage = Int32.Parse(qscoll.Get("n")); int pageNumber = Int32.Parse(qscoll.Get("p")); string queryString = qscoll.Get("s"); // Google search string googleSearchString = ConstructGoogleSearch(queryString); _rcRequest = new RCRequest(this, googleSearchString); //LogDebug("streaming: " + _rcRequest.GenericWebRequest.RequestUri + " to cache and client."); //_rcRequest.GenericWebRequest.Proxy = null; long bytesDownloaded = _rcRequest.DownloadToCache(true); try { FileInfo f = new FileInfo(_rcRequest.CacheFileName); if (bytesDownloaded > -1 && f.Exists) { LinkedList<RCRequest> resultLinkUris = ExtractGoogleResults(_rcRequest); string resultsString = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; resultsString = resultsString + "<search total=\"" + resultLinkUris.Count.ToString() + "\">"; int currentItemNumber = 0; foreach (RCRequest linkObject in resultLinkUris) { currentItemNumber++; if ((currentItemNumber > ((pageNumber - 1) * numItemsPerPage)) && (currentItemNumber < (pageNumber * numItemsPerPage) + 1)) { string uri = System.Security.SecurityElement.Escape(linkObject.Uri); //System.Security.SecurityElement.Escape(result.Get("uri")); // escape xml string string title = System.Security.SecurityElement.Escape(linkObject.AnchorText); //System.Security.SecurityElement.Escape(result.Get("title")); //escape xml string //string displayUri = uri; string contentSnippet = ""; // XXX: find content snippet here if (uri.StartsWith("http://")) //laura: obmit http:// uri = uri.Substring(7); resultsString = resultsString + "<item>" + "<title>" + title + "</title>" + "<url>" + uri + "</url>" + "<snippet>" + contentSnippet + "</snippet>" + "</item>"; } } resultsString = resultsString + "</search>"; SendOkHeaders("text/xml", "Cache-Control: no-cache" + "\r\n" + "Pragma: no-cache" + "\r\n" + "Expires: -1" + "\r\n"); SendMessage(resultsString); } else { // do nothing } } catch { // do nothing } }
/// <summary> /// Extracts the result links from a google results page. /// </summary> /// <param name="googleResultPage">The Google results page.</param> /// <returns>List of links.</returns> private LinkedList<RCRequest> ExtractGoogleResults(string googleResultPage) { string[] stringSeparator = new string[] { "</cite>" }; LinkedList<RCRequest> resultLinks = new LinkedList<RCRequest>(); string[] lines = googleResultPage.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries); // get links int pos; string currLine; string currUri = ""; string currTitle = ""; string currSnippet = ""; // Omitting last split, since there is no link any more. for (int i = 0; i < lines.Length - 1; i++) { currLine = lines[i]; // get the title if ((pos = currLine.LastIndexOf("<a href=")) >= 0) { // title currTitle = currLine.Substring(pos); // find start if ((pos = currTitle.IndexOf(">")) >= 0) { // cut start currTitle = currTitle.Substring(pos + 1); if ((pos = currTitle.IndexOf("</a>")) >= 0) { // cut end currTitle = currTitle.Substring(0, pos); currTitle = HtmlUtils.StripTagsCharArray(currTitle); currTitle = currTitle.Trim(); } } } // get the uri if ((pos = currLine.LastIndexOf("<a href=\"/url?q=")) >= 0) { // start right after currUri = currLine.Substring(pos + "<a href=\"/url?q=".Length); if ((pos = currUri.IndexOf("&")) >= 0) { // cut end currUri = currUri.Substring(0, pos); currUri = HtmlUtils.StripTagsCharArray(currUri); currUri = currUri.Trim(); } if (!HttpUtils.IsValidUri(currUri)) { continue; } // check blacklist if (IsBlacklisted(currUri)) { continue; } if (!currUri.Contains(".") || currTitle.Equals("")) { continue; } // If we're in slow mode, we don't want cached results within the live results if (_proxy.NetworkStatus == RCProxy.NetworkStatusCode.Slow && _proxy.ProxyCacheManager.IsCached(CacheManager.GetRelativeCacheFileName(currUri, "GET"))) { continue; } // get the content snippet (in next split) currLine = lines[i + 1]; // find start string snippetSplit = "<span class=\"st\">"; if ((pos = currLine.LastIndexOf(snippetSplit)) >= 0) { // cut start currSnippet = currLine.Substring(pos + snippetSplit.Length); if ((pos = currSnippet.IndexOf("</span>")) >= 0) { // cut end currSnippet = currSnippet.Substring(0, pos); currSnippet = HtmlUtils.StripTagsCharArray(currSnippet, false); currSnippet = RegExs.MULTIPLE_SPACES_REGEX.Replace(currSnippet.Trim(), " "); } } // Create request and save anchorText and snippet RCRequest currRCRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(currUri)); currRCRequest.AnchorText = currTitle; currRCRequest.ContentSnippet = currSnippet; resultLinks.AddLast(currRCRequest); } } return resultLinks; }
/// <summary> /// Extracts the result links from a google results page. /// XXX: Probably broken all the time due to Google's constantly changing HTML format. /// </summary> /// <param name="rcRequest">Request to make.</param> /// <returns>List of links.</returns> public LinkedList<RCRequest> ExtractGoogleResults(RCRequest rcRequest) { string[] stringSeparator = new string[] { "<cite>" }; LinkedList<RCRequest> resultLinks = new LinkedList<RCRequest>(); string fileString = Util.ReadFileAsString(rcRequest.CacheFileName); string[] lines = fileString.Split(stringSeparator, StringSplitOptions.RemoveEmptyEntries); // get links int pos; string currLine; string currUri; string currTitle; // stagger starting index by 1 since first split can't be a link for (int i = 0; i < lines.Length - 1; i++) { currLine = (string)lines[i]; currTitle = ""; // get the title of the page as well if ((pos = currLine.LastIndexOf("<a href=")) >= 0) { currTitle = currLine.Substring(pos); if ((pos = currTitle.IndexOf(">")) >= 0) { currTitle = currTitle.Substring(pos + 1); if ((pos = currTitle.IndexOf("</a>")) >= 0) { currTitle = currTitle.Substring(0, pos); currTitle = Util.StripTagsCharArray(currTitle); currTitle = currTitle.Trim(); } } } currLine = (string)lines[i + 1]; // to the next " symbol if ((pos = currLine.IndexOf("</cite>")) > 0) { currUri = currLine.Substring(0, pos); if ((pos = currUri.IndexOf(" - ")) > 0) { currUri = currUri.Substring(0, pos); } currUri = Util.StripTagsCharArray(currUri); currUri = currUri.Trim(); // instead of translating to absolute, prepend http:// to make webrequest constructor happy currUri = "http://" + currUri; if (!Util.IsValidUri(currUri)) { continue; } // check blacklist if (IsBlacklisted(currUri)) { continue; } if (!currUri.Contains(".") || currTitle.Equals("")) { continue; } RCRequest currRCRequest = new RCRequest(this, currUri); currRCRequest.AnchorText = currTitle; //currRCRequest.ChildNumber = i - 1; //currRCRequest.SetProxy(_proxy.GatewayProxy, WEB_REQUEST_DEFAULT_TIMEOUT); resultLinks.AddLast(currRCRequest); } } return resultLinks; }
/// <summary> /// Downloads embedded objects based on the richness. /// </summary> /// <param name="rcRequest">Request page to start from.</param> /// <param name="richness">Richness setting.</param> /// <returns>List of RCRequests of embedded objects downloaded</returns> private LinkedList<RCRequest> DownloadEmbeddedObjects(RCRequest rcRequest, string richness) { LinkedList<RCRequest> filteredEmbeddedObjects = new LinkedList<RCRequest>(); if (_quota < DEFAULT_LOW_WATERMARK) { return filteredEmbeddedObjects; } LinkedList<RCRequest> embeddedObjects = ExtractEmbeddedObjects(rcRequest); // XXX: refactor into filter class/method. // filter out based on richness int objectNumber = 0; foreach (RCRequest embeddedObject in embeddedObjects) { // ignore blacklisted domains if (IsBlacklisted(embeddedObject.Uri)) { continue; } if (richness.Equals("normal")) { filteredEmbeddedObjects.AddLast(embeddedObject); } else if (richness.Equals("low")) { // XXX: logic here is ugly, and not perfect // XXX: since the implementation of PossiblyATextPage is incomplete // if its an image or couldn't possibly be a text page if (!IsImagePage(embeddedObject.Uri) && PossiblyATextPage(embeddedObject.Uri)) { filteredEmbeddedObjects.AddLast(embeddedObject); } } embeddedObject.ChildNumber = objectNumber; objectNumber++; } embeddedObjects = filteredEmbeddedObjects; //return DownloadObjects(rcRequest, embeddedObjects); return DownloadObjectsInParallel(rcRequest, embeddedObjects); }