private static async Task SeedRichnessAsync(EveEchoesPlanetaryProductionApiDbContext dbContext) { await foreach (var line in CsvFileService.ReadCsvDataLineByLineAsync(GlobalConstants.FilePaths.RichnessCsvFilePath)) { if (string.IsNullOrWhiteSpace(line)) { continue; } var lineArgs = line.Split(GlobalConstants.CsvDelimiter, StringSplitOptions.RemoveEmptyEntries); var richnessName = lineArgs[0]; var richness = new Richness() { Name = richnessName, }; await dbContext.AddAsync(richness); } await dbContext.SaveChangesAsync(); }
/// <summary> /// Downloads embedded objects based on the richness. /// </summary> /// <param name="rcRequest">Request page to start from.</param> /// <param name="richness">Richness setting.</param> /// <param name="baseUri">The Uri of the website where to download embedded objects.</param> /// <param name="htmlContent">The HTML content of the webiste.</param> /// <returns>List of RCRequests of embedded objects downloaded</returns> private LinkedList<RCRequest> DownloadEmbeddedObjects(RCRequest rcRequest, Uri baseUri, string htmlContent, Richness richness) { LinkedList<Uri> filteredEmbeddedObjects = new LinkedList<Uri>(); if (_killYourself || _quota < DEFAULT_LOW_WATERMARK) { return new LinkedList<RCRequest>(); } LinkedList<Uri> embeddedObjects = HtmlUtils.ExtractEmbeddedObjects(baseUri, htmlContent); // XXX: refactor into filter class/method. // filter out based on richness foreach (Uri uri in embeddedObjects) { string uriS = uri.ToString(); // ignore blacklisted domains if (IsBlacklisted(uriS)) { continue; } if (richness == Richness.Normal || (richness == Richness.Low && IsATextPage(uriS))) { filteredEmbeddedObjects.AddLast(uri); } } embeddedObjects = filteredEmbeddedObjects; return DownloadObjectsInParallel(rcRequest, embeddedObjects); }
/// <summary> /// Recursively downloads a page and its embedded objects, and its outlinks. /// </summary> /// <param name="rcRequest">Requested page to start from.</param> /// <param name="richness">Richness setting.</param> /// <param name="depth">Depth to download.</param> /// <returns>Wheter something was downloaded successfully.</returns> public bool RecursivelyDownloadPage(RCRequest rcRequest, Richness richness, int depth) { if (_killYourself || _quota < DEFAULT_LOW_WATERMARK) { // Send error page if we're on top level if (depth == 0) { SendErrorPage(HttpStatusCode.InternalServerError, "Request aborted or it does not fit in quota."); } return false; } // reduce the timer DateTime currTime = DateTime.Now; DateTime endTime = StartTime.AddMilliseconds(RequestHandler.WEB_REQUEST_DEFAULT_TIMEOUT); if (endTime.CompareTo(currTime) > 0) { RCRequest.GenericWebRequest.Timeout = (int)(endTime.Subtract(currTime)).TotalMilliseconds; } else { RCRequest.GenericWebRequest.Timeout = 0; } // Only download for POST/... or not already existing items if (!IsGetOrHeadHeader() || !_proxy.ProxyCacheManager.IsCached(rcRequest.RelCacheFileName)) { // Download! try { // There is no index on the remote side anyway rcRequest.DownloadToCache(false); } catch (Exception e) { Logger.Warn("[depth = " + depth + "] error downloading: " + rcRequest.Uri + " " + e.Message); // Send error page if we're on top level if (depth == 0) { if (e is WebException) { WebException exp = e as WebException; HttpWebResponse response = (e as WebException).Response as HttpWebResponse; SendErrorPage(response != null ? response.StatusCode : HttpStatusCode.InternalServerError, e.Message); } else { SendErrorPage(HttpStatusCode.InternalServerError, e.Message); } } return false; } } else { Logger.Debug("Already existed: " + rcRequest.Uri); } // add to the package if (_package.Pack(this, rcRequest, ref _quota)) { Logger.Debug("[depth = " + depth + "] packed: " + rcRequest.Uri + " " + rcRequest.FileSize + " bytes, " + _quota + " left"); } // add a new request for the old location if it was redirected. This will then // get the 301 file from the cache, so the local proxy does not need to send // another request to the remote proxy to find that out. if (rcRequest.UriBeforeRedirect != null) { Logger.Debug("Redirected: Also packing old URI with a 301 file."); RCRequest rc301 = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(rcRequest.UriBeforeRedirect)); _package.Pack(this, rc301, ref _quota); } if(!_proxy.ProxyCacheManager.IsHTMLFile(rcRequest.RelCacheFileName)) { return true; } // Getting embedded objects and recursing only makes sense for html pages. Uri baseUri = new Uri(rcRequest.Uri); string htmlContent = Utils.ReadFileAsString(rcRequest.CacheFileName).ToLower(); // get the embedded content of the search result page DownloadEmbeddedObjects(rcRequest, baseUri, htmlContent, richness); // Don't recurse if we're on the deepest layer allowed if (depth == Properties.Settings.Default.DEFAULT_DEPTH - 1) { return true; } // recurse LinkedList<Uri> resultLinkUris = HtmlUtils.ExtractLinks(baseUri, htmlContent); foreach (Uri uri in resultLinkUris) { RCRequest currRequest = new RCRequest(_proxy, (HttpWebRequest)WebRequest.Create(uri)); RecursivelyDownloadPage(currRequest, richness, depth + 1); } return true; }