//request.doContentCheck = false; //request.doCoolOff = false; //request.doRetryExecution = false; //request.doSubdomainVariations = false; //request.doTimeoutLimiter = true; //request.doLogCacheLoaded = imbWEMManager.settings.executionLog.doPageLoadedFromCache; //request.doLogNewLoad = imbWEMManager.settings.executionLog.doPageLoadedLog; //request.doLogRequestError = imbWEMManager.settings.executionLog.doPageErrorOrDuplicateLog; //request.htmlSettings.doTransliterateToLat = false; //request.htmlSettings.doRemoveHtmlEntities = true; //request.htmlSettings.doUpperCase = true; //request.htmlSettings.doAutocloseOnEnd = true; /// <summary> /// Does the web request /// </summary> /// <param name="url">The URL.</param> /// <param name="crawlerContext">The crawler context.</param> /// <returns></returns> internal crawledPage doWebRequest(string url, modelSpiderPageRecord pRecord) { url = controler.GetDuplicateUrl(url); if (url.isNullOrEmpty()) { imbWEMManager.log.log("EMPTY URL PASSED TO THE WEB LOADER"); imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(2200); } loaderRequest wemRequest = new loaderRequest(url); if (controler.CheckFail(wemRequest.url)) { wemRequest.executed = true; wemRequest.statusCode = System.Net.HttpStatusCode.ExpectationFailed; } else { wemRequest = loaderSubsystem.ExecuteRequest(wemRequest); // <----------------------------- if (wemRequest.statusCode != System.Net.HttpStatusCode.OK) { controler.SetFailUrl(wemRequest.url); } } if (dataLoad != null) { dataLoad.AddBytes(wemRequest.byteSize); } crawledPage page = makeCrawledPage(wemRequest, pRecord); // <-----------------------------[ STIZE DO OVDE return(page); // <---- prolazi }
// public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, crawlerAgentContext crawlerContext, Boolean __doTokenization, modelSpiderSiteRecord wRecord) /// <summary> /// Runs the spider task item. /// </summary> /// <param name="ln">The ln.</param> /// <param name="crawlerContext">The crawler context.</param> /// <param name="__doTokenization">if set to <c>true</c> [do tokenization].</param> /// <param name="wRecord">The w record.</param> /// <returns></returns> public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, bool __doTokenization, modelSpiderPageRecord pRecord) { spiderTaskResultItem rItem = new spiderTaskResultItem(ln); crawledPage page = null; page = doWebRequest(ln.url, pRecord); // < ----------------------- ovde puca rItem.finish(page, pRecord.iteration); if (page.status == pageStatus.failed) { return(rItem); } pRecord.acceptPage(page); pRecord.init(rItem.sPage); return(rItem); // <---------------------------------------------- [ prolazi }
/// <summary> /// Makes the crawled page. /// </summary> /// <param name="result">The result.</param> /// <param name="pRecord">The p record.</param> /// <returns></returns> /// <exception cref="aceGeneralException">Error in link processing</exception> internal crawledPage makeCrawledPage(IWebResult result, modelSpiderPageRecord pRecord) { crawledPage page = new crawledPage(result.responseUrl, 0); page.result = result; page.domain = pRecord.wRecord.domainInfo.domainName; var links = result.HtmlDocument.DocumentNode.Descendants("a"); if (links.Any()) { } else { } foreach (HtmlNode hn in links) { try { var ndv = hn.CreateNavigator(); link l = new link(ndv); if (!l.isDefaultHomePage) { page.links.Add(l); } } catch (Exception ex) { throw new aceGeneralException(ex.Message, ex, page, "Error in link processing"); } } if (page.links.Count == 0) { } var meta = result.HtmlDocument.DocumentNode.Descendants("meta"); foreach (HtmlNode hn in meta) { String name = hn.GetAttributeValue("name", "none"); String content = hn.GetAttributeValue("content", ""); switch (name) { case "keywords": page.pageKeywords = content.SplitSmart(",", "", true, true); break; case "description": page.pageDescription = content; break; } } var title = result.HtmlDocument.DocumentNode.Descendants("title").FirstOrDefault(); if (title != null) { page.pageCaption = title.InnerText; } page.links.deployCollection(page); page.isCrawled = true; page.status = pageStatus.loaded; if (!page.links.byScope[imbCommonModels.enums.linkScope.inner].Any()) { } return(page); }
/// <summary> /// Runs the spider task. /// </summary> /// <param name="sTask">The s task.</param> /// <param name="crawlerContext">The crawler context.</param> /// <returns></returns> public spiderTaskResult runSpiderTask(spiderTask sTask, modelSpiderSiteRecord wRecord) { spiderTaskResult sResult = sTask.createResult(); try { if (imbWEMManager.settings.crawlerJobEngine.crawlerDoParallelTaskLoads) { Parallel.ForEach(sTask, ln => { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); // <-------------------------------- [ STIZE } sResult.AddResult(rItem); }); } else { foreach (spiderLink ln in sTask) { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); } sResult.AddResult(rItem); } } } catch (Exception ex) { imbWEMManager.log.log("runSpiderTask exception: " + ex.Message); } loadIndex = loadIndex + sResult.Count(); if (loadIndex > imbWEMManager.settings.crawlerJobEngine.loadCountForGC) { long mem = GC.GetTotalMemory(false); GC.Collect(); GC.WaitForFullGCComplete(); long dmem = GC.GetTotalMemory(false); aceLog.log("Memory allocation reduction [after " + loadIndex + " tasks]: " + (mem - dmem).getMByteCountFormated()); loadIndex = 0; } sResult.finish(); return(sResult); }