/// <summary> /// Attaches the page and performes content decomposition /// </summary> /// <param name="pg">The pg.</param> /// <param name="response">The response.</param> /// <param name="targetBlockCount">The target block count.</param> /// <returns></returns> public spiderTarget AttachPage(spiderTaskResultItem pg, ILogBuilder response, int targetBlockCount = 3) { string key = GetHash(pg.target.url); spiderTarget target = null; target = GetOrCreateTarget(pg.target, true, false); target.AttachPage(pg.sPage, response, targetBlockCount); // <---------------- [ Do ovde stize if (target.contentBlocks.Any()) { foreach (var bl in target.contentBlocks) { blocks.Add(bl); //blockContentHashList.AddUnique(bl_hash); } } return(target); }
// public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, crawlerAgentContext crawlerContext, Boolean __doTokenization, modelSpiderSiteRecord wRecord) /// <summary> /// Runs the spider task item. /// </summary> /// <param name="ln">The ln.</param> /// <param name="crawlerContext">The crawler context.</param> /// <param name="__doTokenization">if set to <c>true</c> [do tokenization].</param> /// <param name="wRecord">The w record.</param> /// <returns></returns> public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, bool __doTokenization, modelSpiderPageRecord pRecord) { spiderTaskResultItem rItem = new spiderTaskResultItem(ln); crawledPage page = null; page = doWebRequest(ln.url, pRecord); // < ----------------------- ovde puca rItem.finish(page, pRecord.iteration); if (page.status == pageStatus.failed) { return(rItem); } pRecord.acceptPage(page); pRecord.init(rItem.sPage); return(rItem); // <---------------------------------------------- [ prolazi }
/// <summary> /// Runs the spider task. /// </summary> /// <param name="sTask">The s task.</param> /// <param name="crawlerContext">The crawler context.</param> /// <returns></returns> public spiderTaskResult runSpiderTask(spiderTask sTask, modelSpiderSiteRecord wRecord) { spiderTaskResult sResult = sTask.createResult(); try { if (imbWEMManager.settings.crawlerJobEngine.crawlerDoParallelTaskLoads) { Parallel.ForEach(sTask, ln => { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); // <-------------------------------- [ STIZE } sResult.AddResult(rItem); }); } else { foreach (spiderLink ln in sTask) { modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url); spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord); if (rItem.status != pageStatus.failed) { wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); } sResult.AddResult(rItem); } } } catch (Exception ex) { imbWEMManager.log.log("runSpiderTask exception: " + ex.Message); } loadIndex = loadIndex + sResult.Count(); if (loadIndex > imbWEMManager.settings.crawlerJobEngine.loadCountForGC) { long mem = GC.GetTotalMemory(false); GC.Collect(); GC.WaitForFullGCComplete(); long dmem = GC.GetTotalMemory(false); aceLog.log("Memory allocation reduction [after " + loadIndex + " tasks]: " + (mem - dmem).getMByteCountFormated()); loadIndex = 0; } sResult.finish(); return(sResult); }