コード例 #1
0
        /// <summary>
        /// Attaches the page and performes content decomposition
        /// </summary>
        /// <param name="pg">The pg.</param>
        /// <param name="response">The response.</param>
        /// <param name="targetBlockCount">The target block count.</param>
        /// <returns></returns>
        public spiderTarget AttachPage(spiderTaskResultItem pg, ILogBuilder response, int targetBlockCount = 3)
        {
            string       key    = GetHash(pg.target.url);
            spiderTarget target = null;

            target = GetOrCreateTarget(pg.target, true, false);
            target.AttachPage(pg.sPage, response, targetBlockCount); // <---------------- [ Do ovde stize

            if (target.contentBlocks.Any())
            {
                foreach (var bl in target.contentBlocks)
                {
                    blocks.Add(bl);
                    //blockContentHashList.AddUnique(bl_hash);
                }
            }

            return(target);
        }
コード例 #2
0
        //  public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, crawlerAgentContext crawlerContext, Boolean __doTokenization, modelSpiderSiteRecord wRecord)

        /// <summary>
        /// Runs the spider task item.
        /// </summary>
        /// <param name="ln">The ln.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <param name="__doTokenization">if set to <c>true</c> [do tokenization].</param>
        /// <param name="wRecord">The w record.</param>
        /// <returns></returns>
        public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, bool __doTokenization, modelSpiderPageRecord pRecord)
        {
            spiderTaskResultItem rItem = new spiderTaskResultItem(ln);

            crawledPage page = null;

            page = doWebRequest(ln.url, pRecord); // < ----------------------- ovde puca

            rItem.finish(page, pRecord.iteration);

            if (page.status == pageStatus.failed)
            {
                return(rItem);
            }


            pRecord.acceptPage(page);


            pRecord.init(rItem.sPage);

            return(rItem); // <---------------------------------------------- [ prolazi
        }
コード例 #3
0
        /// <summary>
        /// Runs the spider task.
        /// </summary>
        /// <param name="sTask">The s task.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <returns></returns>
        public spiderTaskResult runSpiderTask(spiderTask sTask, modelSpiderSiteRecord wRecord)
        {
            spiderTaskResult sResult = sTask.createResult();

            try
            {
                if (imbWEMManager.settings.crawlerJobEngine.crawlerDoParallelTaskLoads)
                {
                    Parallel.ForEach(sTask, ln =>
                    {
                        modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url);

                        spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord);

                        if (rItem.status != pageStatus.failed)
                        {
                            wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount); // <-------------------------------- [ STIZE
                        }

                        sResult.AddResult(rItem);
                    });
                }
                else
                {
                    foreach (spiderLink ln in sTask)
                    {
                        modelSpiderPageRecord pRecord = wRecord.getChildRecord(ln, ln.url); //.startChildRecord(ln, ln.url);

                        spiderTaskResultItem rItem = runSpiderTaskItem(ln, sTask.doTokenization, pRecord);

                        if (rItem.status != pageStatus.failed)
                        {
                            wRecord.context.targets.AttachPage(rItem, pRecord.logBuilder, blockCount);
                        }

                        sResult.AddResult(rItem);
                    }
                }
            } catch (Exception ex)
            {
                imbWEMManager.log.log("runSpiderTask exception: " + ex.Message);
            }

            loadIndex = loadIndex + sResult.Count();


            if (loadIndex > imbWEMManager.settings.crawlerJobEngine.loadCountForGC)
            {
                long mem = GC.GetTotalMemory(false);
                GC.Collect();
                GC.WaitForFullGCComplete();
                long dmem = GC.GetTotalMemory(false);

                aceLog.log("Memory allocation reduction [after " + loadIndex + " tasks]: " + (mem - dmem).getMByteCountFormated());
                loadIndex = 0;
            }



            sResult.finish();

            return(sResult);
        }