Пример #1
0
 /// <summary>
 /// Finishes the result item
 /// </summary>
 /// <param name="__page">The page.</param>
 public void finish(crawledPage __page, int __iteration)
 {
     page               = __page;
     status             = page.status;
     duration           = DateTime.Now.Subtract(startTime);
     sPage              = new spiderPage(page, target.iterationDiscovery, __iteration); // wRecord.iteration);
     sPage.spiderResult = this;
 }
Пример #2
0
        public void acceptPage(crawledPage __page)
        {
            if (__page != null)
            {
                page = __page;
            }

            log("Page [" + page.caption + "] loaded [" + page.url + "]");
        }
Пример #3
0
 public spiderPage(crawledPage __webpage, int __iteration, int __iterationLoad)
 {
     webpage            = __webpage;
     url                = webpage.url.toStringSafe("");
     iterationDiscovery = __iteration;
     name               = __webpage.name;
     captions.Add(__webpage.pageCaption);
     description = __webpage.description;
 }
Пример #4
0
        /// <summary>
        /// Basic HTML Metrics
        /// </summary>
        /// <param name="page"></param>
        /// <param name="output"></param>
        /// <returns></returns>
        public static metricsReport getHtmlMetrics(crawledPage page, metricsSettings settings,
                                                   metricsReport output = null)
        {
            if (output == null)
            {
                output = new metricsReport(page.result.HtmlDocument as IXPathNavigable);
            }


            getMetaReport(page, output);

            int linkInner = page.links.byScope[linkScope.inner].Count;



            output.report("FV01_linkOuter", page.links.byScope[linkScope.outer].Count);
            output.report("FV02_linkInner", linkInner);

            output.report("FV31_cListStructures", htmlDefinitions.HTMLTags_listStructureTags, true);

            output.report("FV31_cListStructures", htmlDefinitions.HTMLTags_listStructureTags, true);
            output.report("FV32_cTableStructures", htmlDefinitions.HTMLTags_tableStructureTags, true);

            output.report("FV41_cHeadingTags", htmlDefinitions.HTMLTags_headingTags, true);
            output.report("FV42_cStructureTags", htmlDefinitions.HTMLTags_allStructureTags, true);


            reportEntryBase _entry = output.report("FV43_cMultiMediaTags", htmlDefinitions.HTMLTags_multimediaTags, true);

            if (settings.flags.HasFlag(metricsFlag.downloadPluginLinkAsMultimediaTag))
            {
                var c = (int)_entry.Value;
                foreach (link l in page.links.byScope[linkScope.outer])
                {
                    if (l.domain == "www.adobe.com")
                    {
                        c = c + 1;
                    }
                }
                _entry.Value = c;
            }

            //Int32 cMultiMediaTags = output["FV43_cMultiMediaTags"].Value.imbToNumber(typeof (Int32));


            output.report("FV44_cImageTags", htmlDefinitions.HTMLTags_multimediaTags, true);


            return(output);
        }
Пример #5
0
        public void SetActiveTargets(modelSpiderSiteRecord wRecord, indexDomain domain)
        {
            List <indexPage> pages = domain.getPageSet();
            // wRecord.web.setSeedUrl(domain.url);
            //spiderPage sp = new spiderPage()

            crawledPage cpage = new crawledPage(domain.url, 0);

            spiderPage spage = new spiderPage(cpage, 0, 0);

            foreach (indexPage p in pages)
            {
                link l = new link(p.url);
                wRecord.context.processLink(l, spage, false);
            }
        }
Пример #6
0
        /// <summary>
        /// Izvršava imbBasic metod detekcije templejta
        /// </summary>
        /// <param name="source">Lista učitanih stranica</param>
        /// <param name="settings">Podešavanja</param>
        /// <returns>Sređena definicija templejta</returns>
        public static templateDefinition detectTemplate_imbBasic(crawledPage[] source, imbWebTemplateSettings settings)
        {
            templateDefinition output = new templateDefinition();

            List <string> xPathList;

            // COMMON TREE DETECTION
            switch (settings.commonTreeDetection)
            {
            default:
            case commonTreeMethod.imbEndNodePathFrequency:
                output.xPathStruktura = templateOperations.commonTree_imbENPF(source, settings);
                break;
            }

            crawledPage c = source.First();

            //imbNamespaceSetup nsSetup = new imbNamespaceSetup(c.xmlDocument);


            // COMMON CONTENT CHECK
            output.xPathStruktura = templateOperations.commonContentCheck(source, output.xPathStruktura, settings);
            templateExtensions.prepareContent(output, settings);

            // page track
            if (settings.doSavePageUrls)
            {
                foreach (crawledPage p in source)
                {
                    output.relatedPages.Add(p.url);
                }
            }


            output.score = source.Length;

            if (output.xPathStruktura.Count == 0)
            {
                logSystem.log("Template detection failed!", logType.Notification);
            }

            return(output);
        }
Пример #7
0
        public spiderLink setSeedUrl(string rootUrl)
        {
            link lnk       = new link(rootUrl, linkProcessFlags.standard);
            Uri  __rootUrl = new Uri(rootUrl);

            crawledPage cpage = new crawledPage(ORIGIN_OF_ROOTURL, 0);

            spiderPage spage = new spiderPage(cpage, 0, 0);
            // webPages.Add(spage);
            spiderLink splink = new spiderLink(spage, lnk, 1);

            //splink.li = lnk;//allLinks.AddSpiderLink(lnk);
            seedLink           = splink;
            name               = rootUrl;
            splink.domain      = __rootUrl.Host;
            domain             = __rootUrl.Host;
            splink.link.domain = domain;
            //webLinks.Add(splink);
            //webTargets.Add(splink);
            return(splink);
        }
Пример #8
0
        //request.doContentCheck = false;
        //request.doCoolOff = false;
        //request.doRetryExecution = false;
        //request.doSubdomainVariations = false;
        //request.doTimeoutLimiter = true;

        //request.doLogCacheLoaded = imbWEMManager.settings.executionLog.doPageLoadedFromCache;
        //request.doLogNewLoad = imbWEMManager.settings.executionLog.doPageLoadedLog;
        //request.doLogRequestError = imbWEMManager.settings.executionLog.doPageErrorOrDuplicateLog;

        //request.htmlSettings.doTransliterateToLat = false;
        //request.htmlSettings.doRemoveHtmlEntities = true;
        //request.htmlSettings.doUpperCase = true;
        //request.htmlSettings.doAutocloseOnEnd = true;

        /// <summary>
        /// Does the web request
        /// </summary>
        /// <param name="url">The URL.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <returns></returns>
        internal crawledPage doWebRequest(string url, modelSpiderPageRecord pRecord)
        {
            url = controler.GetDuplicateUrl(url);

            if (url.isNullOrEmpty())
            {
                imbWEMManager.log.log("EMPTY URL PASSED TO THE WEB LOADER");
                imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(2200);
            }

            loaderRequest wemRequest = new loaderRequest(url);

            if (controler.CheckFail(wemRequest.url))
            {
                wemRequest.executed   = true;
                wemRequest.statusCode = System.Net.HttpStatusCode.ExpectationFailed;
            }
            else
            {
                wemRequest = loaderSubsystem.ExecuteRequest(wemRequest);   // <-----------------------------
                if (wemRequest.statusCode != System.Net.HttpStatusCode.OK)
                {
                    controler.SetFailUrl(wemRequest.url);
                }
            }


            if (dataLoad != null)
            {
                dataLoad.AddBytes(wemRequest.byteSize);
            }


            crawledPage page = makeCrawledPage(wemRequest, pRecord); // <-----------------------------[ STIZE DO OVDE



            return(page); // <---- prolazi
        }
Пример #9
0
        //  public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, crawlerAgentContext crawlerContext, Boolean __doTokenization, modelSpiderSiteRecord wRecord)

        /// <summary>
        /// Runs the spider task item.
        /// </summary>
        /// <param name="ln">The ln.</param>
        /// <param name="crawlerContext">The crawler context.</param>
        /// <param name="__doTokenization">if set to <c>true</c> [do tokenization].</param>
        /// <param name="wRecord">The w record.</param>
        /// <returns></returns>
        public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, bool __doTokenization, modelSpiderPageRecord pRecord)
        {
            spiderTaskResultItem rItem = new spiderTaskResultItem(ln);

            crawledPage page = null;

            page = doWebRequest(ln.url, pRecord); // < ----------------------- ovde puca

            rItem.finish(page, pRecord.iteration);

            if (page.status == pageStatus.failed)
            {
                return(rItem);
            }


            pRecord.acceptPage(page);


            pRecord.init(rItem.sPage);

            return(rItem); // <---------------------------------------------- [ prolazi
        }
Пример #10
0
        /// <summary>
        /// Makes the crawled page.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="pRecord">The p record.</param>
        /// <returns></returns>
        /// <exception cref="aceGeneralException">Error in link processing</exception>
        internal crawledPage makeCrawledPage(IWebResult result, modelSpiderPageRecord pRecord)
        {
            crawledPage page = new crawledPage(result.responseUrl, 0);

            page.result = result;
            page.domain = pRecord.wRecord.domainInfo.domainName;

            var links = result.HtmlDocument.DocumentNode.Descendants("a");

            if (links.Any())
            {
            }
            else
            {
            }


            foreach (HtmlNode hn in links)
            {
                try
                {
                    var  ndv = hn.CreateNavigator();
                    link l   = new link(ndv);
                    if (!l.isDefaultHomePage)
                    {
                        page.links.Add(l);
                    }
                }
                catch (Exception ex)
                {
                    throw new aceGeneralException(ex.Message, ex, page, "Error in link processing");
                }
            }

            if (page.links.Count == 0)
            {
            }

            var meta = result.HtmlDocument.DocumentNode.Descendants("meta");

            foreach (HtmlNode hn in meta)
            {
                String name    = hn.GetAttributeValue("name", "none");
                String content = hn.GetAttributeValue("content", "");

                switch (name)
                {
                case "keywords":
                    page.pageKeywords = content.SplitSmart(",", "", true, true);
                    break;

                case "description":
                    page.pageDescription = content;
                    break;
                }
            }

            var title = result.HtmlDocument.DocumentNode.Descendants("title").FirstOrDefault();

            if (title != null)
            {
                page.pageCaption = title.InnerText;
            }


            page.links.deployCollection(page);

            page.isCrawled = true;
            page.status    = pageStatus.loaded;

            if (!page.links.byScope[imbCommonModels.enums.linkScope.inner].Any())
            {
            }

            return(page);
        }
Пример #11
0
        /// <summary>
        /// Pravi report sa ne-standardnim meta informacijama a standardne smesta u page objekat. Poziva se automatski iz crawlerAgentContextOperations
        /// </summary>
        /// <param name="page"></param>
        /// <param name="output"></param>
        /// <returns></returns>
        public static metricsReport getMetaReport(this crawledPage page, metricsReport output = null)
        {
            if (output == null)
            {
                output = new metricsReport(page.result.HtmlDocument as IXPathNavigable);
            }
            var rt = output.report("META_metanodes", htmlDefinitions.HTMLTags_metaTags);


            // var _allMetaTags = page.xmlDocument.queryXPath(imbXmlXPathTools.makeXPathForAllNodes(htmlDefinitions.HTMLTags_metaTags));
            //  XmlNode old = null;
            string _name = "";

            foreach (IXPathNavigable Ixn in rt.nodes)
            {
                XPathNavigator xn;
                if (Ixn is XPathNavigator)
                {
                    xn = Ixn as XPathNavigator;
                }
                else
                {
                    xn = Ixn.CreateNavigator();
                }


                switch (xn.Name.ToLower())
                {
                case "title":
                    page.pageCaption = xn.Value;

                    output.report("title", page.pageCaption, reportEntryGroups.META);
                    break;

                case "meta":

                    _name = xn.getAttributeValue("name").ToLower();
                    switch (_name)
                    {
                    case "application-name":
                    case "generator":
                    case "author":
                    case "google-site-verification":
                    default:
                        if (!string.IsNullOrEmpty(_name))
                        {
                            output.report(_name, xn.getAttributeValue("content"), reportEntryGroups.META);
                        }
                        break;

                    case "keywords":
                        page.pageKeywords =
                            Enumerable.ToList <string>(xn.getAttributeValue("content").Split(htmlDefinitions.HTMLMeta_keywordsSepparators,
                                                                                             StringSplitOptions.RemoveEmptyEntries));
                        break;

                    case "description":
                        page.pageDescription = xn.getAttributeValue("content");
                        break;
                    }
                    break;
                }
            }
            return(output);
        }