/// <summary> /// Finishes the result item /// </summary> /// <param name="__page">The page.</param> public void finish(crawledPage __page, int __iteration) { page = __page; status = page.status; duration = DateTime.Now.Subtract(startTime); sPage = new spiderPage(page, target.iterationDiscovery, __iteration); // wRecord.iteration); sPage.spiderResult = this; }
public void acceptPage(crawledPage __page) { if (__page != null) { page = __page; } log("Page [" + page.caption + "] loaded [" + page.url + "]"); }
public spiderPage(crawledPage __webpage, int __iteration, int __iterationLoad) { webpage = __webpage; url = webpage.url.toStringSafe(""); iterationDiscovery = __iteration; name = __webpage.name; captions.Add(__webpage.pageCaption); description = __webpage.description; }
/// <summary> /// Basic HTML Metrics /// </summary> /// <param name="page"></param> /// <param name="output"></param> /// <returns></returns> public static metricsReport getHtmlMetrics(crawledPage page, metricsSettings settings, metricsReport output = null) { if (output == null) { output = new metricsReport(page.result.HtmlDocument as IXPathNavigable); } getMetaReport(page, output); int linkInner = page.links.byScope[linkScope.inner].Count; output.report("FV01_linkOuter", page.links.byScope[linkScope.outer].Count); output.report("FV02_linkInner", linkInner); output.report("FV31_cListStructures", htmlDefinitions.HTMLTags_listStructureTags, true); output.report("FV31_cListStructures", htmlDefinitions.HTMLTags_listStructureTags, true); output.report("FV32_cTableStructures", htmlDefinitions.HTMLTags_tableStructureTags, true); output.report("FV41_cHeadingTags", htmlDefinitions.HTMLTags_headingTags, true); output.report("FV42_cStructureTags", htmlDefinitions.HTMLTags_allStructureTags, true); reportEntryBase _entry = output.report("FV43_cMultiMediaTags", htmlDefinitions.HTMLTags_multimediaTags, true); if (settings.flags.HasFlag(metricsFlag.downloadPluginLinkAsMultimediaTag)) { var c = (int)_entry.Value; foreach (link l in page.links.byScope[linkScope.outer]) { if (l.domain == "www.adobe.com") { c = c + 1; } } _entry.Value = c; } //Int32 cMultiMediaTags = output["FV43_cMultiMediaTags"].Value.imbToNumber(typeof (Int32)); output.report("FV44_cImageTags", htmlDefinitions.HTMLTags_multimediaTags, true); return(output); }
public void SetActiveTargets(modelSpiderSiteRecord wRecord, indexDomain domain) { List <indexPage> pages = domain.getPageSet(); // wRecord.web.setSeedUrl(domain.url); //spiderPage sp = new spiderPage() crawledPage cpage = new crawledPage(domain.url, 0); spiderPage spage = new spiderPage(cpage, 0, 0); foreach (indexPage p in pages) { link l = new link(p.url); wRecord.context.processLink(l, spage, false); } }
/// <summary> /// Izvršava imbBasic metod detekcije templejta /// </summary> /// <param name="source">Lista učitanih stranica</param> /// <param name="settings">Podešavanja</param> /// <returns>Sređena definicija templejta</returns> public static templateDefinition detectTemplate_imbBasic(crawledPage[] source, imbWebTemplateSettings settings) { templateDefinition output = new templateDefinition(); List <string> xPathList; // COMMON TREE DETECTION switch (settings.commonTreeDetection) { default: case commonTreeMethod.imbEndNodePathFrequency: output.xPathStruktura = templateOperations.commonTree_imbENPF(source, settings); break; } crawledPage c = source.First(); //imbNamespaceSetup nsSetup = new imbNamespaceSetup(c.xmlDocument); // COMMON CONTENT CHECK output.xPathStruktura = templateOperations.commonContentCheck(source, output.xPathStruktura, settings); templateExtensions.prepareContent(output, settings); // page track if (settings.doSavePageUrls) { foreach (crawledPage p in source) { output.relatedPages.Add(p.url); } } output.score = source.Length; if (output.xPathStruktura.Count == 0) { logSystem.log("Template detection failed!", logType.Notification); } return(output); }
public spiderLink setSeedUrl(string rootUrl) { link lnk = new link(rootUrl, linkProcessFlags.standard); Uri __rootUrl = new Uri(rootUrl); crawledPage cpage = new crawledPage(ORIGIN_OF_ROOTURL, 0); spiderPage spage = new spiderPage(cpage, 0, 0); // webPages.Add(spage); spiderLink splink = new spiderLink(spage, lnk, 1); //splink.li = lnk;//allLinks.AddSpiderLink(lnk); seedLink = splink; name = rootUrl; splink.domain = __rootUrl.Host; domain = __rootUrl.Host; splink.link.domain = domain; //webLinks.Add(splink); //webTargets.Add(splink); return(splink); }
//request.doContentCheck = false; //request.doCoolOff = false; //request.doRetryExecution = false; //request.doSubdomainVariations = false; //request.doTimeoutLimiter = true; //request.doLogCacheLoaded = imbWEMManager.settings.executionLog.doPageLoadedFromCache; //request.doLogNewLoad = imbWEMManager.settings.executionLog.doPageLoadedLog; //request.doLogRequestError = imbWEMManager.settings.executionLog.doPageErrorOrDuplicateLog; //request.htmlSettings.doTransliterateToLat = false; //request.htmlSettings.doRemoveHtmlEntities = true; //request.htmlSettings.doUpperCase = true; //request.htmlSettings.doAutocloseOnEnd = true; /// <summary> /// Does the web request /// </summary> /// <param name="url">The URL.</param> /// <param name="crawlerContext">The crawler context.</param> /// <returns></returns> internal crawledPage doWebRequest(string url, modelSpiderPageRecord pRecord) { url = controler.GetDuplicateUrl(url); if (url.isNullOrEmpty()) { imbWEMManager.log.log("EMPTY URL PASSED TO THE WEB LOADER"); imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(2200); } loaderRequest wemRequest = new loaderRequest(url); if (controler.CheckFail(wemRequest.url)) { wemRequest.executed = true; wemRequest.statusCode = System.Net.HttpStatusCode.ExpectationFailed; } else { wemRequest = loaderSubsystem.ExecuteRequest(wemRequest); // <----------------------------- if (wemRequest.statusCode != System.Net.HttpStatusCode.OK) { controler.SetFailUrl(wemRequest.url); } } if (dataLoad != null) { dataLoad.AddBytes(wemRequest.byteSize); } crawledPage page = makeCrawledPage(wemRequest, pRecord); // <-----------------------------[ STIZE DO OVDE return(page); // <---- prolazi }
// public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, crawlerAgentContext crawlerContext, Boolean __doTokenization, modelSpiderSiteRecord wRecord) /// <summary> /// Runs the spider task item. /// </summary> /// <param name="ln">The ln.</param> /// <param name="crawlerContext">The crawler context.</param> /// <param name="__doTokenization">if set to <c>true</c> [do tokenization].</param> /// <param name="wRecord">The w record.</param> /// <returns></returns> public spiderTaskResultItem runSpiderTaskItem(spiderLink ln, bool __doTokenization, modelSpiderPageRecord pRecord) { spiderTaskResultItem rItem = new spiderTaskResultItem(ln); crawledPage page = null; page = doWebRequest(ln.url, pRecord); // < ----------------------- ovde puca rItem.finish(page, pRecord.iteration); if (page.status == pageStatus.failed) { return(rItem); } pRecord.acceptPage(page); pRecord.init(rItem.sPage); return(rItem); // <---------------------------------------------- [ prolazi }
/// <summary> /// Makes the crawled page. /// </summary> /// <param name="result">The result.</param> /// <param name="pRecord">The p record.</param> /// <returns></returns> /// <exception cref="aceGeneralException">Error in link processing</exception> internal crawledPage makeCrawledPage(IWebResult result, modelSpiderPageRecord pRecord) { crawledPage page = new crawledPage(result.responseUrl, 0); page.result = result; page.domain = pRecord.wRecord.domainInfo.domainName; var links = result.HtmlDocument.DocumentNode.Descendants("a"); if (links.Any()) { } else { } foreach (HtmlNode hn in links) { try { var ndv = hn.CreateNavigator(); link l = new link(ndv); if (!l.isDefaultHomePage) { page.links.Add(l); } } catch (Exception ex) { throw new aceGeneralException(ex.Message, ex, page, "Error in link processing"); } } if (page.links.Count == 0) { } var meta = result.HtmlDocument.DocumentNode.Descendants("meta"); foreach (HtmlNode hn in meta) { String name = hn.GetAttributeValue("name", "none"); String content = hn.GetAttributeValue("content", ""); switch (name) { case "keywords": page.pageKeywords = content.SplitSmart(",", "", true, true); break; case "description": page.pageDescription = content; break; } } var title = result.HtmlDocument.DocumentNode.Descendants("title").FirstOrDefault(); if (title != null) { page.pageCaption = title.InnerText; } page.links.deployCollection(page); page.isCrawled = true; page.status = pageStatus.loaded; if (!page.links.byScope[imbCommonModels.enums.linkScope.inner].Any()) { } return(page); }
/// <summary> /// Pravi report sa ne-standardnim meta informacijama a standardne smesta u page objekat. Poziva se automatski iz crawlerAgentContextOperations /// </summary> /// <param name="page"></param> /// <param name="output"></param> /// <returns></returns> public static metricsReport getMetaReport(this crawledPage page, metricsReport output = null) { if (output == null) { output = new metricsReport(page.result.HtmlDocument as IXPathNavigable); } var rt = output.report("META_metanodes", htmlDefinitions.HTMLTags_metaTags); // var _allMetaTags = page.xmlDocument.queryXPath(imbXmlXPathTools.makeXPathForAllNodes(htmlDefinitions.HTMLTags_metaTags)); // XmlNode old = null; string _name = ""; foreach (IXPathNavigable Ixn in rt.nodes) { XPathNavigator xn; if (Ixn is XPathNavigator) { xn = Ixn as XPathNavigator; } else { xn = Ixn.CreateNavigator(); } switch (xn.Name.ToLower()) { case "title": page.pageCaption = xn.Value; output.report("title", page.pageCaption, reportEntryGroups.META); break; case "meta": _name = xn.getAttributeValue("name").ToLower(); switch (_name) { case "application-name": case "generator": case "author": case "google-site-verification": default: if (!string.IsNullOrEmpty(_name)) { output.report(_name, xn.getAttributeValue("content"), reportEntryGroups.META); } break; case "keywords": page.pageKeywords = Enumerable.ToList <string>(xn.getAttributeValue("content").Split(htmlDefinitions.HTMLMeta_keywordsSepparators, StringSplitOptions.RemoveEmptyEntries)); break; case "description": page.pageDescription = xn.getAttributeValue("content"); break; } break; } } return(output); }