// var matrix = wRecord.context.targets.GetLinkMatrix(); // pageRank = new PageRank(matrix, alpha, convergence, checkSteps); // Double[] dbl = pageRank.ComputePageRank(); // List<Int32> pri = new List<int>(); // foreach (Double db in dbl) // { // pri.Add(Convert.ToInt32(db* scoreUnit)); //} //ranks = wRecord.context.targets.linkMatrix.MapToY(pri); // public HITSRank(linkMatrix public HITSScore this [ISpiderTarget target] { get { return(targetToScore[target.targetHash]); } }
public int getLinkCountRotated(ISpiderTarget itemY, ISpiderTarget itemX) { int output = 0; spiderPage page = itemX.page as spiderPage; List <spiderLink> links = page.relationship.outflowLinks.GetTargetingTo(itemY.targetHash); return(output = links.Count); //wRecord.web }
private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; ISpiderTarget target = __args.Target; if (mcRepo.isTargetProper(target)) { imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger); pRepo.indexEntry = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url); pRepo.HtmlSourceCode = __args.sourceHtml; pRepo.XmlSourceCode = __args.sourceXml; pRepo.SaveDataStructure(wRepo.folder, loger); } }
public modelSpiderSiteRecordEventArgs(ISpiderTarget __target, modelSpiderSiteRecordEventType __type = modelSpiderSiteRecordEventType.DLCTargetPageAttached) { Target = __target; if (__target is spiderTarget) { spiderTarget target = (spiderTarget)__target; sourceHtml = target.page.webpage.result.sourceCode; htmlDoc = target.page.webpage.result.HtmlDocument; //(HtmlDocument)target.page.webpage.result.document.getDocument<HtmlDocument>(); if (htmlDoc != null) { if (htmlDoc.DocumentNode != null) { sourceXml = target.page.webpage.result.sourceCode; } } //sourceXml = target.page.spiderResult.page.result.document.getDocument<HtmlDocument>() } type = __type; }
/// <summary> /// Determines whether target is proper according to repositorium settings /// </summary> /// <param name="target">The target.</param> /// <returns> /// <c>true</c> if [is target proper] [the specified target]; otherwise, <c>false</c>. /// </returns> public bool isTargetProper(ISpiderTarget target) { ISpiderTarget t = target; if (doExcludeDuplicatePages && target.isDuplicate) { t = null; } if (doExcludeIrrelevantPages && !target.IsRelevant) { t = null; } if (t != null) { return(true); } else { return(false); } }
/// <summary> /// Builds the web page repository using <see cref="ISpiderTarget"/> crawl information /// </summary> /// <param name="target">Target information</param> /// <param name="site">The site to build page for</param> /// <param name="output">The output for console/log</param> /// <returns>Built or updated web page repository</returns> public imbMCWebPage BuildWebPage(ISpiderTarget target, imbMCWebSite site, ILogBuilder output = null) { imbMCWebPage page = GetWebPage(site, target.url, true, output); ISpiderPage sPage = target.page; page.entry.AnchorTextAll = sPage.captions.toCsvInLine(","); page.entry.ClickDepth = sPage.iterationDiscovery; page.entry.ResolvedRelativeURL = site.domainInfo.GetURLWithoutDomainName(target.url); page.deploy(page.entry); page.indexEntry = target.GetIndexPage(); page.TextContent = target.pageText; page.name = target.page.name; var htmlDoc = target.GetHtmlDocument(); if (htmlDoc != null) { page.HtmlSourceCode = htmlDoc.DocumentNode.OuterHtml; // ; = target.contentBlocks; } else { } page.Blocks = new List <imbCommonModels.contentBlock.nodeBlock>(); page.TermTable = target.tokens.GetCompiledTable(output); target.contentBlocks.ForEach(x => page.Blocks.Add(x)); site.pageTable.AddOrUpdate(page.entry); page.SaveDataStructure(site.folder, output); return(page); }
int ISpiderTargetCollection.getLinkCountRotated(ISpiderTarget itemY, ISpiderTarget itemX) { return(getLinkCountRotated((spiderTarget)itemY, (spiderTarget)itemX)); }
int ISpiderTargetCollection.getLinkCount(ISpiderTarget itemX, ISpiderTarget itemY) { return(getLinkCount((spiderTarget)itemX, (spiderTarget)itemY)); }