public int getLinkCountRotated(ISpiderTarget itemY, ISpiderTarget itemX) { int output = 0; spiderPage page = itemX.page as spiderPage; List <spiderLink> links = page.relationship.outflowLinks.GetTargetingTo(itemY.targetHash); return(output = links.Count); //wRecord.web }
//public spiderLink(String url) //{ // link = new link(url); // Uri tmp = new Uri(url); // domain = tmp.Host; // url = link.url.ToString(); // name = link.caption; // description = ""; // // originPage = __home; // // iteration = __iteracija; //} public spiderLink(spiderPage __home, link __link, int __iteracija) { if (__home == null) { throw new aceGeneralException("Page of origin for this link never provided", null, this, "Bad arguments at constructor"); } link = __link; url = link.url.ToString(); originPage = __home; iterationDiscovery = __iteracija; name = link.caption; domain = link.domain; captions.Add(link.caption); urls.AddInstance(url, "Link urls @ spiderLink"); //description = ""; //Uri tmp = new Uri(url); //domain = tmp.Host; //if (url.isNullOrEmpty()) //{ // throw new aceGeneralException("url is null"); //} }
/// <summary> /// Attaches the page - if the page was already attached returns <c>false</c> /// </summary> /// <param name="__page">The page.</param> /// <returns></returns> public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3) { if (page != __page) { page = __page; HtmlDocument htmlDoc = GetHtmlDocument(); iterationLoaded = parent.wRecord.iteration; if (htmlDoc != null) { XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator(); pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve); pageText = WebUtility.HtmlDecode(pageText); pageHash = md5.GetMd5Hash(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree) { contentTree = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc); contentBlocks = contentTree.getBlocks(targetBlockCount); contentBlocks.CalculateScores(); } var ignoreTokens = parent.wRecord.domainInfo.domainWords; var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText); if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF) { content = parent.dlTargetPageTokens.AddTable(key) as termDocument; content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent; content.AddTokens(preprocessedTokens.ToList(), response); } bool evaluationOk = false; indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry; if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode) { pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url); } else { pageState = indexPageEvaluationEntryState.notInTheIndex; } if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry)) { evaluation = new multiLanguageEvaluation(); evaluation.result_language = evaluatedLanguage; evaluationOk = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant); evaluatedLanguage = basicLanguageEnum.serbian; } else { evaluation = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList()); evaluatedLanguage = evaluation.result_language; } lock (RelevantPageLock) { if (IsRelevant) { parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens); parent.wRecord.relevantPages.AddUnique(__page.url); parent.wRecord.tRecord.relevantPages.AddUnique(__page.url); } else { parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens); } parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens); } // <----- calling event //targs.htmlDoc = htmlDoc; if (parent.wRecord.context.OnTargetPageAttached != null) { var targs = new modelSpiderSiteRecordEventArgs(this); parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs); } } return(true); } return(false); }
/// <summary> /// Processes loader result /// </summary> /// <param name="stResult">The st result.</param> /// <param name="doLinkResolver">Performs LinkResolver component tasks over each harvested link</param> /// <param name="doLinkHarvest">Extract designated linkNature and linkScope from the content</param> /// <param name="nature">The nature of links to harvest - flags</param> /// <param name="scope">The scope of links to harvest - flags</param> /// <returns>Number of newly added links</returns> public int processLoaderResult(spiderTaskResult stResult, bool doLinkResolver = true, bool doLinkHarvest = true, linkNature nature = linkNature.navigation, linkScope scope = linkScope.inner) { int nw_failed_l = 0; foreach (spiderTaskResultItem cresult in stResult) // <--------------------------------------------------------------------------------- prolazi kroz sve učitane stranice { //cresult.page; spiderPage pg = cresult.sPage; //new spiderPage(cresult.page, wRecord.iteration); // <--------------------------------------------------------------- instancira spiderPage modelWebPageGeneralRecord pGeneralRecord = null; if (cresult.status != pageStatus.failed) { web.webPageContentHashList.AddInstance(pg.contentHash, 1); if (web.webPageContentHashList[pg.contentHash] > 1) { if (imbWEMManager.settings.executionLog.doPageErrorOrDuplicateLog) { aceLog.log("Page [" + pg.url + "] - is content duplicate "); } wRecord.listOfDuplicatedPages.Add(new contentHashAndAddressEntry(pg.url, pg.contentHash, web.webPageContentHashList[pg.contentHash])); wRecord.duplicateCount++; var t = targets.GetByTarget(cresult.target); if (t != null) { t.isDuplicate = true; } continue; } // <-------------------------------------------------- instancira pGeneralRecord // pGeneralRecord = wRecord.wGeneralRecord.children.GetRecord(pg.webpage, true); } cresult.target.targetedPage = pg; // <-------------------------------------------------------------------------------------------- upisuje u link referencu stranice if (!wRecord.web.webPages.Add(pg)) // <--------------------------------------------------------------------------------------------- registruje stranicu u webPages skup { wRecord.logBuilder.log("Web page [" + pg.url + "] was loaded before - check the algorithm"); } if (cresult.status != pageStatus.failed) { if (doLinkHarvest) { List <link> links = cresult.page.links.Where <link>(x => (x.nature.HasFlag(nature) && x.scope.HasFlag(scope))).ToList(); // <---------------- izdvaja linkove sa stranice int length = links.Count; for (int i = 0; i < length; i++)// <------------------------------------------------------------------------------------------------- iteracija kroz linkove { processLink(links[i] as link, cresult.sPage); } } } else { nw_failed_l++; } cresult.dispose(); } if (OnLoaderTaskProcessed != null) { OnLoaderTaskProcessed(wRecord, new modelSpiderSiteRecordEventArgs(stResult)); } return(nw_failed_l); }
/// <summary> /// Processes the link into Targets /// </summary> /// <param name="ln">The ln.</param> /// <param name="parentNode">The parent node.</param> /// <param name="doLinkResolver">if set to <c>true</c> [do link resolver].</param> /// <returns>If new target is created</returns> public bool processLink(link ln, spiderPage parentNode, bool doLinkResolver = true) { bool isNewLink = false; #region LINK NORMALIZATION ================================= if (doLinkResolver) { ln.url = ln.getAbsoluteUrl(parentNode.webpage); ln.url = ln.url.httpsToHttpShema(); ln.url = ln.url.equalizeUrlWithIndexFilenames(); ln.url = wRecord.domainInfo.GetResolvedUrl(ln.url, imbWEMManager.settings.linkResolver.LNK_RemoveAnchors); try { domainAnalysis da = new domainAnalysis(ln.url); if (ln.url.IndexOf(da.domainName) > -1) { int l = ln.url.Length - (ln.url.IndexOf(da.domainName) + da.domainName.Length); if (l == 1) { ln.url = da.urlProper; } } } catch (Exception ex) { imbWEMManager.log.log("Process link exception: " + ex.Message); } } #endregion ======================================================== spiderLink sln = new spiderLink(parentNode, ln, wRecord.iteration); // <------------------------------------------------------------ upisuje referencu porekla: stranica, link i iteracija if (!spider.approveUrl(sln.link)) { sln.flags |= spiderLinkFlags.urlNotSupported; // <---------------------------------------------------------------------- ako link nije poželjan / dozvoljen } else { spiderTarget target = targets.GetByTarget(sln); if (wRecord.web.webLinks.Add(sln)) { sln.flags |= spiderLinkFlags.newlinkVector; } else { sln.flags |= spiderLinkFlags.oldlinkVector; } if (wRecord.web.webTargets.Add(sln)) { sln.flags |= spiderLinkFlags.newlinkTarget; } else { sln.flags |= spiderLinkFlags.oldlinkTarget; } if (sln.flags.HasFlag(spiderLinkFlags.newlinkTarget) || (target == null)) { if (target == null) { isNewLink = true; target = targets.GetOrCreateTarget(sln, true, true); wRecord.web.webActiveLinks.Add(sln); } else { isNewLink = false; } // <----------------------------------------------------------------------- upisuje u spisak aktivnih linkova } } return(isNewLink); }