public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); output.score = 0; if (link.originPage != null) { foreach (var pair in link.originPage.relationship.crossLinks) { spiderPage crossPage = wRecord.web.webPages[pair.Value.targetHash]; //.GetPageByLink(pair.Value); if (crossPage.relationship.outflowLinks.ContainsAsTarget(link.targetHash)) { output.score += scoreUnit; } } } else { throw new aceGeneralException("Link origin page not set!", null, link, "ruleActiveCrossLink->evaluate()"); } return(output); }
/// <summary> /// Adds the specified link if not already known /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public bool Add(spiderPage page) { string key = page.getPageSignature(useCription); if (items.ContainsKey(key)) { if (items[key].webpage.isCrawled == false) { if (page.webpage.isCrawled) { items[key] = page; imbWEMManager.log.log("Temp. spiderPage [" + items[key].name + "] replaced by new instance for domain [" + page.webpage.domain + "]"); } else { aceLog.log("---- This page is not crawled yet and it cannot substitute registrated one ----"); // throw new aceGeneralException("This page is not crawled yet and it cannot substitute registrated one", null, this, "New page under same key is not crawled"); } } else { aceLog.log("---- shouldn't replace existing page if it is crawled yet ----"); //throw new aceGeneralException("You shouldn't replace existing page if it is crawled yet", null, this, "Page under same key is crawled"); } return(false); } else { items.Add(key, page); hashList.Add(key); return(true); } }
public override spiderObjectiveSolution evaluate(spiderPage element, modelSpiderSiteRecord sRecord, params object[] resources) { spiderObjectiveSolution sol = new spiderObjectiveSolution(); if (max < treshold) { return(null); } if (element.relationship.crossLinks.Count() > treshold) { if (q1 == int.MinValue) { double __q1; double __q3; Measures.Quartiles(scoreList.ToArray(), out __q1, out __q3, false); q1 = Convert.ToInt32(__q1); q3 = Convert.ToInt32(__q3); } if (element.marks.score <= q1) { sol = new spiderObjectiveSolution(element, spiderObjectiveStatus.aborted); } else { } } return(sol); }
public override void learn(spiderPage element, modelSpiderSiteRecord sRecord, params object[] resources) { int cross = element.relationship.crossLinks.Count(); min = Math.Min(cross, min); max = Math.Max(cross, max); scoreList.Add(Convert.ToDouble(cross)); }
/// <summary> /// Finishes the result item /// </summary> /// <param name="__page">The page.</param> public void finish(crawledPage __page, int __iteration) { page = __page; status = page.status; duration = DateTime.Now.Subtract(startTime); sPage = new spiderPage(page, target.iterationDiscovery, __iteration); // wRecord.iteration); sPage.spiderResult = this; }
/// <summary> /// Takes information from page - called before evaluation /// </summary> /// <param name="page">The page.</param> public override void learn(spiderPage page) { // if (page.spiderResult != null) { linkScoreMax = Math.Max(page.spiderResult.target.marks.score, linkScoreMax); } }
public static string getPageSignature(this spiderPage sPage, bool cripted = false) { string output = ""; string position = sPage.originHash; //if (cripted) //{ // position = md5.GetMd5Hash(position); //} output = position; //->[" + leftSide + "]"; return(output); }
public void init(spiderPage __pageInstance) { state = modelRecordStateEnum.initiated; pageInstance = __pageInstance; if (pGeneralRecord != null) { pGeneralRecord.sideRecordSets.AddRecord(instance, this); } else { //pGeneralRecord.instance } }
public void SetActiveTargets(modelSpiderSiteRecord wRecord, indexDomain domain) { List <indexPage> pages = domain.getPageSet(); // wRecord.web.setSeedUrl(domain.url); //spiderPage sp = new spiderPage() crawledPage cpage = new crawledPage(domain.url, 0); spiderPage spage = new spiderPage(cpage, 0, 0); foreach (indexPage p in pages) { link l = new link(p.url); wRecord.context.processLink(l, spage, false); } }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderPage page) { spiderEvalRuleResult output = new spiderEvalRuleResult(this, spiderEvalRuleResultEnum.passive); int score = page.relationship.inflowLinks.Count; if (maxInboundLinks > 0) { decimal ratio = Convert.ToDecimal(score) / Convert.ToDecimal(maxInboundLinks); decimal result = scoreUnit * ratio; output.comment = ratio.ToString("P2"); output.score = Convert.ToInt32(result); } return(output); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderPage page) { spiderEvalRuleResult output = new spiderEvalRuleResult(this, spiderEvalRuleResultEnum.passive); bool score = false; List <string> words = page.webpage.pageCaption.getStringTokensMinLength(); foreach (string wrd in words) { if (titleWords[wrd] == 1) { output.comment = " [" + wrd + "]:IsUni "; if (language.isKnownWord(wrd)) { output.comment += "& IsKnown"; output.score = scoreUnit; return(output); } else { } } else { } } if (score) { // output.score = scoreUnit; } else { output.score = penaltyUnit; } return(output); }
public spiderLink setSeedUrl(string rootUrl) { link lnk = new link(rootUrl, linkProcessFlags.standard); Uri __rootUrl = new Uri(rootUrl); crawledPage cpage = new crawledPage(ORIGIN_OF_ROOTURL, 0); spiderPage spage = new spiderPage(cpage, 0, 0); // webPages.Add(spage); spiderLink splink = new spiderLink(spage, lnk, 1); //splink.li = lnk;//allLinks.AddSpiderLink(lnk); seedLink = splink; name = rootUrl; splink.domain = __rootUrl.Host; domain = __rootUrl.Host; splink.link.domain = domain; //webLinks.Add(splink); //webTargets.Add(splink); return(splink); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderPage page) { spiderEvalRuleResult output = new spiderEvalRuleResult(this, spiderEvalRuleResultEnum.active); if (page.spiderResult != null) { if (linkScoreMax > 0) { int score = page.spiderResult.target.marks.score; decimal ratio = Convert.ToDecimal(score) / Convert.ToDecimal(linkScoreMax); decimal result = scoreUnit * ratio; output.comment = ratio.ToString("P2"); output.score = Convert.ToInt32(result); } } return(output); }
public bool Add(string url, spiderPage page, spiderLink link) { if (pageIndex.ContainsKey(url)) { // } else { if (page != null) { pageIndex.Add(url, page); } } if (linkIndex.ContainsKey(url)) { // } else { if (link != null) { linkIndex.Add(url, link); } } bool ok = pageIndex.ContainsKey(url) && linkIndex.ContainsKey(url); if (ok) { urlRegistar.Add(url); return(true); } else { return(false); } }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderPage page) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); int score = page.relationship.crossLinks.Count(); if (maxCrosslinkScore > 0) { decimal ratio = Convert.ToDecimal(score) / Convert.ToDecimal(maxCrosslinkScore); decimal result = scoreUnit * ratio; output.comment = ratio.ToString("P2"); output.score = Convert.ToInt32(result); } else { output.comment = " [maxCLS=0] "; } return(output); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderPage page) { spiderEvalRuleResult output = new spiderEvalRuleResult(this, mode); bool score = false; output.comment = " freq:[" + pageTitleCount[page.webpage.pageCaption] + "] "; if (pageTitleCount[page.webpage.pageCaption] == 1) { score = true; } if (score) { output.score = scoreUnit; } else { output.score = penaltyUnit; } return(output); }
/// <summary> /// Takes information from page - called before evaluation /// </summary> /// <param name="page">The page.</param> public override void learn(spiderPage page) { titleWords.AddInstanceRange(page.webpage.pageCaption.getStringTokensMinLength()); // }
/// <summary> /// Takes information from page - called before evaluation /// </summary> /// <param name="page">The page.</param> public override void learn(spiderPage page) { maxInboundLinks = Math.Max(page.relationship.inflowLinks.Count, maxInboundLinks); // }
public bool HasCrossLinkWith(spiderPage whom) { return(crossLinks.ContainsKey(whom.getPageSignature(true))); }
/// <summary> /// Evaluates the specified page. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> public abstract spiderEvalRuleResult evaluate(spiderPage page);
/// <summary> /// Takes information from page - called before evaluation /// </summary> /// <param name="page">The page.</param> public abstract void learn(spiderPage page);
/// <summary> /// Takes information from page - called before evaluation /// </summary> /// <param name="page">The page.</param> public override void learn(spiderPage page) { maxCrosslinkScore = Math.Max(page.relationship.crossLinks.Count(), maxCrosslinkScore); // }
/// <summary> /// Takes information from page - called before evaluation /// </summary> /// <param name="page">The page.</param> public override void learn(spiderPage page) { // //pageTitles.Add(page.webpage.pageCaption); pageTitleCount.AddInstance(page.webpage.pageCaption, "webpage.pageCaption @ learn() in pageruleTitleUnique"); }