public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); output.score = 0; if (link.originPage != null) { foreach (var pair in link.originPage.relationship.crossLinks) { spiderPage crossPage = wRecord.web.webPages[pair.Value.targetHash]; //.GetPageByLink(pair.Value); if (crossPage.relationship.outflowLinks.ContainsAsTarget(link.targetHash)) { output.score += scoreUnit; } } } else { throw new aceGeneralException("Link origin page not set!", null, link, "ruleActiveCrossLink->evaluate()"); } return(output); }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); linknodeElement node = wRecord.linkHierarchy.GetByOriginalPath(link.url); if (node == null) { wRecord.log("Link not found in the hierarchy"); return(output); } if (node.level == 0) { return(output); } int range = (max - min); if (range > 0) { double coeficient = ((double)node.level) / range; output.score = Convert.ToInt32(penaltyUnit * coeficient); } return(output); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); output.score = scoreUnit; return(output); }
public void learn(ISpiderElement element) { spiderLink link = element as spiderLink; if (link != null) { Gt.Add(link.url, link); } }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); int depth = link.link.html.XPath.Replace("/", "\\").getPathParts().Count(); double k = 1 - ((double)depth) / ((double)max); output.score = Convert.ToInt32(((double)scoreUnit) * k); return(output); }
/// <summary> /// Takes information from page - called before evaluation /// </summary> /// <param name="page">The page.</param> public override void learn(spiderLink link) { // linknodeElement node = wRecord.linkHierarchy.GetByOriginalPath(link.url); if (node == null) { wRecord.log("Link not found in the hierarchy"); return; } min = Math.Min(node.level, min); max = Math.Max(node.level, max); }
/// <summary> /// Adds the specified link if not already known /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public bool Add(spiderLink link) { string key = link.getTargetSignature(useCription); if (items.ContainsKey(key)) { return(false); } else { items.Add(key, link); return(true); } }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); if (link.link.pathAndQuery.isNullOrEmpty()) { return(output); } bool score = false; List <string> words = link.link.pathAndQuery.getStringTokens(); foreach (string wrd in words) { if (languageNames.Contains(wrd.ToLower())) { output.comment = "url_found"; score = true; break; } } if (score == false) { if (!link.link.caption.isNullOrEmpty()) { words = link.link.caption.getStringTokens(); foreach (string wrd in words) { if (languageNames.Contains(wrd.ToLower())) { output.comment = "caption_found"; score = true; break; } } } } if (score) { output.score = scoreUnit; } return(output); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); bool penalty = false; if (link.link.path.isNullOrEmpty()) { output.comment = "path null"; return(output); } if (paths.Contains(link.link.path)) { output.comment = ""; penalty = true; } else { paths.Add(link.link.path); output.score += scoreUnit; } if (link.link.pathDirectoryPath.isNullOrEmpty()) { return(output); } if (folderPaths.Contains(link.link.pathDirectoryPath)) { penalty = true; } else { folderPaths.Add(link.link.pathDirectoryPath); output.score += scoreUnit; } if (penalty) { output.score += penaltyUnit; } return(output); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); string cp = link.link.caption; if (cp.isNullOrEmpty()) { output.comment = "caption is empty"; return(output); } //if (cp.IsEmptyOrWhiteSpace()) //{ // output.comment = "caption is whitespace"; // return output; //} if (cp.isNumber()) { output.comment = "caption is numeric"; return(output); } if (cp.isSymbolicContentOnly()) { output.comment = "caption is symbolic content"; return(output); } if (knownCaptions.Contains(cp)) { output.score = penaltyUnit; } else { knownCaptions.Add(cp); output.score = scoreUnit; } return(output); }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult result = new spiderEvalRuleResult(this); if (tree.Gd == null) { tree.buildGd(); } if (tree.Gd == null) { result.layer = layer3ID; return(result); } linknodeElement linkNode = tree.GetLinkNode(link.url); //.Gd.sourceNodes[link.url]; if (linkNode == null) { result.layer = layer3ID; return(result); } if (tree.bestNode == linkNode) { result.layer = layerID; } else if (tree.bestNode.items.Values.Contains(linkNode)) { result.layer = layerID; } else if (tree.bestNode.items.Values.Any(x => x.items.Values.Contains(linkNode))) { result.layer = layer2ID; } else { result.layer = layer3ID; } return(result); }
public spiderLink setSeedUrl(string rootUrl) { link lnk = new link(rootUrl, linkProcessFlags.standard); Uri __rootUrl = new Uri(rootUrl); crawledPage cpage = new crawledPage(ORIGIN_OF_ROOTURL, 0); spiderPage spage = new spiderPage(cpage, 0, 0); // webPages.Add(spage); spiderLink splink = new spiderLink(spage, lnk, 1); //splink.li = lnk;//allLinks.AddSpiderLink(lnk); seedLink = splink; name = rootUrl; splink.domain = __rootUrl.Host; domain = __rootUrl.Host; splink.link.domain = domain; //webLinks.Add(splink); //webTargets.Add(splink); return(splink); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); if (link.link.pathAndQuery.isNullOrEmpty()) { output.comment = "no pathAndQuery"; return(output); } List <string> words = link.link.pathAndQuery.getStringTokensMinLength(); bool score = false; if (!words.Any()) { output.comment = "no words in url"; return(output); } foreach (string wrd in words) { if (language.isKnownWord(wrd)) { score = true; break; } } if (score) { output.score = scoreUnit; } else { output.score = penaltyUnit; } return(output); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false); spiderEvalRuleResult output = new spiderEvalRuleResult(this); output.layer = layer2ID; foreach (IWeightTableTerm term in target.tokens) { if (needles.Contains(term.nominalForm)) { //if (needles.Contains("ru")) //{ // aceLog.log("Target [" + target.url + "] triggered by : " + needles.Join(",")); //} output.layer = layerID; break; } } return(output); }