private void reportTarget(spiderTarget t, folderNode fn, int c) { string pageFolder = "P" + c.ToString("D3") + "_" + t.IsRelevant.ToString(); folderNode pfn = fn.Add(pageFolder, "Page " + c.ToString(), "Report on page " + t.url + " crawled by " + name + ". Target.IsRelevant: " + t.IsRelevant + ".".addLine(pageDescription)); fileunit content = new fileunit(pfn.pathFor("content.txt"), false); fileunit links = new fileunit(pfn.pathFor("links.txt"), false); if (t.evaluation != null) { t.evaluation.saveObjectToXML(pfn.pathFor("relevance.xml")); } content.setContent(t.pageText); //t.page.relationship.outflowLinks if (t.page != null) { foreach (spiderLink ln in t.page.relationship.outflowLinks.items.Values) { string rl = ln.url; links.Append(ln.url); } //t.page.webpage.links.ForEach(x => links.Append(x.nature + " | " + x.name + " | " + x.url)); } content.Save(); links.Save(); // marks.Save(); }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false); spiderEvalRuleResult output = new spiderEvalRuleResult(this); double weight = 0; foreach (IWeightTableTerm term in target.tokens) { if (language.isKnownWord(term.nominalForm)) { weight += target.tokens.GetTF_IDF(term); } else { weight -= target.tokens.GetTF_IDF(term); } } if (weight > 0) { output.layer = layerID; } else { output.layer = layer2ID; } output.weightScore = weight; return(output); }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult result = new spiderEvalRuleResult(this); spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false); termQueryDocument query = target.getQuery(expansionSteps, wRecord.logBuilder); wRecord.logBuilder.AppendLine("Target [" + link.url + "] query => [" + query.GetAllTermString().toCsvInLine(",") + "]"); weightTableMatchCollection <termSpark, termSpark> matchLinks = query.GetSparkMatchAgainst <termSpark>((termDocument)wRecord.context.targets.dlTargetLinkTokens.AggregateDocument); weightTableMatchCollection <termSpark, termSpark> matchPage = query.GetSparkMatchAgainst <termSpark>((termDocument)wRecord.context.targets.dlTargetPageTokens.AggregateDocument); if ((!matchLinks.Any()) && (!matchPage.Any())) { result.score = scoreUnit; wRecord.logBuilder.AppendLine("D[" + link.url + "][" + target.tokens.GetAllTermString().toCsvInLine(",") + "] = no matches with query"); return(result); } else { wRecord.logBuilder.AppendLine("matchLinks => " + matchLinks.ToString()); wRecord.logBuilder.AppendLine("matchPage => " + matchPage.ToString()); } double pLSim = matchLinks.GetSemanticSimilarity() * target_sd; double pPSim = matchPage.GetSemanticSimilarity() * page_sd; double sim = (pLSim + pPSim); double sc = sim * (double)scoreUnit; double score = ((double)scoreUnit) - sc; if (doAdjustScoreByLanguageDetection) { // < ---- modification of diversity score List <string> tkns = new List <string>(); foreach (IWeightTableTerm spark in query) { tkns.Add(spark.nominalForm); } textEvaluation evaluation = new textEvaluation(wRecord.aJob.langTextEvaluator, null); evaluation.evaluateTokens(tkns, null, false); double evalAdj = Math.Pow(evaluation.ratioA, 2); result.score = Convert.ToInt32((double)score * evalAdj); //Convert.ToInt32(sim_inv * (Double) scoreUnit); wRecord.logBuilder.AppendLine(); wRecord.logBuilder.AppendLine("Score is adjusted by language evaluation ratioA ^ 2: " + evalAdj); } wRecord.logBuilder.AppendLine("D[" + link.url + "][" + target.tokens.GetAllTermString().toCsvInLine(",") + "]=[pL:" + pLSim.ToString("P2") + "][pP:" + pPSim.ToString("P2") + "]=" + sim.ToString("#0.0000") + " (" + result.score + ")"); return(result); }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false); double score = hits[target] * scoreUnit; output.score = (int)score; return(output); }
public indexPage deployTarget(spiderTarget target, modelSpiderSiteRecord wRecord, indexDomain idomain) { indexPage page = pageIndexTable.GetPageForUrl(target.url); //.GetOrCreate(md5.GetMd5Hash(target.url)); if (idomain == null) { idomain = domainIndexTable.GetDomain(wRecord.domainInfo.domainName); } page.url = target.url; page.tst = target.tokens.ToList().toCsvInLine(","); page.domain = wRecord.domainInfo.domainName; if (target.isLoaded) { if (target.evaluation != null) { if (target.evaluation.result_language != basicLanguageEnum.unknown) { page.langTestRatio = target.evaluation.result_ratio; page.singleMatchTokens = target.evaluation.singleLanguageTokens.toCsvInLine(","); page.multiMatchTokens = target.evaluation.multiLanguageTokens.toCsvInLine(","); page.wordCount = target.evaluation.allContentTokens.Count(); page.AllWords = target.evaluation.allContentTokens.toCsvInLine(); page.language = target.evaluation.result_language.ToString(); } } if (target.IsRelevant) { page.relevancy = indexPageRelevancyEnum.isRelevant; } else if (target.evaluatedLanguage == basicLanguageEnum.unknown) { page.relevancy = indexPageRelevancyEnum.unknown; } else { page.relevancy = indexPageRelevancyEnum.notRelevant; } page.byteSize = target.page.spiderResult.page.result.byteSize; } pageIndexTable.AddOrUpdate(page); return(page); }
public modelSpiderSiteRecordEventArgs(ISpiderTarget __target, modelSpiderSiteRecordEventType __type = modelSpiderSiteRecordEventType.DLCTargetPageAttached) { Target = __target; if (__target is spiderTarget) { spiderTarget target = (spiderTarget)__target; sourceHtml = target.page.webpage.result.sourceCode; htmlDoc = target.page.webpage.result.HtmlDocument; //(HtmlDocument)target.page.webpage.result.document.getDocument<HtmlDocument>(); if (htmlDoc != null) { if (htmlDoc.DocumentNode != null) { sourceXml = target.page.webpage.result.sourceCode; } } //sourceXml = target.page.spiderResult.page.result.document.getDocument<HtmlDocument>() } type = __type; }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false); spiderEvalRuleResult output = new spiderEvalRuleResult(this); output.layer = layer2ID; foreach (IWeightTableTerm term in target.tokens) { if (needles.Contains(term.nominalForm)) { //if (needles.Contains("ru")) //{ // aceLog.log("Target [" + target.url + "] triggered by : " + needles.Join(",")); //} output.layer = layerID; break; } } return(output); }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false); spiderEvalRuleResult output = new spiderEvalRuleResult(this); double weight = 0; foreach (IWeightTableTerm term in target.tokens) { var response = semanticLexiconManager.lexiconCache.getLexiconItems(term.nominalForm, wRecord.logBuilder); switch (response.type) { case lexiconResponse.responseType.cachedLexicon: case lexiconResponse.responseType.lexicon: case lexiconResponse.responseType.askingLexiconContext: weight += target.tokens.GetTF_IDF(term); break; default: weight -= target.tokens.GetTF_IDF(term); break; } } if (weight > 0) { output.layer = layerID; } else { output.layer = layer2ID; } output.weightScore = weight; return(output); }
/// <summary> /// Evaluates the specified link. /// </summary> /// <param name="link">The link.</param> /// <returns></returns> public override spiderEvalRuleResult evaluate(spiderLink link) { try { spiderEvalRuleResult result = new spiderEvalRuleResult(this); spiderTarget target = wRecord.context.targets.GetByOrigin(link); if (target == null) { result.layer = layer2ID; return(result); } nodeBlock bl = target.contentBlocks.GetBlockByXPath(link.link.xPath); if (bl == null) { result.layer = layer2ID; return(result); } if (bl.role == semanticRole) { result.layer = layerID; } else { result.layer = layer2ID; } return(result); } catch (Exception ex) { throw new aceGeneralException(ex.Message, ex, this, "layerBlockRolePRule broken"); } return(new spiderEvalRuleResult(this)); }
public override spiderEvalRuleResult evaluate(spiderLink link) { spiderEvalRuleResult output = new spiderEvalRuleResult(this); spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false); if (!ranks.Any()) { output.score = scoreUnit; return(output); } if (ranks.ContainsKey(target)) { output.score = ranks[target]; } else { output.score = penaltyUnit; } return(output); }
public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null) { if (idomain == null) { idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain); } if (ipage == null) { ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url)); } List <string> output = new List <string>(); FileInfo file = GetWordList_File(idomain, ipage); if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists) { output = file.FullName.openFileToList(true); return(output); } string cont = target.pageText.transliterate(); // cont = cont.imbHtmlDecode(); termDocument pageTF = null; if (evaluator == null) { evaluator = target.parent.wRecord.tRecord.evaluator; } multiLanguageEvaluation evaluation = evaluator.evaluate(cont); if (evaluation.result_language == basicLanguageEnum.serbian) { List <string> pt = new List <string>(); pt.AddRange(evaluation.singleLanguageTokens); if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch) { pt.AddRange(evaluation.multiLanguageTokens); } pt.RemoveAll(x => !x.isCleanWord()); pt.RemoveAll(x => x.isSymbolicContentOnly()); var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt); output.AddRange(tkns); } if (imbWEMManager.settings.TFIDF.doSavePageWordlist) { output.saveContentOnFilePath(file.FullName); } return(output); }