Пример #1
0
        private void reportTarget(spiderTarget t, folderNode fn, int c)
        {
            string     pageFolder = "P" + c.ToString("D3") + "_" + t.IsRelevant.ToString();
            folderNode pfn        = fn.Add(pageFolder, "Page " + c.ToString(), "Report on page " + t.url + " crawled by " + name + ". Target.IsRelevant: " + t.IsRelevant + ".".addLine(pageDescription));

            fileunit content = new fileunit(pfn.pathFor("content.txt"), false);
            fileunit links   = new fileunit(pfn.pathFor("links.txt"), false);

            if (t.evaluation != null)
            {
                t.evaluation.saveObjectToXML(pfn.pathFor("relevance.xml"));
            }

            content.setContent(t.pageText);
            //t.page.relationship.outflowLinks
            if (t.page != null)
            {
                foreach (spiderLink ln in t.page.relationship.outflowLinks.items.Values)
                {
                    string rl = ln.url;

                    links.Append(ln.url);
                }

                //t.page.webpage.links.ForEach(x => links.Append(x.nature + " | " + x.name + " | " + x.url));
            }
            content.Save();
            links.Save();
            //  marks.Save();
        }
Пример #2
0
        public override spiderEvalRuleResult evaluate(spiderLink link)
        {
            spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false);

            spiderEvalRuleResult output = new spiderEvalRuleResult(this);

            double weight = 0;

            foreach (IWeightTableTerm term in target.tokens)
            {
                if (language.isKnownWord(term.nominalForm))
                {
                    weight += target.tokens.GetTF_IDF(term);
                }
                else
                {
                    weight -= target.tokens.GetTF_IDF(term);
                }
            }

            if (weight > 0)
            {
                output.layer = layerID;
            }
            else
            {
                output.layer = layer2ID;
            }
            output.weightScore = weight;

            return(output);
        }
Пример #3
0
        public override spiderEvalRuleResult evaluate(spiderLink link)
        {
            spiderEvalRuleResult result = new spiderEvalRuleResult(this);
            spiderTarget         target = wRecord.context.targets.GetOrCreateTarget(link, false, false);

            termQueryDocument query = target.getQuery(expansionSteps, wRecord.logBuilder);


            wRecord.logBuilder.AppendLine("Target [" + link.url + "] query => [" + query.GetAllTermString().toCsvInLine(",") + "]");

            weightTableMatchCollection <termSpark, termSpark> matchLinks = query.GetSparkMatchAgainst <termSpark>((termDocument)wRecord.context.targets.dlTargetLinkTokens.AggregateDocument);
            weightTableMatchCollection <termSpark, termSpark> matchPage  = query.GetSparkMatchAgainst <termSpark>((termDocument)wRecord.context.targets.dlTargetPageTokens.AggregateDocument);

            if ((!matchLinks.Any()) && (!matchPage.Any()))
            {
                result.score = scoreUnit;
                wRecord.logBuilder.AppendLine("D[" + link.url + "][" + target.tokens.GetAllTermString().toCsvInLine(",") + "] = no matches with query");
                return(result);
            }
            else
            {
                wRecord.logBuilder.AppendLine("matchLinks => " + matchLinks.ToString());

                wRecord.logBuilder.AppendLine("matchPage => " + matchPage.ToString());
            }


            double pLSim = matchLinks.GetSemanticSimilarity() * target_sd;
            double pPSim = matchPage.GetSemanticSimilarity() * page_sd;

            double sim = (pLSim + pPSim);

            double sc = sim * (double)scoreUnit;

            double score = ((double)scoreUnit) - sc;

            if (doAdjustScoreByLanguageDetection)
            {
                // < ---- modification of diversity score
                List <string> tkns = new List <string>();
                foreach (IWeightTableTerm spark in query)
                {
                    tkns.Add(spark.nominalForm);
                }
                textEvaluation evaluation = new textEvaluation(wRecord.aJob.langTextEvaluator, null);
                evaluation.evaluateTokens(tkns, null, false);

                double evalAdj = Math.Pow(evaluation.ratioA, 2);
                result.score = Convert.ToInt32((double)score * evalAdj);  //Convert.ToInt32(sim_inv * (Double) scoreUnit);
                wRecord.logBuilder.AppendLine();
                wRecord.logBuilder.AppendLine("Score is adjusted by language evaluation ratioA ^ 2: " + evalAdj);
            }

            wRecord.logBuilder.AppendLine("D[" + link.url + "][" + target.tokens.GetAllTermString().toCsvInLine(",") + "]=[pL:" + pLSim.ToString("P2") + "][pP:" + pPSim.ToString("P2") + "]=" + sim.ToString("#0.0000") + " (" + result.score + ")");


            return(result);
        }
Пример #4
0
        public override spiderEvalRuleResult evaluate(spiderLink link)
        {
            spiderEvalRuleResult output = new spiderEvalRuleResult(this);

            spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false);


            double score = hits[target] * scoreUnit;

            output.score = (int)score;

            return(output);
        }
Пример #5
0
        public indexPage deployTarget(spiderTarget target, modelSpiderSiteRecord wRecord, indexDomain idomain)
        {
            indexPage page = pageIndexTable.GetPageForUrl(target.url); //.GetOrCreate(md5.GetMd5Hash(target.url));

            if (idomain == null)
            {
                idomain = domainIndexTable.GetDomain(wRecord.domainInfo.domainName);
            }
            page.url = target.url;

            page.tst    = target.tokens.ToList().toCsvInLine(",");
            page.domain = wRecord.domainInfo.domainName;
            if (target.isLoaded)
            {
                if (target.evaluation != null)
                {
                    if (target.evaluation.result_language != basicLanguageEnum.unknown)
                    {
                        page.langTestRatio     = target.evaluation.result_ratio;
                        page.singleMatchTokens = target.evaluation.singleLanguageTokens.toCsvInLine(",");
                        page.multiMatchTokens  = target.evaluation.multiLanguageTokens.toCsvInLine(",");
                        page.wordCount         = target.evaluation.allContentTokens.Count();
                        page.AllWords          = target.evaluation.allContentTokens.toCsvInLine();
                        page.language          = target.evaluation.result_language.ToString();
                    }
                }
                if (target.IsRelevant)
                {
                    page.relevancy = indexPageRelevancyEnum.isRelevant;
                }
                else if (target.evaluatedLanguage == basicLanguageEnum.unknown)
                {
                    page.relevancy = indexPageRelevancyEnum.unknown;
                }
                else
                {
                    page.relevancy = indexPageRelevancyEnum.notRelevant;
                }
                page.byteSize = target.page.spiderResult.page.result.byteSize;
            }



            pageIndexTable.AddOrUpdate(page);

            return(page);
        }
 public modelSpiderSiteRecordEventArgs(ISpiderTarget __target, modelSpiderSiteRecordEventType __type = modelSpiderSiteRecordEventType.DLCTargetPageAttached)
 {
     Target = __target;
     if (__target is spiderTarget)
     {
         spiderTarget target = (spiderTarget)__target;
         sourceHtml = target.page.webpage.result.sourceCode;
         htmlDoc    = target.page.webpage.result.HtmlDocument; //(HtmlDocument)target.page.webpage.result.document.getDocument<HtmlDocument>();
         if (htmlDoc != null)
         {
             if (htmlDoc.DocumentNode != null)
             {
                 sourceXml = target.page.webpage.result.sourceCode;
             }
         }
         //sourceXml = target.page.spiderResult.page.result.document.getDocument<HtmlDocument>()
     }
     type = __type;
 }
Пример #7
0
        /// <summary>
        /// Evaluates the specified link.
        /// </summary>
        /// <param name="link">The link.</param>
        /// <returns></returns>
        public override spiderEvalRuleResult evaluate(spiderLink link)
        {
            spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false);

            spiderEvalRuleResult output = new spiderEvalRuleResult(this);

            output.layer = layer2ID;
            foreach (IWeightTableTerm term in target.tokens)
            {
                if (needles.Contains(term.nominalForm))
                {
                    //if (needles.Contains("ru"))
                    //{
                    //    aceLog.log("Target [" + target.url + "] triggered by : " + needles.Join(","));
                    //}
                    output.layer = layerID;
                    break;
                }
            }

            return(output);
        }
Пример #8
0
        public override spiderEvalRuleResult evaluate(spiderLink link)
        {
            spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false);

            spiderEvalRuleResult output = new spiderEvalRuleResult(this);

            double weight = 0;

            foreach (IWeightTableTerm term in target.tokens)
            {
                var response = semanticLexiconManager.lexiconCache.getLexiconItems(term.nominalForm, wRecord.logBuilder);
                switch (response.type)
                {
                case lexiconResponse.responseType.cachedLexicon:
                case lexiconResponse.responseType.lexicon:
                case lexiconResponse.responseType.askingLexiconContext:

                    weight += target.tokens.GetTF_IDF(term);
                    break;

                default:
                    weight -= target.tokens.GetTF_IDF(term);
                    break;
                }
            }

            if (weight > 0)
            {
                output.layer = layerID;
            }
            else
            {
                output.layer = layer2ID;
            }
            output.weightScore = weight;

            return(output);
        }
Пример #9
0
        /// <summary>
        /// Evaluates the specified link.
        /// </summary>
        /// <param name="link">The link.</param>
        /// <returns></returns>
        public override spiderEvalRuleResult evaluate(spiderLink link)
        {
            try
            {
                spiderEvalRuleResult result = new spiderEvalRuleResult(this);
                spiderTarget         target = wRecord.context.targets.GetByOrigin(link);

                if (target == null)
                {
                    result.layer = layer2ID;
                    return(result);
                }

                nodeBlock bl = target.contentBlocks.GetBlockByXPath(link.link.xPath);

                if (bl == null)
                {
                    result.layer = layer2ID;
                    return(result);
                }

                if (bl.role == semanticRole)
                {
                    result.layer = layerID;
                }
                else
                {
                    result.layer = layer2ID;
                }

                return(result);
            } catch (Exception ex)
            {
                throw new aceGeneralException(ex.Message, ex, this, "layerBlockRolePRule broken");
            }
            return(new spiderEvalRuleResult(this));
        }
Пример #10
0
        public override spiderEvalRuleResult evaluate(spiderLink link)
        {
            spiderEvalRuleResult output = new spiderEvalRuleResult(this);

            spiderTarget target = wRecord.context.targets.GetOrCreateTarget(link, false, false);

            if (!ranks.Any())
            {
                output.score = scoreUnit;
                return(output);
            }

            if (ranks.ContainsKey(target))
            {
                output.score = ranks[target];
            }
            else
            {
                output.score = penaltyUnit;
            }


            return(output);
        }
Пример #11
0
        public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null)
        {
            if (idomain == null)
            {
                idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain);
            }
            if (ipage == null)
            {
                ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url));
            }

            List <string> output = new List <string>();

            FileInfo file = GetWordList_File(idomain, ipage);

            if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists)
            {
                output = file.FullName.openFileToList(true);



                return(output);
            }

            string cont = target.pageText.transliterate();
            // cont = cont.imbHtmlDecode();

            termDocument pageTF = null;

            if (evaluator == null)
            {
                evaluator = target.parent.wRecord.tRecord.evaluator;
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(cont);

            if (evaluation.result_language == basicLanguageEnum.serbian)
            {
                List <string> pt = new List <string>();

                pt.AddRange(evaluation.singleLanguageTokens);

                if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
                {
                    pt.AddRange(evaluation.multiLanguageTokens);
                }

                pt.RemoveAll(x => !x.isCleanWord());
                pt.RemoveAll(x => x.isSymbolicContentOnly());

                var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt);


                output.AddRange(tkns);
            }

            if (imbWEMManager.settings.TFIDF.doSavePageWordlist)
            {
                output.saveContentOnFilePath(file.FullName);
            }

            return(output);
        }