Exemplo n.º 1
0
 protected spiderModuleBase(string __name, string __desc, ISpiderEvaluatorBase __parent)
 {
     name        = __name;
     code        = name[0].ToString().ToUpper();
     description = __desc;
     _parent     = __parent;
 }
Exemplo n.º 2
0
 public void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
 {
     foreach (IPlugInCommonBase <crawlReportingStageEnum, directReporterBase> plug in allPlugins)
     {
         plug.eventDLCInitiated(null, __task, __wRecord); /// aJob, __machine, __tRecord);
     }
 }
Exemplo n.º 3
0
 public languageModule(ISpiderEvaluatorBase __parent, basicLanguage __langA, basicLanguage __langB)
     : base("Language Module", "The Targets are distributed into layers by the Passive rules and Active rules testing the tokens of the Target.", __parent)
 {
     languageA = __langA;
     languageB = __langB;
     //  setup();
 }
Exemplo n.º 4
0
 public layerDistributionRuleBase(string __name, string __description, int __layerID, ISpiderEvaluatorBase __parent, int __layer2ID = 0) : base()
 {
     parent      = __parent;
     name        = __name;
     description = __description;
     layerID     = __layerID;
     layer2ID    = __layer2ID;
 }
 public spiderEvalRuleForLinkBase(string __name, string __description, int __scoreUnit, int __penaltyUnit, ISpiderEvaluatorBase __parent)
 {
     name        = __name;
     description = __description;
     parent      = __parent;
     penaltyUnit = __penaltyUnit;
     scoreUnit   = __scoreUnit;
 }
Exemplo n.º 6
0
 public layerLanguageTFIDF_ARule(basicLanguage __language, int __layerID, ISpiderEvaluatorBase __parent, int __layerID2 = -1)
     : base("Language TF-IDF Test ({0})", "Tests Target tokens against the specified language [{0}], sets layerID [{1}] and calculates layer weight score as sum of matched Target token TF-IDFs minus sum of unmatched."
            + "If resulting weight score is more than 0 the layerID [{1}] is assigned, if it's less than 0 then the layer2ID [{2}] is assigned", __layerID, __parent, __layerID2)
 {
     language    = __language;
     name        = string.Format(name, language.languageEnglishName);
     description = string.Format(description, language.languageEnglishName, layerID, layer2ID);
 }
Exemplo n.º 7
0
        public static String GetCrawlFolderName(ISpiderEvaluatorBase spider, crawlerDomainTaskMachineSettings crawlerJobEngineSettings, String templateString)
        {
            stringTemplate template = new stringTemplate(templateString);

            PropertyCollection data = GetData(crawlerJobEngineSettings, spider);

            return(template.applyToContent(data));
        }
Exemplo n.º 8
0
        public static String GetCrawlFolderName(ISpiderEvaluatorBase spider, ICrawlJobContext state, String templateString)
        {
            stringTemplate template = new stringTemplate(templateString);

            PropertyCollection data = GetData(state, spider);

            return(template.applyToContent(data));
        }
 public spiderEvalRuleForPageBase(string __name, string __description, int __scoreUnit, int __penaltyUnit, ISpiderEvaluatorBase __parent)
 {
     name        = __name;
     description = __description;
     parent      = __parent;
     scoreUnit   = __scoreUnit;
     penaltyUnit = __penaltyUnit;
     mode        = spiderEvalRuleResultEnum.active;
 }
Exemplo n.º 10
0
 public rankPageRank(ISpiderEvaluatorBase __parent, double __alpha = 0.85, double __convergence = 0.0001, int __checkSteps = 10)
     : base("PageRankRule", "Integrates PageRank algorithm included from C# open source project [https://github.com/jeffersonhwang/pagerank]. Dumping value (d) is set to:" + __alpha.ToString() + ", convergence (c): "
            + __convergence + ", check steps: " + __checkSteps + ". Targets are ranked by associated page rank multiplied to integer factor: {0}. When no pages were loaded it assigns maximum score."
            , 1000, 10, __parent)
 {
     description = string.Format(description, scoreUnit);
     alpha       = __alpha;
     convergence = __convergence;
     checkSteps  = __checkSteps;
 }
Exemplo n.º 11
0
        public diversityModule(double ttd, double ptd, ISpiderEvaluatorBase __parent, int expansionSteps) :
            base("Diversity Module", "Use inversed semantic similarity between a target and Target link url/title corpus and Target page content corpus", __parent)
        {
            tt_diversityFactor = ttd;
            pt_diversityFactor = ptd;

            termExpansionSteps = expansionSteps;

            //   setup();
        }
Exemplo n.º 12
0
        public templateModule(bool isRankingAlternative, ISpiderEvaluatorBase __parent)
            : base("Template Module", "The Targets are distributed into layers according to the semantic role of the host content block. ", __parent)
        {
            alternative = isRankingAlternative;
            if (alternative)
            {
                description = description + "This is alternative implementation with XPath depth based ranking";
            }

            //  setup();
        }
Exemplo n.º 13
0
        public structureModule(bool isRankingAlternative, ISpiderEvaluatorBase __parent)
            : base("Structure Module", "The Targets are distributed into layers according to the URL-structure graph tree", __parent)
        {
            alternative = isRankingAlternative;

            if (alternative)
            {
                description = description + "This is alternative implementation with URL structure graph based ranking";
            }

            //setup();
        }
Exemplo n.º 14
0
        public void EnterStage(modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase sEvaluator)
        {
            wRecord.logBuilder.log("-- entering stage [" + name + "] with " + objectives.Count() + " objectives.");
            wRecord.logBuilder.log("> " + description + " (codename:" + codename + ")");
            wRecord.logBuilder.log("> stage iteration limit: " + stageIterationLimit + " (global limit:" + GLOBAL_stageIterationLimit + ")");

            foreach (spiderObjective objective in objectives)
            {
                objective.prepare();
                wRecord.logBuilder.log("> Objective [" + objective.name + "] t:" + objective.supervisor + " ");
            }
        }
Exemplo n.º 15
0
        public rankHITS(ISpiderEvaluatorBase __parent, double __convergence = 0.0001, int __checkSteps = 10)
            : base("rankTargetsHITS", "Adaptation of HITS algorithm for inner crawl frontier ranking. Convergence (c): "
                   + __convergence + ", iteration steps: " + __checkSteps + ". Targets are ranked by associated Authority + Hub score multiplied by integer factor: {0}."
                   , 1000, 0, __parent)
        {
            description = string.Format(description, scoreUnit);

            convergence = __convergence;
            checkSteps  = __checkSteps;

            hits = new HITSRank();
        }
Exemplo n.º 16
0
        public rankDiversityALink(ISpiderEvaluatorBase __parent, double __target_sd = 0.5, double __page_sd = 0.5, int __expansionSteps = 3)
            : base("TokenDiversity", "Ranks links by semantic diversity (inverted similarity) against Target tokens (" + __target_sd.ToString("P") + ") and crawled pages tokens (" + __page_sd.ToString("P") + ")."
                   + "Target tokens are semantically expanded in [" + __expansionSteps + "] steps using Semantic Lexicon"
                   , 100000, 0, __parent)
        {
            expansionSteps = __expansionSteps;
            target_sd      = __target_sd;
            page_sd        = __page_sd;
            //subject = spiderEvalRuleSubjectEnum.targets;
            mode = spiderEvalRuleResultEnum.active;
            doAdjustScoreByLanguageDetection = imbWEMManager.settings.crawlAdHok.FLAG_doAdjustDiversityScore;

            __parent.settings.doEnableDLC_TFIDF = true;
        }
Exemplo n.º 17
0
        public static PropertyCollection GetData(ICrawlJobContext state, ISpiderEvaluatorBase crawler)
        {
            PropertyCollection data = new PropertyCollection();

            data[nameComposerFields.crawlerClassName]        = crawler.GetType().Name;
            data[nameComposerFields.crawlerTitleName]        = crawler.name;
            data[nameComposerFields.crawlerFileFriendlyName] = crawler.name.getCleanFilepath().Replace("-", "");
            data[nameComposerFields.variablePLmax]           = crawler.settings.limitTotalPageLoad;
            data[nameComposerFields.variableLT] = crawler.settings.limitIterationNewLinks;
            //data[nameComposerFields.variableTCmax] = state.crawlerJobEngineSettings.TC_max;
            data[nameComposerFields.sampleSize] = state.sampleList.Count();
            //data[nameComposerFields.sampleFileSource] = state.sampleFile;
            //data[nameComposerFields.sampleName] = state.sampleTags;

            return(data);
        }
Exemplo n.º 18
0
        public static void SetTestIDAndSignature(this IReportBenchmark target, ISpiderEvaluatorBase evaluator, ICrawlJobContext state, modelSpiderTestRecord tRecord)
        {
            target.TestSignature = evaluator.name + "|DC:" + state.sampleList.Count + "|PL:" + evaluator.settings.limitTotalPageLoad + "|LT:" + evaluator.settings.limitIterationNewLinks + "|IID:" + imbWEMManager.index.current_indexID + "|SID:" + imbWEMManager.index.indexSessionEntry.SessionID.add(evaluator.settings.SignatureSufix);
            target.Crawler       = tRecord.instance.name;

            if (evaluator != null)
            {
                target.TestID = md5.GetMd5Hash(objectSerialization.ObjectToXML(imbWEMManager.settings)) + "-" + evaluator.crawlerHash + "-" + md5.GetMd5Hash(target.TestSignature).toWidthMaximum(3, "");
            }
            else if (state != null)
            {
                target.TestID = state.setupHash_global + "-" + state.setupHash_crawler;
            }
            else
            {
                target.TestID = md5.GetMd5Hash(target.GetHashCode().ToString());
            }
        }
Exemplo n.º 19
0
        public void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (!IsEnabled)
            {
                return;
            }

            foreach (reportPlugIn_base plug in allPlugins)
            {
                try
                {
                    plug.eventIteration(__spider, __task, __wRecord);
                }
                catch (Exception ex)
                {
                    aceLog.log("Reporting Plugin [" + plug.name + "]:" + plug.GetType().Name + " at status report execution crashed: " + ex.Message);
                    crawlerErrorLog cel = new crawlerErrorLog(ex, null, null, crawlerErrorEnum.indexPlugin);
                    cel.SaveXML();
                }
            }
        }
Exemplo n.º 20
0
        public static void Describe(this ISpiderEvaluatorBase evaluator, ITextRender output)
        {
            output.AppendHeading("Crawler [" + evaluator.name + "]");

            output.AppendPair("Class name", evaluator.GetType().Name, true, ": ");

            output.AppendPair("Description", evaluator.description, true, ": ");



            if (evaluator is spiderModularEvaluatorBase)
            {
                spiderModularEvaluatorBase evaluator_spiderModularEvaluatorBase = (spiderModularEvaluatorBase)evaluator;

                foreach (var md in evaluator_spiderModularEvaluatorBase.modules)
                {
                    md.DescribeModule(output);
                }
            }



            if (evaluator is spiderEvaluatorSimpleBase)
            {
                spiderEvaluatorSimpleBase evaluator_spiderEvaluatorSimpleBase = (spiderEvaluatorSimpleBase)evaluator;

                evaluator_spiderEvaluatorSimpleBase.linkActiveRules.ToList().ForEach(x => x.DescribeRule(output));
            }


            output.AppendHorizontalLine();

            output.open("div", "General configuration", "Crawler configuration properties declared in common settings class");

            evaluator.settings.GetUserManual(output, "", true, true);

            output.close();
        }
Exemplo n.º 21
0
 public abstract void eventIteration(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
Exemplo n.º 22
0
        public layerBlockRolePRule(nodeBlockSemanticRoleEnum __semanticRole, int __layerID, ISpiderEvaluatorBase __parent, int __layer2ID = -1)
            : base("Block Role ({0})", "Links found in a block with semantic role [{0}] are set to layerID [{1}], otherwise to [{2}].", __layerID, __parent, __layer2ID)
            //: base(__name, __description, __layerID, __parent, __layer2ID)
        {
            semanticRole = __semanticRole;
            name         = string.Format(name, semanticRole);
            description  = string.Format(description, semanticRole, layerID, layer2ID);

            __parent.settings.doEnableDLC_BlockTree = true;
        }
Exemplo n.º 23
0
 /// <summary>
 /// Just when new DLC thread was prepared to run
 /// </summary>
 /// <param name="__spider">The spider.</param>
 /// <param name="__task">The task.</param>
 /// <param name="__wRecord">The w record.</param>
 public abstract void eventDLCInitiated(ISpiderEvaluatorBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord);
Exemplo n.º 24
0
 public abstract void eventUniversal <TFirst, TSecond>(crawlerDomainTaskIterationPhase stage, ISpiderEvaluatorBase __parent, TFirst __task, TSecond __resource);
Exemplo n.º 25
0
 /// <summary>
 /// Events the plugin installed --- when new spider instance being constructed
 /// </summary>
 /// <param name="__spider">The spider.</param>
 /// <param name="__task">The task.</param>
 public abstract void eventPluginInstalled(ISpiderEvaluatorBase __spider);
Exemplo n.º 26
0
 public rankTargetUrlGraph(ISpiderEvaluatorBase __parent, int __scoreUnit = 100)
     : base("Target URL Graph", "In the Learning phase creates link-path graph out of all links sent for evaluation, according to normalized score of matched node it will assign proportion of [{0}].", __scoreUnit, 0, __parent)
 {
     description = string.Format(description, __scoreUnit);
 }
Exemplo n.º 27
0
 protected spiderLayerModuleBase(string __name, string __desc, ISpiderEvaluatorBase __parent) : base(__name, __desc, __parent)
 {
 }
Exemplo n.º 28
0
 public spiderRankingModuleBase(string __name, string __desc, ISpiderEvaluatorBase __parent) : base(__name, __desc, __parent)
 {
 }
Exemplo n.º 29
0
 public ruleRandomNoiseScore(int maximum, int minimum, ISpiderEvaluatorBase __parent)
     : base("NoiseScore", "Passive rule adding random score between {0} and {1} points - used to create initial noise in the frontier. ", maximum, minimum, __parent)
 {
     description = string.Format(description, maximum, minimum);
     rnd         = new Random();
 }
Exemplo n.º 30
0
        public void reportIteration(dataUnitSpiderIteration dataUnit, modelSpiderSiteRecord wRecord, ISpiderEvaluatorBase evaluator)
        {
            iterationPerformanceRecord ip_record = new iterationPerformanceRecord(wRecord);

            wRecord.iterationTableRecord.Add(ip_record);


            folderNode fn; //siteRecords[wRecord].Add(dataUnit.iteration.ToString("D3"), wRecord.domainInfo.domainRootName + dataUnit.iteration.ToString("D3"), "Iteration " + dataUnit.iteration + " on domain: " + wRecord.domainInfo.domainName);


            if (imbWEMManager.settings.directReportEngine.doIterationReport)
            {
                if (imbWEMManager.settings.directReportEngine.doDomainReport)
                {
                    fn = getIterationFolder(dataUnit.iteration, wRecord);
                    if (REPORT_WRECORD_LOG)
                    {
                        wRecord.logBuilder.getLastLine().saveStringToFile(fn.pathFor("wrecord.txt"));
                    }



                    string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath();


                    textByIteration url_loaded   = urlsLoaded[wRecord];   //.GetOrAdd(wRecord, new textByIteration());
                    textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration());
                                                                          //textByIteration terms_ext = termsExtracted[wRecord];
                                                                          //textByIteration sentence_ext = sentencesExtracted[wRecord];



                    if (REPORT_MODULES)
                    {
                        if (imbWEMManager.settings.directReportEngine.DR_ReportModules_XMLIteration)
                        {
                            if (wRecord.tRecord.instance is spiderModularEvaluatorBase)
                            {
                                wRecord.frontierDLC.reportIterationOut(wRecord, fn);
                            }
                        }
                    }

                    string its = dataUnit.iteration.ToString("D3");


                    //DataTable dt = wRecord.context.targets.GetDataTable();
                    //dt.SetTitle(fileprefix + "_targets");
                    //dt.serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.csv, "", fn, notation);

                    //sentence_ext[dataUnit.iteration].AddRangeUnique(wRecord.context.targets.blocks.GetHashList());

                    //if (REPORT_ITERATION_TERMS)
                    //{
                    //    fileunit blocks = new fileunit(fn.pathFor(its + "_blc.txt"), false);


                    //    blocks.setContentLines(sentence_ext[dataUnit.iteration]);

                    //    blocks.Save();
                    //}

                    if (REPORT_TIMELINE)
                    {
                        objectSerialization.saveObjectToXML(ip_record, fn.pathFor("performance.xml"));
                    }



                    if (REPORT_ITERATION_URLS)
                    {
                        if (wRecord.iteration > 0)
                        {
                            builderForMarkdown now_loaded = new builderForMarkdown();

                            //fileunit now_loaded = new fileunit(fn.pathFor(its + "_loadedNow.txt"), false);
                            List <spiderTarget> targets_loaded = wRecord.context.targets.GetLoadedInIteration(wRecord.iteration - 1);

                            int tc = 0;
                            foreach (spiderTarget t in targets_loaded)
                            {
                                reportTarget(t, fn, tc);
                                now_loaded.AppendLine(t.url);
                                now_loaded.AppendHorizontalLine();
                                now_loaded.Append(t.marks.GetActiveResults());
                                now_loaded.AppendHorizontalLine();
                                now_loaded.Append(t.marks.GetPassiveResults());
                                now_loaded.AppendHorizontalLine();

                                var dt = t.marks.getHistory(t.url, wRecord.tRecord.instance.name);
                                dt.Save(fn, imbWEMManager.authorNotation, its + "_loadedNow");

                                now_loaded.AppendTable(dt, false);

                                tc++;
                            }

                            now_loaded.ToString().saveStringToFile(fn.pathFor(its + "_loadedNow.txt"));


                            spiderTaskResult loadResults = wRecord.spiderTaskResults[wRecord.iteration - 1];
                            loadResults.getDataTable().GetReportAndSave(fn, notation, "loadResults", true); // .serializeDataTable(aceCommonTypes.enums.dataTableExportEnum.excel, "loadResults", fn, notation);
                        }



                        fileunit detected = new fileunit(fn.pathFor(its + "_dt.txt"), false);
                        fileunit loaded   = new fileunit(fn.pathFor(its + "_ld.txt"), false);

                        fileunit relp = new fileunit(fn.pathFor(its + "_srb_ld.txt"), false);
                        relp.Append(wRecord.relevantPages, true);

                        foreach (spiderTarget t in wRecord.context.targets)
                        {
                            if (t.page != null)
                            {
                                //t.contentBlocks.ForEach(x => sentence_ext[dataUnit.iteration].AddUnique(x.textHash));

                                loaded.Append(t.url);
                                url_loaded[dataUnit.iteration].Add(t.url);
                            }
                            else
                            {
                                detected.Append(t.url);
                                url_detected[dataUnit.iteration].Add(t.url);
                            }
                        }


                        string lineFormat = "{0,5} {1,30} [s:{1,6}]" + Environment.NewLine;

                        fileunit active = new fileunit(fn.pathFor(its + "_act.txt"), false);
                        int      c      = 1;

                        foreach (var lnk in wRecord.web.webActiveLinks)
                        {
                            active.Append(string.Format(lineFormat, c, lnk.url, lnk.marks.score));
                            active.Append(lnk.marks.GetLayerAssociation());
                            c++;
                        }


                        detected.Save();
                        loaded.Save();
                        active.Save();
                    }
                }
            }



            wRecord.tRecord.instance.reportIteration(this, wRecord);
        }