예제 #1
0
        /// <summary>
        /// Runs when a DLC is finished
        /// </summary>
        /// <param name="wRecord">The w record.</param>
        public void reportDomainFinished(modelSpiderSiteRecord wRecord)
        {
            folderNode fn = null;

            string fileprefix = wRecord.domainInfo.domainRootName.getCleanFilepath();



            if (imbWEMManager.settings.directReportEngine.doDomainReport)
            {
                fn = folder[DRFolderEnum.sites].Add(wRecord.domainInfo.domainRootName.getCleanFilepath(), "Report on " + wRecord.domainInfo.domainName, "Records on domain " + wRecord.domainInfo.domainName + " crawled by " + name);

                if (REPORT_DOMAIN_TERMS)
                {
                    if (wRecord.tRecord.instance.settings.doEnableDLC_TFIDF)
                    {
                        if (wRecord.context.targets.dlTargetPageTokens != null)
                        {
                            wRecord.context.targets.dlTargetPageTokens.GetDataSet(true).serializeDataSet("token_ptkn", fn, dataTableExportEnum.excel, notation);
                        }
                    }
                    if (wRecord.context.targets.dlTargetLinkTokens != null)
                    {
                        wRecord.context.targets.dlTargetLinkTokens.GetDataSet(true).serializeDataSet("token_ltkn", fn, dataTableExportEnum.excel, notation);
                    }
                }

                if (REPORT_DOMAIN_PAGES)
                {
                    int c = 1;
                    foreach (spiderTarget t in wRecord.context.targets.GetLoadedInOrderOfLoad())
                    {
                        reportTarget(t, fn, c);
                        c++;
                    }
                }



                fileunit wLog = new fileunit(folder[DRFolderEnum.logs].pathFor(fileprefix + ".txt"), false);
                wLog.setContent(wRecord.logBuilder.ContentToString(true));

                wLog.Save();


                if (REPORT_ITERATION_URLS)
                {
                    textByIteration url_loaded   = urlsLoaded[wRecord];   //.GetOrAdd(wRecord, new textByIteration());
                    textByIteration url_detected = urlsDetected[wRecord]; //, new textByIteration());


                    fileunit url_ld_out  = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_ld.txt"), false);
                    fileunit url_dt_out  = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_dt.txt"), false);
                    fileunit url_srb_out = new fileunit(folder[DRFolderEnum.sites].pathFor(fileprefix + "_url_srb_ld.txt"), false);

                    url_ld_out.setContentLines(url_loaded.GetAllUnique());
                    url_dt_out.setContentLines(url_detected.GetAllUnique());
                    url_srb_out.setContentLines(wRecord.relevantPages);

                    url_ld_out.Save();
                    url_dt_out.Save();
                    url_srb_out.Save();
                }

                //terms_out.Save();
                //sentence_out.Save();
            }

            if (REPORT_MODULES)
            {
                if (wRecord.tRecord.instance is spiderModularEvaluatorBase)
                {
                    wRecord.frontierDLC.reportDomainOut(wRecord, fn, fileprefix);
                }
            }

            if (REPORT_TIMELINE)
            {
                wRecord.iterationTableRecord.GetDataTable(null, "iteration_performace_" + fileprefix).GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_performace_" + fileprefix); //, notation);
            }

            //if (REPORT_TIMELINE)
            //{
            //    DataTable dt = wRecord.GetTimeSeriesPerformance();
            //    timeSeries.Add(dt);
            //    dt.GetReportAndSave(folder[DRFolderEnum.it], notation, "iteration_frontier_stats_" + fileprefix);
            //}


            wRecord.tRecord.lastDomainIterationTable.Add(wRecord.iterationTableRecord.GetLastEntryTouched());

            wRecord.tRecord.instance.reportDomainFinished(this, wRecord);

            wRecord.Dispose();
        }
예제 #2
0
        /// <summary>
        /// Performs full domain reevaluation
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="evaluator">The evaluator.</param>
        public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF)
        {
            indexDomain idomain = null;

            //lock (updateIndexLockD)
            //{
            idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            // }

            idomain.url = __wRecord.domain;

            //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true);

            double        dIP    = 0;
            int           p      = 0;
            List <string> dTerms = new List <string>();

            List <string> dDistinctTerms = new List <string>();

            List <string> dLemmas = new List <string>();
            List <string> dWords  = new List <string>();


            List <string> urls = new List <string>();

            bool doEvalD = true;



            foreach (spiderTarget target in __wRecord.context.targets.GetLoaded())
            {
                indexPage ipage = null;

                // lock (updateIndexLock)
                // {
                ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain);
                // }
                bool doEval = true;
                int  dLc    = 0;

                if (settings.plugIn_indexDBUpdater_optimizedMode)
                {
                    if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant)))
                    {
                        doEval = false;

                        if (ipage.AllWords.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                        if (ipage.AllLemmas.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                    }
                }

                if (doEval)
                {
                    List <string> terms = new List <string>();

                    if (ipage.AllWords.isNullOrEmpty())
                    {
                        terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);
                    }
                    else
                    {
                        terms = ipage.AllWords.SplitSmart(",", "", true);
                    }


                    ipage.AllWords = terms.toCsvInLine();


                    double IP = 0;

                    List <string> lemmas = new List <string>();

                    List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms);

                    if (ipage.AllLemmas.isNullOrEmpty())
                    {
                        //  terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);

                        lemmas.AddRange(mchl.Select(x => x.nominalForm));
                    }
                    else
                    {
                        lemmas = ipage.AllLemmas.SplitSmart(",", "", true);
                    }



                    foreach (weightTableTermCompiled cterm in mchl)
                    {
                        IP += cterm.tf_idf;
                        //dTerms.AddUnique(cterm.nominalForm);

                        if (cterm.df == 1)
                        {
                            dDistinctTerms.AddUnique(cterm.nominalForm);
                        }
                    }

                    ipage.InfoPrize = IP;

                    dIP += IP;

                    ipage.Lemmas = lemmas.Count;

                    ipage.AllLemmas = lemmas.toCsvInLine();

                    dWords.AddRange(terms);
                    dLemmas.AddRange(lemmas);

                    ipage.Note = "indexUpdate" + SessionID;

                    //   lock (updateIndexLockB)
                    //    {
                    imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage);
                    //   }
                    // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }
                else
                {
                    dIP    += ipage.InfoPrize;
                    doEvalD = false;
                    // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", "  ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }

                urls.Add(ipage.url);

                p++;
                loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                target.Dispose();
            }


            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                if (!doEvalD)
                {
                    var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain);
                    int dlc_c  = dlc_tf.Count;


                    idomain.TFIDFcompiled = (dlc_c > 0);
                    idomain.Lemmas        = dlc_c;
                }
                else
                {
                    idomain.Lemmas         = dLemmas.Count;
                    idomain.Words          = dWords.Count;
                    idomain.TFIDFcompiled  = (dLemmas.Count > 0);
                    idomain.DistinctLemmas = dDistinctTerms.toCsvInLine();
                    idomain.AllLemmas      = dLemmas.toCsvInLine();
                    idomain.AllWords       = dWords.toCsvInLine();
                }
                idomain.InfoPrize = dIP;
                //if (doEvalD)


                var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls);

                idomain.relevantPages        = urlAssert[indexPageEvaluationEntryState.isRelevant].Count;
                idomain.notRelevantPages     = urlAssert[indexPageEvaluationEntryState.notRelevant].Count;
                idomain.detected             = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count;
                idomain.Crawled              = urlAssert.certainty;
                idomain.RelevantContentRatio = urlAssert.relevant;
                string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7"));
                if (loger != null)
                {
                    loger.AppendLine(rpp);
                }
            }



            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);
            }
            imbWEMManager.index.wRecordsDeployed++;

            __wRecord.Dispose();
        }