Exemple #1
0
        public void SetTarget(modelSpiderSiteRecord wRecord, indexPage page)
        {
            link ln = new link(page.url);

            //spiderLink sLink = new spiderLink()
            wRecord.context.processLink(ln, wRecord.web.webPages.items.FirstOrDefault().Value, false);
        }
Exemple #2
0
 /// <summary>
 /// Gets <see cref="indexPage"/> entry from current Index database instance
 /// </summary>
 /// <returns></returns>
 public indexPage GetIndexPage()
 {
     if (indexPageInfo == null)
     {
         indexPageInfo = imbWEMManager.index.pageIndexTable.GetPageForUrl(url);
     }
     return(indexPageInfo);
 }
Exemple #3
0
        public indexPage deployTarget(spiderTarget target, modelSpiderSiteRecord wRecord, indexDomain idomain)
        {
            indexPage page = pageIndexTable.GetPageForUrl(target.url); //.GetOrCreate(md5.GetMd5Hash(target.url));

            if (idomain == null)
            {
                idomain = domainIndexTable.GetDomain(wRecord.domainInfo.domainName);
            }
            page.url = target.url;

            page.tst    = target.tokens.ToList().toCsvInLine(",");
            page.domain = wRecord.domainInfo.domainName;
            if (target.isLoaded)
            {
                if (target.evaluation != null)
                {
                    if (target.evaluation.result_language != basicLanguageEnum.unknown)
                    {
                        page.langTestRatio     = target.evaluation.result_ratio;
                        page.singleMatchTokens = target.evaluation.singleLanguageTokens.toCsvInLine(",");
                        page.multiMatchTokens  = target.evaluation.multiLanguageTokens.toCsvInLine(",");
                        page.wordCount         = target.evaluation.allContentTokens.Count();
                        page.AllWords          = target.evaluation.allContentTokens.toCsvInLine();
                        page.language          = target.evaluation.result_language.ToString();
                    }
                }
                if (target.IsRelevant)
                {
                    page.relevancy = indexPageRelevancyEnum.isRelevant;
                }
                else if (target.evaluatedLanguage == basicLanguageEnum.unknown)
                {
                    page.relevancy = indexPageRelevancyEnum.unknown;
                }
                else
                {
                    page.relevancy = indexPageRelevancyEnum.notRelevant;
                }
                page.byteSize = target.page.spiderResult.page.result.byteSize;
            }



            pageIndexTable.AddOrUpdate(page);

            return(page);
        }
Exemple #4
0
        /// <summary>
        /// Closes the session.
        /// </summary>
        public void CloseSession(IEnumerable <modelSpiderTestRecord> tRecords)
        {
            if (imbWEMManager.settings.indexEngine.doSaveFailedURLQueries)
            {
                pageIndexTable.ReadOnlyMode = false;

                int i  = 0;
                int c  = 0;
                int ic = pageIndexTable.urlsNotInIndex.Count;
                int ib = ic / 10;
                aceLog.log("Deploying queried URLs that were not in the index (" + ic.ToString() + ")");

                foreach (string url in pageIndexTable.urlsNotInIndex)
                {
                    i++;
                    c++;

                    indexPage page = pageIndexTable.GetPageForUrl(url);
                    page.url = url;

                    domainAnalysis da = new domainAnalysis(url);


                    page.domain = da.domainName;

                    pageIndexTable.AddOrUpdate(page);


                    if (i >= ib)
                    {
                        aceLog.log("URL processed: " + c.GetRatio(ic).ToString("P2") + " (" + c + ")");
                        i = 0;
                    }
                }
            }

            if (indexSessionEntry != null)
            {
                aceLog.log("Saving index engine performance : ... ");


                if (!SKIP_INDEXUPDATE)
                {
                    var das = imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true);

                    aceLog.log("Saving index engine performance : DomainAssetion done ");

                    indexSessionEntry.Domains         = domainIndexTable.Count;
                    indexSessionEntry.Pages           = pageIndexTable.Count;
                    indexSessionEntry.PagesEvaluated  = pageIndexTable.Where(x => !collectionExtensions.isNullOrEmpty(x.relevancyText)).Count();
                    indexSessionEntry.CrawlerHash     = experimentManager.CurrentSession.state.setupHash_crawler;
                    indexSessionEntry.GlobalSetupHash = experimentManager.CurrentSession.state.setupHash_global;
                    indexSessionEntry.Duration        = DateTime.Now.Subtract(indexSessionEntry.Start).TotalMinutes;

                    aceLog.log("Saving index engine performance : PagesEvaluated counted ");


                    indexSessionEntry.CertainityPP        = das.certainty;
                    indexSessionEntry.MasterTFIDFCoverage = das.masterTFIDFApplied;
                    indexSessionEntry.DomainTFIDFs        = das[indexDomainContentEnum.completeDomainTFIDF].Count;
                }

                aceLog.log("Saving index engine performance : Saving index ");

                if (imbWEMManager.settings.directReportEngine.doPublishIndexPerformanceTable)
                {
                    indexSessionRecords.AddOrUpdate(indexSessionEntry);
                }

                // experimentManager.globalTFIDFSet.GetAggregateDataTable().saveObjectToXML(folder.pathFor(experimentSessionRegistry.PATH_CompiledFTIDF));
            }


            //Publish(imbWEMManager.authorNotation, null);
            //Publish(imbWEMManager.authorNotation, experimentManager.CurrentSession.sessionReportFolder);

            experimentManager.CloseSession(tRecords);
        }
Exemple #5
0
 public void addToPageSet(indexPage page)
 {
     pageSet.Add(page);
 }
        /// <summary>
        /// Deploys information from wRecord, including the key
        /// </summary>
        /// <param name="wRecord">The w record.</param>
        public void deploy(modelSpiderSiteRecord wRecord)
        {
            double i_lm_harvest = 0;
            double i_lm_recall  = 0;
            double i_pi_harvest = 0;
            double i_pi_nominal = 0;

            dataUnitSpiderIteration spi_first   = wRecord.timeseries.GetData().FirstOrDefault() as dataUnitSpiderIteration;
            dataUnitSpiderIteration spi_last    = wRecord.timeseries.lastEntry as dataUnitSpiderIteration;
            dataUnitSpiderIteration spi_current = wRecord.timeseries.currentEntry as dataUnitSpiderIteration;

            if (spi_current != null)
            {
                time_duration_s = creationTime.Subtract(spi_current.rowCreated).TotalSeconds;
            }
            else
            {
                time_duration_s = 0;
            }
            // if (spi_last != null) time_duration_gross_s = creationTime.Subtract(spi_last.rowCreated).TotalSeconds; else time_duration_gross_s = 0;
            if (spi_first != null)
            {
                time_sincefirst_s = creationTime.Subtract(spi_first.rowCreated).TotalSeconds;
            }
            else
            {
                time_sincefirst_s = 0;
            }


            indexDomain idomain = wRecord.GetIndexInfo();   // imbWEMManager.index.domainIndexTable.GetDomain(wRecord.domainInfo.domainName);

            iteration = wRecord.iteration;

            blocks_all      = wRecord.context.targets.blocks.Count(false);
            blocks_relevant = wRecord.context.targets.blocks.Count(true);

            terms_all      = wRecord.context.targets.termsAll.Count();
            terms_relevant = wRecord.context.targets.termSerbian.Count();

            var TFIDF = wRecord.MasterTFIDF; // imbWEMManager.index.experimentEntry.globalTFIDFCompiled;

            var mchs = TFIDF.GetMatches(wRecord.context.targets.termSerbian);



            //TFIDF.GetScoreAggregate()


            key = wRecord.domainInfo.domainName + iteration.ToString("D3");

            int relCount  = 0;
            int irelCount = 0;
            int lCount    = 0;
            int rCount    = 0;
            int dCount    = 0;

            double fraDuration      = 0;
            int    modulesContained = 0;


            int rec = 0;

            foreach (frontierRankingAlgorithmIterationRecord gen in wRecord.frontierDLC.generalRecords)
            {
                rec++;
                fraDuration += gen.duration;
            }

            FRA_SummaryRuntime = fraDuration.GetRatio((double)rec);

            FRA_TimePercent = FRA_SummaryRuntime.GetRatio(time_duration_s);

            var rtake = wRecord.tRecord.measureTaker.GetLastTake();

            if (rtake != null)
            {
                CPU = rtake.cpuRateOfProcess;
            }


            if (imbWEMManager.settings.directReportEngine.DR_ReportModules)
            {
                foreach (moduleDLCRecord mod in wRecord.frontierDLC)
                {
                    if (mod != null)
                    {
                        modulesContained += mod.GetLastEntry().accumulated;
                    }
                }
            }


            List <string>       hashList     = new List <string>();
            List <spiderTarget> nonDuplicate = new List <spiderTarget>();

            foreach (spiderTarget t in wRecord.context.targets.GetLoaded())
            {
                indexPage ipage = t.GetIndexPage(); //imbWEMManager.index.pageIndexTable.GetPageForUrl(t.url);
                //i_pi_harvest += ipage.InfoPrize;

                if (ipage != null)
                {
                    i_pi_nominal += ipage.InfoPrize;
                }

                bool isDuplicate = t.isDuplicate;

                if (isDuplicate)
                {
                    if (!hashList.Contains(t.pageHash))
                    {
                        hashList.Add(t.pageHash);
                        isDuplicate = false;
                    }
                }

                if (!isDuplicate)
                {
                    if (t.IsRelevant)
                    {
                        relCount++;
                    }
                    else
                    {
                        irelCount++;
                    }
                    lCount++;
                    nonDuplicate.Add(t);
                }
                else
                {
                    dCount++;
                }
            }
            relevantPageCount   = relCount;
            irrelevantPageCount = irelCount;
            loadedPageCount     = lCount;
            duplicateCount      = dCount;

            int mchs_c  = 0;
            int id_lm_c = 0;

            if (idomain != null)
            {
                id_lm_c = idomain.Lemmas;
            }
            if (mchs != null)
            {
                mchs_c = mchs.Count();
            }

            i_lm_harvest = mchs_c.GetRatio(loadedPageCount);

            IP = TFIDF.GetScoreForMatch(wRecord.context.targets.termSerbian);

            i_lm_recall = mchs_c.GetRatio(id_lm_c);
            if (i_lm_recall > 1)
            {
                i_lm_recall = 1;
            }


            if (idomain != null)
            {
                IP_recall = i_pi_nominal.GetRatio(idomain.InfoPrize).ClipToK();
            }
            if (idomain != null)
            {
                Term_recall = wRecord.context.targets.termSerbian.Count().GetRatio(idomain.Words).ClipToK();
            }
            Page_recall = relevantPageCount.GetRatio(wRecord.pageRecallTarget).ClipToK();

            i_pi_nominal = i_pi_nominal.GetRatio(loadedPageCount);
            i_pi_harvest = IP.GetRatio((double)lCount);



            spiderTaskResult lastResult = null;

            foreach (spiderTaskResult r in wRecord.spiderTaskResults)
            {
                lastResult = r;
                rCount     = rCount + r.Count;
            }

            realLoadsCount = rCount;

            if (lastResult != null)
            {
                targetUrl       = "";
                targetLanguage  = "";
                targetEvalRatio = "";

                foreach (spiderTaskResultItem item in lastResult.items.Values)
                {
                    targetUrl = targetUrl.add(item.target.url, ",");

                    var t = wRecord.context.targets.GetByTarget(item.target);
                    if (t != null)
                    {
                        if (t.evaluation != null)
                        {
                            targetLanguage  = targetLanguage.add(t.evaluatedLanguage.ToString(), ";");
                            targetEvalRatio = targetEvalRatio.add(t.evaluation.result_ratio.ToString(), ";");
                        }
                        else
                        {
                            if (t.isDuplicate)
                            {
                                targetLanguage  = targetLanguage.add("duplicate", ";");
                                targetEvalRatio = targetEvalRatio.add("duplicate", ";");
                            }
                            else
                            {
                                targetLanguage  = targetLanguage.add("unknown", ";");
                                targetEvalRatio = targetEvalRatio.add("unknown", ";");
                            }
                        }
                    }
                }
            }

            if ((relevantPageCount == 0) || (loadedPageCount == 0))
            {
                E_PP = 0;
            }
            else
            {
                E_PP = (double)relevantPageCount / (double)loadedPageCount;
            }
            if ((wRecord.context.targets.termSerbian.Count == 0) || (wRecord.context.targets.termsAll.Count == 0) || (loadedPageCount == 0))
            {
                E_TP = 0;
                E_TH = 0;
            }
            else
            {
                E_TP = (double)wRecord.context.targets.termSerbian.Count / (double)wRecord.context.targets.termsAll.Count;
                E_TH = (double)wRecord.context.targets.termSerbian.Count / (double)loadedPageCount;
            }

            IPnominal    = i_pi_nominal;
            IP_collected = i_pi_harvest;

            Lm_collected = i_lm_harvest;
            Lm_recall    = i_lm_recall;
        }
        /// <summary>
        /// Performs full domain reevaluation
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="evaluator">The evaluator.</param>
        public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF)
        {
            indexDomain idomain = null;

            //lock (updateIndexLockD)
            //{
            idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            // }

            idomain.url = __wRecord.domain;

            //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true);

            double        dIP    = 0;
            int           p      = 0;
            List <string> dTerms = new List <string>();

            List <string> dDistinctTerms = new List <string>();

            List <string> dLemmas = new List <string>();
            List <string> dWords  = new List <string>();


            List <string> urls = new List <string>();

            bool doEvalD = true;



            foreach (spiderTarget target in __wRecord.context.targets.GetLoaded())
            {
                indexPage ipage = null;

                // lock (updateIndexLock)
                // {
                ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain);
                // }
                bool doEval = true;
                int  dLc    = 0;

                if (settings.plugIn_indexDBUpdater_optimizedMode)
                {
                    if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant)))
                    {
                        doEval = false;

                        if (ipage.AllWords.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                        if (ipage.AllLemmas.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                    }
                }

                if (doEval)
                {
                    List <string> terms = new List <string>();

                    if (ipage.AllWords.isNullOrEmpty())
                    {
                        terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);
                    }
                    else
                    {
                        terms = ipage.AllWords.SplitSmart(",", "", true);
                    }


                    ipage.AllWords = terms.toCsvInLine();


                    double IP = 0;

                    List <string> lemmas = new List <string>();

                    List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms);

                    if (ipage.AllLemmas.isNullOrEmpty())
                    {
                        //  terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);

                        lemmas.AddRange(mchl.Select(x => x.nominalForm));
                    }
                    else
                    {
                        lemmas = ipage.AllLemmas.SplitSmart(",", "", true);
                    }



                    foreach (weightTableTermCompiled cterm in mchl)
                    {
                        IP += cterm.tf_idf;
                        //dTerms.AddUnique(cterm.nominalForm);

                        if (cterm.df == 1)
                        {
                            dDistinctTerms.AddUnique(cterm.nominalForm);
                        }
                    }

                    ipage.InfoPrize = IP;

                    dIP += IP;

                    ipage.Lemmas = lemmas.Count;

                    ipage.AllLemmas = lemmas.toCsvInLine();

                    dWords.AddRange(terms);
                    dLemmas.AddRange(lemmas);

                    ipage.Note = "indexUpdate" + SessionID;

                    //   lock (updateIndexLockB)
                    //    {
                    imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage);
                    //   }
                    // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }
                else
                {
                    dIP    += ipage.InfoPrize;
                    doEvalD = false;
                    // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", "  ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }

                urls.Add(ipage.url);

                p++;
                loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                target.Dispose();
            }


            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                if (!doEvalD)
                {
                    var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain);
                    int dlc_c  = dlc_tf.Count;


                    idomain.TFIDFcompiled = (dlc_c > 0);
                    idomain.Lemmas        = dlc_c;
                }
                else
                {
                    idomain.Lemmas         = dLemmas.Count;
                    idomain.Words          = dWords.Count;
                    idomain.TFIDFcompiled  = (dLemmas.Count > 0);
                    idomain.DistinctLemmas = dDistinctTerms.toCsvInLine();
                    idomain.AllLemmas      = dLemmas.toCsvInLine();
                    idomain.AllWords       = dWords.toCsvInLine();
                }
                idomain.InfoPrize = dIP;
                //if (doEvalD)


                var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls);

                idomain.relevantPages        = urlAssert[indexPageEvaluationEntryState.isRelevant].Count;
                idomain.notRelevantPages     = urlAssert[indexPageEvaluationEntryState.notRelevant].Count;
                idomain.detected             = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count;
                idomain.Crawled              = urlAssert.certainty;
                idomain.RelevantContentRatio = urlAssert.relevant;
                string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7"));
                if (loger != null)
                {
                    loger.AppendLine(rpp);
                }
            }



            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);
            }
            imbWEMManager.index.wRecordsDeployed++;

            __wRecord.Dispose();
        }
        public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null)
        {
            if (idomain == null)
            {
                idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain);
            }
            if (ipage == null)
            {
                ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url));
            }

            List <string> output = new List <string>();

            FileInfo file = GetWordList_File(idomain, ipage);

            if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists)
            {
                output = file.FullName.openFileToList(true);



                return(output);
            }

            string cont = target.pageText.transliterate();
            // cont = cont.imbHtmlDecode();

            termDocument pageTF = null;

            if (evaluator == null)
            {
                evaluator = target.parent.wRecord.tRecord.evaluator;
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(cont);

            if (evaluation.result_language == basicLanguageEnum.serbian)
            {
                List <string> pt = new List <string>();

                pt.AddRange(evaluation.singleLanguageTokens);

                if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
                {
                    pt.AddRange(evaluation.multiLanguageTokens);
                }

                pt.RemoveAll(x => !x.isCleanWord());
                pt.RemoveAll(x => x.isSymbolicContentOnly());

                var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt);


                output.AddRange(tkns);
            }

            if (imbWEMManager.settings.TFIDF.doSavePageWordlist)
            {
                output.saveContentOnFilePath(file.FullName);
            }

            return(output);
        }
 public FileInfo GetWordList_File(indexDomain idomain, indexPage ipage)
 {
     return(TFIDF_ConstructFolder.pathFor("p_words_" + idomain.HashCode + "-" + ipage.HashCode + ".txt").getWritableFile(getWritableFileMode.newOrExisting));
 }