Ejemplo n.º 1
0
        public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            if (!__wRecord.tRecord.instance.settings.FRONTIER_doLinkHarvest)
            {
                indexDomain      idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
                List <indexPage> pages   = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName);


                var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault();

                var spage = seedTarget?.page;

                if (spage != null)
                {
                    loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url);
                }


                //FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain);

                foreach (indexPage p in pages)
                {
                    link l = new link(p.url);
                    // if (__wRecord.web.webActiveLinks.Contains())
                    __wRecord.context.processLink(l, spage, false);
                }
            }
        }
Ejemplo n.º 2
0
 /// <summary>
 /// Gets the index information.
 /// </summary>
 /// <returns></returns>
 public indexDomain GetIndexInfo()
 {
     if (indexDomainInfo == null)
     {
         indexDomainInfo = imbWEMManager.index.domainIndexTable.GetDomain(domainInfo.domainName);
     }
     return(indexDomainInfo);
 }
        public override void eventDLCInitiated(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            // imbWEMManager.index.domainIndexTable
            var state = __session.state;


            indexDomain      idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            List <indexPage> pages   = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domainInfo.domainName);

            /*
             * __session.state.crawler.settings.FRONTIER_doLinkHarvest = false;
             * __session.state.crawler.settings.FRONTIER_doLinkResolver = false;
             */

            var seedTarget = __wRecord.context.targets.GetLoaded().FirstOrDefault();
            //.webPages.items.Values.First();
            var spage = seedTarget?.page;

            if (spage != null)
            {
                loger.AppendLine(__wRecord.domain + " seed page selected -> " + spage.url);
            }


            FileInfo dlcFile = __session.GetTFIDF_DLC_File(idomain);

            if ((!dlcFile.Exists) || imbWEMManager.settings.TFIDF.doSchedulePagesWithDLCTable)
            {
                foreach (indexPage p in pages)
                {
                    link l = new link(p.url);

                    if (!p.url.Contains(__wRecord.domainInfo.domainRootName))
                    {
                        loger.AppendLine(__wRecord.domain + " -X-> " + p.url + " Wrong link association?");
                        aceTerminalInput.doBeepViaConsole(1600, 200, 3);
                    }

                    __wRecord.context.processLink(l, spage, false);
                }

                loger.AppendLine(__wRecord.domain + " -> " + __wRecord.web.webActiveLinks.Count + " targets set for load");
            }
            else
            {
                loger.AppendLine(__wRecord.domain + " -> DLC cache found: " + dlcFile.FullName);
            }
        }
Ejemplo n.º 4
0
        public void SetActiveTargets(modelSpiderSiteRecord wRecord, indexDomain domain)
        {
            List <indexPage> pages = domain.getPageSet();
            // wRecord.web.setSeedUrl(domain.url);
            //spiderPage sp = new spiderPage()

            crawledPage cpage = new crawledPage(domain.url, 0);

            spiderPage spage = new spiderPage(cpage, 0, 0);

            foreach (indexPage p in pages)
            {
                link l = new link(p.url);
                wRecord.context.processLink(l, spage, false);
            }
        }
Ejemplo n.º 5
0
        public weightTableCompiled GetOrCreateTFIDF_DLC_Heuristic(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null)
        {
            indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);

            FileInfo            TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing);
            weightTableCompiled TFIDF_DLC      = null;

            if (TFIDF_DLC_File.Exists && __useExisting)
            {
                TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF");

                loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain);
                return(TFIDF_DLC);
            }

            // <--------------- evaluator selection

            if (evaluator == null)
            {
                evaluator = __wRecord.tRecord.evaluator;
            }

            loger.log("DLC TF-IDF heuristic construction for: " + idomain.domain + " initiated.");

            termDocument domainTable = new termDocument();

            domainTable.expansion = 1;

            double tp = 0;

            var DLCTerms = GetDLCTerms_Heuristics(__wRecord, loger, __useExisting, __useExisting, evaluator, idomain);



            domainTable.AddTokens(DLCTerms, loger);

            tp = domainTable.Count().GetRatio(DLCTerms.Count); // allTerms.Count.GetRatio(tc);

            loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainTable.Count() + "] - Semantic compression: " + tp.ToString("P2"));

            TFIDF_DLC      = domainTable.GetCompiledTable(loger);
            TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain;

            return(TFIDF_DLC);
        }
Ejemplo n.º 6
0
        public indexPage deployTarget(spiderTarget target, modelSpiderSiteRecord wRecord, indexDomain idomain)
        {
            indexPage page = pageIndexTable.GetPageForUrl(target.url); //.GetOrCreate(md5.GetMd5Hash(target.url));

            if (idomain == null)
            {
                idomain = domainIndexTable.GetDomain(wRecord.domainInfo.domainName);
            }
            page.url = target.url;

            page.tst    = target.tokens.ToList().toCsvInLine(",");
            page.domain = wRecord.domainInfo.domainName;
            if (target.isLoaded)
            {
                if (target.evaluation != null)
                {
                    if (target.evaluation.result_language != basicLanguageEnum.unknown)
                    {
                        page.langTestRatio     = target.evaluation.result_ratio;
                        page.singleMatchTokens = target.evaluation.singleLanguageTokens.toCsvInLine(",");
                        page.multiMatchTokens  = target.evaluation.multiLanguageTokens.toCsvInLine(",");
                        page.wordCount         = target.evaluation.allContentTokens.Count();
                        page.AllWords          = target.evaluation.allContentTokens.toCsvInLine();
                        page.language          = target.evaluation.result_language.ToString();
                    }
                }
                if (target.IsRelevant)
                {
                    page.relevancy = indexPageRelevancyEnum.isRelevant;
                }
                else if (target.evaluatedLanguage == basicLanguageEnum.unknown)
                {
                    page.relevancy = indexPageRelevancyEnum.unknown;
                }
                else
                {
                    page.relevancy = indexPageRelevancyEnum.notRelevant;
                }
                page.byteSize = target.page.spiderResult.page.result.byteSize;
            }



            pageIndexTable.AddOrUpdate(page);

            return(page);
        }
        /// <summary>
        /// Deploys information from wRecord, including the key
        /// </summary>
        /// <param name="wRecord">The w record.</param>
        public void deploy(modelSpiderSiteRecord wRecord)
        {
            double i_lm_harvest = 0;
            double i_lm_recall  = 0;
            double i_pi_harvest = 0;
            double i_pi_nominal = 0;

            dataUnitSpiderIteration spi_first   = wRecord.timeseries.GetData().FirstOrDefault() as dataUnitSpiderIteration;
            dataUnitSpiderIteration spi_last    = wRecord.timeseries.lastEntry as dataUnitSpiderIteration;
            dataUnitSpiderIteration spi_current = wRecord.timeseries.currentEntry as dataUnitSpiderIteration;

            if (spi_current != null)
            {
                time_duration_s = creationTime.Subtract(spi_current.rowCreated).TotalSeconds;
            }
            else
            {
                time_duration_s = 0;
            }
            // if (spi_last != null) time_duration_gross_s = creationTime.Subtract(spi_last.rowCreated).TotalSeconds; else time_duration_gross_s = 0;
            if (spi_first != null)
            {
                time_sincefirst_s = creationTime.Subtract(spi_first.rowCreated).TotalSeconds;
            }
            else
            {
                time_sincefirst_s = 0;
            }


            indexDomain idomain = wRecord.GetIndexInfo();   // imbWEMManager.index.domainIndexTable.GetDomain(wRecord.domainInfo.domainName);

            iteration = wRecord.iteration;

            blocks_all      = wRecord.context.targets.blocks.Count(false);
            blocks_relevant = wRecord.context.targets.blocks.Count(true);

            terms_all      = wRecord.context.targets.termsAll.Count();
            terms_relevant = wRecord.context.targets.termSerbian.Count();

            var TFIDF = wRecord.MasterTFIDF; // imbWEMManager.index.experimentEntry.globalTFIDFCompiled;

            var mchs = TFIDF.GetMatches(wRecord.context.targets.termSerbian);



            //TFIDF.GetScoreAggregate()


            key = wRecord.domainInfo.domainName + iteration.ToString("D3");

            int relCount  = 0;
            int irelCount = 0;
            int lCount    = 0;
            int rCount    = 0;
            int dCount    = 0;

            double fraDuration      = 0;
            int    modulesContained = 0;


            int rec = 0;

            foreach (frontierRankingAlgorithmIterationRecord gen in wRecord.frontierDLC.generalRecords)
            {
                rec++;
                fraDuration += gen.duration;
            }

            FRA_SummaryRuntime = fraDuration.GetRatio((double)rec);

            FRA_TimePercent = FRA_SummaryRuntime.GetRatio(time_duration_s);

            var rtake = wRecord.tRecord.measureTaker.GetLastTake();

            if (rtake != null)
            {
                CPU = rtake.cpuRateOfProcess;
            }


            if (imbWEMManager.settings.directReportEngine.DR_ReportModules)
            {
                foreach (moduleDLCRecord mod in wRecord.frontierDLC)
                {
                    if (mod != null)
                    {
                        modulesContained += mod.GetLastEntry().accumulated;
                    }
                }
            }


            List <string>       hashList     = new List <string>();
            List <spiderTarget> nonDuplicate = new List <spiderTarget>();

            foreach (spiderTarget t in wRecord.context.targets.GetLoaded())
            {
                indexPage ipage = t.GetIndexPage(); //imbWEMManager.index.pageIndexTable.GetPageForUrl(t.url);
                //i_pi_harvest += ipage.InfoPrize;

                if (ipage != null)
                {
                    i_pi_nominal += ipage.InfoPrize;
                }

                bool isDuplicate = t.isDuplicate;

                if (isDuplicate)
                {
                    if (!hashList.Contains(t.pageHash))
                    {
                        hashList.Add(t.pageHash);
                        isDuplicate = false;
                    }
                }

                if (!isDuplicate)
                {
                    if (t.IsRelevant)
                    {
                        relCount++;
                    }
                    else
                    {
                        irelCount++;
                    }
                    lCount++;
                    nonDuplicate.Add(t);
                }
                else
                {
                    dCount++;
                }
            }
            relevantPageCount   = relCount;
            irrelevantPageCount = irelCount;
            loadedPageCount     = lCount;
            duplicateCount      = dCount;

            int mchs_c  = 0;
            int id_lm_c = 0;

            if (idomain != null)
            {
                id_lm_c = idomain.Lemmas;
            }
            if (mchs != null)
            {
                mchs_c = mchs.Count();
            }

            i_lm_harvest = mchs_c.GetRatio(loadedPageCount);

            IP = TFIDF.GetScoreForMatch(wRecord.context.targets.termSerbian);

            i_lm_recall = mchs_c.GetRatio(id_lm_c);
            if (i_lm_recall > 1)
            {
                i_lm_recall = 1;
            }


            if (idomain != null)
            {
                IP_recall = i_pi_nominal.GetRatio(idomain.InfoPrize).ClipToK();
            }
            if (idomain != null)
            {
                Term_recall = wRecord.context.targets.termSerbian.Count().GetRatio(idomain.Words).ClipToK();
            }
            Page_recall = relevantPageCount.GetRatio(wRecord.pageRecallTarget).ClipToK();

            i_pi_nominal = i_pi_nominal.GetRatio(loadedPageCount);
            i_pi_harvest = IP.GetRatio((double)lCount);



            spiderTaskResult lastResult = null;

            foreach (spiderTaskResult r in wRecord.spiderTaskResults)
            {
                lastResult = r;
                rCount     = rCount + r.Count;
            }

            realLoadsCount = rCount;

            if (lastResult != null)
            {
                targetUrl       = "";
                targetLanguage  = "";
                targetEvalRatio = "";

                foreach (spiderTaskResultItem item in lastResult.items.Values)
                {
                    targetUrl = targetUrl.add(item.target.url, ",");

                    var t = wRecord.context.targets.GetByTarget(item.target);
                    if (t != null)
                    {
                        if (t.evaluation != null)
                        {
                            targetLanguage  = targetLanguage.add(t.evaluatedLanguage.ToString(), ";");
                            targetEvalRatio = targetEvalRatio.add(t.evaluation.result_ratio.ToString(), ";");
                        }
                        else
                        {
                            if (t.isDuplicate)
                            {
                                targetLanguage  = targetLanguage.add("duplicate", ";");
                                targetEvalRatio = targetEvalRatio.add("duplicate", ";");
                            }
                            else
                            {
                                targetLanguage  = targetLanguage.add("unknown", ";");
                                targetEvalRatio = targetEvalRatio.add("unknown", ";");
                            }
                        }
                    }
                }
            }

            if ((relevantPageCount == 0) || (loadedPageCount == 0))
            {
                E_PP = 0;
            }
            else
            {
                E_PP = (double)relevantPageCount / (double)loadedPageCount;
            }
            if ((wRecord.context.targets.termSerbian.Count == 0) || (wRecord.context.targets.termsAll.Count == 0) || (loadedPageCount == 0))
            {
                E_TP = 0;
                E_TH = 0;
            }
            else
            {
                E_TP = (double)wRecord.context.targets.termSerbian.Count / (double)wRecord.context.targets.termsAll.Count;
                E_TH = (double)wRecord.context.targets.termSerbian.Count / (double)loadedPageCount;
            }

            IPnominal    = i_pi_nominal;
            IP_collected = i_pi_harvest;

            Lm_collected = i_lm_harvest;
            Lm_recall    = i_lm_recall;
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Performs full domain reevaluation
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="evaluator">The evaluator.</param>
        public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF)
        {
            indexDomain idomain = null;

            //lock (updateIndexLockD)
            //{
            idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            // }

            idomain.url = __wRecord.domain;

            //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true);

            double        dIP    = 0;
            int           p      = 0;
            List <string> dTerms = new List <string>();

            List <string> dDistinctTerms = new List <string>();

            List <string> dLemmas = new List <string>();
            List <string> dWords  = new List <string>();


            List <string> urls = new List <string>();

            bool doEvalD = true;



            foreach (spiderTarget target in __wRecord.context.targets.GetLoaded())
            {
                indexPage ipage = null;

                // lock (updateIndexLock)
                // {
                ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain);
                // }
                bool doEval = true;
                int  dLc    = 0;

                if (settings.plugIn_indexDBUpdater_optimizedMode)
                {
                    if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant)))
                    {
                        doEval = false;

                        if (ipage.AllWords.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                        if (ipage.AllLemmas.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                    }
                }

                if (doEval)
                {
                    List <string> terms = new List <string>();

                    if (ipage.AllWords.isNullOrEmpty())
                    {
                        terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);
                    }
                    else
                    {
                        terms = ipage.AllWords.SplitSmart(",", "", true);
                    }


                    ipage.AllWords = terms.toCsvInLine();


                    double IP = 0;

                    List <string> lemmas = new List <string>();

                    List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms);

                    if (ipage.AllLemmas.isNullOrEmpty())
                    {
                        //  terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);

                        lemmas.AddRange(mchl.Select(x => x.nominalForm));
                    }
                    else
                    {
                        lemmas = ipage.AllLemmas.SplitSmart(",", "", true);
                    }



                    foreach (weightTableTermCompiled cterm in mchl)
                    {
                        IP += cterm.tf_idf;
                        //dTerms.AddUnique(cterm.nominalForm);

                        if (cterm.df == 1)
                        {
                            dDistinctTerms.AddUnique(cterm.nominalForm);
                        }
                    }

                    ipage.InfoPrize = IP;

                    dIP += IP;

                    ipage.Lemmas = lemmas.Count;

                    ipage.AllLemmas = lemmas.toCsvInLine();

                    dWords.AddRange(terms);
                    dLemmas.AddRange(lemmas);

                    ipage.Note = "indexUpdate" + SessionID;

                    //   lock (updateIndexLockB)
                    //    {
                    imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage);
                    //   }
                    // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }
                else
                {
                    dIP    += ipage.InfoPrize;
                    doEvalD = false;
                    // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", "  ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }

                urls.Add(ipage.url);

                p++;
                loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                target.Dispose();
            }


            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                if (!doEvalD)
                {
                    var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain);
                    int dlc_c  = dlc_tf.Count;


                    idomain.TFIDFcompiled = (dlc_c > 0);
                    idomain.Lemmas        = dlc_c;
                }
                else
                {
                    idomain.Lemmas         = dLemmas.Count;
                    idomain.Words          = dWords.Count;
                    idomain.TFIDFcompiled  = (dLemmas.Count > 0);
                    idomain.DistinctLemmas = dDistinctTerms.toCsvInLine();
                    idomain.AllLemmas      = dLemmas.toCsvInLine();
                    idomain.AllWords       = dWords.toCsvInLine();
                }
                idomain.InfoPrize = dIP;
                //if (doEvalD)


                var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls);

                idomain.relevantPages        = urlAssert[indexPageEvaluationEntryState.isRelevant].Count;
                idomain.notRelevantPages     = urlAssert[indexPageEvaluationEntryState.notRelevant].Count;
                idomain.detected             = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count;
                idomain.Crawled              = urlAssert.certainty;
                idomain.RelevantContentRatio = urlAssert.relevant;
                string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7"));
                if (loger != null)
                {
                    loger.AppendLine(rpp);
                }
            }



            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);
            }
            imbWEMManager.index.wRecordsDeployed++;

            __wRecord.Dispose();
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Gets the or create tfidf DLC.
        /// </summary>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__useExisting">if set to <c>true</c> [use existing].</param>
        /// <param name="__saveToCache">if set to <c>true</c> [save to cache].</param>
        /// <param name="evaluator">The evaluator.</param>
        /// <returns></returns>
        public weightTableCompiled GetOrCreateTFIDF_DLC(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null)
        {
            indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);

            FileInfo            TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing);
            weightTableCompiled TFIDF_DLC      = null;

            if (TFIDF_DLC_File.Exists && __useExisting)
            {
                TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF");

                loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain);
                return(TFIDF_DLC);
            }

            if (evaluator == null)
            {
                evaluator = __wRecord.tRecord.evaluator;
            }

            // <--------------- evaluator selection

            if (imbWEMManager.settings.TFIDF.doUseHeuristicDLCTFIDFConstruction)
            {
                TFIDF_DLC = GetOrCreateTFIDF_DLC_Heuristic(__wRecord, loger, __useExisting, __saveToCache, evaluator);
            }
            else
            {
                loger.log("DLC TF-IDF construction for: " + idomain.domain + " initiated.");

                termDocumentSet domainSet = new termDocumentSet("DomainTFIDF_source");

                var tLoaded = __wRecord.context.targets.GetLoaded();
                int tc      = tLoaded.Count;
                int ti      = 0;
                int ts      = 10;
                int c       = 0;

                int    input_c  = 0;
                int    output_c = 0;
                double io_r     = 0;

                foreach (spiderTarget target in tLoaded)
                {
                    ti++;
                    c++;
                    double tp = ti.GetRatio(tc);

                    if (target.IsRelevant)
                    {
                        var wordlist = GetTermsForPage(target, idomain, null, evaluator, loger);
                        input_c += wordlist.Count;

                        termDocument pageTF = domainSet.AddTable(target.pageHash) as termDocument;
                        pageTF.expansion = 1;
                        pageTF.AddTokens(wordlist, loger);

                        output_c += pageTF.Count();
                    }

                    if (c > 10)
                    {
                        c    = 0;
                        io_r = output_c.GetRatio(input_c);
                        aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "] Semantic compression rate: " + io_r.ToString("P2"), loger, false, 0);
                    }
                }

                loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainSet.CountAllDocuments() + "]");

                TFIDF_DLC      = domainSet.AggregateDocument.GetCompiledTable(loger);
                TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain;
            }

            idomain.Lemmas = TFIDF_DLC.Count;

            if (__saveToCache)
            {
                if (TFIDF_DLC.SaveAs(TFIDF_DLC_File.FullName, getWritableFileMode.overwrite))
                {
                    loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table cache saved to: " + TFIDF_DLC_File.FullName);
                }
                else
                {
                    loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table save failed");
                }
            }

            imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);

            return(TFIDF_DLC);
        }
Ejemplo n.º 10
0
        public List <string> GetTermsForPage(spiderTarget target, indexDomain idomain = null, indexPage ipage = null, multiLanguageEvaluator evaluator = null, builderForLog loger = null)
        {
            if (idomain == null)
            {
                idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(target.parent.wRecord.domain);
            }
            if (ipage == null)
            {
                ipage = imbWEMManager.index.pageIndexTable.GetOrCreate(md5.GetMd5Hash(target.url));
            }

            List <string> output = new List <string>();

            FileInfo file = GetWordList_File(idomain, ipage);

            if (imbWEMManager.settings.TFIDF.doUseSavedPageWordlists && file.Exists)
            {
                output = file.FullName.openFileToList(true);



                return(output);
            }

            string cont = target.pageText.transliterate();
            // cont = cont.imbHtmlDecode();

            termDocument pageTF = null;

            if (evaluator == null)
            {
                evaluator = target.parent.wRecord.tRecord.evaluator;
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(cont);

            if (evaluation.result_language == basicLanguageEnum.serbian)
            {
                List <string> pt = new List <string>();

                pt.AddRange(evaluation.singleLanguageTokens);

                if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
                {
                    pt.AddRange(evaluation.multiLanguageTokens);
                }

                pt.RemoveAll(x => !x.isCleanWord());
                pt.RemoveAll(x => x.isSymbolicContentOnly());

                var tkns = semanticLexiconManager.lexiconCache.encodeTwins(pt);


                output.AddRange(tkns);
            }

            if (imbWEMManager.settings.TFIDF.doSavePageWordlist)
            {
                output.saveContentOnFilePath(file.FullName);
            }

            return(output);
        }
Ejemplo n.º 11
0
        //public webSitePageTFSet GetTFIDF_MasterConstruct()
        //{
        //    if (globalTFIDFSet == null)
        //    {
        //        globalTFIDFSet = new webSitePageTFSet(SessionID, "Temporary TF-IDF table for master table construction");

        //    }
        //    return globalTFIDFSet;
        //}

        private List <string> GetDLCTerms_Heuristics(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator, indexDomain idomain)
        {
            List <string> allTerms = new List <string>();

            List <string> DLCTerms = new List <string>();

            FileInfo dlcWordList = GetWordList_File(idomain);

            if (dlcWordList.Exists && __useExisting)
            {
                DLCTerms = dlcWordList.FullName.openFileToList(true);
                return(DLCTerms);
            }

            var    tLoaded = __wRecord.context.targets.GetLoaded();
            int    tc      = tLoaded.Count;
            int    ti      = 0;
            int    ts      = 10;
            int    c       = 0;
            double tp      = 0;

            foreach (spiderTarget target in tLoaded)
            {
                ti++;
                c++;
                tp = ti.GetRatio(tc);

                if (target.IsRelevant)
                {
                    string cont = target.pageText.transliterate();
                    cont = WebUtility.HtmlDecode(cont);
                    // cont = cont.imbHtmlDecode();

                    allTerms.AddRange(cont.getTokens(true, true, true, true, 4)); //, loger);
                }

                if (c > 10)
                {
                    c = 0;
                    aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "]", loger, false, 0);
                }
            }

            multiLanguageEvaluation evaluation = evaluator.evaluate(allTerms, null, null);

            DLCTerms.AddRange(evaluation.singleLanguageTokens);

            if (!imbWEMManager.settings.TFIDF.doUseOnlySingleMatch)
            {
                DLCTerms.AddRange(evaluation.multiLanguageTokens);
            }

            DLCTerms = semanticLexiconManager.lexiconCache.encodeTwins(DLCTerms);

            if (imbWEMManager.settings.TFIDF.doSaveDomainWordList)
            {
                if (__saveToCache)
                {
                    DLCTerms.saveContentOnFilePath(dlcWordList.FullName);
                }
            }

            return(DLCTerms);
        }
Ejemplo n.º 12
0
        public weightTableCompiled GetTFIDF_DLC(indexDomain idomain, getWritableFileMode mode = getWritableFileMode.newOrExisting)
        {
            FileInfo fi = new FileInfo(TFIDF_ConstructFolder.pathFor("dlc_" + idomain.HashCode + ".xml"));  //.getWritableFile(mode);

            return(new weightTableCompiled(fi.FullName, true, idomain.domain));
        }
Ejemplo n.º 13
0
 public FileInfo GetTFIDF_DLC_File(indexDomain idomain, getWritableFileMode mode = getWritableFileMode.newOrExisting)
 {
     return(TFIDF_ConstructFolder.pathFor("dlc_" + idomain.HashCode + ".xml").getWritableFile(mode));
 }
Ejemplo n.º 14
0
 public FileInfo GetWordList_File(indexDomain idomain)
 {
     return(TFIDF_ConstructFolder.pathFor("d_words_" + idomain.HashCode + ".txt").getWritableFile(getWritableFileMode.newOrExisting));
 }