public List <weightTableCompiled> GetTFIDF_DLC_AllCached(builderForLog loger = null)
        {
            List <weightTableCompiled> allDLC_TFIDFs = new List <weightTableCompiled>();

            List <string> DLC_TFIDF_Files = TFIDF_ConstructFolder.findFiles("dlc_*.xml");

            if (loger != null)
            {
                loger.log("[" + DLC_TFIDF_Files.Count + "] DLC TFIDF files detected in the cache folder [" + TFIDF_ConstructFolder.path + "]");
            }



            int    tc = DLC_TFIDF_Files.Count;
            double tr = 0;
            int    c  = 0;


            foreach (string fPath in DLC_TFIDF_Files)
            {
                c++;
                weightTableCompiled dlc = new weightTableCompiled(fPath, true, c.ToString("D5"));

                allDLC_TFIDFs.Add(dlc);

                tr = c.GetRatio(tc);
                if (loger != null)
                {
                    aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0);
                }
            }
            return(allDLC_TFIDFs);
        }
        public void StartSession(string __CrawlID, indexPerformanceEntry __indexID, string __SessionID, ICrawlJobContext __state)
        {
            CrawlID   = __CrawlID;
            SessionID = __SessionID;
            state     = __state;

            var tmpFolder = new folderNode("reportOutput", "reporting module", "");

            sessionReportFolder = tmpFolder.createDirectory(SessionID, "", imbWEMManager.settings.directReportEngine.doAutoRenameSessionFolder); //  Directory.CreateDirectory(path);

            TestID = CrawlID + "-" + SessionID;

            ReportPath = sessionReportFolder.path;

            sessionCrawlerFolder = sessionReportFolder.createDirectory(__CrawlID, "Report folder for Crawl [" + __CrawlID + "] - part of session: " + SessionID);



            indexSubFolder = imbWEMManager.index.folder;        //.createDirectory(__indexID.IndexRepository, "Index folder fo sub index", false);

            TFIDF_ConstructFolder = imbWEMManager.index.folder; //.createDirectory(SessionID, "TFIDF cache files for this session", false);

            FileInfo master_file = GetTFIDF_Master_File();

            if (globalTFIDFCompiled == null)
            {
                globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, true, SessionID);
                globalTFIDFCompiled.ReadOnlyMode = true;
            }

            SampleRandomOrder = imbWEMManager.settings.crawlerJobEngine.doRandomizeSampleOrder;
            //SampleSource = state.sampleTags.add(state.sampleFile, ";");

            SampleListHash = randomizeSample();
        }
Esempio n. 3
0
        // <--------------------------- data getters

        public void Dispose()
        {
            timeseries        = null;
            lastInput         = null;
            lastOutput        = null;
            currentModuleData = null;
            web      = null;
            wProfile = null;
            listOfDuplicatedPages = null;
            resultPageSet         = null;
            spiderTaskResults     = null;
            crossLinkStats        = null;
            //_crawlerContext = null;
            linkHierarchy = null;
            _MasterTFIDF  = null;
        }
        public weightTableCompiled GetOrCreateTFIDF_DLC_Heuristic(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null)
        {
            indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);

            FileInfo            TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing);
            weightTableCompiled TFIDF_DLC      = null;

            if (TFIDF_DLC_File.Exists && __useExisting)
            {
                TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF");

                loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain);
                return(TFIDF_DLC);
            }

            // <--------------- evaluator selection

            if (evaluator == null)
            {
                evaluator = __wRecord.tRecord.evaluator;
            }

            loger.log("DLC TF-IDF heuristic construction for: " + idomain.domain + " initiated.");

            termDocument domainTable = new termDocument();

            domainTable.expansion = 1;

            double tp = 0;

            var DLCTerms = GetDLCTerms_Heuristics(__wRecord, loger, __useExisting, __useExisting, evaluator, idomain);



            domainTable.AddTokens(DLCTerms, loger);

            tp = domainTable.Count().GetRatio(DLCTerms.Count); // allTerms.Count.GetRatio(tc);

            loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainTable.Count() + "] - Semantic compression: " + tp.ToString("P2"));

            TFIDF_DLC      = domainTable.GetCompiledTable(loger);
            TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain;

            return(TFIDF_DLC);
        }
        public weightTableCompiled LoadNewTFIDF_Master(bool loadNew = true)
        {
            if (loadNew)
            {
                FileInfo master_file = GetTFIDF_Master_File();

                var output = new weightTableCompiled(master_file.FullName, true, SessionID);
                output.ReadOnlyMode = true;
                return(output);
            }
            else
            {
                FileInfo master_file = GetTFIDF_Master_File();

                if (globalTFIDFCompiled == null)
                {
                    globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, true, SessionID);
                    globalTFIDFCompiled.ReadOnlyMode = true;
                }
                return(globalTFIDFCompiled);
            }
        }
        public override void eventDLCFinished(experimentSessionEntry __session, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord)
        {
            weightTableCompiled DLC_TDF = __session.GetOrCreateTFIDF_DLC(__wRecord, loger, imbWEMManager.settings.TFIDF.doUseCachedDLCTables, imbWEMManager.settings.TFIDF.doSaveCacheOfDLCTables, evaluator);

            //   domainTF_IDF.Add(__wRecord.domain, DLC_TDF);

            /*
             *
             *
             * indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);
             *
             * List<indexPage> pages = imbWEMManager.index.pageIndexTable.GetPagesForDomain(__wRecord.domain);
             *
             *
             * loger.log("[" + idomain.domain + "] application of DLC TF-IDF");
             *
             *
             * allterms = new List<string>();
             * List<String> DLCTerms = new List<string>();
             * ti = 0;
             * tc = pages.Count;
             * foreach (indexPage ipage in pages)
             * {
             *  //if (ipage.relevancy == indexPageRelevancyEnum.isRelevant)
             *  //{
             *
             *      spiderTarget tPage = __wRecord.context.targets.GetByURL(ipage.url); // tLoaded.FirstOrDefault(x => (x.key == __wRecord.context.targets.GetHash(ipage.url)));
             *
             *      if (!selected.Contains(tPage))
             *      {
             *          continue;
             *      }
             *
             *      if (tPage == null)
             *      {
             *          loger.log("-- page: " + ipage.url + " [not found in the crawler context of: " + idomain.url);
             *          continue;
             *      }
             *
             *      // __wRecord.context.targets.GetByURL(ipage.url);
             *      termDocument dPage = (termDocument)domainSet[tPage.pageHash];
             *
             *
             *      if (dPage == null)
             *      {
             *          continue;
             *      }
             *
             *      dPage.expansion = 0;
             *      distinct = new List<string>();
             *
             *
             *      var wt = dPage.GetAllTerms();
             *      foreach (IWeightTableTerm t in wt)
             *      {
             *          if (dPage.GetBDFreq(t) == 1)
             *          {
             *              distinct.Add(t.nominalForm);
             *          }
             *          allterms.Add(t.nominalForm);
             *      }
             *
             *      ipage.DistinctLemmas = distinct.toCsvInLine();
             *      ipage.RelevantTerms = allterms.toCsvInLine();
             *      ipage.TFIDFcompiled = true;
             *
             *      DLCTerms.AddRangeUnique(allterms);
             *
             *      dPage.GetDataTableClean(ipage.HashCode).saveObjectToXML(__session.indexSubFolder.pathFor(GetCompbinedHash(idomain, ipage) + ".xml"));
             *
             *      ti++;
             *      Double tp = ti.GetRatio(tc);
             *      aceLog.consoleControl.writeToConsole(tp.ToString("P2"), loger, false, 0);
             *
             *      imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage);
             *  //}
             * }
             *
             * loger.log("[" + idomain.domain + "] application of DLC TF-IDF (done)");
             *
             *
             * loger.log("[" + idomain.domain + "] constructing DLC TF-IDF for Master TF-IDF (semantic compression)");
             *
             * // -------------
             * //var sparks = DLCTerms.getSparks(1, loger, false);
             *
             * webPageTF wTFIDF = globalTFIDFConstruct.AddTable(idomain.HashCode) as webPageTF;
             *
             * wTFIDF.AddPageTerms(allterms, 0, loger);
             *
             * //wTFIDF.AddTokens(DLCTerms, loger);
             *
             * String path = __session.indexSubFolder.pathFor(idomain.HashCode + ".xml").getWritableFile().FullName;
             * wTFIDF.GetDataTable("Lemma" + idomain.domain, null, false).saveObjectToXML(path);
             *
             *
             *
             * idomain.Lemmas = wTFIDF.Count();
             *
             * imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);
             *
             * loger.log("[" + idomain.domain + "] TF-IDF operations done");
             */
        }
        /// <summary>
        /// Performs full domain reevaluation
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="evaluator">The evaluator.</param>
        public void doDomainEvaluation(IndexEngineConfiguration settings, builderForLog loger, modelSpiderSiteRecord __wRecord, multiLanguageEvaluator evaluator, weightTableCompiled mTFIDF)
        {
            indexDomain idomain = null;

            //lock (updateIndexLockD)
            //{
            idomain = imbWEMManager.index.domainIndexTable.GetDomain(__wRecord.domainInfo.domainName);
            // }

            idomain.url = __wRecord.domain;

            //if (mTFIDF == null) mTFIDF = GetTFIDF_Master(loger, true, true);

            double        dIP    = 0;
            int           p      = 0;
            List <string> dTerms = new List <string>();

            List <string> dDistinctTerms = new List <string>();

            List <string> dLemmas = new List <string>();
            List <string> dWords  = new List <string>();


            List <string> urls = new List <string>();

            bool doEvalD = true;



            foreach (spiderTarget target in __wRecord.context.targets.GetLoaded())
            {
                indexPage ipage = null;

                // lock (updateIndexLock)
                // {
                ipage = imbWEMManager.index.deployTarget(target, __wRecord, idomain);
                // }
                bool doEval = true;
                int  dLc    = 0;

                if (settings.plugIn_indexDBUpdater_optimizedMode)
                {
                    if ((ipage.InfoPrize > 0) && (ipage.Lemmas > 0) && (ipage.relevancyText == nameof(indexPageRelevancyEnum.isRelevant)))
                    {
                        doEval = false;

                        if (ipage.AllWords.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                        if (ipage.AllLemmas.isNullOrEmpty())
                        {
                            doEval = true;
                        }
                    }
                }

                if (doEval)
                {
                    List <string> terms = new List <string>();

                    if (ipage.AllWords.isNullOrEmpty())
                    {
                        terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);
                    }
                    else
                    {
                        terms = ipage.AllWords.SplitSmart(",", "", true);
                    }


                    ipage.AllWords = terms.toCsvInLine();


                    double IP = 0;

                    List <string> lemmas = new List <string>();

                    List <IWeightTableTerm> mchl = mTFIDF.GetMatches(terms);

                    if (ipage.AllLemmas.isNullOrEmpty())
                    {
                        //  terms = GetTermsForPage(target, idomain, ipage, evaluator, loger);

                        lemmas.AddRange(mchl.Select(x => x.nominalForm));
                    }
                    else
                    {
                        lemmas = ipage.AllLemmas.SplitSmart(",", "", true);
                    }



                    foreach (weightTableTermCompiled cterm in mchl)
                    {
                        IP += cterm.tf_idf;
                        //dTerms.AddUnique(cterm.nominalForm);

                        if (cterm.df == 1)
                        {
                            dDistinctTerms.AddUnique(cterm.nominalForm);
                        }
                    }

                    ipage.InfoPrize = IP;

                    dIP += IP;

                    ipage.Lemmas = lemmas.Count;

                    ipage.AllLemmas = lemmas.toCsvInLine();

                    dWords.AddRange(terms);
                    dLemmas.AddRange(lemmas);

                    ipage.Note = "indexUpdate" + SessionID;

                    //   lock (updateIndexLockB)
                    //    {
                    imbWEMManager.index.pageIndexTable.AddOrUpdate(ipage);
                    //   }
                    // if (loger!=null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }
                else
                {
                    dIP    += ipage.InfoPrize;
                    doEvalD = false;
                    // if (loger != null) loger.AppendLine(String.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", "  ^---- using existing ", ipage.url.TrimToMaxLength(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                }

                urls.Add(ipage.url);

                p++;
                loger.AppendLine(string.Format("[{0,25}] [{1,70}] IP[{2,7}] LM[{3,6}]", idomain.domain, ipage.url.toWidthMaximum(60), ipage.InfoPrize.ToString("F4"), ipage.Lemmas.ToString("D5")));
                target.Dispose();
            }


            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                if (!doEvalD)
                {
                    var dlc_tf = imbWEMManager.index.experimentEntry.GetTFIDF_DLC(idomain);
                    int dlc_c  = dlc_tf.Count;


                    idomain.TFIDFcompiled = (dlc_c > 0);
                    idomain.Lemmas        = dlc_c;
                }
                else
                {
                    idomain.Lemmas         = dLemmas.Count;
                    idomain.Words          = dWords.Count;
                    idomain.TFIDFcompiled  = (dLemmas.Count > 0);
                    idomain.DistinctLemmas = dDistinctTerms.toCsvInLine();
                    idomain.AllLemmas      = dLemmas.toCsvInLine();
                    idomain.AllWords       = dWords.toCsvInLine();
                }
                idomain.InfoPrize = dIP;
                //if (doEvalD)


                var urlAssert = imbWEMManager.index.pageIndexTable.GetUrlAssertion(urls);

                idomain.relevantPages        = urlAssert[indexPageEvaluationEntryState.isRelevant].Count;
                idomain.notRelevantPages     = urlAssert[indexPageEvaluationEntryState.notRelevant].Count;
                idomain.detected             = urlAssert[indexPageEvaluationEntryState.haveNoEvaluationEntry].Count;
                idomain.Crawled              = urlAssert.certainty;
                idomain.RelevantContentRatio = urlAssert.relevant;
                string rpp = string.Format("[{0,25}] Pages [{1,10}] IP[{2,10}] LM[{3,10}]", idomain.domain, p, idomain.InfoPrize.ToString("F5"), idomain.Lemmas.ToString("D7"));
                if (loger != null)
                {
                    loger.AppendLine(rpp);
                }
            }



            if (imbWEMManager.settings.indexEngine.plugIn_indexDBUpdater_updateDomainEntry)
            {
                imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);
            }
            imbWEMManager.index.wRecordsDeployed++;

            __wRecord.Dispose();
        }
        /// <summary>
        /// Gets the or create tfidf DLC.
        /// </summary>
        /// <param name="__wRecord">The w record.</param>
        /// <param name="loger">The loger.</param>
        /// <param name="__useExisting">if set to <c>true</c> [use existing].</param>
        /// <param name="__saveToCache">if set to <c>true</c> [save to cache].</param>
        /// <param name="evaluator">The evaluator.</param>
        /// <returns></returns>
        public weightTableCompiled GetOrCreateTFIDF_DLC(modelSpiderSiteRecord __wRecord, builderForLog loger, bool __useExisting, bool __saveToCache, multiLanguageEvaluator evaluator = null)
        {
            indexDomain idomain = imbWEMManager.index.domainIndexTable.GetOrCreate(__wRecord.domain);

            FileInfo            TFIDF_DLC_File = GetTFIDF_DLC_File(idomain, getWritableFileMode.existing);
            weightTableCompiled TFIDF_DLC      = null;

            if (TFIDF_DLC_File.Exists && __useExisting)
            {
                TFIDF_DLC = new weightTableCompiled(TFIDF_DLC_File.FullName, true, idomain.domain + "_DLC_TF_IDF");

                loger.log("DLC TF-IDF[" + TFIDF_DLC.Count + "] cache found for: " + idomain.domain);
                return(TFIDF_DLC);
            }

            if (evaluator == null)
            {
                evaluator = __wRecord.tRecord.evaluator;
            }

            // <--------------- evaluator selection

            if (imbWEMManager.settings.TFIDF.doUseHeuristicDLCTFIDFConstruction)
            {
                TFIDF_DLC = GetOrCreateTFIDF_DLC_Heuristic(__wRecord, loger, __useExisting, __saveToCache, evaluator);
            }
            else
            {
                loger.log("DLC TF-IDF construction for: " + idomain.domain + " initiated.");

                termDocumentSet domainSet = new termDocumentSet("DomainTFIDF_source");

                var tLoaded = __wRecord.context.targets.GetLoaded();
                int tc      = tLoaded.Count;
                int ti      = 0;
                int ts      = 10;
                int c       = 0;

                int    input_c  = 0;
                int    output_c = 0;
                double io_r     = 0;

                foreach (spiderTarget target in tLoaded)
                {
                    ti++;
                    c++;
                    double tp = ti.GetRatio(tc);

                    if (target.IsRelevant)
                    {
                        var wordlist = GetTermsForPage(target, idomain, null, evaluator, loger);
                        input_c += wordlist.Count;

                        termDocument pageTF = domainSet.AddTable(target.pageHash) as termDocument;
                        pageTF.expansion = 1;
                        pageTF.AddTokens(wordlist, loger);

                        output_c += pageTF.Count();
                    }

                    if (c > 10)
                    {
                        c    = 0;
                        io_r = output_c.GetRatio(input_c);
                        aceLog.consoleControl.writeToConsole("Pages processed [" + tp.ToString("P2") + "] Semantic compression rate: " + io_r.ToString("P2"), loger, false, 0);
                    }
                }

                loger.log("[" + idomain.domain + "] preprocess finished. DLC TF-IDF terms [" + domainSet.CountAllDocuments() + "]");

                TFIDF_DLC      = domainSet.AggregateDocument.GetCompiledTable(loger);
                TFIDF_DLC.name = "DLC-TFIDF " + idomain.domain;
            }

            idomain.Lemmas = TFIDF_DLC.Count;

            if (__saveToCache)
            {
                if (TFIDF_DLC.SaveAs(TFIDF_DLC_File.FullName, getWritableFileMode.overwrite))
                {
                    loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table cache saved to: " + TFIDF_DLC_File.FullName);
                }
                else
                {
                    loger.log("[" + idomain.domain + "] DLC TF-IDF compiled table save failed");
                }
            }

            imbWEMManager.index.domainIndexTable.AddOrUpdate(idomain);

            return(TFIDF_DLC);
        }
        /// <summary>
        /// Gets the tfidf master: loads from file or returns any existing instance
        /// </summary>
        /// <returns></returns>
        public weightTableCompiled GetTFIDF_Master(builderForLog loger, bool __useExisting = true, bool __saveToCache = true)
        {
            bool     rebuild     = !__useExisting;
            FileInfo master_file = GetTFIDF_Master_File();

            if (globalTFIDFCompiled == null)
            {
                globalTFIDFCompiled = new weightTableCompiled(master_file.FullName, __useExisting, SessionID);
                globalTFIDFCompiled.ReadOnlyMode = true;
            }

            if (globalTFIDFCompiled.Count == 0)
            {
                rebuild = true;
            }
            else
            {
                if (loger != null)
                {
                    loger.log("Master table loaded [" + globalTFIDFCompiled.Count + "]");
                }
            }

            if (rebuild)
            {
                int input_c  = 0;
                int output_c = 0;

                List <weightTableCompiled> allDLC_TFIDFs = GetTFIDF_DLC_AllCached(loger);

                if (loger != null)
                {
                    loger.log("Rebuilding Master Table ");
                }

                termDocumentSet construct = new termDocumentSet(SessionID, "Temporary TF-IDF construct table for session: " + SessionID);

                int    tc = allDLC_TFIDFs.Count;
                double tr = 0;
                int    c  = 0;

                foreach (weightTableCompiled dlc in allDLC_TFIDFs)
                {
                    c++;

                    termDocument td = construct.Add(dlc) as termDocument;
                    input_c += td.Count();

                    tr = c.GetRatio(tc);
                    if (loger != null)
                    {
                        aceLog.consoleControl.writeToConsole(tr.ToString("P2") + " ", loger, false, 0);
                    }

                    // output_c = construct.AggregateDocument.Count();
                }

                globalTFIDFCompiled = construct.AggregateDocument.GetCompiledTable(loger);
                output_c            = construct.AggregateDocument.Count();

                tr = input_c.GetRatio(output_c);
                if (loger != null)
                {
                    loger.log("Master Table - final semantic compression rate: [" + tr.ToString("P2") + "]");
                }
            }

            if (__saveToCache)
            {
                if (loger != null)
                {
                    loger.log("Master Table saved to:[" + master_file.FullName + "]");                // Namesemantic compression rate: [" + tr.ToString("P2") + "]");
                }
                globalTFIDFCompiled.SaveAs(master_file.FullName, getWritableFileMode.overwrite);
            }

            return(globalTFIDFCompiled);
        }
Esempio n. 10
0
 public override void OnLoaded()
 {
     TermTable = new weightTableCompiled(TermTablePath, true, nameof(TermTable));
 }