/// <summary>
        /// Gets all web pages registered in the <see cref="imbMCWebSite.pageTable" /> loaded, if <c>takeSettings</c> used then returns sampleTake
        /// </summary>
        /// <param name="site">The site repo to take pages for</param>
        /// <param name="output">The log output.</param>
        /// <param name="takeSettings">If specified it will return only fraction of web sites, according to the sampling settings.</param>
        /// <returns>
        /// All web pages in the site repo , or sampleTake if sampling settings specified
        /// </returns>
        public List <imbMCWebPage> GetAllWebPages(imbMCWebSite site, ILogBuilder output = null, samplingSettings takeSettings = null)
        {
            if (output == null)
            {
                output = aceLog.loger;
            }

            var all = site.pageTable.GetList();

            if (takeSettings != null)
            {
                all = new sampleTake <imbMCWebPageEntry>(all, takeSettings);
            }

            List <imbMCWebPage> pages = new List <imbMCWebPage>();

            foreach (var pe in all)
            {
                var repo = pe.HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output);
                if (repo != null)
                {
                    pages.Add(repo);
                }
            }

            return(pages);
        }
        /// <summary>
        /// Returns repository instances for all web sites registered in <see cref="imbMCRepository.siteTable" />, if <c>takeSettings</c> used then returns sampleTake
        /// </summary>
        /// <param name="output">The log output.</param>
        /// <param name="takeSettings">If specified it will return only fraction of web sites, according to the sampling settings.</param>
        /// <returns>All web sites in the repo, or sampleTake if sampling settings specified</returns>
        public List <imbMCWebSite> GetAllWebSites(ILogBuilder output = null, samplingSettings takeSettings = null)
        {
            if (output == null)
            {
                output = aceLog.loger;
            }

            CheckSiteTable(output);

            List <imbMCWebSiteEntry> all = siteTable.GetList();

            if (takeSettings != null)
            {
                all = new sampleTake <imbMCWebSiteEntry>(all, takeSettings);
            }



            List <imbMCWebSite> sites = new List <imbMCWebSite>();

            foreach (var se in all)
            {
                try
                {
                    var repo = se.domain.LoadDataStructure <imbMCWebSite>(folder, output);
                    if (repo != null)
                    {
                        sites.Add(repo);
                    }
                } catch (Exception ex)
                {
                    output.log("Problem loading [" + se.domain + "] -> exception:" + ex.Message);
                }
            }

            return(sites);
        }
Esempio n. 3
0
        /// <summary>
        /// Builds the validation cases.
        /// </summary>
        /// <param name="basename">The basename.</param>
        /// <param name="k">The k.</param>
        /// <param name="debug">if set to <c>true</c> [debug].</param>
        /// <param name="output">The output.</param>
        /// <returns></returns>
        public kFoldValidationCollection BuildValidationCases(String basename, Int32 k, Boolean debug, ILogBuilder output = null, folderNode folderOverride = null, Boolean randomize = false)
        {
            kFoldValidationCollection validationCases = new kFoldValidationCollection();

            folderNode folderToUse = folderRoot;

            if (folderOverride != null)
            {
                folderToUse = folderOverride;
            }

            validationCases.folder = folderToUse.Add(basename, basename, basename + " " + k + "-fold validation");

            validationCases.Clear();

            var classes = GetClasses();

            validationCases.sampleMatrix = new Dictionary <IDocumentSetClass, List <string> >();


            foreach (IDocumentSetClass cl in classes)
            {
                validationCases.sampleMatrix.Add(cl, cl.WebSiteSample.ToList());
                //   sampling.takeOrder = samplingOrderEnum.randomSuffle;
            }


            samplingSettings sampling = new samplingSettings();

            sampling.parts     = k;
            sampling.takeOrder = samplingOrderEnum.ordinal;

            if (randomize)
            {
                foreach (IDocumentSetClass cl in classes)
                {
                    validationCases.sampleMatrix[cl].Randomize();
                    //sampleMatrix.Add(cl, cl.WebSiteSample.ToList());
                    //   sampling.takeOrder = samplingOrderEnum.randomSuffle;
                }
            }



            for (int i = 0; i < k; i++)
            {
                var valCase = validationCases.CreateNew(basename + i.ToString("D3"));



                foreach (IDocumentSetClass cl in classes)
                {
                    List <String> sample = validationCases.sampleMatrix[cl].ToList();

                    //if (randomize)
                    //{
                    //    sample.Randomize();
                    //}

                    //Int32 foldSize = sample.Count() / k;



                    if (k > 1)
                    {
                        sampling.skip = i;

                        var eval = new sampleTake <String>(sample, sampling);

                        sample = eval.GetRestOfSource();

                        valCase.trainingCases.Add(cl.name, sample);
                        valCase.evaluationCases.Add(cl.name, eval);
                    }
                    else
                    {
                        valCase.trainingCases.Add(cl.name, sample);
                        valCase.evaluationCases.Add(cl.name, sample);
                    }

                    if (output != null)
                    {
                        if (debug)
                        {
                            output.AppendLine("Case [" + valCase.name + "] for [" + cl.name + "] have training[" + valCase.trainingCases[cl.name].Count + "] and eval[" + valCase.evaluationCases[cl.name].Count + "]");
                        }
                    }
                }

                if (output != null)
                {
                    output.log("k-fold validation case [" + valCase.name + "] created for [" + valCase.trainingCases.Count + "] industries");
                }
            }

            //validationCases.OnLoad(null, output);

            validationCases.OnBeforeSave(output);
            return(validationCases);
        }