/// <summary> /// Gets all web pages registered in the <see cref="imbMCWebSite.pageTable" /> loaded, if <c>takeSettings</c> used then returns sampleTake /// </summary> /// <param name="site">The site repo to take pages for</param> /// <param name="output">The log output.</param> /// <param name="takeSettings">If specified it will return only fraction of web sites, according to the sampling settings.</param> /// <returns> /// All web pages in the site repo , or sampleTake if sampling settings specified /// </returns> public List <imbMCWebPage> GetAllWebPages(imbMCWebSite site, ILogBuilder output = null, samplingSettings takeSettings = null) { if (output == null) { output = aceLog.loger; } var all = site.pageTable.GetList(); if (takeSettings != null) { all = new sampleTake <imbMCWebPageEntry>(all, takeSettings); } List <imbMCWebPage> pages = new List <imbMCWebPage>(); foreach (var pe in all) { var repo = pe.HashCode.LoadDataStructure <imbMCWebPage>(site.folder, output); if (repo != null) { pages.Add(repo); } } return(pages); }
/// <summary> /// Returns repository instances for all web sites registered in <see cref="imbMCRepository.siteTable" />, if <c>takeSettings</c> used then returns sampleTake /// </summary> /// <param name="output">The log output.</param> /// <param name="takeSettings">If specified it will return only fraction of web sites, according to the sampling settings.</param> /// <returns>All web sites in the repo, or sampleTake if sampling settings specified</returns> public List <imbMCWebSite> GetAllWebSites(ILogBuilder output = null, samplingSettings takeSettings = null) { if (output == null) { output = aceLog.loger; } CheckSiteTable(output); List <imbMCWebSiteEntry> all = siteTable.GetList(); if (takeSettings != null) { all = new sampleTake <imbMCWebSiteEntry>(all, takeSettings); } List <imbMCWebSite> sites = new List <imbMCWebSite>(); foreach (var se in all) { try { var repo = se.domain.LoadDataStructure <imbMCWebSite>(folder, output); if (repo != null) { sites.Add(repo); } } catch (Exception ex) { output.log("Problem loading [" + se.domain + "] -> exception:" + ex.Message); } } return(sites); }
/// <summary> /// Builds the validation cases. /// </summary> /// <param name="basename">The basename.</param> /// <param name="k">The k.</param> /// <param name="debug">if set to <c>true</c> [debug].</param> /// <param name="output">The output.</param> /// <returns></returns> public kFoldValidationCollection BuildValidationCases(String basename, Int32 k, Boolean debug, ILogBuilder output = null, folderNode folderOverride = null, Boolean randomize = false) { kFoldValidationCollection validationCases = new kFoldValidationCollection(); folderNode folderToUse = folderRoot; if (folderOverride != null) { folderToUse = folderOverride; } validationCases.folder = folderToUse.Add(basename, basename, basename + " " + k + "-fold validation"); validationCases.Clear(); var classes = GetClasses(); validationCases.sampleMatrix = new Dictionary <IDocumentSetClass, List <string> >(); foreach (IDocumentSetClass cl in classes) { validationCases.sampleMatrix.Add(cl, cl.WebSiteSample.ToList()); // sampling.takeOrder = samplingOrderEnum.randomSuffle; } samplingSettings sampling = new samplingSettings(); sampling.parts = k; sampling.takeOrder = samplingOrderEnum.ordinal; if (randomize) { foreach (IDocumentSetClass cl in classes) { validationCases.sampleMatrix[cl].Randomize(); //sampleMatrix.Add(cl, cl.WebSiteSample.ToList()); // sampling.takeOrder = samplingOrderEnum.randomSuffle; } } for (int i = 0; i < k; i++) { var valCase = validationCases.CreateNew(basename + i.ToString("D3")); foreach (IDocumentSetClass cl in classes) { List <String> sample = validationCases.sampleMatrix[cl].ToList(); //if (randomize) //{ // sample.Randomize(); //} //Int32 foldSize = sample.Count() / k; if (k > 1) { sampling.skip = i; var eval = new sampleTake <String>(sample, sampling); sample = eval.GetRestOfSource(); valCase.trainingCases.Add(cl.name, sample); valCase.evaluationCases.Add(cl.name, eval); } else { valCase.trainingCases.Add(cl.name, sample); valCase.evaluationCases.Add(cl.name, sample); } if (output != null) { if (debug) { output.AppendLine("Case [" + valCase.name + "] for [" + cl.name + "] have training[" + valCase.trainingCases[cl.name].Count + "] and eval[" + valCase.evaluationCases[cl.name].Count + "]"); } } } if (output != null) { output.log("k-fold validation case [" + valCase.name + "] created for [" + valCase.trainingCases.Count + "] industries"); } } //validationCases.OnLoad(null, output); validationCases.OnBeforeSave(output); return(validationCases); }