public WebDataSetImportContext GetImportContext(becDataSetSettings datasetSettings, ILogBuilder dsLoadLogger, Boolean silentDatasetLoad = true) { WebKBDatasetAdapter adapter = new WebKBDatasetAdapter(); //ILogBuilder dsLoadLogger = parent.output; if (silentDatasetLoad) { dsLoadLogger = null; } WebDocumentsCategory category = adapter.LoadDataset(datasetSettings.path, WebDomainCategoryFormatOptions.normalizeDomainname, dsLoadLogger); List <WebSiteDocumentsSet> dataset = new List <WebSiteDocumentsSet>(); if (datasetSettings.flattenCategoryHierarchy) { dataset = category.GetFirstLevelCategories(); } else { throw new NotImplementedException(); } // exp.setup.toolkitSettings.entityMethod.cachePath = imbACE.Core.appManager.Application.folder_cache.Add("BEC", "BEC", "Cached objects for BEC").path; // vetting the dataset foreach (WebSiteDocumentsSet ds in dataset) { ds.RemoveEmptyDocuments(dsLoadLogger, datasetSettings.minPageLimit, datasetSettings.maxPageLimit); ds.AssignID(dsLoadLogger); } WebDataSetImportContext importContext = new WebDataSetImportContext(datasetSettings.path, dataset); return(importContext); }
/// <summary>Performs dataset reduction by removing designated html nodes and attributes</summary> /// <remarks><para>What it will do?</para></remarks> /// <param name="inputPath">Path leading to source dataset</param> /// <param name="outp">--</param> /// <param name="debug">--</param> /// <seealso cref="aceOperationSetExecutorBase"/> public void aceOperation_runDataSetReduction( [Description("Path leading to source dataset")] String inputPath = @"G:\imbWBI\datasets\7sectors_2018b", [Description("Output for reduced dataset")] String outputPath = @"G:\imbWBI\datasets\7sectors_2018c", [Description("Filename of reduction setup")] String reductionSetup = "*", [Description("Sites to remove")] WebSiteGraphDiagnosticMark marksToRemove = WebSiteGraphDiagnosticMark.none, Int32 minPageLimit = -1, Int32 maxPageLimit = -1, Boolean removeEmptyPages = false) { var logPosition = output.Length; WebKBDatasetAdapter adapter = new WebKBDatasetAdapter(); List <WebSiteDocumentsSet> dataset = null; WebDocumentsCategory category = adapter.LoadDataset(inputPath, WebDomainCategoryFormatOptions.normalizeDomainname, output); if (reductionSetup != "*") { reductionSetup = folder.pathMake(reductionSetup); } var settings = WebSiteDataSetReductionSettings.LoadOrDefault(reductionSetup, output); // HtmlDocumentReductionSettings.LoadOrDefault(reductionSetup, output); if (settings.LimitSettings.flattenCategoryHierarchy) { dataset = category.GetFirstLevelCategories(); } else { dataset = category.GetAllCategories(); } if (mainContext?.dataset != null) { mainContext.dataset.TransferExtensionsTo(dataset); } if (outputPath == "*") { outputPath = inputPath; } WebSiteDataSetReductionEngine engine = new WebSiteDataSetReductionEngine(); //HtmlDocumentReductionEngine engine = new HtmlDocumentReductionEngine(); engine.ReduceDataset(dataset, settings, output); String out_pathRoot = Path.GetDirectoryName(outputPath); String out_dataset = outputPath.Substring(out_pathRoot.Length); if (!out_dataset.isNullOrEmpty()) { category.name = out_dataset; } WebDocumentsCategory output_category = new WebDocumentsCategory(out_dataset); output_category.description = "Version of dataset [" + inputPath + "], reduced in size [" + engine.reductionScore.ToString("F2") + "] by HTML node filtration and WebGraph consistency rules."; output_category.SetCategoryByDataset(dataset); adapter.SaveDataset(output_category, out_pathRoot, WebDomainCategoryFormatOptions.normalizeDomainname | WebDomainCategoryFormatOptions.saveDomainList | WebDomainCategoryFormatOptions.saveReadmeFile | WebDomainCategoryFormatOptions.saveAggregate, output); engine.SaveReport(output, out_pathRoot + output_category.path, settings); }
/// <summary>Loads WebKB web site datasets</summary> /// <remarks><para>It will load WebKB 7Sectors dataset</para></remarks> /// <param name="path">Path to 7Secotrs dataset</param> /// <param name="steps">--</param> /// <param name="debug">--</param> /// <seealso cref="aceOperationSetExecutorBase"/> public void aceOperation_runLoadWebKB( [Description("Path to 7Secotrs dataset")] String path = "G:\\_DOKTORAT_MAIN\\SM03_Datasets\\7sectors") { WebKBDatasetAdapter webKBDatasetAdapter = new WebKBDatasetAdapter(); var dataset = webKBDatasetAdapter.LoadDataset(path, response); }