Beispiel #1
0
        public WebDataSetImportContext GetImportContext(becDataSetSettings datasetSettings, ILogBuilder dsLoadLogger, Boolean silentDatasetLoad = true)
        {
            WebKBDatasetAdapter adapter = new WebKBDatasetAdapter();

            //ILogBuilder dsLoadLogger = parent.output;
            if (silentDatasetLoad)
            {
                dsLoadLogger = null;
            }

            WebDocumentsCategory category = adapter.LoadDataset(datasetSettings.path, WebDomainCategoryFormatOptions.normalizeDomainname, dsLoadLogger);

            List <WebSiteDocumentsSet> dataset = new List <WebSiteDocumentsSet>();

            if (datasetSettings.flattenCategoryHierarchy)
            {
                dataset = category.GetFirstLevelCategories();
            }
            else
            {
                throw new NotImplementedException();
            }

            //  exp.setup.toolkitSettings.entityMethod.cachePath = imbACE.Core.appManager.Application.folder_cache.Add("BEC", "BEC", "Cached objects for BEC").path;

            // vetting the dataset
            foreach (WebSiteDocumentsSet ds in dataset)
            {
                ds.RemoveEmptyDocuments(dsLoadLogger, datasetSettings.minPageLimit, datasetSettings.maxPageLimit);
                ds.AssignID(dsLoadLogger);
            }

            WebDataSetImportContext importContext = new WebDataSetImportContext(datasetSettings.path, dataset);

            return(importContext);
        }
Beispiel #2
0
        /// <summary>Performs dataset reduction by removing designated html nodes and attributes</summary>
        /// <remarks><para>What it will do?</para></remarks>
        /// <param name="inputPath">Path leading to source dataset</param>
        /// <param name="outp">--</param>
        /// <param name="debug">--</param>
        /// <seealso cref="aceOperationSetExecutorBase"/>
        public void aceOperation_runDataSetReduction(
            [Description("Path leading to source dataset")] String inputPath          = @"G:\imbWBI\datasets\7sectors_2018b",
            [Description("Output for reduced dataset")] String outputPath             = @"G:\imbWBI\datasets\7sectors_2018c",
            [Description("Filename of reduction setup")] String reductionSetup        = "*",
            [Description("Sites to remove")] WebSiteGraphDiagnosticMark marksToRemove = WebSiteGraphDiagnosticMark.none,
            Int32 minPageLimit       = -1,
            Int32 maxPageLimit       = -1,
            Boolean removeEmptyPages = false)
        {
            var logPosition = output.Length;

            WebKBDatasetAdapter        adapter = new WebKBDatasetAdapter();
            List <WebSiteDocumentsSet> dataset = null;


            WebDocumentsCategory category = adapter.LoadDataset(inputPath, WebDomainCategoryFormatOptions.normalizeDomainname, output);

            if (reductionSetup != "*")
            {
                reductionSetup = folder.pathMake(reductionSetup);
            }

            var settings = WebSiteDataSetReductionSettings.LoadOrDefault(reductionSetup, output); // HtmlDocumentReductionSettings.LoadOrDefault(reductionSetup, output);


            if (settings.LimitSettings.flattenCategoryHierarchy)
            {
                dataset = category.GetFirstLevelCategories();
            }
            else
            {
                dataset = category.GetAllCategories();
            }

            if (mainContext?.dataset != null)
            {
                mainContext.dataset.TransferExtensionsTo(dataset);
            }

            if (outputPath == "*")
            {
                outputPath = inputPath;
            }


            WebSiteDataSetReductionEngine engine = new WebSiteDataSetReductionEngine();

            //HtmlDocumentReductionEngine engine = new HtmlDocumentReductionEngine();
            engine.ReduceDataset(dataset, settings, output);


            String out_pathRoot = Path.GetDirectoryName(outputPath);
            String out_dataset  = outputPath.Substring(out_pathRoot.Length);

            if (!out_dataset.isNullOrEmpty())
            {
                category.name = out_dataset;
            }


            WebDocumentsCategory output_category = new WebDocumentsCategory(out_dataset);

            output_category.description = "Version of dataset [" + inputPath + "], reduced in size [" + engine.reductionScore.ToString("F2") + "] by HTML node filtration and WebGraph consistency rules.";


            output_category.SetCategoryByDataset(dataset);

            adapter.SaveDataset(output_category, out_pathRoot, WebDomainCategoryFormatOptions.normalizeDomainname | WebDomainCategoryFormatOptions.saveDomainList
                                | WebDomainCategoryFormatOptions.saveReadmeFile | WebDomainCategoryFormatOptions.saveAggregate, output);

            engine.SaveReport(output, out_pathRoot + output_category.path, settings);
        }