コード例 #1
0
        /// <summary>
        /// Renders the specified set of WebSiteDocuments into List of <see cref="TextDocumentSet"/>s
        /// </summary>
        /// <param name="input">The input.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public List <TextDocumentSet> RenderDocumentSet(WebSiteDocumentsSet input, ILogBuilder logger)
        {
            List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>();
            Int32 target = input.Count;
            Int32 ti     = 0;

            foreach (WebSiteDocuments webSite in input)
            {
                //if (GroupSiteDocuments)
                //{
                TextDocumentSet textSet = RenderSiteDocuments(webSite, logger);
                textSetForLabel.Add(textSet);
                //} else
                //{
                //    foreach (WebSiteDocument webPage in webSite.documents)
                //    {
                //        TextDocumentSet textSet = new TextDocumentSet(webPage.AssociatedID);
                //        TextDocumentLayerCollection pg = RenderText(webPage, webSite);
                //        pg.name = webPage.AssociatedID;
                //        textSet.Add(pg);
                //        textSetForLabel.Add(textSet);
                //    }
                //}
                ti++;
                Double done = ti.GetRatio(target);
                logger.Append(" [" + done.ToString("P2") + "] ");
            }
            return(textSetForLabel);
        }
コード例 #2
0
        /// <summary>
        /// Creates dataset list, with all entities, groupped into first level categories.
        /// </summary>
        /// <returns></returns>
        public List <WebSiteDocumentsSet> GetFirstLevelCategories()
        {
            List <WebSiteDocumentsSet> dataset = new List <WebSiteDocumentsSet>();

            foreach (WebDocumentsCategory subcat in this)
            {
                WebSiteDocumentsSet ds = new WebSiteDocumentsSet();
                ds.AddRange(subcat.GetAllSites());
                ds.name = subcat.name;
                dataset.Add(ds);
            }
            return(dataset);
        }
コード例 #3
0
        /// <summary>
        /// Gets flat list of categories, where names represent category hierarchy graph path
        /// </summary>
        /// <param name="parentCatName">Name of the parent category, leave blank if this category should be considered as root</param>
        /// <returns></returns>
        public List <WebSiteDocumentsSet> GetAllCategories(String parentCatName = "")
        {
            List <WebSiteDocumentsSet> dataset = new List <WebSiteDocumentsSet>();

            foreach (WebDocumentsCategory subcat in this)
            {
                WebSiteDocumentsSet ds = new WebSiteDocumentsSet();
                ds.AddRange(subcat.siteDocuments);
                ds.name = parentCatName + pathSeparator + subcat.name;
                dataset.Add(ds);

                dataset.AddRange(subcat.GetAllCategories(ds.name));
            }
            return(dataset);
        }
コード例 #4
0
        public static DatasetStructureReport MakeStructureReport(WebSiteDocumentsSet category)
        {
            DatasetStructureReport output = new DatasetStructureReport();

            output.name = category.name;

            output.classes = 1;

            output.sites = category.Count;

            output.pages = category.CountDocumentsTotal();

            output.Compute();

            return(output);
        }
コード例 #5
0
        public static DataTable MakeTable(this WebSiteDocumentsSet docSet, Dictionary <String, SpaceDocumentModel> docModels)
        {
            DataTable table = new DataTable();

            table.SetTitle(docSet.name);
            table.SetDescription(docSet.description);

            DataColumn column_rank   = table.Add("Nr", "Order number", "Nr", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(10);
            DataColumn column_domain = table.Add("Domain", "Web site domain", "Domain", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(40);
            DataColumn column_page   = table.Add("Pages", "Number of pages for the website", "Pages", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20);
            String     g1            = "Presentation";

            DataColumn column_Terms  = table.Add("Terms", "Number of distinct terms", "Terms", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20).SetGroup(g1).SetDefaultBackground("#FF6633");
            DataColumn column_Tokens = table.Add("Tokens", "Total number of tokens", "Tokens", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20).SetGroup(g1).SetDefaultBackground("#FF6633");


            Int32 p = 0;

            Int32 c = 1;

            foreach (var pair in docSet)
            {
                var dr = table.NewRow();

                dr[column_rank]   = c;
                dr[column_domain] = pair.domain;
                dr[column_page]   = pair.documents.Count;

                var docModel = docModels[pair.domain];
                dr[column_Terms]  = docModel.terms.Count;
                dr[column_Tokens] = docModel.terms.GetSumFrequency();

                p += pair.documents.Count;
                c++;
                table.Rows.Add(dr);
            }

            table.AddExtra("Category name [" + docSet.name + "]");
            table.AddExtra("Category description [" + docSet.description + "]");


            table.SetAdditionalInfoEntry("Websites", docSet.Count, "Number of websites in the set");
            table.SetAdditionalInfoEntry("Web pages", p, "Total count of pages");
            //    table.SetAdditionalInfoEntry("Total tokens", terms.GetSumFrequency(), "Total number of tokens extracted from the corpus/document, i.e. sum of all frequencies");

            return(table);
        }
コード例 #6
0
        /// <summary>
        /// Reduces the dataset.
        /// </summary>
        /// <param name="dataSet">The data set.</param>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public Double ReduceDatasetCategory(WebSiteDocumentsSet dataSet, HtmlDocumentReductionSettings settings, ILogBuilder logger)
        {
            if (dataSet.Count == 0)
            {
                return(1);
            }
            List <Double> reductions = new List <double>();

            foreach (WebSiteDocuments site in dataSet)
            {
                reductions.Add(ReduceDocumentSet(site, settings, logger));
            }

            Double average = reductions.Average();

            if (settings.logCategoryLevel)
            {
                logger.log("_ [" + dataSet.name + "] _ reduced to (avg): " + average.ToString("P2"));
            }

            return(average);
        }
コード例 #7
0
        /// <summary>
        /// Reduces the dataset category.
        /// </summary>
        /// <param name="dataSet">The data set.</param>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public double ReduceDatasetCategory(WebSiteDocumentsSet dataSet, WebSiteDataSetReductionSettings settings, ILogBuilder logger)
        {
            //List<Double> reductions = new List<double>();

            Int32 total_input = dataSet.CountDocumentsTotal();

            List <WebSiteGraphDiagnosticMark> marks = new List <WebSiteGraphDiagnosticMark>();

            if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none)
            {
                marks = settings.marksToRemove.getEnumListFromFlags <WebSiteGraphDiagnosticMark>();
            }

            List <WebSiteDocuments> toRemove = new List <WebSiteDocuments>();

            foreach (WebSiteDocuments site in dataSet)
            {
                if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none)
                {
                    if (site.extensions.graph == null)
                    {
                        if (settings.logSiteLevel)
                        {
                            logger.log("Site _ [" + site.domain + "] _ flaged for removal because not having graph declared");
                        }
                    }
                    else
                    {
                        foreach (WebSiteGraphDiagnosticMark mark in marks)
                        {
                            if (site.extensions.graph.diagnosticResults.HasFlag(mark))
                            {
                                if (settings.logSiteLevel)
                                {
                                    logger.log("Site _ [" + site.domain + "] _ flaged for removal because of [" + mark.ToString() + "] web graph diagnostic mark");
                                }

                                toRemove.Add(site);
                            }
                        }
                    }
                }
            }

            foreach (WebSiteDocuments site in toRemove)
            {
                if (dataSet.Contains(site))
                {
                    dataSet.Remove(site);
                }
            }

            dataSet.RemoveEmptyDocuments(logger, settings.LimitSettings.minPageLimit, settings.LimitSettings.maxPageLimit);

            Int32 total_output = dataSet.CountDocumentsTotal();

            Double average = total_output.GetRatio(total_input);

            if (settings.logCategoryLevel)
            {
                logger.log("Document count in _ [" + dataSet.name + "] _ reduced to: " + average.ToString("P2"));
            }

            return(average);
        }
コード例 #8
0
        /// <summary>
        /// Deploys the specified settings.
        /// </summary>
        /// <param name="_settings">The settings.</param>
        /// <param name="_dataset">Un-folded dataset, without having the unknown class defined</param>
        /// <param name="logger">The logger.</param>
        public void Deploy(CrossValidationModel _settings, IEnumerable <WebSiteDocumentsSet> _dataset, ILogBuilder logger)
        {
            settings = _settings;

            if (settings.SingleFold)
            {
                name = "1-fold -- single fold override";
                ExperimentDataSetFold fold = new ExperimentDataSetFold();
                fold.name = "SingleFold";
                fold.AddRange(_dataset);
                Add(fold);
                return;
            }

            if (_dataset is ExperimentDataSetFold foldInstance)
            {
                dataset = foldInstance;
            }

            name = settings.K + "-fold Tr[" + _settings.TrainingFolds + "] Ts[" + _settings.TestFolds + "]";

            List <CategorySlicedFolds> folds = new List <CategorySlicedFolds>();

            Dictionary <WebSiteDocumentsSet, CategorySlicedFolds> slicedFolds = new Dictionary <WebSiteDocumentsSet, CategorySlicedFolds>();

            foreach (WebSiteDocumentsSet cat in _dataset)
            {
                CategorySlicedFolds fold = new CategorySlicedFolds();
                fold.Deploy(cat, settings.K, settings.randomFolds);
                folds.Add(fold);
                slicedFolds.Add(cat, fold);
            }

            // --------------------------------------------------------- //

            var distributionMatrix = settings.GetDistributionMatrix();

            Int32 foldsToCreate = settings.K;

            if (settings.LimitFoldsExecution > 0)
            {
                foldsToCreate = settings.LimitFoldsExecution;
            }
            for (int i = 0; i < foldsToCreate; i++)
            {
                ExperimentDataSetFold setFold = new ExperimentDataSetFold();
                setFold.name = settings.K + "-fold[" + i + "]";

                setFold.CopyLabelNames(_dataset);

                WebSiteDocumentsSet unknownCat = new WebSiteDocumentsSet(SpaceLabel.UNKNOWN, "Test category - " + setFold.name);

                setFold.Add(unknownCat);

                foreach (KeyValuePair <WebSiteDocumentsSet, CategorySlicedFolds> catPair in slicedFolds)
                {
                    WebSiteDocumentsSet cat = setFold.First(x => x.name == catPair.Key.name);

                    for (int s = 0; s < settings.K; s++)
                    {
                        bool toTraining = distributionMatrix[i][s];

                        if (toTraining)
                        {
                            cat.AddRange(catPair.Value[s].WeakClone());
                        }
                        else
                        {
                            unknownCat.AddRange(catPair.Value[s].WeakClone());
                        }
                    }
                }

                Add(setFold);
            }
        }