/// <summary> /// Renders the specified set of WebSiteDocuments into List of <see cref="TextDocumentSet"/>s /// </summary> /// <param name="input">The input.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public List <TextDocumentSet> RenderDocumentSet(WebSiteDocumentsSet input, ILogBuilder logger) { List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>(); Int32 target = input.Count; Int32 ti = 0; foreach (WebSiteDocuments webSite in input) { //if (GroupSiteDocuments) //{ TextDocumentSet textSet = RenderSiteDocuments(webSite, logger); textSetForLabel.Add(textSet); //} else //{ // foreach (WebSiteDocument webPage in webSite.documents) // { // TextDocumentSet textSet = new TextDocumentSet(webPage.AssociatedID); // TextDocumentLayerCollection pg = RenderText(webPage, webSite); // pg.name = webPage.AssociatedID; // textSet.Add(pg); // textSetForLabel.Add(textSet); // } //} ti++; Double done = ti.GetRatio(target); logger.Append(" [" + done.ToString("P2") + "] "); } return(textSetForLabel); }
/// <summary> /// Creates dataset list, with all entities, groupped into first level categories. /// </summary> /// <returns></returns> public List <WebSiteDocumentsSet> GetFirstLevelCategories() { List <WebSiteDocumentsSet> dataset = new List <WebSiteDocumentsSet>(); foreach (WebDocumentsCategory subcat in this) { WebSiteDocumentsSet ds = new WebSiteDocumentsSet(); ds.AddRange(subcat.GetAllSites()); ds.name = subcat.name; dataset.Add(ds); } return(dataset); }
/// <summary> /// Gets flat list of categories, where names represent category hierarchy graph path /// </summary> /// <param name="parentCatName">Name of the parent category, leave blank if this category should be considered as root</param> /// <returns></returns> public List <WebSiteDocumentsSet> GetAllCategories(String parentCatName = "") { List <WebSiteDocumentsSet> dataset = new List <WebSiteDocumentsSet>(); foreach (WebDocumentsCategory subcat in this) { WebSiteDocumentsSet ds = new WebSiteDocumentsSet(); ds.AddRange(subcat.siteDocuments); ds.name = parentCatName + pathSeparator + subcat.name; dataset.Add(ds); dataset.AddRange(subcat.GetAllCategories(ds.name)); } return(dataset); }
public static DatasetStructureReport MakeStructureReport(WebSiteDocumentsSet category) { DatasetStructureReport output = new DatasetStructureReport(); output.name = category.name; output.classes = 1; output.sites = category.Count; output.pages = category.CountDocumentsTotal(); output.Compute(); return(output); }
public static DataTable MakeTable(this WebSiteDocumentsSet docSet, Dictionary <String, SpaceDocumentModel> docModels) { DataTable table = new DataTable(); table.SetTitle(docSet.name); table.SetDescription(docSet.description); DataColumn column_rank = table.Add("Nr", "Order number", "Nr", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(10); DataColumn column_domain = table.Add("Domain", "Web site domain", "Domain", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(40); DataColumn column_page = table.Add("Pages", "Number of pages for the website", "Pages", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20); String g1 = "Presentation"; DataColumn column_Terms = table.Add("Terms", "Number of distinct terms", "Terms", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20).SetGroup(g1).SetDefaultBackground("#FF6633"); DataColumn column_Tokens = table.Add("Tokens", "Total number of tokens", "Tokens", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20).SetGroup(g1).SetDefaultBackground("#FF6633"); Int32 p = 0; Int32 c = 1; foreach (var pair in docSet) { var dr = table.NewRow(); dr[column_rank] = c; dr[column_domain] = pair.domain; dr[column_page] = pair.documents.Count; var docModel = docModels[pair.domain]; dr[column_Terms] = docModel.terms.Count; dr[column_Tokens] = docModel.terms.GetSumFrequency(); p += pair.documents.Count; c++; table.Rows.Add(dr); } table.AddExtra("Category name [" + docSet.name + "]"); table.AddExtra("Category description [" + docSet.description + "]"); table.SetAdditionalInfoEntry("Websites", docSet.Count, "Number of websites in the set"); table.SetAdditionalInfoEntry("Web pages", p, "Total count of pages"); // table.SetAdditionalInfoEntry("Total tokens", terms.GetSumFrequency(), "Total number of tokens extracted from the corpus/document, i.e. sum of all frequencies"); return(table); }
/// <summary> /// Reduces the dataset. /// </summary> /// <param name="dataSet">The data set.</param> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public Double ReduceDatasetCategory(WebSiteDocumentsSet dataSet, HtmlDocumentReductionSettings settings, ILogBuilder logger) { if (dataSet.Count == 0) { return(1); } List <Double> reductions = new List <double>(); foreach (WebSiteDocuments site in dataSet) { reductions.Add(ReduceDocumentSet(site, settings, logger)); } Double average = reductions.Average(); if (settings.logCategoryLevel) { logger.log("_ [" + dataSet.name + "] _ reduced to (avg): " + average.ToString("P2")); } return(average); }
/// <summary> /// Reduces the dataset category. /// </summary> /// <param name="dataSet">The data set.</param> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public double ReduceDatasetCategory(WebSiteDocumentsSet dataSet, WebSiteDataSetReductionSettings settings, ILogBuilder logger) { //List<Double> reductions = new List<double>(); Int32 total_input = dataSet.CountDocumentsTotal(); List <WebSiteGraphDiagnosticMark> marks = new List <WebSiteGraphDiagnosticMark>(); if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none) { marks = settings.marksToRemove.getEnumListFromFlags <WebSiteGraphDiagnosticMark>(); } List <WebSiteDocuments> toRemove = new List <WebSiteDocuments>(); foreach (WebSiteDocuments site in dataSet) { if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none) { if (site.extensions.graph == null) { if (settings.logSiteLevel) { logger.log("Site _ [" + site.domain + "] _ flaged for removal because not having graph declared"); } } else { foreach (WebSiteGraphDiagnosticMark mark in marks) { if (site.extensions.graph.diagnosticResults.HasFlag(mark)) { if (settings.logSiteLevel) { logger.log("Site _ [" + site.domain + "] _ flaged for removal because of [" + mark.ToString() + "] web graph diagnostic mark"); } toRemove.Add(site); } } } } } foreach (WebSiteDocuments site in toRemove) { if (dataSet.Contains(site)) { dataSet.Remove(site); } } dataSet.RemoveEmptyDocuments(logger, settings.LimitSettings.minPageLimit, settings.LimitSettings.maxPageLimit); Int32 total_output = dataSet.CountDocumentsTotal(); Double average = total_output.GetRatio(total_input); if (settings.logCategoryLevel) { logger.log("Document count in _ [" + dataSet.name + "] _ reduced to: " + average.ToString("P2")); } return(average); }
/// <summary> /// Deploys the specified settings. /// </summary> /// <param name="_settings">The settings.</param> /// <param name="_dataset">Un-folded dataset, without having the unknown class defined</param> /// <param name="logger">The logger.</param> public void Deploy(CrossValidationModel _settings, IEnumerable <WebSiteDocumentsSet> _dataset, ILogBuilder logger) { settings = _settings; if (settings.SingleFold) { name = "1-fold -- single fold override"; ExperimentDataSetFold fold = new ExperimentDataSetFold(); fold.name = "SingleFold"; fold.AddRange(_dataset); Add(fold); return; } if (_dataset is ExperimentDataSetFold foldInstance) { dataset = foldInstance; } name = settings.K + "-fold Tr[" + _settings.TrainingFolds + "] Ts[" + _settings.TestFolds + "]"; List <CategorySlicedFolds> folds = new List <CategorySlicedFolds>(); Dictionary <WebSiteDocumentsSet, CategorySlicedFolds> slicedFolds = new Dictionary <WebSiteDocumentsSet, CategorySlicedFolds>(); foreach (WebSiteDocumentsSet cat in _dataset) { CategorySlicedFolds fold = new CategorySlicedFolds(); fold.Deploy(cat, settings.K, settings.randomFolds); folds.Add(fold); slicedFolds.Add(cat, fold); } // --------------------------------------------------------- // var distributionMatrix = settings.GetDistributionMatrix(); Int32 foldsToCreate = settings.K; if (settings.LimitFoldsExecution > 0) { foldsToCreate = settings.LimitFoldsExecution; } for (int i = 0; i < foldsToCreate; i++) { ExperimentDataSetFold setFold = new ExperimentDataSetFold(); setFold.name = settings.K + "-fold[" + i + "]"; setFold.CopyLabelNames(_dataset); WebSiteDocumentsSet unknownCat = new WebSiteDocumentsSet(SpaceLabel.UNKNOWN, "Test category - " + setFold.name); setFold.Add(unknownCat); foreach (KeyValuePair <WebSiteDocumentsSet, CategorySlicedFolds> catPair in slicedFolds) { WebSiteDocumentsSet cat = setFold.First(x => x.name == catPair.Key.name); for (int s = 0; s < settings.K; s++) { bool toTraining = distributionMatrix[i][s]; if (toTraining) { cat.AddRange(catPair.Value[s].WeakClone()); } else { unknownCat.AddRange(catPair.Value[s].WeakClone()); } } } Add(setFold); } }