protected void BuildChunksForClass(classifierTools tools, IDocumentSetClass documentSetClass) { var context = items[documentSetClass.name]; //lemmaTable.SaveAs(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml", imbSCI.Data.enums.getWritableFileMode.overwrite)); experimentContext.chunkComposer.reset(); experimentContext.notes.log("Chunk construction... [" + documentSetClass.name + "]"); ConcurrentBag <IPipelineTaskSubject> MCStreams = context.exitByLevel[cnt_level.mcTokenStream]; //context.exitSubjects.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcTokenStream, cnt_level.mcChunk }, true); // sites.getAllChildren(); //context.exitSubjects.GetSubjectsOfLevel<IPipelineTaskSubject>(cnt_level.mcTokenStream); streamsByCategory.Add(documentSetClass, MCStreams.ToList()); List <pipelineTaskSubjectContentToken> Chunks = experimentContext.chunkComposer.process(MCStreams.ToSubjectToken(), experimentContext.logger); chunksByCategory.Add(documentSetClass, Chunks); if (Chunks.Count == 0) { experimentContext.logger.log("-- no chunks produced for [" + documentSetClass.name + "] -- Stream input count [" + MCStreams.Count + "]"); } else { experimentContext.notes.log("[" + Chunks.Count + "] chunks constructed for class [" + documentSetClass.name + "]"); } }
/// <summary> /// Performing post processing of FV knowledge /// </summary> /// <param name="validationCase">The validation case.</param> /// <param name="tools">The tools.</param> /// <param name="logger">The logger.</param> public override void DoFVPostProcessing(kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger) { List <lemmaSemanticCloud> clouds = new List <lemmaSemanticCloud>(); foreach (var docClass in validationCase.context.classes.GetClasses()) { var knowledge = validationCase.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(docClass, validationCase, logger); knowledge.semanticCloudFiltered = knowledge.semanticCloud.Clone(); clouds.Add(knowledge.semanticCloudFiltered); knowledge.semanticCloud.className = docClass.name; knowledge.semanticCloudFiltered.className = docClass.name + "flt"; if (settings.semanticCloudFilter.isActive) { knowledge.semanticCloudFiltered.description = "Semantic cloud filtered with cloud matrix"; } else { knowledge.semanticCloudFiltered.description = "Semantic cloud filter is off - this is initial cloud"; } } if (settings.semanticCloudFilter.isActive) { logger.log(validationCase.name + ": Cloud matrix creation starts..."); cloudMatrix matrix = new cloudMatrix(validationCase.name, "Cloud overlap matrix of [" + clouds.Count + "] for fold [" + validationCase.name + "] of experiment [" + validationCase.context.setup.name + "]"); matrix.build(clouds, logger); matrix.TransformClouds(settings.semanticCloudFilter, logger); if (tools.operation.doMakeGraphForClassClouds) { foreach (var cloud in clouds) { if (tools.operation.doUseSimpleGraphs) { cloud.GetSimpleGraph(true).Save(validationCase.caseFolder.pathFor("class_" + cloud.className + "_reducedCloud", getWritableFileMode.overwrite)); } else { var converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(cloud).Save(validationCase.caseFolder.pathFor("class_" + cloud.className + "_reducedCloud", getWritableFileMode.overwrite)); } } } //logger.log(validationCase.name + ": Cloud matrix report creation ..."); // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_norm_initial"); // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_abs_initial"); // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.normalizedValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_norm_reduced"); // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_abs_reduced"); // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.normalizedValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_CF_norm_reduced"); // logger.log(validationCase.name + ": Cloud matrix report done."); } else { logger.log(validationCase.name + ": Cloud matrix is not active"); } }
protected pipelineModelExecutionContext GetContextForPipeline(classifierTools tools, IDocumentSetClass documentSetClass) { if (!items.ContainsKey(documentSetClass.name)) { pipelineModelExecutionContext context = machine.run(tools.model, documentSetClass.MCRepositoryName, documentSetClass, new List <String>()); items.Add(documentSetClass.name, context); } return(items[documentSetClass.name]); }
public override void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger) { var state = states.SetState(trainingSet, GetExperimentSufix()); _distance = new SquareEuclidean(); var kNearest = new KNearestNeighbors <Double[]>(k: setup.kNN_k, distance: _distance); kNearest.Learn(state.data.inputs, state.data.outputs); state.machine = kNearest; state.SaveState(); }
protected webLemmaTermTable BuildLemmaTableForClass(classifierTools tools, IDocumentSetClass documentSetClass, List <pipelineTaskMCSiteSubject> sites) { var context = items[documentSetClass.name]; experimentContext.notes.log("Master TF-IDF table construction (used for POS flagging)... [" + documentSetClass.name + "]"); webLemmaTermTable lemmaTable = knowledgeByClass[documentSetClass].WLTableOfIndustryClass; // new webLemmaTermTable(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml"), true, "master_table_" + documentSetClass.name); lemmaTable.Clear(); experimentContext.masterConstructor.process(GetTokensForSites <IPipelineTaskSubject>(sites), cnt_level.mcPage, lemmaTable, tools.GetLemmaResource(), context.logger, false); //lemmaTableByClass.TryAdd(documentSetClass, lemmaTable); return(lemmaTable); }
public override void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger) { var state = states.SetState(trainingSet, GetExperimentSufix()); if (isMultinominal) { NaiveBayesLearning <GeneralizedBetaDistribution> teacher = new NaiveBayesLearning <GeneralizedBetaDistribution>(); // Set options for the component distributions teacher.Options.InnerOption = new NormalOptions { Regularization = 1e-5 // to avoid zero variances }; // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism) _teacher = teacher; // Learn a machine // state.machine = teacher.Learn(state.data.inputs, state.data.outputs); } else { NaiveBayesLearning <NormalDistribution> teacher = new NaiveBayesLearning <NormalDistribution>(); // Set options for the component distributions teacher.Options.InnerOption = new NormalOptions { Regularization = 1e-5 // to avoid zero variances }; // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism) _teacher = teacher; // Learn a machine state.machine = teacher.Learn(state.data.inputs, state.data.outputs); } state.SaveState(); }
/// <summary> /// Sets the execution context. /// </summary> /// <param name="_manager">The manager.</param> /// <param name="_setup">The setup.</param> /// <param name="_tools">The tools.</param> /// <param name="_classes">The classes.</param> /// <param name="sufix">The sufix.</param> /// <param name="chunker">The chunker.</param> /// <param name="_masterExtractor">The master extractor.</param> /// <param name="_logger">The logger.</param> public void SetExecutionContext(experimentManager _manager, experimentSetup _setup, classifierTools _tools, DocumentSetClasses _classes, String sufix, chunkComposerBasic chunker, semanticFVExtractor _masterExtractor, ILogBuilder _logger = null) { if (_logger == null) { _logger = new builderForLog(); aceLog.consoleControl.setAsOutput(_logger, _setup.name); } logger = _logger; chunkComposer = chunker; setup = _setup; tools = _tools; tools.context = this; classes = _classes; // masterConstructor = _masterExtractor.termTableConstructor; masterExtractor = _setup.featureVectorExtractors_semantic.First(); masterConstructor = masterExtractor.termTableConstructor; manager = _manager; String expContextName = "exp_" + setup.name.add(sufix, "_"); folder = manager.folder.Add(expContextName, "Experiment " + setup.name, "Directory with all information on the experiment [" + setup.name + "]"); errorNotesFolder = folder.Add("errors", "Error logs", "Directory with error reports produced if an exception occours. Normally, if everything was ok this folder should have only two files inside: directory_readme.txt and empty: note.txt)."); errorNotes = new experimentNotes(errorNotesFolder, "Notes (logs) about critical and non-critical errors that happen during experiment execution. If everything was ok - this file should remain empty"); notes = new experimentNotes(folder, "Notes on experiment setup and execution log"); aceLog.consoleControl.setAsOutput(notes, "Notes"); notes.log("Experiment [" + expContextName + "] initiated"); notes.AppendLine("About: " + setup.description); notes.AppendHorizontalLine(); notes.SaveNote(); notes.AppendHeading("Feature extraction models"); var lnsc = chunkComposer.DescribeSelf(); lnsc.ForEach(x => notes.AppendLine(x)); notes.AppendLine(" - "); List <String> mdn = new List <string>(); foreach (var md in setup.models) { if (mdn.Contains(md.name)) { md.name += "_" + mdn.Count.ToString(); } else { mdn.Add(md.name); } } foreach (var md in setup.models) { String prefix = md.name; md.classes = classes; md.BuildFeatureVectorDefinition(); var lns = md.DescribeSelf(); lns.ForEach(x => notes.AppendLine(x)); kFoldValidationCollection validationCases = classes.BuildValidationCases(prefix, setup.validationSetup.k, tools.DoDebug, logger, folder, setup.validationSetup.randomize); validationCases.pipelineCollection = pipelineCollection; validationCases.connectContext(this, md); validationCollections.Add(md.name, validationCases); //md.postClassifiers = setup.classifiers; } }
public abstract void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger);
public override void DoMakeKnowledge(List <pipelineTaskMCSiteSubject> subjects, classifierTools tools, tfidfFVExtractorKnowledge knowledge, ILogBuilder logger) { knowledge.WLTableOfIndustryClass.Clear(); knowledge.WLTableOfIndustryClass = constructor.process(subjects, cnt_level.mcPage, knowledge.WLTableOfIndustryClass, tools.GetLemmaResource(), logger, false); logger.log("TF-IDF built for [" + knowledge.name + "]"); }
public override void DoFVPostProcessing(kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger) { }
public override tfidfFVExtractorKnowledge DoFVExtractionForClassViaCases(validationCaseCollection vCaseColl, IDocumentSetClass documentSetClass, kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger) { throw new NotImplementedException(); }
/// <summary> /// Gets the context. /// </summary> /// <param name="tools">The tools.</param> /// <param name="documentSetClass">The document set class.</param> /// <returns></returns> public pipelineModelExecutionContext GetContext(classifierTools tools, IDocumentSetClass documentSetClass) { return(GetContextForPipeline(tools, documentSetClass)); }
/// <summary> /// Prepares for parallel execution. /// </summary> /// <param name="tools">The tools.</param> /// <param name="_context">The context.</param> public webProjectKnowledgeSet PrepareForParallelExecution(classifierTools tools, experimentExecutionContext _context) { if (caseKnowledgeSet == null) { caseKnowledgeSet = new webProjectKnowledgeSet(); } if (items.Any()) { experimentContext.notes.log("Mining Context was ready already."); return(caseKnowledgeSet); } DateTime startTime = DateTime.Now; experimentContext = _context; List <webCaseKnowledge> cases = new List <webCaseKnowledge>(); folderNode classReportFolder = experimentContext.folder.Add("General", "General and diagnostic reports", "The folder contains general (outside k-folds) reports on analysied industries (categories), web sites and other diagnostic data"); // <---------------------------------------------------------------------------------------------------------------- [ performing pipeline ] experimentContext.notes.log("Executing the Mining Context decomposition with the pipeline model"); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var pipelineContext = GetContextForPipeline(tools, classSet); sitesByCategory.Add(classSet, new List <pipelineTaskMCSiteSubject>()); if (!pipelineContext.exitByType.ContainsKey(typeof(pipelineTaskMCSiteSubject))) { throw new aceGeneralException("Pipeline context output contains no web site subjects! Check the pipeline Site Task constructor.", null, pipelineContext, "Pipeline broken"); } var sitesForContext = pipelineContext.exitByType[typeof(pipelineTaskMCSiteSubject)]; // <----- preparing foreach (var site in sitesForContext) { tokenBySite.Add(site as pipelineTaskMCSiteSubject, new ConcurrentBag <pipelineTaskSubjectContentToken>()); sitesByCategory[classSet].Add(site as pipelineTaskMCSiteSubject); webCaseKnowledge webCase = new webCaseKnowledge(site as pipelineTaskMCSiteSubject, classSet); caseKnowledgeSet.Add(webCase); cases.Add(webCase); } semanticFVExtractorKnowledge kn = new semanticFVExtractorKnowledge(); kn.name = classSet.name + "_general"; kn.relatedItemPureName = classSet.name; kn.type = WebFVExtractorKnowledgeType.aboutCompleteCategory; kn.Deploy(classReportFolder, experimentContext.logger); knowledgeByClass.TryAdd(classSet, kn); } experimentContext.notes.log("Sorting tokens for all sites [in parallel]"); Parallel.ForEach(tokenBySite.Keys, site => { var leafs = site.getAllLeafs(); foreach (var leaf in leafs) { pipelineTaskSubjectContentToken token = leaf as pipelineTaskSubjectContentToken; if (token != null) { tokenBySite[site].Add(token); } } }); foreach (var c in cases) { c.tokens = tokenBySite[c.MCSiteSubject]; } experimentContext.notes.log("Building diagnostic TF-IDF master tables for all classes [in parallel]"); Boolean useIntegratedApproach = false; if (useIntegratedApproach) { var valCase = experimentContext.validationCollections[experimentContext.masterExtractor.name].GetDiagnosticCase(experimentContext.classes); Parallel.ForEach(sitesByCategory, pair => { knowledgeByClass.TryAdd(pair.Key, experimentContext.masterExtractor.DoFVExtractionForClassViaCases(valCase.trainingCases[pair.Key.classID], pair.Key, valCase, experimentContext.tools, experimentContext.logger)); }); } else { Parallel.ForEach(sitesByCategory, pair => { IDocumentSetClass category = pair.Key; List <pipelineTaskMCSiteSubject> sites = pair.Value; var lt = BuildLemmaTableForClass(tools, category, sites); lt.Save(); // lt.SaveAs(classReportFolder.pathFor(lt.info.Name), imbSCI.Data.enums.getWritableFileMode.overwrite); }); } experimentContext.notes.log("Saving lexic resource cache subset - for later reuse in case of repeated experiment run"); tools.SaveCache(); if (!useIntegratedApproach) { experimentContext.notes.log("Performing chunk construction for all web sites in all categories [in serial]"); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { BuildChunksForClass(tools, classSet); } foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false); } } if (tools.operation.doCreateDiagnosticMatrixAtStart) { experimentContext.notes.log("Performing diagnostic analysis on all categories...[doCreateDiagnosticMatrixAtStart=true]"); folderNode matrixReport = classReportFolder.Add("clouds", "More reports on semantic cloud", "Directory contains exported DirectedGraphs, varous matrix derivates, combined cloud and other diagnostic things"); List <lemmaSemanticCloud> clouds = new List <lemmaSemanticCloud>(); List <lemmaSemanticCloud> filteredClouds = new List <lemmaSemanticCloud>(); var converter = lemmaSemanticCloud.GetDGMLConverter(); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { // experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false); var cloud = experimentContext.masterExtractor.CloudConstructor.process(knowledgeByClass[classSet].WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList(), tools.GetLemmaResource()); knowledgeByClass[classSet].semanticCloud.className = classSet.name; clouds.Add(cloud); if (experimentContext.tools.operation.doUseSimpleGraphs) { cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]")); } else { converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]")); } knowledgeByClass[classSet].semanticCloudFiltered = knowledgeByClass[classSet].semanticCloud.CloneIntoType <lemmaSemanticCloud>(true); knowledgeByClass[classSet].semanticCloudFiltered.className = classSet.name; filteredClouds.Add(knowledgeByClass[classSet].semanticCloudFiltered); } cloudMatrix matrix = new cloudMatrix("CloudMatrix", "Diagnostic cloud matrix created from the complete sample set of [" + clouds.Count() + "] classes"); matrix.build(filteredClouds, experimentContext.logger); lemmaSemanticCloud mergedCloudInitial = matrix.GetUnifiedCloud(); mergedCloudInitial.Save(matrixReport.pathFor("unified_initial_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories")); var reductions = matrix.TransformClouds(experimentContext.masterExtractor.settings.semanticCloudFilter, experimentContext.logger); var p = matrixReport.pathFor("reductions_nodes.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Report on Cloud Matrix transformation process"); File.WriteAllLines(p, reductions); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_max_cf_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_size_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapValue | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_value_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.ExportTextReports(matrixReport, true, "matrix_cf"); matrix.ExportTextReports(matrixReport, false, "matrix_cf"); lemmaSemanticCloud mergedCloudAfterReduction = matrix.GetUnifiedCloud(); mergedCloudAfterReduction.Save(matrixReport.pathFor("unified_reduced_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object -Version of all-categories diagnostic Semantic Cloud, after Cloud Matrix filter was applied")); if (experimentContext.tools.operation.doUseSimpleGraphs) { mergedCloudInitial.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } else { converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(mergedCloudInitial).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } // <-------- analysis ----------------------------------------------------------------------------------- DataTableTypeExtended <freeGraphReport> cloudReports = new DataTableTypeExtended <freeGraphReport>(); foreach (var cl in filteredClouds) { freeGraphReport fgReport = new freeGraphReport(cl); fgReport.Save(matrixReport); cloudReports.AddRow(fgReport); } freeGraphReport unifiedReport = new freeGraphReport(mergedCloudAfterReduction); unifiedReport.Save(matrixReport); cloudReports.AddRow(unifiedReport); cloudReports.GetReportAndSave(matrixReport, appManager.AppInfo, "analysis_SemanticClouds"); // <-------- analysis ----------------------------------------------------------------------------------- foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var cloud = knowledgeByClass[classSet].semanticCloudFiltered; // .WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList()); if (experimentContext.tools.operation.doUseSimpleGraphs) { cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } else { converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } //converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_reduced_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "DirectedGraphML file - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories (Open this with VS)"), imbSCI.Data.enums.getWritableFileMode.overwrite); } instanceCountCollection <String> tfcounter = new instanceCountCollection <string>(); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var wlt = knowledgeByClass[classSet].WLTableOfIndustryClass.GetDataTable(); wlt.DefaultView.Sort = "termFrequency desc"; var sorted = wlt.DefaultView.ToTable(); var tbl = wlt.GetClonedShema <DataTable>(true); tbl.CopyRowsFrom(sorted, 0, 100); tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_WebLemma", true, experimentContext.tools.operation.doReportsInParalell); var cht = knowledgeByClass[classSet].WLChunkTableOfIndustryClass.GetDataTable(); cht.DefaultView.Sort = "termFrequency desc"; var csorted = cht.DefaultView.ToTable(); tbl = cht.GetClonedShema <DataTable>(true); tbl.CopyRowsFrom(csorted, 0, 100); tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_Chunks", true, experimentContext.tools.operation.doReportsInParalell); tfcounter.AddInstanceRange(knowledgeByClass[classSet].WLTableOfIndustryClass.unresolved); knowledgeByClass[classSet].OnBeforeSave(); } List <String> countSorted = tfcounter.getSorted(); StringBuilder sb = new StringBuilder(); foreach (String s in countSorted) { sb.AppendLine(String.Format("{1} : {0}", s, tfcounter[s])); } String pt = classReportFolder.pathFor("unresolved_tokens.txt", imbSCI.Data.enums.getWritableFileMode.none, "Cloud Frequency list of all unresolved letter-only tokens"); File.WriteAllText(pt, sb.ToString()); } if (tools.operation.doFullDiagnosticReport) { experimentContext.notes.log("Generating full diagnostic report on classes..."); DataTable rep = null; foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { rep = this.GetClassKnowledgeReport(classSet, rep); } rep.SetAdditionalInfoEntry("Experiment", experimentContext.setup.name); rep.AddExtra("Experiment: " + experimentContext.setup.name); rep.AddExtra("Info: " + experimentContext.setup.description); rep.SetDescription("Structural report for all classes in the experiment"); rep.GetReportAndSave(classReportFolder, appManager.AppInfo, "structural_class_report", true, experimentContext.tools.operation.doReportsInParalell); } classReportFolder.generateReadmeFiles(appManager.AppInfo); experimentContext.notes.log("Mining Context preprocessing done in [" + DateTime.Now.Subtract(startTime).TotalMinutes.ToString("F2") + "] minutes"); return(caseKnowledgeSet); }
public override void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger) { var state = states.SetState(trainingSet, GetExperimentSufix()); if (activationFunction == null) { activationFunction = new BipolarSigmoidFunction(setup.neuralnetwork.alpha); } var neurons = setup.neuralnetwork.HiddenLayersNeuronCounts.ToList(); ActivationNetwork machine = null; switch (neurons.Count) { case 0: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, state.data.NumberOfClasses); break; case 1: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], state.data.NumberOfClasses); break; case 2: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], state.data.NumberOfClasses); break; case 3: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], state.data.NumberOfClasses); break; case 4: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], state.data.NumberOfClasses); break; case 5: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], neurons[4], state.data.NumberOfClasses); break; case 6: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], neurons[4], neurons[5], state.data.NumberOfClasses); break; case 7: machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], neurons[4], neurons[5], neurons[6], state.data.NumberOfClasses); break; default: throw new aceGeneralException("At current implementation NN with [" + neurons.Count + "] hidden layers is not allowed.", null, this, "To high number of hidden layers"); break; } new NguyenWidrow(machine).Randomize(); state.machine = machine; // BackPropagationLearning teacher = new BackPropagationLearning(machine); LevenbergMarquardtLearning teacher = new LevenbergMarquardtLearning(machine); teacher.LearningRate = setup.neuralnetwork.learningRate; var outputs = Accord.Statistics.Tools.Expand(state.data.outputs, state.data.NumberOfClasses, -1, 1); //teacher.Momentum = momentum; Int32 itOfSameError = 0; Int32 itOfSameErrorLimit = setup.neuralnetwork.learningIterationsMax / 10; Double errorSignificantSpan = setup.neuralnetwork.errorLowerLimit * setup.neuralnetwork.errorLowerLimit; for (int i = 0; i < setup.neuralnetwork.learningIterationsMax; i++) { double error = teacher.RunEpoch(state.data.inputs, outputs); if (Math.Abs(error - state.errorRate) < errorSignificantSpan) { itOfSameError++; } if (itOfSameError > itOfSameErrorLimit) { logger.log("Stoping training in [" + i.ToString("D3") + "] because error rate had no significant change [" + errorSignificantSpan.ToString("F8") + "] in last [" + itOfSameError + "] iterations [" + error.ToString("F8") + "]"); break; } if (i % 10 == 0) { logger.log("Learning Neural Network [" + i.ToString("D3") + "] Error rate: " + error.ToString("F5")); } if (error < state.errorRate) { state.errorRate = error; } if (error < setup.neuralnetwork.errorLowerLimit) { break; } } if (teacherRef == null) { teacherRef = teacher; } state.SaveState(); }
/// <summary> /// Does the make knowledge. /// </summary> /// <param name="subjects">The subjects.</param> /// <param name="tools">The tools.</param> /// <param name="knowledge">The knowledge.</param> /// <param name="logger">The logger.</param> public override void DoMakeKnowledge(List <pipelineTaskMCSiteSubject> subjects, classifierTools tools, semanticFVExtractorKnowledge knowledge, ILogBuilder logger) { Boolean report = tools.DoReport; if (knowledge.doBuildTermTable) { knowledge.WLTableOfIndustryClass.Clear(); knowledge.WLTableOfIndustryClass = termTableConstructor.process(tools.context.pipelineCollection.GetTokensForSites <IPipelineTaskSubject>(subjects), cnt_level.mcPage, knowledge.WLTableOfIndustryClass, tools.GetLemmaResource(), logger, subjects.Count == 1); } else { if (subjects.Count == 1) { // logger.log("Using existing Web Lemma Table on [" + subjects.First().name + "]"); } } if (knowledge.doBuildChunkTable) { if ((subjects.Count > 1) || SVMChunkSimilarity.isActive) { if (semanticSimilarity.isActive || SVMChunkSimilarity.isActive || cosineSemanticSimilarity.isActive) { List <IPipelineTaskSubject> MCChunks = subjects.GetSubjectChildrenTokenType <IPipelineTaskSubject, IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcChunk }, true); // sites.getAllChildren(); //context.exitSubjects.GetSubjectsOfLevel<IPipelineTaskSubject>(cnt_level.mcTokenStream); if (!MCChunks.Any()) { throw new aceScienceException("No chunks found from [" + subjects.Count + "] web sites", null, subjects, "FVE Chunk construction :: Pipeline context returned no chunks"); } knowledge.WLChunkTableOfIndustryClass.Clear(); knowledge.WLChunkTableOfIndustryClass = chunkTableConstructor.process(MCChunks, cnt_level.mcPage, knowledge.WLChunkTableOfIndustryClass, null, logger, subjects.Count == 1); } } } else { } if (knowledge.doBuildSemanticCloud) { if ((subjects.Count > 1)) { if (knowledge.WLChunkTableOfIndustryClass.Count > 0) { if (knowledge.semanticCloud.Any()) { if (tools.operation.doUseExistingKnowledge) { logger.log(" ::: Rebuilding semantic cloud for [" + subjects.Count + "] subjects, despite the cloud already had [" + knowledge.semanticCloud.CountNodes() + "] nodes and doUseExistingKnowledge=true !! "); logger.log(" ::: This is not proper behaviour --- seems the code has bugs :)"); } } knowledge.semanticCloud.Clear(); knowledge.semanticCloud = CloudConstructor.process(knowledge.WLChunkTableOfIndustryClass, knowledge.WLTableOfIndustryClass, knowledge.semanticCloud, logger, subjects, tools.GetLemmaResource()); knowledge.semanticCloud.name = knowledge.name; knowledge.semanticCloud.description = "Original semantic cloud, extracted from chunks"; if (tools.operation.doUseSimpleGraphs) { knowledge.semanticCloud.GetSimpleGraph(true).Save(knowledge.folder.pathFor("class_" + knowledge.semanticCloud.className + "_initialCloud", getWritableFileMode.overwrite)); } else { var converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(knowledge.semanticCloud).Save(knowledge.folder.pathFor("class_" + knowledge.semanticCloud.className + "_initialCloud", getWritableFileMode.overwrite)); } //if (tools.operation.doMakeGraphForClassClouds) //{ // var converter = lemmaSemanticCloud.GetDGMLConverter(); // converter.ConvertToDMGL(knowledge.semanticCloud).Save(knowledge.folder.pathFor(knowledge.name + "_initialCloud", getWritableFileMode.overwrite, "Semantic cloud in initial state - before Cloud Matrix filter applied")); //} if (knowledge.semanticCloud.CountNodes() == 0) { throw new aceScienceException("Semantic cloud [" + knowledge.name + "] construction failed -- zero nodes produced!", null, knowledge, "Sound cloud construction failed", subjects); } } } } if (tools.DoReport) { // knowledge.WLTableOfIndustryClass.GetDataTable().GetReportAndSave(knowledge.folder, appManager.AppInfo, "wfl_" + knowledge.name); // knowledge.WLChunkTableOfIndustryClass.GetDataTable().GetReportAndSave(knowledge.folder, appManager.AppInfo, "wfl_" + knowledge.name + "_chunks"); } }
public override semanticFVExtractorKnowledge DoFVExtractionForClassViaCases(validationCaseCollection vCaseColl, IDocumentSetClass documentSetClass, kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger) { semanticFVExtractorKnowledge knowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(documentSetClass, vCaseColl.kFoldCase, logger); knowledge.SetRebuild(!tools.DoUseExistingKnowledge); if (knowledge.ShouldBuildAny()) { DocumentSetCaseCollection dSetCol = new DocumentSetCaseCollection(documentSetClass); var context = tools.context.pipelineCollection.GetContext(tools, documentSetClass); //var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ConvertList<IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList(); var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ToList(); List <pipelineTaskMCSiteSubject> ISites = sites.ConvertList <IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList(); List <pipelineTaskMCSiteSubject> fSites = vCaseColl.FilterSites(ISites); dSetCol.deploy(vCaseColl, validationCase, fSites, classes); List <webLemmaTermTable> tables = new List <webLemmaTermTable>(); //List<webLemmaTermTable> chunkTables = new List<webLemmaTermTable>(); foreach (DocumentSetCase vc in dSetCol) { semanticFVExtractorKnowledge cKnowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(vc, validationCase, logger); DoMakeKnowledgeForCase(vc, tools, dSetCol, logger); tables.Add(cKnowledge.WLTableOfIndustryClass); } var tbl = tables.GetMergedLemmaTable(knowledge.name, logger); termTableConstructor.recompute(knowledge.WLTableOfIndustryClass, logger, false, tbl.GetList()); DoMakeKnowledge(fSites, tools, knowledge, logger); } // SetKnowledge(knowledge); //knowledge.OnBeforeSave(); logger.log("[ALTPROC] Feature Extraction by [" + name + "][" + vCaseColl.kFoldCase.name + "][" + documentSetClass.name + "] done for " + vCaseColl.className); return(knowledge); }