public textMap <pipelineTaskSubjectContentToken> render(pipelineTaskSubjectContentToken subject, contentTokenSubjectRenderMode mode)
 {
     if (layers.ContainsKey(mode))
     {
         return(layers[mode]);
     }
     layers.Add(mode, subject.render(mode));
     return(layers[mode]);
 }
        /// <summary>
        /// Renders the textMap from specified token subject
        /// </summary>
        /// <param name="token">The token.</param>
        /// <param name="mode">The mode.</param>
        /// <returns></returns>
        public static textMap <pipelineTaskSubjectContentToken> render(this pipelineTaskSubjectContentToken token, contentTokenSubjectRenderMode mode)
        {
            textMap <pipelineTaskSubjectContentToken> output = new textMap <pipelineTaskSubjectContentToken>();

            renderSub(output, token, mode);

            output.AddPlainRender(textMapBase.SEPARATOR);

            return(output);
        }
Beispiel #3
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCPageSubject> realTask = task as pipelineTask <pipelineTaskMCPageSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCPageSubject realSubject = realTask.subject;

            HtmlDocument html = new HtmlDocument();

            html.LoadHtml(realSubject.MCPage.HtmlSourceCode);

            pipelineTaskMCSiteSubject siteSubject = realSubject.parent as pipelineTaskMCSiteSubject;

            realSubject.htmlDocument = html;

            List <imbMCBlock> blocks = blockComposer.process(html, realSubject.name);

            if (!blocks.Any())
            {
                task.context.logger.log("Block composer returned zero blocks for [" + siteSubject.name + "]");
            }


            foreach (imbMCBlock block in blocks)
            {
                pipelineTaskSubjectContentToken tokenSubject = new pipelineTaskSubjectContentToken();
                tokenSubject.name             = block.name;
                tokenSubject.contentLevelType = flags.token.cnt_level.mcBlock;
                tokenSubject.mcElement        = block;
                tokenSubject.currentForm      = block.content;
                realSubject.mcElement.Add(tokenSubject.mcElement);
                realSubject.Add(tokenSubject);


                pipelineTask <pipelineTaskSubjectContentToken> taskForElement = new pipelineTask <pipelineTaskSubjectContentToken>(tokenSubject);


                task.context.scheduledTasks.Push(taskForElement);
            }



            return(forward);
        }
Beispiel #4
0
        /// <summary>
        /// Renders the token into string form
        /// </summary>
        /// <param name="token">The token.</param>
        /// <param name="mode">The mode.</param>
        /// <returns></returns>
        private static String renderString(pipelineTaskSubjectContentToken token, contentTokenSubjectRenderMode mode)
        {
            StringBuilder sb = new StringBuilder();

            if (token == null)
            {
                sb.Append(textMapBase.SEPARATOR);
                return(sb.ToString());
            }

            switch (mode)
            {
            default:
            case contentTokenSubjectRenderMode.currentForm:
                sb.Append(token.currentForm);
                break;

            case contentTokenSubjectRenderMode.lemmaForm:
                if (token.graph != null)
                {
                    sb.Append(token.graph.lemmaForm);
                }
                else
                {
                    sb.Append(token.currentForm);
                }
                break;

            case contentTokenSubjectRenderMode.descriptive:
                sb.Append(token.currentForm);
                if (token.graph != null)
                {
                    sb.Append(textMapBase.MAINLEVEL_COMMA);
                    sb.Append(token.graph.lemmaForm);
                }
                sb.Append(textMapBase.MAINLEVEL_COMMA);
                renderGramCase(sb, token.flagBag, false);
                break;

            case contentTokenSubjectRenderMode.flagsForm:
                renderGramCase(sb, token.flagBag, false);
                break;

            case contentTokenSubjectRenderMode.flagsFullForm:
                renderGramCase(sb, token.flagBag, true);
                break;

            case contentTokenSubjectRenderMode.initialForm:
                sb.Append(token.initialForm);
                break;

            case contentTokenSubjectRenderMode.none:
                break;

            case contentTokenSubjectRenderMode.posTypeAndGramTagForm:
                //sb.Append("[");
                sb.Append(renderString(token, contentTokenSubjectRenderMode.posTypeTagForm));
                //sb.Backspace(textMapBase.SEPARATOR);

                if (token.graph != null)
                {
                    for (int i = 0; i < token.graph.Count(); i++)
                    {
                        lexicGrammarCase pt = token.graph[i] as lexicGrammarCase;

                        renderGramCase(sb, pt.tags.GetTags(), false);

                        if (i < token.graph.Count() - 1)
                        {
                            sb.Append(textMapBase.MAINLEVEL_COMMA);
                        }
                    }
                }

                //sb.Append("]");
                break;

            case contentTokenSubjectRenderMode.posTypeTagForm:

                List <pos_type> posTypeTags = new List <pos_type>();
                Boolean         ok          = false;

                if (token.graph != null)
                {
                    var pst = token.graph.GetTagFromGramTags <pos_type>();
                    foreach (var ps in pst)
                    {
                        posTypeTags.AddUnique(ps);
                    }

                    if (posTypeTags.Any())
                    {
                        ok = true;
                    }
                }

                if (ok == false)
                {
                    var pst = token.flagBag.getAllOfType <pos_type>(false);
                    foreach (var ps in pst)
                    {
                        posTypeTags.AddUnique(ps);
                    }
                }

                if (!posTypeTags.Any())
                {
                    posTypeTags.Add(pos_type.none);
                }

                foreach (pos_type pt in posTypeTags)
                {
                    if (pt != pos_type.none)
                    {
                        sb.Append(pt.ToString());
                        if (pt != posTypeTags.Last())
                        {
                            sb.Append(textMapBase.SUBLEVEL_COMMA);
                        }
                    }
                }

                break;
            }
            sb.Append(" ");
            return(sb.ToString());
        }
Beispiel #5
0
        private static void renderSub(textMap <pipelineTaskSubjectContentToken> output, pipelineTaskSubjectContentToken token, contentTokenSubjectRenderMode mode)
        {
            switch (token.contentLevelType)
            {
            //output.Add(token, renderString(token, mode));
            //break;
            case flags.token.cnt_level.mcToken:
                output.Add(token, renderString(token, mode));
                break;

            case flags.token.cnt_level.mcChunk:
            case flags.token.cnt_level.mcBlock:
            case flags.token.cnt_level.mcTokenStream:
                output.AddOpen(token, token.contentLevelType.renderOpen(mode));
                foreach (pipelineTaskSubjectContentToken tkn in token)
                {
                    renderSub(output, tkn, mode);
                }
                output.AddClose(token.contentLevelType.renderClose(mode));
                break;
            }
        }
Beispiel #6
0
        /// <summary>
        /// Prepares for parallel execution.
        /// </summary>
        /// <param name="tools">The tools.</param>
        /// <param name="_context">The context.</param>
        public webProjectKnowledgeSet PrepareForParallelExecution(classifierTools tools, experimentExecutionContext _context)
        {
            if (caseKnowledgeSet == null)
            {
                caseKnowledgeSet = new webProjectKnowledgeSet();
            }

            if (items.Any())
            {
                experimentContext.notes.log("Mining Context was ready already.");
                return(caseKnowledgeSet);
            }
            DateTime startTime = DateTime.Now;

            experimentContext = _context;



            List <webCaseKnowledge> cases = new List <webCaseKnowledge>();

            folderNode classReportFolder = experimentContext.folder.Add("General", "General and diagnostic reports", "The folder contains general (outside k-folds) reports on analysied industries (categories), web sites and other diagnostic data");

            // <----------------------------------------------------------------------------------------------------------------        [ performing pipeline ]
            experimentContext.notes.log("Executing the Mining Context decomposition with the pipeline model");
            foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
            {
                var pipelineContext = GetContextForPipeline(tools, classSet);
                sitesByCategory.Add(classSet, new List <pipelineTaskMCSiteSubject>());

                if (!pipelineContext.exitByType.ContainsKey(typeof(pipelineTaskMCSiteSubject)))
                {
                    throw new aceGeneralException("Pipeline context output contains no web site subjects! Check the pipeline Site Task constructor.", null, pipelineContext, "Pipeline broken");
                }

                var sitesForContext = pipelineContext.exitByType[typeof(pipelineTaskMCSiteSubject)]; // <----- preparing
                foreach (var site in sitesForContext)
                {
                    tokenBySite.Add(site as pipelineTaskMCSiteSubject, new ConcurrentBag <pipelineTaskSubjectContentToken>());
                    sitesByCategory[classSet].Add(site as pipelineTaskMCSiteSubject);

                    webCaseKnowledge webCase = new webCaseKnowledge(site as pipelineTaskMCSiteSubject, classSet);

                    caseKnowledgeSet.Add(webCase);
                    cases.Add(webCase);
                }

                semanticFVExtractorKnowledge kn = new semanticFVExtractorKnowledge();
                kn.name = classSet.name + "_general";
                kn.relatedItemPureName = classSet.name;
                kn.type = WebFVExtractorKnowledgeType.aboutCompleteCategory;
                kn.Deploy(classReportFolder, experimentContext.logger);
                knowledgeByClass.TryAdd(classSet, kn);
            }

            experimentContext.notes.log("Sorting tokens for all sites [in parallel]");
            Parallel.ForEach(tokenBySite.Keys, site =>
            {
                var leafs = site.getAllLeafs();
                foreach (var leaf in leafs)
                {
                    pipelineTaskSubjectContentToken token = leaf as pipelineTaskSubjectContentToken;
                    if (token != null)
                    {
                        tokenBySite[site].Add(token);
                    }
                }
            });

            foreach (var c in cases)
            {
                c.tokens = tokenBySite[c.MCSiteSubject];
            }


            experimentContext.notes.log("Building diagnostic TF-IDF master tables for all classes [in parallel]");


            Boolean useIntegratedApproach = false;



            if (useIntegratedApproach)
            {
                var valCase = experimentContext.validationCollections[experimentContext.masterExtractor.name].GetDiagnosticCase(experimentContext.classes);
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    knowledgeByClass.TryAdd(pair.Key, experimentContext.masterExtractor.DoFVExtractionForClassViaCases(valCase.trainingCases[pair.Key.classID], pair.Key, valCase, experimentContext.tools, experimentContext.logger));
                });
            }
            else
            {
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    IDocumentSetClass category             = pair.Key;
                    List <pipelineTaskMCSiteSubject> sites = pair.Value;

                    var lt = BuildLemmaTableForClass(tools, category, sites);
                    lt.Save();
                    // lt.SaveAs(classReportFolder.pathFor(lt.info.Name), imbSCI.Data.enums.getWritableFileMode.overwrite);
                });
            }

            experimentContext.notes.log("Saving lexic resource cache subset - for later reuse in case of repeated experiment run");
            tools.SaveCache();


            if (!useIntegratedApproach)
            {
                experimentContext.notes.log("Performing chunk construction for all web sites in all categories [in serial]");



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    BuildChunksForClass(tools, classSet);
                }



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);
                }
            }

            if (tools.operation.doCreateDiagnosticMatrixAtStart)
            {
                experimentContext.notes.log("Performing diagnostic analysis on all categories...[doCreateDiagnosticMatrixAtStart=true]");



                folderNode matrixReport = classReportFolder.Add("clouds", "More reports on semantic cloud", "Directory contains exported DirectedGraphs, varous matrix derivates, combined cloud and other diagnostic things");

                List <lemmaSemanticCloud> clouds         = new List <lemmaSemanticCloud>();
                List <lemmaSemanticCloud> filteredClouds = new List <lemmaSemanticCloud>();

                var converter = lemmaSemanticCloud.GetDGMLConverter();

                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    // experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);


                    var cloud = experimentContext.masterExtractor.CloudConstructor.process(knowledgeByClass[classSet].WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList(), tools.GetLemmaResource());
                    knowledgeByClass[classSet].semanticCloud.className = classSet.name;
                    clouds.Add(cloud);

                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }
                    else
                    {
                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }



                    knowledgeByClass[classSet].semanticCloudFiltered           = knowledgeByClass[classSet].semanticCloud.CloneIntoType <lemmaSemanticCloud>(true);
                    knowledgeByClass[classSet].semanticCloudFiltered.className = classSet.name;
                    filteredClouds.Add(knowledgeByClass[classSet].semanticCloudFiltered);
                }

                cloudMatrix matrix = new cloudMatrix("CloudMatrix", "Diagnostic cloud matrix created from the complete sample set of [" + clouds.Count() + "] classes");
                matrix.build(filteredClouds, experimentContext.logger);

                lemmaSemanticCloud mergedCloudInitial = matrix.GetUnifiedCloud();
                mergedCloudInitial.Save(matrixReport.pathFor("unified_initial_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories"));


                var reductions = matrix.TransformClouds(experimentContext.masterExtractor.settings.semanticCloudFilter, experimentContext.logger);

                var p = matrixReport.pathFor("reductions_nodes.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Report on Cloud Matrix transformation process");
                File.WriteAllLines(p, reductions);



                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_max_cf_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_size_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapValue | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_value_initial", true, experimentContext.tools.operation.doReportsInParalell);


                matrix.ExportTextReports(matrixReport, true, "matrix_cf");
                matrix.ExportTextReports(matrixReport, false, "matrix_cf");

                lemmaSemanticCloud mergedCloudAfterReduction = matrix.GetUnifiedCloud();
                mergedCloudAfterReduction.Save(matrixReport.pathFor("unified_reduced_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object -Version of all-categories diagnostic Semantic Cloud, after Cloud Matrix filter was applied"));

                if (experimentContext.tools.operation.doUseSimpleGraphs)
                {
                    mergedCloudInitial.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }
                else
                {
                    converter = lemmaSemanticCloud.GetDGMLConverter();

                    converter.ConvertToDMGL(mergedCloudInitial).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }


                // <-------- analysis -----------------------------------------------------------------------------------
                DataTableTypeExtended <freeGraphReport> cloudReports = new DataTableTypeExtended <freeGraphReport>();
                foreach (var cl in filteredClouds)
                {
                    freeGraphReport fgReport = new freeGraphReport(cl);
                    fgReport.Save(matrixReport);
                    cloudReports.AddRow(fgReport);
                }
                freeGraphReport unifiedReport = new freeGraphReport(mergedCloudAfterReduction);
                unifiedReport.Save(matrixReport);
                cloudReports.AddRow(unifiedReport);


                cloudReports.GetReportAndSave(matrixReport, appManager.AppInfo, "analysis_SemanticClouds");
                // <-------- analysis -----------------------------------------------------------------------------------



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var cloud = knowledgeByClass[classSet].semanticCloudFiltered; // .WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList());


                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }
                    else
                    {
                        converter = lemmaSemanticCloud.GetDGMLConverter();

                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }



                    //converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_reduced_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "DirectedGraphML file - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories (Open this with VS)"), imbSCI.Data.enums.getWritableFileMode.overwrite);
                }

                instanceCountCollection <String> tfcounter = new instanceCountCollection <string>();
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var wlt = knowledgeByClass[classSet].WLTableOfIndustryClass.GetDataTable();
                    wlt.DefaultView.Sort = "termFrequency desc";
                    var sorted = wlt.DefaultView.ToTable();
                    var tbl    = wlt.GetClonedShema <DataTable>(true);

                    tbl.CopyRowsFrom(sorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_WebLemma", true, experimentContext.tools.operation.doReportsInParalell);

                    var cht = knowledgeByClass[classSet].WLChunkTableOfIndustryClass.GetDataTable();
                    cht.DefaultView.Sort = "termFrequency desc";
                    var csorted = cht.DefaultView.ToTable();

                    tbl = cht.GetClonedShema <DataTable>(true);
                    tbl.CopyRowsFrom(csorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_Chunks", true, experimentContext.tools.operation.doReportsInParalell);

                    tfcounter.AddInstanceRange(knowledgeByClass[classSet].WLTableOfIndustryClass.unresolved);


                    knowledgeByClass[classSet].OnBeforeSave();
                }

                List <String> countSorted = tfcounter.getSorted();
                StringBuilder sb          = new StringBuilder();
                foreach (String s in countSorted)
                {
                    sb.AppendLine(String.Format("{1}  :  {0}", s, tfcounter[s]));
                }
                String pt = classReportFolder.pathFor("unresolved_tokens.txt", imbSCI.Data.enums.getWritableFileMode.none, "Cloud Frequency list of all unresolved letter-only tokens");
                File.WriteAllText(pt, sb.ToString());
            }


            if (tools.operation.doFullDiagnosticReport)
            {
                experimentContext.notes.log("Generating full diagnostic report on classes...");
                DataTable rep = null;
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    rep = this.GetClassKnowledgeReport(classSet, rep);
                }
                rep.SetAdditionalInfoEntry("Experiment", experimentContext.setup.name);

                rep.AddExtra("Experiment: " + experimentContext.setup.name);

                rep.AddExtra("Info: " + experimentContext.setup.description);

                rep.SetDescription("Structural report for all classes in the experiment");
                rep.GetReportAndSave(classReportFolder, appManager.AppInfo, "structural_class_report", true, experimentContext.tools.operation.doReportsInParalell);
            }

            classReportFolder.generateReadmeFiles(appManager.AppInfo);


            experimentContext.notes.log("Mining Context preprocessing done in [" + DateTime.Now.Subtract(startTime).TotalMinutes.ToString("F2") + "] minutes");
            return(caseKnowledgeSet);
        }