/// <summary> /// Processes the specified cloud. /// </summary> /// <param name="cloud">The cloud.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public lemmaSemanticWeaverResult Process(lemmaSemanticCloud cloud, ILogBuilder logger) { Int32 c = cloud.links.Count; lemmaSemanticWeaverResult weaverResult = new lemmaSemanticWeaverResult(cloud); wordSimilarityResultSet output = new wordSimilarityResultSet(); List <String> words = cloud.nodes.Select(x => x.name).ToList(); if (useSimilarity) { output = similarWords.GetResult(words); foreach (var pair in output) { var link = cloud.GetLink(pair.wordA, pair.wordB); if (link == null) { cloud.AddLink(pair.wordA, pair.wordB, pair.score, LINK_OF_SIMILARWORDS); } } weaverResult.linkRatioAfterWS = cloud.GetLinkPerNodeRatio(); weaverResult.similarWords = output; } /* NOT WORKING ---- TEMPORARLY DISABLED * if (useDictionary) * { * Stack<String> wordsToTest = new Stack<string>(); * words.ForEach(x => wordsToTest.Push(x)); * * while (wordsToTest.Any()) * { * String word = wordsToTest.Pop(); * apertiumDictionaryResult result = apertium.queryForSynonyms(word, apertiumDictQueryScope.exact); * * var synonims = result.GetNativeWords(); * * var synonimNodes = wordsToTest.Where(x => synonims.Contains(x)); * * foreach (String syn in synonimNodes) * { * wordsToTest.RemoveAll(x=>x==syn); * cloud.AddLink(word, syn, 1, LINK_OF_DICTIONARYSYNONIMS); * * weaverResult.appertiumNotes.Add(word + " -> " + syn); * } * weaverResult.linkRatioAfterDS = cloud.GetLinkPerNodeRatio(); * } * }*/ if (c != cloud.links.Count) { if (logger != null) { logger.log("Weaver created [" + (cloud.links.Count - c) + "] new links in the cloud [" + cloud.className + "]"); } } return(weaverResult); }
public lemmaSemanticCloud GetUnifiedCloud(String nameForCloud = "") { lemmaSemanticCloud output = new lemmaSemanticCloud(); if (nameForCloud.isNullOrEmpty()) { output.name = "UnifiedCloud"; } else { output.name = nameForCloud; } output.description = "Created as union of clouds: "; var builder = new StringBuilder(); builder.Append(output.description); foreach (lemmaSemanticCloud x in this.Get1stKeys()) { output.RebuildIndex(); output.AddCloud(x); builder.Append(x.name + ";"); } output.description = builder.ToString(); output.RebuildIndex(); return(output); }
public lemmaSemanticWeaverResult(lemmaSemanticCloud _cloud) { cloud = _cloud; cloudClassName = _cloud.className; linkRatioInitial = cloud.GetLinkPerNodeRatio(); }
/// <summary> /// Gets the value for cell targeted /// </summary> /// <param name="x">The x.</param> /// <param name="y">The y.</param> /// <param name="type">The type.</param> /// <param name="counter">The counter.</param> /// <returns></returns> public Double GetCellNumber(lemmaSemanticCloud x, lemmaSemanticCloud y, cloudMatrixDataTableType type, instanceCountCollection <String> counter) { Double output = 0; List <freeGraphNodeBase> selected = this[x, y]; Double min = MaxCloudFrequency; Double max = MinCloudFrequency; if (type.HasFlag(cloudMatrixDataTableType.overlapValue)) { if (type.HasFlag(cloudMatrixDataTableType.initialState)) { output = selected.Sum(s => s.weight); } else { output = x.GetOverlap(y).Sum(s => s.weight); } } if (output == 0) { if (type.HasFlag(cloudMatrixDataTableType.normalizedValues)) { if (type.HasFlag(cloudMatrixDataTableType.overlapSize)) { if (type.HasFlag(cloudMatrixDataTableType.initialState)) { output = selected.Count.GetRatio(MaxOverlap); } else { if (x == y) { output = 0; } else { output = x.GetOverlap(y).Count.GetRatio(selected.Count); } } } else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency) || type.HasFlag(cloudMatrixDataTableType.minCloudFrequency)) { for (int i = 0; i < selected.Count; i++) { freeGraphNodeBase ne = selected[i]; min = Math.Min(min, (Double)counter[ne.name]); max = Math.Max(max, (Double)counter[ne.name]); } if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency)) { output = max.GetRatio(MaxCloudFrequency); } else { output = min.GetRatio(MinCloudFrequency); } } } else { if (type.HasFlag(cloudMatrixDataTableType.overlapSize)) { if (type.HasFlag(cloudMatrixDataTableType.initialState)) { output = selected.Count; } else { if (x == y) { output = 0; } else { output = x.GetOverlap(y).Count; } } } else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency) || type.HasFlag(cloudMatrixDataTableType.minCloudFrequency)) { for (int i = 0; i < selected.Count; i++) { freeGraphNodeBase ne = selected[i]; min = Math.Min(min, (Double)counter[ne.name]); max = Math.Max(max, (Double)counter[ne.name]); } if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency)) { output = max; } else { output = min; } } } } return(output); }
/// <summary> /// Prepares for parallel execution. /// </summary> /// <param name="tools">The tools.</param> /// <param name="_context">The context.</param> public webProjectKnowledgeSet PrepareForParallelExecution(classifierTools tools, experimentExecutionContext _context) { if (caseKnowledgeSet == null) { caseKnowledgeSet = new webProjectKnowledgeSet(); } if (items.Any()) { experimentContext.notes.log("Mining Context was ready already."); return(caseKnowledgeSet); } DateTime startTime = DateTime.Now; experimentContext = _context; List <webCaseKnowledge> cases = new List <webCaseKnowledge>(); folderNode classReportFolder = experimentContext.folder.Add("General", "General and diagnostic reports", "The folder contains general (outside k-folds) reports on analysied industries (categories), web sites and other diagnostic data"); // <---------------------------------------------------------------------------------------------------------------- [ performing pipeline ] experimentContext.notes.log("Executing the Mining Context decomposition with the pipeline model"); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var pipelineContext = GetContextForPipeline(tools, classSet); sitesByCategory.Add(classSet, new List <pipelineTaskMCSiteSubject>()); if (!pipelineContext.exitByType.ContainsKey(typeof(pipelineTaskMCSiteSubject))) { throw new aceGeneralException("Pipeline context output contains no web site subjects! Check the pipeline Site Task constructor.", null, pipelineContext, "Pipeline broken"); } var sitesForContext = pipelineContext.exitByType[typeof(pipelineTaskMCSiteSubject)]; // <----- preparing foreach (var site in sitesForContext) { tokenBySite.Add(site as pipelineTaskMCSiteSubject, new ConcurrentBag <pipelineTaskSubjectContentToken>()); sitesByCategory[classSet].Add(site as pipelineTaskMCSiteSubject); webCaseKnowledge webCase = new webCaseKnowledge(site as pipelineTaskMCSiteSubject, classSet); caseKnowledgeSet.Add(webCase); cases.Add(webCase); } semanticFVExtractorKnowledge kn = new semanticFVExtractorKnowledge(); kn.name = classSet.name + "_general"; kn.relatedItemPureName = classSet.name; kn.type = WebFVExtractorKnowledgeType.aboutCompleteCategory; kn.Deploy(classReportFolder, experimentContext.logger); knowledgeByClass.TryAdd(classSet, kn); } experimentContext.notes.log("Sorting tokens for all sites [in parallel]"); Parallel.ForEach(tokenBySite.Keys, site => { var leafs = site.getAllLeafs(); foreach (var leaf in leafs) { pipelineTaskSubjectContentToken token = leaf as pipelineTaskSubjectContentToken; if (token != null) { tokenBySite[site].Add(token); } } }); foreach (var c in cases) { c.tokens = tokenBySite[c.MCSiteSubject]; } experimentContext.notes.log("Building diagnostic TF-IDF master tables for all classes [in parallel]"); Boolean useIntegratedApproach = false; if (useIntegratedApproach) { var valCase = experimentContext.validationCollections[experimentContext.masterExtractor.name].GetDiagnosticCase(experimentContext.classes); Parallel.ForEach(sitesByCategory, pair => { knowledgeByClass.TryAdd(pair.Key, experimentContext.masterExtractor.DoFVExtractionForClassViaCases(valCase.trainingCases[pair.Key.classID], pair.Key, valCase, experimentContext.tools, experimentContext.logger)); }); } else { Parallel.ForEach(sitesByCategory, pair => { IDocumentSetClass category = pair.Key; List <pipelineTaskMCSiteSubject> sites = pair.Value; var lt = BuildLemmaTableForClass(tools, category, sites); lt.Save(); // lt.SaveAs(classReportFolder.pathFor(lt.info.Name), imbSCI.Data.enums.getWritableFileMode.overwrite); }); } experimentContext.notes.log("Saving lexic resource cache subset - for later reuse in case of repeated experiment run"); tools.SaveCache(); if (!useIntegratedApproach) { experimentContext.notes.log("Performing chunk construction for all web sites in all categories [in serial]"); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { BuildChunksForClass(tools, classSet); } foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false); } } if (tools.operation.doCreateDiagnosticMatrixAtStart) { experimentContext.notes.log("Performing diagnostic analysis on all categories...[doCreateDiagnosticMatrixAtStart=true]"); folderNode matrixReport = classReportFolder.Add("clouds", "More reports on semantic cloud", "Directory contains exported DirectedGraphs, varous matrix derivates, combined cloud and other diagnostic things"); List <lemmaSemanticCloud> clouds = new List <lemmaSemanticCloud>(); List <lemmaSemanticCloud> filteredClouds = new List <lemmaSemanticCloud>(); var converter = lemmaSemanticCloud.GetDGMLConverter(); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { // experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false); var cloud = experimentContext.masterExtractor.CloudConstructor.process(knowledgeByClass[classSet].WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList(), tools.GetLemmaResource()); knowledgeByClass[classSet].semanticCloud.className = classSet.name; clouds.Add(cloud); if (experimentContext.tools.operation.doUseSimpleGraphs) { cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]")); } else { converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]")); } knowledgeByClass[classSet].semanticCloudFiltered = knowledgeByClass[classSet].semanticCloud.CloneIntoType <lemmaSemanticCloud>(true); knowledgeByClass[classSet].semanticCloudFiltered.className = classSet.name; filteredClouds.Add(knowledgeByClass[classSet].semanticCloudFiltered); } cloudMatrix matrix = new cloudMatrix("CloudMatrix", "Diagnostic cloud matrix created from the complete sample set of [" + clouds.Count() + "] classes"); matrix.build(filteredClouds, experimentContext.logger); lemmaSemanticCloud mergedCloudInitial = matrix.GetUnifiedCloud(); mergedCloudInitial.Save(matrixReport.pathFor("unified_initial_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories")); var reductions = matrix.TransformClouds(experimentContext.masterExtractor.settings.semanticCloudFilter, experimentContext.logger); var p = matrixReport.pathFor("reductions_nodes.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Report on Cloud Matrix transformation process"); File.WriteAllLines(p, reductions); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_max_cf_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_size_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapValue | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_value_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.ExportTextReports(matrixReport, true, "matrix_cf"); matrix.ExportTextReports(matrixReport, false, "matrix_cf"); lemmaSemanticCloud mergedCloudAfterReduction = matrix.GetUnifiedCloud(); mergedCloudAfterReduction.Save(matrixReport.pathFor("unified_reduced_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object -Version of all-categories diagnostic Semantic Cloud, after Cloud Matrix filter was applied")); if (experimentContext.tools.operation.doUseSimpleGraphs) { mergedCloudInitial.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } else { converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(mergedCloudInitial).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } // <-------- analysis ----------------------------------------------------------------------------------- DataTableTypeExtended <freeGraphReport> cloudReports = new DataTableTypeExtended <freeGraphReport>(); foreach (var cl in filteredClouds) { freeGraphReport fgReport = new freeGraphReport(cl); fgReport.Save(matrixReport); cloudReports.AddRow(fgReport); } freeGraphReport unifiedReport = new freeGraphReport(mergedCloudAfterReduction); unifiedReport.Save(matrixReport); cloudReports.AddRow(unifiedReport); cloudReports.GetReportAndSave(matrixReport, appManager.AppInfo, "analysis_SemanticClouds"); // <-------- analysis ----------------------------------------------------------------------------------- foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var cloud = knowledgeByClass[classSet].semanticCloudFiltered; // .WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList()); if (experimentContext.tools.operation.doUseSimpleGraphs) { cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } else { converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } //converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_reduced_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "DirectedGraphML file - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories (Open this with VS)"), imbSCI.Data.enums.getWritableFileMode.overwrite); } instanceCountCollection <String> tfcounter = new instanceCountCollection <string>(); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var wlt = knowledgeByClass[classSet].WLTableOfIndustryClass.GetDataTable(); wlt.DefaultView.Sort = "termFrequency desc"; var sorted = wlt.DefaultView.ToTable(); var tbl = wlt.GetClonedShema <DataTable>(true); tbl.CopyRowsFrom(sorted, 0, 100); tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_WebLemma", true, experimentContext.tools.operation.doReportsInParalell); var cht = knowledgeByClass[classSet].WLChunkTableOfIndustryClass.GetDataTable(); cht.DefaultView.Sort = "termFrequency desc"; var csorted = cht.DefaultView.ToTable(); tbl = cht.GetClonedShema <DataTable>(true); tbl.CopyRowsFrom(csorted, 0, 100); tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_Chunks", true, experimentContext.tools.operation.doReportsInParalell); tfcounter.AddInstanceRange(knowledgeByClass[classSet].WLTableOfIndustryClass.unresolved); knowledgeByClass[classSet].OnBeforeSave(); } List <String> countSorted = tfcounter.getSorted(); StringBuilder sb = new StringBuilder(); foreach (String s in countSorted) { sb.AppendLine(String.Format("{1} : {0}", s, tfcounter[s])); } String pt = classReportFolder.pathFor("unresolved_tokens.txt", imbSCI.Data.enums.getWritableFileMode.none, "Cloud Frequency list of all unresolved letter-only tokens"); File.WriteAllText(pt, sb.ToString()); } if (tools.operation.doFullDiagnosticReport) { experimentContext.notes.log("Generating full diagnostic report on classes..."); DataTable rep = null; foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { rep = this.GetClassKnowledgeReport(classSet, rep); } rep.SetAdditionalInfoEntry("Experiment", experimentContext.setup.name); rep.AddExtra("Experiment: " + experimentContext.setup.name); rep.AddExtra("Info: " + experimentContext.setup.description); rep.SetDescription("Structural report for all classes in the experiment"); rep.GetReportAndSave(classReportFolder, appManager.AppInfo, "structural_class_report", true, experimentContext.tools.operation.doReportsInParalell); } classReportFolder.generateReadmeFiles(appManager.AppInfo); experimentContext.notes.log("Mining Context preprocessing done in [" + DateTime.Now.Subtract(startTime).TotalMinutes.ToString("F2") + "] minutes"); return(caseKnowledgeSet); }
public void TestCloudWeaver() { folderNode folder = new folderNode(); folderNode weaverFolder = folder.Add("NLP\\CloudWeaver", "Cloud Weaver", "Folder with results of cloud weaver tests"); folderNode cloudFolder = folder.Add("Clouds", "Test resources", ""); folderNode resourceFolder = folder.Add("resources", "Test resources", ""); lemmaSemanticWeaver weaver = new lemmaSemanticWeaver(); weaver.prepare(resourceFolder, null); weaver.useSimilarity = true; weaver.similarWords.N = 2; weaver.similarWords.gramConstruction = nGramsModeEnum.overlap; weaver.similarWords.treshold = 0.6; weaver.similarWords.equation = nGramsSimilarityEquationEnum.DiceCoefficient; weaver.useDictionary = false; var cloudPaths = cloudFolder.findFiles("*_initialCloud.xml", SearchOption.TopDirectoryOnly); foreach (String path in cloudPaths) { lemmaSemanticCloud testCloud = lemmaSemanticCloud.Load <lemmaSemanticCloud>(path); testCloud.GetSimpleGraph(false).Save(weaverFolder.pathFor(testCloud.className + "_initial.dgml", imbSCI.Data.enums.getWritableFileMode.overwrite), imbSCI.Data.enums.getWritableFileMode.overwrite); var report = weaver.Process(testCloud, null); report.Save(weaverFolder, "DiceCoefficient"); } weaver.similarWords.equation = nGramsSimilarityEquationEnum.JaccardIndex; foreach (String path in cloudPaths) { lemmaSemanticCloud testCloud = lemmaSemanticCloud.Load <lemmaSemanticCloud>(path); var report = weaver.Process(testCloud, null); report.Save(weaverFolder, "JaccardIndex"); } weaver.similarWords.equation = nGramsSimilarityEquationEnum.continualOverlapRatio; foreach (String path in cloudPaths) { lemmaSemanticCloud testCloud = lemmaSemanticCloud.Load <lemmaSemanticCloud>(path); var report = weaver.Process(testCloud, null); report.Save(weaverFolder, "ContinualOverlap"); objectSerialization.saveObjectToXML(testCloud, weaverFolder.pathFor(testCloud.className + "_weaved.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Processed cloud")); testCloud.GetSimpleGraph(false).Save(weaverFolder.pathFor(testCloud.className + "_weaved.dgml", imbSCI.Data.enums.getWritableFileMode.overwrite), imbSCI.Data.enums.getWritableFileMode.overwrite); } //weaver.similarWords.equation = nGramsSimilarityEquationEnum.continualOverlapRatio; //weaver.useDictionary = true; //foreach (String path in cloudPaths) //{ // lemmaSemanticCloud testCloud = lemmaSemanticCloud.Load<lemmaSemanticCloud>(path); // var report = weaver.Process(testCloud, null); // report.Save(weaverFolder, "JaccardIndexAndApertium"); //} folder.generateReadmeFiles(new imbSCI.Core.data.aceAuthorNotation()); }