public override void DoSelect(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger) { var state = states.GetState(caseSet, GetExperimentSufix()); Int32 c = state.machine.Decide(target.data.featureVectors.GetValues().ToArray()); target.data[this].SetValues(c); }
public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger) { if (target == null) { logger.log("-- target is null -- [GetClassification]"); return(null); } tfidfFVExtractorKnowledge caseKnowledge = target.caseKnowledge as tfidfFVExtractorKnowledge; foreach (DocumentSetCaseCollection caseColl in caseSet.Values) { tfidfFVExtractorKnowledge knowledge = caseColl.classKnowledge as tfidfFVExtractorKnowledge; webLemmaTermPairCollection lemmaOverlap = null; if (SVMSimilarity.isActive) { lemmaOverlap = knowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass); target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger); } } //target.result.selected = target.result.GetClassWithHighestScore(); return(target.data.featureVectors); }
public T GetKnowledgeInstance <T>(DocumentSetCase setCase, kFoldValidationCase validationCase, ILogBuilder logger) where T : class, IWebFVExtractorKnowledge, new() { T knowledge = GetKnowledgeInstance <T>("case_" + setCase.subject.name, validationCase, WebFVExtractorKnowledgeType.aboutDocumentSet, logger); knowledge.relatedItemPureName = setCase.subject.name; return(knowledge); }
public void SaveCaseKnowledge <T>(DocumentSetCase setCase, kFoldValidationCase validationCase, ILogBuilder logger) where T : class, IWebFVExtractorKnowledge, new() { IWebFVExtractorKnowledge knowledge = GetKnowledgeInstance <T>(setCase, validationCase, logger); if (!savedKnowledge.Contains(knowledge)) { lock (savedKnowledgeLock) { if (!savedKnowledge.Contains(knowledge)) { savedKnowledge.Add(knowledge); knowledge.OnBeforeSave(); } } } }
public static DataRow BuildRow(this DocumentSetCaseCollection host, DocumentSetCase setCase, DataTable output, Boolean isTrainingCollection = false, Boolean doFVAnalysis = true) { var setClass = host.setClass; var validationCase = host.validationCase; DataRow dr = output.NewRow(); dr["name"] = host.validationCase.name + "_" + setCase.subject.name; if (output.Columns.Contains("Origin")) { dr["Origin"] = host.setClass.name; } dr["Case"] = setCase.subject.name; if (!isTrainingCollection) { Int32 cor = 0; foreach (var cl in validationCase.context.setup.classifiers) { String cName = ""; Int32 t = 0; if (setCase.data[cl].selected != null) { cName = setCase.data[cl].selected.name; if (setCase.data[cl].selected.classID == host.rightClassID) { t = 1; } else { t = 0; } } else { cName = "- not set -"; } dr["ClassResultName" + cl.name] = cName; cor += t; dr["EvalTrue" + cl.name] = t; } dr["Correct"] = cor.GetRatio(validationCase.context.setup.classifiers.Count); } foreach (var cl in setCase.data.setClassCollection.GetClasses()) { foreach (var fv in validationCase.extractor.settings.featureVectors.serialization) { if (fv.isActive) { dr[fv.name + "_" + cl.treeLetterAcronim] = setCase.data.featureVectors[cl.classID][fv]; } } } if (doFVAnalysis) { // aceDictionary2D<String, String, rangeFinder> matrix = new aceDictionary2D<string, string, rangeFinder>(); Dictionary <String, rangeFinderWithData> rangers = new Dictionary <string, rangeFinderWithData>(); foreach (var cl in setCase.data.setClassCollection.GetClasses()) { foreach (var fv in validationCase.extractor.settings.featureVectors.serialization) { if (fv.isActive) { if (!rangers.ContainsKey(fv.name)) { rangers.Add(fv.name, new rangeFinderWithData(fv.name)); } rangers[fv.name].Learn(setCase.data.featureVectors[cl.classID][fv]); } } } foreach (var fv in validationCase.extractor.settings.featureVectors.serialization) { if (fv.isActive) { dr["FVRange" + fv.name] = rangers[fv.name].doubleEntries.GetStdDeviation(false); dr["CFV_Ratio" + fv.name] = rangers[fv.name].GetPositionInRange(setCase.data.featureVectors[setClass.classID][fv]); // output.Add("CFV_Ratio" + fv.name, "Value ratio indicating the position of correct category FV, within the range", fv.name, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " Range Position").SetGroup("FV Metrics"); } // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS"); } } output.Rows.Add(dr); return(dr); }
public abstract void DoSelect(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger);
/// <summary> /// Gets the classification. /// </summary> /// <param name="target">The target.</param> /// <param name="caseSet">The case set.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger) { if (target == null) { logger.log("-- target is null -- [GetClassification]"); return(null); } semanticFVExtractorKnowledge caseKnowledge = target.caseKnowledge as semanticFVExtractorKnowledge; List <webLemmaTerm> caseTerms = caseKnowledge.WLTableOfIndustryClass.GetList(); // StringBuilder sb = new StringBuilder(); foreach (DocumentSetCaseCollection caseColl in caseSet.Values) { Boolean doReportInDetail = caseSet.validationCase.context.tools.operation.doMakeClassificationReportForCases; if (caseSet.validationCase.context.tools.operation.DoRandomCaseGraphReportMode && doReportInDetail) { Int32 r = rnd.Next(100); if (r <= caseSet.validationCase.context.tools.operation.In100RandomCaseGraphReport) { doReportInDetail = true; } else { doReportInDetail = false; } } else { } semanticFVExtractorKnowledge classKnowledge = caseColl.classKnowledge as semanticFVExtractorKnowledge; webLemmaTermPairCollection lemmaOverlap = null; if (semanticSimilarity.isActive) { var expandedCloud = classKnowledge.semanticCloudFiltered.ExpandTermsToCloud(caseTerms, settings.caseTermExpansionSteps, true, settings.caseTermExpansionOptions); //expandedCloud.InverseWeights(true, true); //expandedCloud.normalizeNodeWeights(); // expandedCloud.normalizeLinkWeights(); lemmaOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(caseTerms, true); SSRMComputation debug = null; if (doReportInDetail) { debug = new SSRMComputation(classKnowledge.name, caseKnowledge.name); } Double Similarity = expandedCloud.GetSSRM(lemmaOverlap, logger, debug); target.data.featureVectors[caseColl.setClass.classID][semanticSimilarity] += Similarity; target.data.featureVectors[caseColl.setClass.classID].termMatched += lemmaOverlap.Count; if (doReportInDetail) { // var dt = lemmaOverlap.GetDataTable(); // dt.GetReportAndSave(caseColl.setClass.folder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name); freeGraphToDMGL converter = new freeGraphToDMGL(); String dgmlOutput = "expandedCloud_" + caseKnowledge.name + "_" + classKnowledge.name + ".dgml"; var dgml = converter.ConvertToDMGL(expandedCloud); if (debug != null) { var simNode = dgml.Nodes.AddNode("sim", "Sim(d,t) = " + debug.similarity.ToString("F5")); var simUp = dgml.Nodes.AddNode("up", debug.upper.ToString("F5")); var simLow = dgml.Nodes.AddNode("low", debug.lower.ToString("F5")); dgml.Links.AddLink(simNode, simUp, "Above fraction"); dgml.Links.AddLink(simNode, simLow, "Below fraction"); } dgml.Save(caseSet.validationCase.caseSampleFolder.pathFor(dgmlOutput, getWritableFileMode.autoRenameThis, "DGML export of expanded terms for [" + caseKnowledge.name + "] against class cloud [" + classKnowledge.name + "]")); logger.log("DGML Saved [" + dgmlOutput + "]"); } if (debug != null) { File.WriteAllText(caseSet.validationCase.caseSampleFolder.pathFor(debug.GetFilename(), imbSCI.Data.enums.getWritableFileMode.overwrite), debug.sb.ToString()); } } if (cosineSemanticSimilarity.isActive) { var caseLemmaDictionary = lemmaSemanticCloud.GetWebLemmaDictionary(caseKnowledge.semanticCloud.nodes); List <webLemmaTerm> expandedTerms = classKnowledge.semanticCloudFiltered.ExpandTerms(caseTerms, settings.caseTermExpansionSteps, settings.caseTermExpansionOptions); var cloudOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(expandedTerms); if (doReportInDetail) { var dt = cloudOverlap.GetDataTable(); dt.GetReportAndSave(caseSet.validationCase.caseSampleFolder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name, true, caseSet.validationCase.context.tools.operation.doReportsInParalell); } target.data.featureVectors[caseColl.setClass.classID][cosineSemanticSimilarity] += cloudOverlap.GetCosineSimilarity(logger); } if (SVMSimilarity.isActive) { lemmaOverlap = classKnowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass); target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger); } if (SVMChunkSimilarity.isActive) { // lemmaOverlap = classKnowledge.WLChunkTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLChunkTableOfIndustryClass); Double similarity = 0; foreach (var primChunk in classKnowledge.semanticCloudFiltered.primaryChunks) { if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk)) { similarity += caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF); } } foreach (var primChunk in classKnowledge.semanticCloudFiltered.secondaryChunks) { if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk)) { similarity += (caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF)) * 0.25; } } target.data.featureVectors[caseColl.setClass.classID][SVMChunkSimilarity] += similarity; } } //target.result.selected = target.result.GetClassWithHighestScore(); // <---------------------------------- ovde treba da se desi poziv ka klasifikatoru // sb.AppendLine("kNN used - class selected is: " + c.ToString() + " [" + target.result.selected.name + "]"); //String path = caseKnowledge.folder.pathFor(caseKnowledge.name + "_log.txt"); //File.WriteAllText(path, sb.ToString()); return(target.data.featureVectors); }