/// <summary> /// Gets the matching terms against list of nodes. /// </summary> /// <param name="lemmas">The lemmas.</param> /// <returns></returns> public webLemmaTermPairCollection GetMatchingTerms(IEnumerable <webLemmaTerm> lemmas, Boolean reverse = false) { webLemmaTermPairCollection output = new webLemmaTermPairCollection(); Dictionary <String, webLemmaTerm> lemmaDictionary = lemmas.GetLemmaDictionary(); List <freeGraphNodeBase> result = GetNodes(lemmaDictionary.Keys); Dictionary <String, webLemmaTerm> secondDictionary = GetWebLemmaDictionary(result); foreach (String key in secondDictionary.Keys) { if (lemmaDictionary.ContainsKey(key)) { if (reverse) { output.Add(secondDictionary[key], lemmaDictionary[key]); } else { output.Add(lemmaDictionary[key], secondDictionary[key]); } } } return(output); }
public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger) { if (target == null) { logger.log("-- target is null -- [GetClassification]"); return(null); } tfidfFVExtractorKnowledge caseKnowledge = target.caseKnowledge as tfidfFVExtractorKnowledge; foreach (DocumentSetCaseCollection caseColl in caseSet.Values) { tfidfFVExtractorKnowledge knowledge = caseColl.classKnowledge as tfidfFVExtractorKnowledge; webLemmaTermPairCollection lemmaOverlap = null; if (SVMSimilarity.isActive) { lemmaOverlap = knowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass); target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger); } } //target.result.selected = target.result.GetClassWithHighestScore(); return(target.data.featureVectors); }
/// <summary> /// Gets the SSRM - computes the SSRM Similarity /// </summary> /// <param name="lemmas">The lemmas.</param> /// <param name="logger">The logger.</param> /// <param name="debug">The debug.</param> /// <returns></returns> public Double GetSSRM(webLemmaTermPairCollection lemmas, ILogBuilder logger = null, SSRMComputation debug = null) { Double upper = 0; Double lowerA = 0; Int32 i = 0; foreach (webLemmaTermPair wlta in lemmas) { //foreach (webLemmaTermPair wltb in lemmas) //{ if (ContainsNode(wlta.entryA.name)) { i++; var node = GetNode(wlta.entryA.name); upper += wlta.entryA.weight * wlta.entryB.weight * node.weight; lowerA += wlta.entryA.weight * wlta.entryB.weight; if (debug != null) { debug.printTerm(i, wlta.entryA.name, wlta.entryA.weight, wlta.entryB.weight, node.weight, upper, lowerA); } } //} } Double output = upper.GetRatio(lowerA); if (debug != null) { debug.upper = upper; debug.lower = lowerA; debug.similarity = output; debug.terms = i; debug.printFinale(); } if (output == 0) { logger.log("Semantic similarity returned 0 score!"); } return(output); }
/// <summary> /// Gets the classification. /// </summary> /// <param name="target">The target.</param> /// <param name="caseSet">The case set.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger) { if (target == null) { logger.log("-- target is null -- [GetClassification]"); return(null); } semanticFVExtractorKnowledge caseKnowledge = target.caseKnowledge as semanticFVExtractorKnowledge; List <webLemmaTerm> caseTerms = caseKnowledge.WLTableOfIndustryClass.GetList(); // StringBuilder sb = new StringBuilder(); foreach (DocumentSetCaseCollection caseColl in caseSet.Values) { Boolean doReportInDetail = caseSet.validationCase.context.tools.operation.doMakeClassificationReportForCases; if (caseSet.validationCase.context.tools.operation.DoRandomCaseGraphReportMode && doReportInDetail) { Int32 r = rnd.Next(100); if (r <= caseSet.validationCase.context.tools.operation.In100RandomCaseGraphReport) { doReportInDetail = true; } else { doReportInDetail = false; } } else { } semanticFVExtractorKnowledge classKnowledge = caseColl.classKnowledge as semanticFVExtractorKnowledge; webLemmaTermPairCollection lemmaOverlap = null; if (semanticSimilarity.isActive) { var expandedCloud = classKnowledge.semanticCloudFiltered.ExpandTermsToCloud(caseTerms, settings.caseTermExpansionSteps, true, settings.caseTermExpansionOptions); //expandedCloud.InverseWeights(true, true); //expandedCloud.normalizeNodeWeights(); // expandedCloud.normalizeLinkWeights(); lemmaOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(caseTerms, true); SSRMComputation debug = null; if (doReportInDetail) { debug = new SSRMComputation(classKnowledge.name, caseKnowledge.name); } Double Similarity = expandedCloud.GetSSRM(lemmaOverlap, logger, debug); target.data.featureVectors[caseColl.setClass.classID][semanticSimilarity] += Similarity; target.data.featureVectors[caseColl.setClass.classID].termMatched += lemmaOverlap.Count; if (doReportInDetail) { // var dt = lemmaOverlap.GetDataTable(); // dt.GetReportAndSave(caseColl.setClass.folder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name); freeGraphToDMGL converter = new freeGraphToDMGL(); String dgmlOutput = "expandedCloud_" + caseKnowledge.name + "_" + classKnowledge.name + ".dgml"; var dgml = converter.ConvertToDMGL(expandedCloud); if (debug != null) { var simNode = dgml.Nodes.AddNode("sim", "Sim(d,t) = " + debug.similarity.ToString("F5")); var simUp = dgml.Nodes.AddNode("up", debug.upper.ToString("F5")); var simLow = dgml.Nodes.AddNode("low", debug.lower.ToString("F5")); dgml.Links.AddLink(simNode, simUp, "Above fraction"); dgml.Links.AddLink(simNode, simLow, "Below fraction"); } dgml.Save(caseSet.validationCase.caseSampleFolder.pathFor(dgmlOutput, getWritableFileMode.autoRenameThis, "DGML export of expanded terms for [" + caseKnowledge.name + "] against class cloud [" + classKnowledge.name + "]")); logger.log("DGML Saved [" + dgmlOutput + "]"); } if (debug != null) { File.WriteAllText(caseSet.validationCase.caseSampleFolder.pathFor(debug.GetFilename(), imbSCI.Data.enums.getWritableFileMode.overwrite), debug.sb.ToString()); } } if (cosineSemanticSimilarity.isActive) { var caseLemmaDictionary = lemmaSemanticCloud.GetWebLemmaDictionary(caseKnowledge.semanticCloud.nodes); List <webLemmaTerm> expandedTerms = classKnowledge.semanticCloudFiltered.ExpandTerms(caseTerms, settings.caseTermExpansionSteps, settings.caseTermExpansionOptions); var cloudOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(expandedTerms); if (doReportInDetail) { var dt = cloudOverlap.GetDataTable(); dt.GetReportAndSave(caseSet.validationCase.caseSampleFolder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name, true, caseSet.validationCase.context.tools.operation.doReportsInParalell); } target.data.featureVectors[caseColl.setClass.classID][cosineSemanticSimilarity] += cloudOverlap.GetCosineSimilarity(logger); } if (SVMSimilarity.isActive) { lemmaOverlap = classKnowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass); target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger); } if (SVMChunkSimilarity.isActive) { // lemmaOverlap = classKnowledge.WLChunkTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLChunkTableOfIndustryClass); Double similarity = 0; foreach (var primChunk in classKnowledge.semanticCloudFiltered.primaryChunks) { if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk)) { similarity += caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF); } } foreach (var primChunk in classKnowledge.semanticCloudFiltered.secondaryChunks) { if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk)) { similarity += (caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF)) * 0.25; } } target.data.featureVectors[caseColl.setClass.classID][SVMChunkSimilarity] += similarity; } } //target.result.selected = target.result.GetClassWithHighestScore(); // <---------------------------------- ovde treba da se desi poziv ka klasifikatoru // sb.AppendLine("kNN used - class selected is: " + c.ToString() + " [" + target.result.selected.name + "]"); //String path = caseKnowledge.folder.pathFor(caseKnowledge.name + "_log.txt"); //File.WriteAllText(path, sb.ToString()); return(target.data.featureVectors); }