Ejemplo n.º 1
0
        /// <summary>
        /// Gets the matching terms against list of nodes.
        /// </summary>
        /// <param name="lemmas">The lemmas.</param>
        /// <returns></returns>
        public webLemmaTermPairCollection GetMatchingTerms(IEnumerable <webLemmaTerm> lemmas, Boolean reverse = false)
        {
            webLemmaTermPairCollection output = new webLemmaTermPairCollection();

            Dictionary <String, webLemmaTerm> lemmaDictionary = lemmas.GetLemmaDictionary();

            List <freeGraphNodeBase> result = GetNodes(lemmaDictionary.Keys);

            Dictionary <String, webLemmaTerm> secondDictionary = GetWebLemmaDictionary(result);

            foreach (String key in secondDictionary.Keys)
            {
                if (lemmaDictionary.ContainsKey(key))
                {
                    if (reverse)
                    {
                        output.Add(secondDictionary[key], lemmaDictionary[key]);
                    }
                    else
                    {
                        output.Add(lemmaDictionary[key], secondDictionary[key]);
                    }
                }
            }

            return(output);
        }
Ejemplo n.º 2
0
        public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger)
        {
            if (target == null)
            {
                logger.log("-- target is null -- [GetClassification]");
                return(null);
            }

            tfidfFVExtractorKnowledge caseKnowledge = target.caseKnowledge as tfidfFVExtractorKnowledge;

            foreach (DocumentSetCaseCollection caseColl in caseSet.Values)
            {
                tfidfFVExtractorKnowledge  knowledge    = caseColl.classKnowledge as tfidfFVExtractorKnowledge;
                webLemmaTermPairCollection lemmaOverlap = null;

                if (SVMSimilarity.isActive)
                {
                    lemmaOverlap = knowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass);
                    target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger);
                }
            }

            //target.result.selected = target.result.GetClassWithHighestScore();

            return(target.data.featureVectors);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Gets the SSRM - computes the SSRM Similarity
        /// </summary>
        /// <param name="lemmas">The lemmas.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="debug">The debug.</param>
        /// <returns></returns>
        public Double GetSSRM(webLemmaTermPairCollection lemmas, ILogBuilder logger = null, SSRMComputation debug = null)
        {
            Double upper  = 0;
            Double lowerA = 0;

            Int32 i = 0;

            foreach (webLemmaTermPair wlta in lemmas)
            {
                //foreach (webLemmaTermPair wltb in lemmas)
                //{
                if (ContainsNode(wlta.entryA.name))
                {
                    i++;
                    var node = GetNode(wlta.entryA.name);

                    upper  += wlta.entryA.weight * wlta.entryB.weight * node.weight;
                    lowerA += wlta.entryA.weight * wlta.entryB.weight;

                    if (debug != null)
                    {
                        debug.printTerm(i, wlta.entryA.name, wlta.entryA.weight, wlta.entryB.weight, node.weight, upper, lowerA);
                    }
                }
                //}
            }


            Double output = upper.GetRatio(lowerA);

            if (debug != null)
            {
                debug.upper      = upper;
                debug.lower      = lowerA;
                debug.similarity = output;
                debug.terms      = i;

                debug.printFinale();
            }


            if (output == 0)
            {
                logger.log("Semantic similarity returned 0 score!");
            }

            return(output);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Gets the classification.
        /// </summary>
        /// <param name="target">The target.</param>
        /// <param name="caseSet">The case set.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger)
        {
            if (target == null)
            {
                logger.log("-- target is null -- [GetClassification]");
                return(null);
            }

            semanticFVExtractorKnowledge caseKnowledge = target.caseKnowledge as semanticFVExtractorKnowledge;

            List <webLemmaTerm> caseTerms = caseKnowledge.WLTableOfIndustryClass.GetList();

            //  StringBuilder sb = new StringBuilder();

            foreach (DocumentSetCaseCollection caseColl in caseSet.Values)
            {
                Boolean doReportInDetail = caseSet.validationCase.context.tools.operation.doMakeClassificationReportForCases;

                if (caseSet.validationCase.context.tools.operation.DoRandomCaseGraphReportMode && doReportInDetail)
                {
                    Int32 r = rnd.Next(100);
                    if (r <= caseSet.validationCase.context.tools.operation.In100RandomCaseGraphReport)
                    {
                        doReportInDetail = true;
                    }
                    else
                    {
                        doReportInDetail = false;
                    }
                }
                else
                {
                }

                semanticFVExtractorKnowledge classKnowledge = caseColl.classKnowledge as semanticFVExtractorKnowledge;
                webLemmaTermPairCollection   lemmaOverlap   = null;

                if (semanticSimilarity.isActive)
                {
                    var expandedCloud = classKnowledge.semanticCloudFiltered.ExpandTermsToCloud(caseTerms, settings.caseTermExpansionSteps, true, settings.caseTermExpansionOptions);
                    //expandedCloud.InverseWeights(true, true);
                    //expandedCloud.normalizeNodeWeights();
                    // expandedCloud.normalizeLinkWeights();

                    lemmaOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(caseTerms, true);

                    SSRMComputation debug = null;
                    if (doReportInDetail)
                    {
                        debug = new SSRMComputation(classKnowledge.name, caseKnowledge.name);
                    }

                    Double Similarity = expandedCloud.GetSSRM(lemmaOverlap, logger, debug);



                    target.data.featureVectors[caseColl.setClass.classID][semanticSimilarity] += Similarity;

                    target.data.featureVectors[caseColl.setClass.classID].termMatched += lemmaOverlap.Count;

                    if (doReportInDetail)
                    {
                        // var dt = lemmaOverlap.GetDataTable();
                        //  dt.GetReportAndSave(caseColl.setClass.folder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name);

                        freeGraphToDMGL converter = new freeGraphToDMGL();

                        String dgmlOutput = "expandedCloud_" + caseKnowledge.name + "_" + classKnowledge.name + ".dgml";

                        var dgml = converter.ConvertToDMGL(expandedCloud);

                        if (debug != null)
                        {
                            var simNode = dgml.Nodes.AddNode("sim", "Sim(d,t) = " + debug.similarity.ToString("F5"));
                            var simUp   = dgml.Nodes.AddNode("up", debug.upper.ToString("F5"));
                            var simLow  = dgml.Nodes.AddNode("low", debug.lower.ToString("F5"));

                            dgml.Links.AddLink(simNode, simUp, "Above fraction");
                            dgml.Links.AddLink(simNode, simLow, "Below fraction");
                        }

                        dgml.Save(caseSet.validationCase.caseSampleFolder.pathFor(dgmlOutput, getWritableFileMode.autoRenameThis, "DGML export of expanded terms for [" + caseKnowledge.name + "] against class cloud [" + classKnowledge.name + "]"));



                        logger.log("DGML Saved [" + dgmlOutput + "]");
                    }

                    if (debug != null)
                    {
                        File.WriteAllText(caseSet.validationCase.caseSampleFolder.pathFor(debug.GetFilename(), imbSCI.Data.enums.getWritableFileMode.overwrite), debug.sb.ToString());
                    }
                }

                if (cosineSemanticSimilarity.isActive)
                {
                    var caseLemmaDictionary = lemmaSemanticCloud.GetWebLemmaDictionary(caseKnowledge.semanticCloud.nodes);

                    List <webLemmaTerm> expandedTerms = classKnowledge.semanticCloudFiltered.ExpandTerms(caseTerms, settings.caseTermExpansionSteps, settings.caseTermExpansionOptions);

                    var cloudOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(expandedTerms);

                    if (doReportInDetail)
                    {
                        var dt = cloudOverlap.GetDataTable();
                        dt.GetReportAndSave(caseSet.validationCase.caseSampleFolder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name, true, caseSet.validationCase.context.tools.operation.doReportsInParalell);
                    }


                    target.data.featureVectors[caseColl.setClass.classID][cosineSemanticSimilarity] += cloudOverlap.GetCosineSimilarity(logger);
                }

                if (SVMSimilarity.isActive)
                {
                    lemmaOverlap = classKnowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass);
                    target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger);
                }

                if (SVMChunkSimilarity.isActive)
                {
                    //  lemmaOverlap = classKnowledge.WLChunkTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLChunkTableOfIndustryClass);

                    Double similarity = 0;
                    foreach (var primChunk in classKnowledge.semanticCloudFiltered.primaryChunks)
                    {
                        if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk))
                        {
                            similarity += caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF);
                        }
                    }

                    foreach (var primChunk in classKnowledge.semanticCloudFiltered.secondaryChunks)
                    {
                        if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk))
                        {
                            similarity += (caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF)) * 0.25;
                        }
                    }

                    target.data.featureVectors[caseColl.setClass.classID][SVMChunkSimilarity] += similarity;
                }
            }



            //target.result.selected = target.result.GetClassWithHighestScore();

            // <---------------------------------- ovde treba da se desi poziv ka klasifikatoru

            // sb.AppendLine("kNN used - class selected is: " + c.ToString() + " [" + target.result.selected.name + "]");


            //String path = caseKnowledge.folder.pathFor(caseKnowledge.name + "_log.txt");
            //File.WriteAllText(path, sb.ToString());

            return(target.data.featureVectors);
        }