Beispiel #1
0
        public override void DoSelect(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger)
        {
            var   state = states.GetState(caseSet, GetExperimentSufix());
            Int32 c     = state.machine.Decide(target.data.featureVectors.GetValues().ToArray());

            target.data[this].SetValues(c);
        }
Beispiel #2
0
        public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger)
        {
            if (target == null)
            {
                logger.log("-- target is null -- [GetClassification]");
                return(null);
            }

            tfidfFVExtractorKnowledge caseKnowledge = target.caseKnowledge as tfidfFVExtractorKnowledge;

            foreach (DocumentSetCaseCollection caseColl in caseSet.Values)
            {
                tfidfFVExtractorKnowledge  knowledge    = caseColl.classKnowledge as tfidfFVExtractorKnowledge;
                webLemmaTermPairCollection lemmaOverlap = null;

                if (SVMSimilarity.isActive)
                {
                    lemmaOverlap = knowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass);
                    target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger);
                }
            }

            //target.result.selected = target.result.GetClassWithHighestScore();

            return(target.data.featureVectors);
        }
Beispiel #3
0
        public T GetKnowledgeInstance <T>(DocumentSetCase setCase, kFoldValidationCase validationCase, ILogBuilder logger) where T : class, IWebFVExtractorKnowledge, new()
        {
            T knowledge = GetKnowledgeInstance <T>("case_" + setCase.subject.name, validationCase, WebFVExtractorKnowledgeType.aboutDocumentSet, logger);

            knowledge.relatedItemPureName = setCase.subject.name;
            return(knowledge);
        }
Beispiel #4
0
        public void SaveCaseKnowledge <T>(DocumentSetCase setCase, kFoldValidationCase validationCase, ILogBuilder logger) where T : class, IWebFVExtractorKnowledge, new()
        {
            IWebFVExtractorKnowledge knowledge = GetKnowledgeInstance <T>(setCase, validationCase, logger);

            if (!savedKnowledge.Contains(knowledge))
            {
                lock (savedKnowledgeLock)
                {
                    if (!savedKnowledge.Contains(knowledge))
                    {
                        savedKnowledge.Add(knowledge);
                        knowledge.OnBeforeSave();
                    }
                }
            }
        }
Beispiel #5
0
        public static DataRow BuildRow(this DocumentSetCaseCollection host, DocumentSetCase setCase, DataTable output, Boolean isTrainingCollection = false, Boolean doFVAnalysis = true)
        {
            var setClass       = host.setClass;
            var validationCase = host.validationCase;

            DataRow dr = output.NewRow();

            dr["name"] = host.validationCase.name + "_" + setCase.subject.name;

            if (output.Columns.Contains("Origin"))
            {
                dr["Origin"] = host.setClass.name;
            }


            dr["Case"] = setCase.subject.name;

            if (!isTrainingCollection)
            {
                Int32 cor = 0;
                foreach (var cl in validationCase.context.setup.classifiers)
                {
                    String cName = "";
                    Int32  t     = 0;
                    if (setCase.data[cl].selected != null)
                    {
                        cName = setCase.data[cl].selected.name;
                        if (setCase.data[cl].selected.classID == host.rightClassID)
                        {
                            t = 1;
                        }
                        else
                        {
                            t = 0;
                        }
                    }
                    else
                    {
                        cName = "- not set -";
                    }
                    dr["ClassResultName" + cl.name] = cName;

                    cor += t;

                    dr["EvalTrue" + cl.name] = t;
                }

                dr["Correct"] = cor.GetRatio(validationCase.context.setup.classifiers.Count);
            }

            foreach (var cl in setCase.data.setClassCollection.GetClasses())
            {
                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        dr[fv.name + "_" + cl.treeLetterAcronim] = setCase.data.featureVectors[cl.classID][fv];
                    }
                }
            }



            if (doFVAnalysis)
            {
                // aceDictionary2D<String, String, rangeFinder> matrix = new aceDictionary2D<string, string, rangeFinder>();

                Dictionary <String, rangeFinderWithData> rangers = new Dictionary <string, rangeFinderWithData>();

                foreach (var cl in setCase.data.setClassCollection.GetClasses())
                {
                    foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                    {
                        if (fv.isActive)
                        {
                            if (!rangers.ContainsKey(fv.name))
                            {
                                rangers.Add(fv.name, new rangeFinderWithData(fv.name));
                            }

                            rangers[fv.name].Learn(setCase.data.featureVectors[cl.classID][fv]);
                        }
                    }
                }



                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        dr["FVRange" + fv.name]   = rangers[fv.name].doubleEntries.GetStdDeviation(false);
                        dr["CFV_Ratio" + fv.name] = rangers[fv.name].GetPositionInRange(setCase.data.featureVectors[setClass.classID][fv]);
                        // output.Add("CFV_Ratio" + fv.name, "Value ratio indicating the position of correct category FV, within the range", fv.name, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " Range Position").SetGroup("FV Metrics");
                    }
                    // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS");
                }
            }



            output.Rows.Add(dr);
            return(dr);
        }
 public abstract void DoSelect(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger);
        /// <summary>
        /// Gets the classification.
        /// </summary>
        /// <param name="target">The target.</param>
        /// <param name="caseSet">The case set.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public override WebSiteClassifierResult GetClassification(DocumentSetCase target, DocumentSetCaseCollectionSet caseSet, ILogBuilder logger)
        {
            if (target == null)
            {
                logger.log("-- target is null -- [GetClassification]");
                return(null);
            }

            semanticFVExtractorKnowledge caseKnowledge = target.caseKnowledge as semanticFVExtractorKnowledge;

            List <webLemmaTerm> caseTerms = caseKnowledge.WLTableOfIndustryClass.GetList();

            //  StringBuilder sb = new StringBuilder();

            foreach (DocumentSetCaseCollection caseColl in caseSet.Values)
            {
                Boolean doReportInDetail = caseSet.validationCase.context.tools.operation.doMakeClassificationReportForCases;

                if (caseSet.validationCase.context.tools.operation.DoRandomCaseGraphReportMode && doReportInDetail)
                {
                    Int32 r = rnd.Next(100);
                    if (r <= caseSet.validationCase.context.tools.operation.In100RandomCaseGraphReport)
                    {
                        doReportInDetail = true;
                    }
                    else
                    {
                        doReportInDetail = false;
                    }
                }
                else
                {
                }

                semanticFVExtractorKnowledge classKnowledge = caseColl.classKnowledge as semanticFVExtractorKnowledge;
                webLemmaTermPairCollection   lemmaOverlap   = null;

                if (semanticSimilarity.isActive)
                {
                    var expandedCloud = classKnowledge.semanticCloudFiltered.ExpandTermsToCloud(caseTerms, settings.caseTermExpansionSteps, true, settings.caseTermExpansionOptions);
                    //expandedCloud.InverseWeights(true, true);
                    //expandedCloud.normalizeNodeWeights();
                    // expandedCloud.normalizeLinkWeights();

                    lemmaOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(caseTerms, true);

                    SSRMComputation debug = null;
                    if (doReportInDetail)
                    {
                        debug = new SSRMComputation(classKnowledge.name, caseKnowledge.name);
                    }

                    Double Similarity = expandedCloud.GetSSRM(lemmaOverlap, logger, debug);



                    target.data.featureVectors[caseColl.setClass.classID][semanticSimilarity] += Similarity;

                    target.data.featureVectors[caseColl.setClass.classID].termMatched += lemmaOverlap.Count;

                    if (doReportInDetail)
                    {
                        // var dt = lemmaOverlap.GetDataTable();
                        //  dt.GetReportAndSave(caseColl.setClass.folder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name);

                        freeGraphToDMGL converter = new freeGraphToDMGL();

                        String dgmlOutput = "expandedCloud_" + caseKnowledge.name + "_" + classKnowledge.name + ".dgml";

                        var dgml = converter.ConvertToDMGL(expandedCloud);

                        if (debug != null)
                        {
                            var simNode = dgml.Nodes.AddNode("sim", "Sim(d,t) = " + debug.similarity.ToString("F5"));
                            var simUp   = dgml.Nodes.AddNode("up", debug.upper.ToString("F5"));
                            var simLow  = dgml.Nodes.AddNode("low", debug.lower.ToString("F5"));

                            dgml.Links.AddLink(simNode, simUp, "Above fraction");
                            dgml.Links.AddLink(simNode, simLow, "Below fraction");
                        }

                        dgml.Save(caseSet.validationCase.caseSampleFolder.pathFor(dgmlOutput, getWritableFileMode.autoRenameThis, "DGML export of expanded terms for [" + caseKnowledge.name + "] against class cloud [" + classKnowledge.name + "]"));



                        logger.log("DGML Saved [" + dgmlOutput + "]");
                    }

                    if (debug != null)
                    {
                        File.WriteAllText(caseSet.validationCase.caseSampleFolder.pathFor(debug.GetFilename(), imbSCI.Data.enums.getWritableFileMode.overwrite), debug.sb.ToString());
                    }
                }

                if (cosineSemanticSimilarity.isActive)
                {
                    var caseLemmaDictionary = lemmaSemanticCloud.GetWebLemmaDictionary(caseKnowledge.semanticCloud.nodes);

                    List <webLemmaTerm> expandedTerms = classKnowledge.semanticCloudFiltered.ExpandTerms(caseTerms, settings.caseTermExpansionSteps, settings.caseTermExpansionOptions);

                    var cloudOverlap = classKnowledge.semanticCloudFiltered.GetMatchingTerms(expandedTerms);

                    if (doReportInDetail)
                    {
                        var dt = cloudOverlap.GetDataTable();
                        dt.GetReportAndSave(caseSet.validationCase.caseSampleFolder, appManager.AppInfo, "cosine_similarity_" + caseKnowledge.name + "_" + classKnowledge.name, true, caseSet.validationCase.context.tools.operation.doReportsInParalell);
                    }


                    target.data.featureVectors[caseColl.setClass.classID][cosineSemanticSimilarity] += cloudOverlap.GetCosineSimilarity(logger);
                }

                if (SVMSimilarity.isActive)
                {
                    lemmaOverlap = classKnowledge.WLTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLTableOfIndustryClass);
                    target.data.featureVectors[caseColl.setClass.classID][SVMSimilarity] += lemmaOverlap.GetCosineSimilarity(logger);
                }

                if (SVMChunkSimilarity.isActive)
                {
                    //  lemmaOverlap = classKnowledge.WLChunkTableOfIndustryClass.GetMatchingTerms(caseKnowledge.WLChunkTableOfIndustryClass);

                    Double similarity = 0;
                    foreach (var primChunk in classKnowledge.semanticCloudFiltered.primaryChunks)
                    {
                        if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk))
                        {
                            similarity += caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF);
                        }
                    }

                    foreach (var primChunk in classKnowledge.semanticCloudFiltered.secondaryChunks)
                    {
                        if (caseKnowledge.WLChunkTableOfIndustryClass.ContainsKey(primChunk))
                        {
                            similarity += (caseKnowledge.WLChunkTableOfIndustryClass[primChunk].documentFrequency.GetRatio(caseKnowledge.WLChunkTableOfIndustryClass.meta.maxDF)) * 0.25;
                        }
                    }

                    target.data.featureVectors[caseColl.setClass.classID][SVMChunkSimilarity] += similarity;
                }
            }



            //target.result.selected = target.result.GetClassWithHighestScore();

            // <---------------------------------- ovde treba da se desi poziv ka klasifikatoru

            // sb.AppendLine("kNN used - class selected is: " + c.ToString() + " [" + target.result.selected.name + "]");


            //String path = caseKnowledge.folder.pathFor(caseKnowledge.name + "_log.txt");
            //File.WriteAllText(path, sb.ToString());

            return(target.data.featureVectors);
        }