/*
         * public static WeightDictionary GetChildrenWT(WeightDictionary output, SpaceDocumentModel model, FeatureWeightModel weightModel, SpaceModel space, List<string> FV)
         * {
         *  if (output == null)
         *  {
         *      output = new WeightDictionary(model.name, "");
         *  }
         *  if (model.Children.Any())
         *  {
         *      foreach (var child in model.Children)
         *      {
         *          GetChildrenWT(output, child, weightModel, space, FV);
         *
         *      }
         *
         *  }
         *  else
         *  {
         *      var wd = weightModel.GetWeights(FV, model, space);
         *      output.Merge(wd.index.Values, model.weight);
         *  }
         *  return output;
         *
         * }*/


        public static T BlendToVector <T>(this SpaceDocumentModel model, FeatureWeightModel weightModel, SpaceModel space, List <string> FV) where T : VectorDocument, new()
        {
            T output = new T();

            output.name = model.name;

            var leafs = model.GetLeafs();

            foreach (var leaf in leafs)
            {
                var wd = weightModel.GetWeights(FV, model, space);
                output.terms.Merge(wd);
            }

            //output.terms =

            //output.Merge(wd.index.Values, model.weight);

            //WeightDictionary wd = new WeightDictionary(model.name, "");

            // GetChildrenWT(wd, model, weightModel, space, FV);



            return(output);
        }
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <exception cref="ArgumentException">context</exception>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder);
            String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder);

            TermWeightModel = FeatureWeightModel.LoadModel(p_m, log);

            //if (File.Exists(p_m))
            //{

            //    //TermWeightModel = objectSerialization.loadObjectFromXML<FeatureWeightModel>(p_m, log);
            //}

            TermWeightModel.Deploy(log);

            if (context.spaceModel == null)
            {
                String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation";
                throw new ArgumentException(msg, nameof(context));
            }

            if (File.Exists(p_d) && useStoredData)
            {
                WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log);
                TermWeightModel.LoadModelDataSet(data, log);

                if (useSelectedFeatures)
                {
                    SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log);
                }
            }
            else
            {
                TermWeightModel.PrepareTheModel(context.spaceModel, log);
            }

            if (context.query.isNullOrEmpty())
            {
                context.query.QueryTerms = context.query.QueryTerms.Trim();

                List <String> tkns = context.query.QueryTerms.getTokens(true, true, true, false, 4);

                foreach (String tkn in tkns)
                {
                    queryTerms.Add(context.stemmingContext.Stem(tkn));
                }
            }
        }
예제 #3
0
 public static void SetReportDataFields(this classificationReport report, IClassifier classifier, FeatureFilter filter, FeatureWeightModel featureWeight)
 {
     report.data.Add(nameof(ReportDataFieldEnum.Classifier), classifier.GetSignature(), "Signature of the classification algorithm");
     report.data.Add(nameof(ReportDataFieldEnum.FeatureSelection), filter.GetSignature(), "Signature of feature selection filter model");
     report.data.Add(nameof(ReportDataFieldEnum.FeatureWeighting), featureWeight.GetSignature(), "Signature of feature weight model");
 }
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            String p_m = "";

            String p_d = "";

            modelDefinitionFile = modelDefinitionFile.Replace("*", "");

            if (!modelDefinitionFile.isNullOrEmpty())
            {
                p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder);
                p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder);
            }

            if (TermWeightModel == null)
            {
                log.log("Loading model from [" + p_m + "]");

                if (File.Exists(p_m))
                {
                    TermWeightModel = FeatureWeightModel.LoadModel(p_m, log);
                }
            }

            TermWeightModel.Deploy(log);

            if (File.Exists(p_d) && UseModelData)
            {
                log.log("Loading model data from [" + p_d + "]");

                var dataset = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log);

                //  WeightingModelDataSet
                TermWeightModel.LoadModelDataSet(dataset, log);
            }
            else
            {
                log.log("Preparing model ...");
                TermWeightModel.PrepareTheModel(context.spaceModel, log);
            }


            if (computation.HasFlag(ScoreComputationModeEnum.category))
            {
                vectorDictionary = context.TransformToFVDictionaryAsCategorySimilarity(TermWeightModel, function, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.site))
            {
                vectorDictionary = context.TransformToFVDictionaryAsSiteSimilarity(TermWeightModel, function, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.pageDivergence))
            {
                vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.site, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.pagesOfCategory))
            {
                vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.category, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.pagesOfDataset))
            {
                vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.dataset, log);
            }



            log.log("Category similarity ready ... [" + computation.ToString() + "]");
        }
예제 #5
0
        /// <summary>
        /// Gets default configuration
        /// </summary>
        /// <returns></returns>
        public static PlanesMethodSettings GetDefaultSettings()
        {
            PlanesMethodSettings output = new PlanesMethodSettings();

            output.entityMethod.instructions.Add(DocumentRenderInstruction.GetDescriptionInstruction());
            output.entityMethod.instructions.Add(DocumentRenderInstruction.GetTitleInstruction());
            output.entityMethod.instructions.Add(DocumentRenderInstruction.GetBodyTextInstruction());

            //  output.entityMethod.blenderOptions = DocumentBlenderFunctionOptions.binaryAggregation | DocumentBlenderFunctionOptions.pageLevel;
            output.entityMethod.filterFunctionName = ""; // nameof(DocumentEntropyFunction);
            output.entityMethod.filterLimit        = 5;



            output.corpusMethod.stemmer   = nameof(EnglishStemmer);
            output.corpusMethod.tokenizer = nameof(TokenizerBasic);
            output.corpusMethod.transliterationRuleSetId = "";

            #region PREPARE Weighting model
            var weightModel = new FeatureWeightModel();
            weightModel.LocalFunction = new Weighting.Local.TermFrequencyFunction();

            var globalFactor = new FeatureWeightFactor();
            globalFactor.Settings.functionName = nameof(IDFElement);
            weightModel.GlobalFactors.Add(globalFactor);

            output.corpusMethod.WeightModel = weightModel;
            #endregion


            var featureFilter = new FeatureFilter();
            featureFilter.limit = 8000;
            //featureFilter.
            //featureFilter.functionSettings = new GlobalFunctionSettings();
            //featureFilter.functionSettings.functionName = nameof(CollectionTDPElement);
            //featureFilter.functionSettings.weight = 1.0;
            //featureFilter.functionSettings.flags.Add(Weighting.Metrics.TDPFactor.chi.ToString());
            output.corpusMethod.filter = featureFilter;

            /*
             * output.vectorMethod.constructor = new Feature.Settings.FeatureVectorConstructorSettings();
             * dimensionSpecification dimSpec = new dimensionSpecification();
             * dimSpec.functionName = nameof(CosineSimilarityFunction);
             * dimSpec.type = FeatureVectorDimensionType.similarityFunction;
             * output.vectorMethod.constructor.labelDimensions.Add(dimSpec);
             */

            //output.vectorMethod.constructor = new Feature.Settings.FeatureVectorConstructorSettings();
            //dimensionSpecification dimSpec = new dimensionSpecification();
            ////dimSpec.functionName = nameof(CosineSimilarityFunction);
            //dimSpec.type = FeatureVectorDimensionType.directTermWeight;
            //output.vectorMethod.constructor.featureDimensions.Add(dimSpec);


            output.featureMethod.classifierSettings.type = Classifiers.ClassifierType.multiClassSVM;
            output.featureMethod.classifierSettings.lossFunctionForTraining = Accord.MachineLearning.VectorMachines.Learning.Loss.L2;


            /*
             * output.featureMethod.classifierSettings.type = Classifiers.ClassifierType.kNearestNeighbors;
             * output.featureMethod.classifierSettings.lossFunctionForTraining = Accord.MachineLearning.VectorMachines.Learning.Loss.L2;
             * output.featureMethod.classifierSettings.kNN_k = 4;
             */

            return(output);
        }
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <exception cref="ArgumentException">context</exception>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            //context.folder.GetOrFindFiles("*", dictionaryFile + "*.xml");

            scoreDictionary = FeatureVectorDictionaryWithDimensions.LoadFile(context.folder, dictionaryFile, log); // WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log);

            if (scoreDictionary == null)
            {
                String msg = "Error: Failed to find score dictionary [" + dictionaryFile + "] in " + context.folder.path;
                throw new ArgumentException(msg, nameof(context));
            }

            if (useMachineLearning)
            {
                #region --------------- PREPARING TERM WEIGHT MODEL


                String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder);
                String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder);


                if (TermWeightModel == null)
                {
                    TermWeightModel = FeatureWeightModel.LoadModel(p_m, log);
                }


                TermWeightModel.Deploy(log);

                if (context.spaceModel == null)
                {
                    String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation";
                    throw new ArgumentException(msg, nameof(context));
                }



                if (File.Exists(p_d) && useStoredData)
                {
                    WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log);
                    TermWeightModel.LoadModelDataSet(data, log);

                    if (useSelectedFeatures)
                    {
                        SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log);
                    }
                }
                else
                {
                    TermWeightModel.PrepareTheModel(context.spaceModel, log);
                }

                if (SelectedTerms.Count == 0)
                {
                    SelectedTerms = context.selectedFeatures;
                }
                List <String> sel_tkns = new List <String>();

                sel_tkns.AddRange(SelectedTerms.index.Values.Select(x => x.name));

                if (!sel_tkns.Any())
                {
                    sel_tkns.AddRange(context.spaceModel.terms_known_label.GetTokens());
                }


                #endregion

                fvConstructor.Deploy(featureMethod.constructor, sel_tkns);



                classifier = featureMethod.classifierSettings.GetClassifier();

                sc_id = scoreDictionary.GetVectorsWithLabelID(null, criterion).ToNameVsLabelID();


                List <FeatureVectorWithLabelID> trainingSet = new List <FeatureVectorWithLabelID>();
                foreach (var item in context.items)
                {
                    if (sc_id.ContainsKey(item.AssignedID))
                    {
                        WeightDictionary dc_vec = TermWeightModel.GetWeights(sel_tkns, item.spaceDocument, context.spaceModel);


                        var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, item.AssignedID);

                        FeatureVectorWithLabelID id_vec = new FeatureVectorWithLabelID(n_vec, sc_id[item.AssignedID]);

                        trainingSet.Add(id_vec);
                    }
                }


                log.log("Training [" + classifier.name + "] with [" + sc_id.Count + "] feature vectors.");
                classifier.DoTraining(trainingSet, log);
            }
        }
        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        {
            log.log("... Category Similarity ...");

            List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList();

            Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>();

            foreach (SpaceLabel label in context.spaceModel.labels)
            {
                Relationship <SpaceLabel, SpaceCategoryModel> categoryModel = context.spaceModel.LabelToCategoryLinks.GetAllRelationships(label).FirstOrDefault();

                var c = TermWeightModel.GetWeights(selectedTerms, categoryModel.NodeB, context.spaceModel, label);
                categoryDictionarties.Add(label.name, c);
            }

            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();

            String domainNameLast = "";

            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 20);

            foreach (var entry in context.items)
            {
                i++;

                WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);

                FeatureVector fv = new FeatureVector(entry.AssignedID);
                fv.dimensions = new double[context.spaceModel.labels.Count];

                Int32 c = 0;

                Parallel.ForEach(context.spaceModel.labels, (label) =>
                {
                    var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[label.name], documentWeights);
                    fv.dimensions[context.spaceModel.labels.IndexOf(label)] = docToClassSimilarity;
                });


                Int32 r = i % p;
                if (r == 0)
                {
                    log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                }


                dict.GetOrAdd(entry.DomainID).Add(fv, -1);
            }

            foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict)
            {
                pair.Value.CloseDeploy();
            }

            log.log("... Preparation done...");
            return(dict);
        }
        ///// <summary>
        ///// Transforms to fv dictionary.
        ///// </summary>
        ///// <param name="context">The context.</param>
        ///// <param name="TermWeightModel">The term weight model.</param>
        ///// <param name="function">The function.</param>
        ///// <returns></returns>
        //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        //{
        //    log.log("... Page Similarity ...");

        //    List<string> selectedTerms = context.selectedFeatures.GetKeys();



        //    var ByDomain = context.GetByDomain(log);

        //    Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true);

        //    var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log);

        //    Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>();



        //    Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>();


        //    foreach (var entry in context.items)
        //    {

        //        WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
        //        documentDictionarties.Add(entry.AssignedID, documentWeights);
        //    }


        //    FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();



        //    Double total = context.Count;
        //    Int32 i = 0;
        //    Int32 p = (context.Count / 10);

        //    //List<List<Double>> matrix = new List<List<double>>();

        //    //foreach (var entry in context.items)
        //    //{
        //    //    matrix.Add(new List<double>());
        //    //}


        //    //for (int x = 0; x < context.items.Count; x++)
        //    //{

        //    //    for (int y = 0; y < context.items.Count; x++)
        //    //    {



        //    //    }

        //    //}

        //    ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>();


        //    foreach (var domainPair in ByCategory)
        //    {
        //        List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList();


        //        foreach (var entry in relatives)
        //        {

        //            i++;
        //            FeatureVector fv = new FeatureVector(entry.AssignedID);

        //            // List<Double> d = new List<>();

        //            fv.dimensions = new double[relatives.Count - 1];


        //            // List<String> keys = documentDictionarties.Keys.ToList();

        //            Int32 hostInd = relatives.IndexOf(entry);

        //            Int32 c = 0;


        //            //foreach (var pair in documentDictionarties)
        //            //{

        //            Parallel.ForEach(relatives, (pair) =>
        //            {

        //                Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
        //                if (ind >= hostInd)
        //                {
        //                    ind = ind - 1;
        //                }

        //                if (pair.AssignedID != entry.AssignedID)
        //                {
        //                    Double docToClassSimilarity = 0;

        //                    if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
        //                    }
        //                    else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
        //                    }
        //                    else
        //                    {
        //                        var vecA = documentDictionarties[pair.AssignedID];
        //                        var vecB = documentDictionarties[entry.AssignedID];
        //                        docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
        //                        if (docToClassSimilarity > 0)
        //                        {

        //                        }
        //                        if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
        //                            //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
        //                        }
        //                        else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
        //                        }

        //                    }

        //                    fv.dimensions[ind] = docToClassSimilarity;

        //                }
        //            });



        //            Int32 r = i % p;
        //            if (r == 0)
        //            {
        //                log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
        //            }


        //            dict.GetOrAdd(entry.DomainID).Add(fv, -1);
        //        }



        //    }


        //    //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict)
        //    //{
        //    //    pair.Value.CloseDeploy();
        //    //}

        //    log.log("... Preparation finished ...");

        //    return dict;


        //}



        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        {
            log.log("... Site Similarity ...");

            List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList();

            Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>();
            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();

            var byDomain = context.GetByDomain(log);

            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);

            foreach (var pair in byDomain)
            {
                i++;
                SpaceDocumentModel siteModel = new SpaceDocumentModel();

                foreach (var ent in pair.Value)
                {
                    WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel);
                    documentDictionarties.Add(ent.AssignedID, documentWeights);
                    siteModel.Children.Add(ent.spaceDocument);

                    //siteModel.terms.MergeDictionary(ent.spaceDocument.terms);
                }

                siteModel.Flatten(false);

                categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel));


                foreach (var ent in pair.Value)
                {
                    FeatureVector fv = new FeatureVector(ent.AssignedID);
                    fv.dimensions = new double[context.spaceModel.labels.Count];

                    // documentDictionarties[ent.AssignedID].entries


                    var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]);

                    fv.dimensions[0] = docToClassSimilarity;

                    dict.GetOrAdd(pair.Key).Add(fv, -1);
                }

                Int32 r = i % p;
                if (r == 0)
                {
                    log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                }
            }



            foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict)
            {
                pair.Value.CloseDeploy();
            }

            log.log("... Preparation finished ...");

            return(dict);
        }
        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log)
        {
            List <string> selectedTerms = context.selectedFeatures.GetKeys();

            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();


            foreach (var entry in context.items)
            {
                WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
                documentDictionarties.Add(entry.AssignedID, documentWeights);
            }


            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);



            Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null;


            if (groupmode == ScoreComputationModeEnum.category)
            {
                Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true);

                relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log);
                if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN))
                {
                    assignIDByLabel.Remove(SpaceLabel.UNKNOWN);
                }
                log.log("... Page Similarity ... Groups by category");
            }
            else if (groupmode == ScoreComputationModeEnum.site)
            {
                relative_groups = context.GetByDomain(log);
                log.log("... Page Similarity ... Groups by site");
            }
            else if (groupmode == ScoreComputationModeEnum.dataset)
            {
                relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >();
                relative_groups.Add("dataset", context.items);
                log.log("... Page Similarity ... dataset");
            }


            ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>();


            foreach (var domainPair in relative_groups)
            {
                List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList();


                foreach (var entry in relatives)
                {
                    i++;
                    FeatureVector fv = new FeatureVector(entry.AssignedID);

                    // List<Double> d = new List<>();

                    fv.dimensions = new double[relatives.Count - 1];


                    // List<String> keys = documentDictionarties.Keys.ToList();

                    Int32 hostInd = relatives.IndexOf(entry);

                    Int32 c = 0;


                    //foreach (var pair in documentDictionarties)
                    //{

                    Parallel.ForEach(relatives, (pair) =>
                    {
                        Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
                        if (ind >= hostInd)
                        {
                            ind = ind - 1;
                        }

                        if (pair.AssignedID != entry.AssignedID)
                        {
                            Double docToClassSimilarity = 0;

                            if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
                            }
                            else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
                            }
                            else
                            {
                                var vecA             = documentDictionarties[pair.AssignedID];
                                var vecB             = documentDictionarties[entry.AssignedID];
                                docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
                                if (docToClassSimilarity > 0)
                                {
                                }
                                if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                                {
                                    computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
                                    //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
                                }
                                else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                                {
                                    computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
                                }
                            }

                            fv.dimensions[ind] = docToClassSimilarity;
                        }
                    });



                    Int32 r = i % p;
                    if (r == 0)
                    {
                        log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                    }


                    dict.GetOrAdd(domainPair.Key).Add(fv, -1);
                }
            }



            log.log("... Preparation finished ...");

            return(dict);
        }
        public override ExperimentDataSetFoldContextPair <OperationContext> Execute(ILogBuilder logger, OperationContext executionContextMain = null, ExperimentModelExecutionContext executionContextExtra = null)
        {
            ExperimentDataSetFoldContextPair <OperationContext> output = new ExperimentDataSetFoldContextPair <OperationContext>(fold, executionContextMain);

            Open();

            String p_m = FeatureWeightModel.GetModelDefinitionFilename(setup.OutputFilename, fold_notes.folder);

            String p_d = FeatureWeightModel.GetModelDataFilename(setup.OutputFilename, fold_notes.folder);

            String w_t = WeightDictionary.GetDictionaryFilename(setup.OutputFilename, fold_notes.folder);

            Boolean skip = false;

            if (setup.skipIfExisting)
            {
                if (File.Exists(p_m) && File.Exists(p_d) && File.Exists(w_t))
                {
                    logger.log("WeightTable [" + p_d + "] found, skipping the operation");
                    skip = true;
                }
            }

            if (!skip)
            {
                output.context.DeployDataSet(fold, logger);

                entityOperation.TextRendering(output.context, notes);

                /*
                 * entityOperation.TextPreblendFilter(output.context, notes);
                 *
                 * entityOperation.TextBlending(output.context, notes);
                 */

                corpusOperation.SpaceModelPopulation(output.context, notes);

                corpusOperation.SpaceModelCategories(output.context, notes);

                corpusOperation.FeatureSelection(output.context, notes, requirements.MayUseSelectedFeatures);

                output.context.SelectedFeatures.Save(fold_notes.folder, notes, setup.OutputFilename + "_fs");

                //corpusOperation.weightModel.

                corpusOperation.weightModel.PrepareTheModel(output.context.spaceModel, logger);

                var wt_s = corpusOperation.weightModel.GetElementFactors(output.context.SelectedFeatures.GetKeys(), output.context.spaceModel);

                wt_s.Save(fold_notes.folder, notes, setup.OutputFilename);

                corpusOperation.weightModel.Save(setup.OutputFilename, fold_notes.folder, notes);

                OperationContextReport reportOperation = new OperationContextReport();
                reportOperation.DeploySettingsBase(notes);

                reportOperation.GenerateReports(output.context, setup.reportOptions, notes);
            }

            Close();

            return(output);
        }
예제 #11
0
        /// <summary>
        /// Picks specified number of sample documents and constructs a demo table, showing all term weight components
        /// </summary>
        /// <param name="space">The space.</param>
        /// <param name="weightModel">The weight model.</param>
        /// <param name="sampleDocuments">The sample documents.</param>
        /// <param name="name">The name.</param>
        /// <param name="description">The description.</param>
        /// <returns></returns>
        public static DataTable MakeWeightModelDemoTable(this SpaceModel space, FeatureWeightModel weightModel, WeightDictionary selectedFeatures, Int32 sampleDocuments, String name, String description)
        {
            DataTable table = new DataTable();

            table.SetTitle(name);
            table.SetDescription(description);

            table.SetAdditionalInfoEntry("Documents", space.documents.Count, "Total count of document vectors");
            table.SetAdditionalInfoEntry("Local function", weightModel.LocalFunction.GetSignature(), weightModel.LocalFunction.description);

            var sampleIn = space.documents.Take(Math.Min(sampleDocuments, space.documents.Count)).ToList();

            List <SpaceDocumentModel> sample = new List <SpaceDocumentModel>();

            foreach (var s in sampleIn)
            {
                sample.Add(s);
            }

            List <String> terms = new List <String>();

            var terms_in = sample.First().GetTerms(true, true).GetTokens();

            foreach (var t in terms_in)
            {
                if (selectedFeatures.ContainsKey(t))
                {
                    terms.Add(t);
                }
                if (terms.Count > 500)
                {
                    break;
                }
            }



            DataColumn        column_token = table.Add("Name", "Name of the document vector", "Name", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(50);
            List <DataColumn> dimensions   = new List <DataColumn>();


            DataColumn loc = null;

            List <DataColumn> localColumns = new List <DataColumn>();

            for (int i = 0; i < sample.Count; i++)
            {
                var doc = sample[i];
                localColumns.Add(
                    table.Add(weightModel.LocalFunction.shortName + i.ToString(),
                              weightModel.LocalFunction.GetSignature() + " for document: " + doc.name,
                              weightModel.LocalFunction.shortName, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5",
                              weightModel.LocalFunction.GetSignature() + "[" + i.ToString("D2") + "]").SetGroup("Local"));
            }

            Int32             c             = 0;
            List <DataColumn> globalColumns = new List <DataColumn>();

            foreach (FeatureWeightFactor gl in weightModel.GlobalFactors)
            {
                globalColumns.Add(
                    table.Add(gl.GlobalFunction.shortName + c.ToString(),
                              gl.GlobalFunction.shortName + " at w= " + gl.weight,
                              gl.GlobalFunction.shortName, typeof(Double), imbSCI.Core.enums.dataPointImportance.important, "F5",
                              gl.Settings.GetSignature() + "[" + c.ToString("D2") + "]").SetGroup("Global"));

                c++;
            }

            Int32             ct           = 0;
            List <DataColumn> totalColumns = new List <DataColumn>();

            foreach (var doc in sample)
            {
                totalColumns.Add(
                    table.Add("TotalScore" + ct.ToString(),
                              weightModel.LocalFunction.GetSignature() + " for document: " + doc.name,
                              weightModel.LocalFunction.shortName, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5",
                              weightModel.LocalFunction.GetSignature() + "[" + ct.ToString("D2") + "]").SetGroup("Total"));

                ct++;
            }



            /*
             * for (int i = 0; i < sample.Count; i++)
             * {
             *  var doc = sample[i];
             *
             *  foreach (String term in terms)
             *  {
             *      weightModel.GetCompositeEntry(term, doc, space);
             *  }
             *
             * }*/



            foreach (String term in terms)
            {
                var dr = table.NewRow();

                dr[column_token] = term;
                Int32 li = 0;
                foreach (DataColumn local in localColumns)
                {
                    dr[local] = weightModel.LocalFunction.GetElementFactor(term, sample[li]);
                    li++;
                }

                li = 0;
                foreach (DataColumn local in globalColumns)
                {
                    dr[local] = weightModel.GlobalFactors[li].GlobalFunction.GetElementFactor(term, space);
                    li++;
                }

                li = 0;
                foreach (DataColumn local in totalColumns)
                {
                    dr[local] = weightModel.GetWeight(term, sample[li], space); //. //GetElementFactor(term, sample[li]);
                    li++;
                }

                table.Rows.Add(dr);
            }


            return(table);
        }