コード例 #1
0
        /// <summary>
        /// Adds category similarity factor to the document ScoreModel
        /// </summary>
        /// <param name="remove">if set to <c>true</c> [remove].</param>
        /// <param name="modelName">Name of the model.</param>
        /// <param name="computation">The computation.</param>
        /// <param name="weight">The weight.</param>
        /// <param name="normalize">if set to <c>true</c> [normalize].</param>
        /// <remarks>
        /// It will add TermWeight based, similarity score factor into document score computation model
        /// </remarks>
        /// <seealso cref="aceOperationSetExecutorBase" />
        public void aceOperation_setAddSimilarityFactor(
            [Description("If true it will remove any existing ScoreModel Factors")] Boolean remove = true,
            [Description("Model definition name, leave * to use current weighting model from docSelection setup")] String modelName = "*",
            [Description("Computation mode")]
            ScoreComputationModeEnum computation = ScoreComputationModeEnum.none,
            [Description("Weight associated with the term weight based document score factor")] Double weight    = 1.0,
            [Description("Normalize score value on range from 0.0 to 1.0, across the sample")] Boolean normalize = true)
        {
            if (remove)
            {
                data.ranking.model.Factors.Clear();
                data.ranking.model.SerializedFactors.Clear();
            }

            ScoreCategorySimilarity gf = new ScoreCategorySimilarity();

            gf.weight              = weight;
            gf.computation         = computation;
            gf.doNormalize         = normalize;
            gf.modelDefinitionFile = modelName;

            if (modelName == "*")
            {
                gf.TermWeightModel = data.corpusForEvaluation.WeightModel;
            }

            data.ranking.model.Factors.Add(gf);
        }
コード例 #2
0
        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log)
        {
            List <string> selectedTerms = context.selectedFeatures.GetKeys();

            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();


            foreach (var entry in context.items)
            {
                WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
                documentDictionarties.Add(entry.AssignedID, documentWeights);
            }


            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);



            Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null;


            if (groupmode == ScoreComputationModeEnum.category)
            {
                Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true);

                relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log);
                if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN))
                {
                    assignIDByLabel.Remove(SpaceLabel.UNKNOWN);
                }
                log.log("... Page Similarity ... Groups by category");
            }
            else if (groupmode == ScoreComputationModeEnum.site)
            {
                relative_groups = context.GetByDomain(log);
                log.log("... Page Similarity ... Groups by site");
            }
            else if (groupmode == ScoreComputationModeEnum.dataset)
            {
                relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >();
                relative_groups.Add("dataset", context.items);
                log.log("... Page Similarity ... dataset");
            }


            ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>();


            foreach (var domainPair in relative_groups)
            {
                List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList();


                foreach (var entry in relatives)
                {
                    i++;
                    FeatureVector fv = new FeatureVector(entry.AssignedID);

                    // List<Double> d = new List<>();

                    fv.dimensions = new double[relatives.Count - 1];


                    // List<String> keys = documentDictionarties.Keys.ToList();

                    Int32 hostInd = relatives.IndexOf(entry);

                    Int32 c = 0;


                    //foreach (var pair in documentDictionarties)
                    //{

                    Parallel.ForEach(relatives, (pair) =>
                    {
                        Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
                        if (ind >= hostInd)
                        {
                            ind = ind - 1;
                        }

                        if (pair.AssignedID != entry.AssignedID)
                        {
                            Double docToClassSimilarity = 0;

                            if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
                            }
                            else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
                            }
                            else
                            {
                                var vecA             = documentDictionarties[pair.AssignedID];
                                var vecB             = documentDictionarties[entry.AssignedID];
                                docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
                                if (docToClassSimilarity > 0)
                                {
                                }
                                if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                                {
                                    computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
                                    //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
                                }
                                else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                                {
                                    computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
                                }
                            }

                            fv.dimensions[ind] = docToClassSimilarity;
                        }
                    });



                    Int32 r = i % p;
                    if (r == 0)
                    {
                        log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                    }


                    dict.GetOrAdd(domainPair.Key).Add(fv, -1);
                }
            }



            log.log("... Preparation finished ...");

            return(dict);
        }