/// <summary> /// Adds category similarity factor to the document ScoreModel /// </summary> /// <param name="remove">if set to <c>true</c> [remove].</param> /// <param name="modelName">Name of the model.</param> /// <param name="computation">The computation.</param> /// <param name="weight">The weight.</param> /// <param name="normalize">if set to <c>true</c> [normalize].</param> /// <remarks> /// It will add TermWeight based, similarity score factor into document score computation model /// </remarks> /// <seealso cref="aceOperationSetExecutorBase" /> public void aceOperation_setAddSimilarityFactor( [Description("If true it will remove any existing ScoreModel Factors")] Boolean remove = true, [Description("Model definition name, leave * to use current weighting model from docSelection setup")] String modelName = "*", [Description("Computation mode")] ScoreComputationModeEnum computation = ScoreComputationModeEnum.none, [Description("Weight associated with the term weight based document score factor")] Double weight = 1.0, [Description("Normalize score value on range from 0.0 to 1.0, across the sample")] Boolean normalize = true) { if (remove) { data.ranking.model.Factors.Clear(); data.ranking.model.SerializedFactors.Clear(); } ScoreCategorySimilarity gf = new ScoreCategorySimilarity(); gf.weight = weight; gf.computation = computation; gf.doNormalize = normalize; gf.modelDefinitionFile = modelName; if (modelName == "*") { gf.TermWeightModel = data.corpusForEvaluation.WeightModel; } data.ranking.model.Factors.Add(gf); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log) { List <string> selectedTerms = context.selectedFeatures.GetKeys(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); foreach (var entry in context.items) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); documentDictionarties.Add(entry.AssignedID, documentWeights); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null; if (groupmode == ScoreComputationModeEnum.category) { Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true); relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log); if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN)) { assignIDByLabel.Remove(SpaceLabel.UNKNOWN); } log.log("... Page Similarity ... Groups by category"); } else if (groupmode == ScoreComputationModeEnum.site) { relative_groups = context.GetByDomain(log); log.log("... Page Similarity ... Groups by site"); } else if (groupmode == ScoreComputationModeEnum.dataset) { relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >(); relative_groups.Add("dataset", context.items); log.log("... Page Similarity ... dataset"); } ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>(); foreach (var domainPair in relative_groups) { List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList(); foreach (var entry in relatives) { i++; FeatureVector fv = new FeatureVector(entry.AssignedID); // List<Double> d = new List<>(); fv.dimensions = new double[relatives.Count - 1]; // List<String> keys = documentDictionarties.Keys.ToList(); Int32 hostInd = relatives.IndexOf(entry); Int32 c = 0; //foreach (var pair in documentDictionarties) //{ Parallel.ForEach(relatives, (pair) => { Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); if (ind >= hostInd) { ind = ind - 1; } if (pair.AssignedID != entry.AssignedID) { Double docToClassSimilarity = 0; if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; } else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; } else { var vecA = documentDictionarties[pair.AssignedID]; var vecB = documentDictionarties[entry.AssignedID]; docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); if (docToClassSimilarity > 0) { } if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); } else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); } } fv.dimensions[ind] = docToClassSimilarity; } }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(domainPair.Key).Add(fv, -1); } } log.log("... Preparation finished ..."); return(dict); }