/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Category Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); foreach (SpaceLabel label in context.spaceModel.labels) { Relationship <SpaceLabel, SpaceCategoryModel> categoryModel = context.spaceModel.LabelToCategoryLinks.GetAllRelationships(label).FirstOrDefault(); var c = TermWeightModel.GetWeights(selectedTerms, categoryModel.NodeB, context.spaceModel, label); categoryDictionarties.Add(label.name, c); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); String domainNameLast = ""; Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 20); foreach (var entry in context.items) { i++; WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); FeatureVector fv = new FeatureVector(entry.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; Int32 c = 0; Parallel.ForEach(context.spaceModel.labels, (label) => { var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[label.name], documentWeights); fv.dimensions[context.spaceModel.labels.IndexOf(label)] = docToClassSimilarity; }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(entry.DomainID).Add(fv, -1); } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation done..."); return(dict); }
/* * public static WeightDictionary GetChildrenWT(WeightDictionary output, SpaceDocumentModel model, FeatureWeightModel weightModel, SpaceModel space, List<string> FV) * { * if (output == null) * { * output = new WeightDictionary(model.name, ""); * } * if (model.Children.Any()) * { * foreach (var child in model.Children) * { * GetChildrenWT(output, child, weightModel, space, FV); * * } * * } * else * { * var wd = weightModel.GetWeights(FV, model, space); * output.Merge(wd.index.Values, model.weight); * } * return output; * * }*/ public static T BlendToVector <T>(this SpaceDocumentModel model, FeatureWeightModel weightModel, SpaceModel space, List <string> FV) where T : VectorDocument, new() { T output = new T(); output.name = model.name; var leafs = model.GetLeafs(); foreach (var leaf in leafs) { var wd = weightModel.GetWeights(FV, model, space); output.terms.Merge(wd); } //output.terms = //output.Merge(wd.index.Values, model.weight); //WeightDictionary wd = new WeightDictionary(model.name, ""); // GetChildrenWT(wd, model, weightModel, space, FV); return(output); }
///// <summary> ///// Transforms to fv dictionary. ///// </summary> ///// <param name="context">The context.</param> ///// <param name="TermWeightModel">The term weight model.</param> ///// <param name="function">The function.</param> ///// <returns></returns> //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) //{ // log.log("... Page Similarity ..."); // List<string> selectedTerms = context.selectedFeatures.GetKeys(); // var ByDomain = context.GetByDomain(log); // Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true); // var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log); // Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>(); // Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>(); // foreach (var entry in context.items) // { // WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); // documentDictionarties.Add(entry.AssignedID, documentWeights); // } // FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); // Double total = context.Count; // Int32 i = 0; // Int32 p = (context.Count / 10); // //List<List<Double>> matrix = new List<List<double>>(); // //foreach (var entry in context.items) // //{ // // matrix.Add(new List<double>()); // //} // //for (int x = 0; x < context.items.Count; x++) // //{ // // for (int y = 0; y < context.items.Count; x++) // // { // // } // //} // ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>(); // foreach (var domainPair in ByCategory) // { // List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList(); // foreach (var entry in relatives) // { // i++; // FeatureVector fv = new FeatureVector(entry.AssignedID); // // List<Double> d = new List<>(); // fv.dimensions = new double[relatives.Count - 1]; // // List<String> keys = documentDictionarties.Keys.ToList(); // Int32 hostInd = relatives.IndexOf(entry); // Int32 c = 0; // //foreach (var pair in documentDictionarties) // //{ // Parallel.ForEach(relatives, (pair) => // { // Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); // if (ind >= hostInd) // { // ind = ind - 1; // } // if (pair.AssignedID != entry.AssignedID) // { // Double docToClassSimilarity = 0; // if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; // } // else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; // } // else // { // var vecA = documentDictionarties[pair.AssignedID]; // var vecB = documentDictionarties[entry.AssignedID]; // docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); // if (docToClassSimilarity > 0) // { // } // if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); // //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); // } // else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); // } // } // fv.dimensions[ind] = docToClassSimilarity; // } // }); // Int32 r = i % p; // if (r == 0) // { // log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); // } // dict.GetOrAdd(entry.DomainID).Add(fv, -1); // } // } // //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict) // //{ // // pair.Value.CloseDeploy(); // //} // log.log("... Preparation finished ..."); // return dict; //} /// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Site Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); var byDomain = context.GetByDomain(log); FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); foreach (var pair in byDomain) { i++; SpaceDocumentModel siteModel = new SpaceDocumentModel(); foreach (var ent in pair.Value) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel); documentDictionarties.Add(ent.AssignedID, documentWeights); siteModel.Children.Add(ent.spaceDocument); //siteModel.terms.MergeDictionary(ent.spaceDocument.terms); } siteModel.Flatten(false); categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel)); foreach (var ent in pair.Value) { FeatureVector fv = new FeatureVector(ent.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; // documentDictionarties[ent.AssignedID].entries var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]); fv.dimensions[0] = docToClassSimilarity; dict.GetOrAdd(pair.Key).Add(fv, -1); } Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation finished ..."); return(dict); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log) { List <string> selectedTerms = context.selectedFeatures.GetKeys(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); foreach (var entry in context.items) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); documentDictionarties.Add(entry.AssignedID, documentWeights); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null; if (groupmode == ScoreComputationModeEnum.category) { Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true); relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log); if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN)) { assignIDByLabel.Remove(SpaceLabel.UNKNOWN); } log.log("... Page Similarity ... Groups by category"); } else if (groupmode == ScoreComputationModeEnum.site) { relative_groups = context.GetByDomain(log); log.log("... Page Similarity ... Groups by site"); } else if (groupmode == ScoreComputationModeEnum.dataset) { relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >(); relative_groups.Add("dataset", context.items); log.log("... Page Similarity ... dataset"); } ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>(); foreach (var domainPair in relative_groups) { List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList(); foreach (var entry in relatives) { i++; FeatureVector fv = new FeatureVector(entry.AssignedID); // List<Double> d = new List<>(); fv.dimensions = new double[relatives.Count - 1]; // List<String> keys = documentDictionarties.Keys.ToList(); Int32 hostInd = relatives.IndexOf(entry); Int32 c = 0; //foreach (var pair in documentDictionarties) //{ Parallel.ForEach(relatives, (pair) => { Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); if (ind >= hostInd) { ind = ind - 1; } if (pair.AssignedID != entry.AssignedID) { Double docToClassSimilarity = 0; if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; } else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; } else { var vecA = documentDictionarties[pair.AssignedID]; var vecB = documentDictionarties[entry.AssignedID]; docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); if (docToClassSimilarity > 0) { } if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); } else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); } } fv.dimensions[ind] = docToClassSimilarity; } }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(domainPair.Key).Add(fv, -1); } } log.log("... Preparation finished ..."); return(dict); }