/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <exception cref="ArgumentException">context</exception> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { /* * String p_m = WeightDictionary.GetDictionaryFilename(, context.folder); * * if (File.Exists(p_m)) * { * //objectSerialization.loadObjectFromXML<WeightDictionary>(p_m, log); * * } */ weightDictionary = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log); if (context.spaceModel == null) { String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation"; throw new ArgumentException(msg, nameof(context)); } if (context.query.isNullOrEmpty()) { context.query.QueryTerms = context.query.QueryTerms.Trim(); List <String> tkns = context.query.QueryTerms.getTokens(true, true, true, false, 4); foreach (String tkn in tkns) { queryTerms.Add(context.stemmingContext.Stem(tkn)); } } }
/// <summary> /// Scores the specified entry. /// </summary> /// <param name="entry">The entry.</param> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <returns></returns> public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { if (useMachineLearning) { WeightDictionary dc_vec = TermWeightModel.GetWeights(SelectedTerms.GetKeys(), entry.spaceDocument, context.spaceModel); var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, entry.AssignedID); Double score = 0; Int32 l_id = -1; if (sc_id.ContainsKey(entry.AssignedID)) { l_id = sc_id[entry.AssignedID]; } score = classifier.DoScore(n_vec, log, l_id); return(score); } else { if (scoreDictionary.ContainsKey(entry.AssignedID)) { var fv = scoreDictionary[entry.AssignedID]; return(fv.CompressNumericVector(vectorCompression)); } else { return(0); } } }
/// <summary> /// Filters the space model features. /// </summary> /// <param name="spaceModel">The space model.</param> /// <param name="selectedFeatures">The selected features.</param> /// <param name="log">The log.</param> /// <returns></returns> public static Int32 FilterSpaceModelFeatures(this SpaceModel spaceModel, WeightDictionary selectedFeatures, ILogBuilder log) { Int32 i = 0; Int32 s = spaceModel.documents.Count() / 5; Int32 c_filter_out = 0; List <String> keys = selectedFeatures.GetKeys(); List <String> termsToRemove = spaceModel.terms.GetTokensOtherThan(keys); for (int i2 = 0; i2 < spaceModel.documents.Count; i2++) { c_filter_out += spaceModel.documents[i2].FilterSelectedFeatures(termsToRemove, false); if (i > s) { Double r = i2.GetRatio(spaceModel.documents.Count()); log.log("Filter SelectedFeatures [" + r.ToString("P2") + "]"); i = 0; } i++; } spaceModel.terms_known_label.FilterTokens(termsToRemove, false); spaceModel.terms_unknown_label.FilterTokens(termsToRemove, false); return(c_filter_out); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Category Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); foreach (SpaceLabel label in context.spaceModel.labels) { Relationship <SpaceLabel, SpaceCategoryModel> categoryModel = context.spaceModel.LabelToCategoryLinks.GetAllRelationships(label).FirstOrDefault(); var c = TermWeightModel.GetWeights(selectedTerms, categoryModel.NodeB, context.spaceModel, label); categoryDictionarties.Add(label.name, c); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); String domainNameLast = ""; Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 20); foreach (var entry in context.items) { i++; WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); FeatureVector fv = new FeatureVector(entry.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; Int32 c = 0; Parallel.ForEach(context.spaceModel.labels, (label) => { var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[label.name], documentWeights); fv.dimensions[context.spaceModel.labels.IndexOf(label)] = docToClassSimilarity; }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(entry.DomainID).Add(fv, -1); } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation done..."); return(dict); }
public override double ComputeSimilarity(WeightDictionary vectorA, WeightDictionary vectorB) { WeightDictionaryEntryPairs termPairs = new WeightDictionaryEntryPairs(vectorA, vectorB); if (termPairs.Count == 0) { return(0); } return(Compute(termPairs)); }
/// <summary> /// Gets the weights. /// </summary> /// <param name="termWhiteList">The term white list.</param> /// <param name="document">The document.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public WeightDictionary GetWeights(List <String> termWhiteList, SpaceDocumentModel document, SpaceModel space, SpaceLabel label = null) { WeightDictionary output = new WeightDictionary(); output.name = GetSignature() + "_" + document.name; output.description = "Feature weight table constructed by [" + GetSignature() + "] for features [" + termWhiteList.Count + "] in document [" + document.name + "]"; output.nDimensions = nDimensions; if (KERNELOPTION_USE_WHITELISTTERMS) { foreach (String term in termWhiteList) { if (document.terms.Contains(term)) { throw new NotImplementedException(); //output.entries.Add(entry); } } } else { List <String> terms = document.terms.GetTokens(); for (int i = 0; i < document.terms.Count; i++) { String term = terms[i]; WeightDictionaryEntry entry = new WeightDictionaryEntry(term, 0); if (DoUseLocalFunction) { entry = LocalFunction.GetElementFactorEntry(term, document); } foreach (FeatureWeightFactor gf in GlobalFactors) { entry = entry * (gf.GlobalFunction.GetElementFactorEntry(term, space, label) * gf.weight); } if (document.weight != 1) { entry = entry * document.weight; } output.Merge(entry); //output.AddEntry(term, entry.dimensions, false); } } return(output); }
public FeatureCWPAnalysisEntryReport(String __name, String description, folderNode _folder, FeatureCWPAnalysisSettings.AnalysisPurpose _purpose) { purpose = _purpose; name = __name; if (purpose != FeatureCWPAnalysisSettings.AnalysisPurpose.application) { //EntryDictionary = new WeightDictionary("DictionaryFor" + CategoryID, "Dictionary with term metrics for category " + CategoryID); EntryDictionary = new WeightDictionary("DictionaryFor" + name, description); EntryDictionary.nDimensions = fields().Count; } folder = _folder; }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <exception cref="ArgumentException">context</exception> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder); String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder); TermWeightModel = FeatureWeightModel.LoadModel(p_m, log); //if (File.Exists(p_m)) //{ // //TermWeightModel = objectSerialization.loadObjectFromXML<FeatureWeightModel>(p_m, log); //} TermWeightModel.Deploy(log); if (context.spaceModel == null) { String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation"; throw new ArgumentException(msg, nameof(context)); } if (File.Exists(p_d) && useStoredData) { WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log); TermWeightModel.LoadModelDataSet(data, log); if (useSelectedFeatures) { SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log); } } else { TermWeightModel.PrepareTheModel(context.spaceModel, log); } if (context.query.isNullOrEmpty()) { context.query.QueryTerms = context.query.QueryTerms.Trim(); List <String> tkns = context.query.QueryTerms.getTokens(true, true, true, false, 4); foreach (String tkn in tkns) { queryTerms.Add(context.stemmingContext.Stem(tkn)); } } }
public void DeployAndRun(ILogBuilder log, SpaceModel _space, folderNode folder) { filter.Deploy(log, folder); weightedFeatures = new WeightDictionary(name + "_weg" + filter.limit.ToString(), "weighted features, before filter"); selectedFeatures = new WeightDictionary(name + "_sel" + filter.limit.ToString(), "selected weighted featyres"); var selected = filter.SelectFeatures(_space, log, folder, weightedFeatures); foreach (var pair in selected) { selectedFeatures.AddEntry(pair.Key, pair.Value); } weightedFeatures.Save(folder, log, WeightDictionary.GetDictionaryFilename(weightedFeatures.name, folder)); selectedFeatures.Save(folder, log, WeightDictionary.GetDictionaryFilename(selectedFeatures.name, folder)); }
/// <summary> /// Constructs global weight fictionary using global elements /// </summary> /// <param name="terms">The terms.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public WeightDictionary GetElementFactors(IEnumerable <string> terms, SpaceModel space, SpaceLabel label = null) { var output = new WeightDictionary(); output.name = GetSignature() + "_globalOnly"; foreach (String term in terms) { Double score = GetElementFactor(term, space, label); WeightDictionaryEntry entry = new WeightDictionaryEntry(term, score); output.AddEntry(entry, true); } output.description = "Global weights for [" + output.Count + "] terms."; return(output); }
private void SetWeightButton_Click(object sender, EventArgs e) { this.allStringWeightList = new List <WeightDictionary>(); Dictionary <int, int> weightRelation = convertColumnIndex(this.weightRelation); if (!weightAndColumnNameRelationIndexConverted) { convertWeightAndColumnNameRelationIndex(); this.weightAndColumnNameRelationIndexConverted = true; } foreach (KeyValuePair <int, int> kvp in weightRelation) { WeightDictionary weightDict = new WeightDictionary(); int weightValueColumn = kvp.Key; int weightNameColumn = kvp.Value; weightDict.columnName = weightAndColumnNameRelation[weightValueColumn]; weightDict.variableName = selectedColumnVariableDict[weightAndColumnNameRelation[weightValueColumn]].ToString(); for (int i = 0; i < this.selectedColumnDataGridView.Rows.Count; i++) { if (!selectedColumnDataGridView.Rows[i].Cells[weightNameColumn].Value.ToString().Equals("") && selectedColumnDataGridView.Rows[i].Cells[weightValueColumn].Value == null) { DialogResult dr1 = MessageBox.Show(loadGlobalChineseCharacters.GlobalChineseCharactersDict["error_1"] + "," + weightAndColumnNameRelation[weightValueColumn] + loadGlobalChineseCharacters.GlobalChineseCharactersDict["weight"] + ":" + (i + 1) + loadGlobalChineseCharacters.GlobalChineseCharactersDict["row"], "message"); return; } else if (selectedColumnDataGridView.Rows[i].Cells[weightNameColumn].Value.ToString().Equals("") && selectedColumnDataGridView.Rows[i].Cells[weightValueColumn].Value == null) { break; } else if (!selectedColumnDataGridView.Rows[i].Cells[weightNameColumn].Value.ToString().Equals("") && selectedColumnDataGridView.Rows[i].Cells[weightValueColumn].Value != null) { string valueStr = selectedColumnDataGridView.Rows[i].Cells[weightValueColumn].Value.ToString(); float value = Convert.ToSingle(valueStr); string name = selectedColumnDataGridView.Rows[i].Cells[weightNameColumn].Value.ToString(); weightDict.Add(name, value); } } allStringWeightList.Add(weightDict); } printAllWeightList(); }
/// <summary> /// Builds dictionary of global element factors /// </summary> /// <param name="terms">The terms.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public WeightDictionary GetElementFactors(IEnumerable <String> terms, SpaceModel space, SpaceLabel label = null) { WeightDictionary output = new WeightDictionary(); switch (resultType) { case FunctionResultTypeEnum.numeric: output.nDimensions = 1; break; case FunctionResultTypeEnum.numericVectorForMultiClass: output.nDimensions = space.labels.Count; break; } foreach (String term in terms) { output.AddEntry(GetElementFactorEntry(term, space, label)); } return(output); }
public void Deploy(ILogBuilder logger, folderNode folder = null) { if (!outputFilename.isNullOrEmpty()) { if (folder != null) { String p_m = folder.pathFor(outputFilename, imbSCI.Data.enums.getWritableFileMode.none, "", false); precompiledSelection = WeightDictionary.LoadFile(p_m, logger); } } if (WeightModel != null) { WeightModel.DoUseLocalFunction = false; WeightModel.Deploy(logger); } //function = functionSettings.GetFunction(logger); //_isEnabled = function.IsEnabled; }
/// <summary> /// Constructs a feature vector - having dimension values set by <see cref="dimensionFunctionSet"/> /// </summary> /// <param name="vector">The vector.</param> /// <returns></returns> public FeatureVector ConstructFeatureVector(WeightDictionary terms, String name) { FeatureVector fv = new FeatureVector(name); Int32 c = 0; Int32 d = terms.nDimensions; fv.dimensions = new double[dimensionFunctionSet.Count * d]; //terms.index.Select(x => x.Value.CompressNumericVector(compression)).ToArray(); // new double[dimensionFunctionSet.Count]; foreach (FeatureSpaceDimensionBase dimension in dimensionFunctionSet) { for (int i = 0; i < d; i++) { fv.dimensions[c] = dimension.ComputeDimension(terms, i); c++; } } return(fv); }
///// <summary> ///// Transforms to fv dictionary. ///// </summary> ///// <param name="context">The context.</param> ///// <param name="TermWeightModel">The term weight model.</param> ///// <param name="function">The function.</param> ///// <returns></returns> //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) //{ // log.log("... Page Similarity ..."); // List<string> selectedTerms = context.selectedFeatures.GetKeys(); // var ByDomain = context.GetByDomain(log); // Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true); // var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log); // Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>(); // Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>(); // foreach (var entry in context.items) // { // WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); // documentDictionarties.Add(entry.AssignedID, documentWeights); // } // FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); // Double total = context.Count; // Int32 i = 0; // Int32 p = (context.Count / 10); // //List<List<Double>> matrix = new List<List<double>>(); // //foreach (var entry in context.items) // //{ // // matrix.Add(new List<double>()); // //} // //for (int x = 0; x < context.items.Count; x++) // //{ // // for (int y = 0; y < context.items.Count; x++) // // { // // } // //} // ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>(); // foreach (var domainPair in ByCategory) // { // List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList(); // foreach (var entry in relatives) // { // i++; // FeatureVector fv = new FeatureVector(entry.AssignedID); // // List<Double> d = new List<>(); // fv.dimensions = new double[relatives.Count - 1]; // // List<String> keys = documentDictionarties.Keys.ToList(); // Int32 hostInd = relatives.IndexOf(entry); // Int32 c = 0; // //foreach (var pair in documentDictionarties) // //{ // Parallel.ForEach(relatives, (pair) => // { // Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); // if (ind >= hostInd) // { // ind = ind - 1; // } // if (pair.AssignedID != entry.AssignedID) // { // Double docToClassSimilarity = 0; // if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; // } // else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; // } // else // { // var vecA = documentDictionarties[pair.AssignedID]; // var vecB = documentDictionarties[entry.AssignedID]; // docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); // if (docToClassSimilarity > 0) // { // } // if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) // { // computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); // //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); // } // else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) // { // computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); // } // } // fv.dimensions[ind] = docToClassSimilarity; // } // }); // Int32 r = i % p; // if (r == 0) // { // log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); // } // dict.GetOrAdd(entry.DomainID).Add(fv, -1); // } // } // //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict) // //{ // // pair.Value.CloseDeploy(); // //} // log.log("... Preparation finished ..."); // return dict; //} /// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log) { log.log("... Site Similarity ..."); List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList(); Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); var byDomain = context.GetByDomain(log); FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); foreach (var pair in byDomain) { i++; SpaceDocumentModel siteModel = new SpaceDocumentModel(); foreach (var ent in pair.Value) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel); documentDictionarties.Add(ent.AssignedID, documentWeights); siteModel.Children.Add(ent.spaceDocument); //siteModel.terms.MergeDictionary(ent.spaceDocument.terms); } siteModel.Flatten(false); categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel)); foreach (var ent in pair.Value) { FeatureVector fv = new FeatureVector(ent.AssignedID); fv.dimensions = new double[context.spaceModel.labels.Count]; // documentDictionarties[ent.AssignedID].entries var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]); fv.dimensions[0] = docToClassSimilarity; dict.GetOrAdd(pair.Key).Add(fv, -1); } Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } } foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict) { pair.Value.CloseDeploy(); } log.log("... Preparation finished ..."); return(dict); }
public abstract double ComputeDimension(WeightDictionary vector, Int32 d = 0);
/// <summary> /// Computes the similarity. /// </summary> /// <param name="vectorA">The vector a.</param> /// <param name="vectorB">The vector b.</param> /// <returns></returns> //public abstract Double ComputeSimilarity(IEnumerable<WeightDictionaryEntry> vectorA, IEnumerable<WeightDictionaryEntry> vectorB); public abstract double ComputeSimilarity(WeightDictionary vectorA, WeightDictionary vectorB);
/// <summary> /// Picks specified number of sample documents and constructs a demo table, showing all term weight components /// </summary> /// <param name="space">The space.</param> /// <param name="weightModel">The weight model.</param> /// <param name="sampleDocuments">The sample documents.</param> /// <param name="name">The name.</param> /// <param name="description">The description.</param> /// <returns></returns> public static DataTable MakeWeightModelDemoTable(this SpaceModel space, FeatureWeightModel weightModel, WeightDictionary selectedFeatures, Int32 sampleDocuments, String name, String description) { DataTable table = new DataTable(); table.SetTitle(name); table.SetDescription(description); table.SetAdditionalInfoEntry("Documents", space.documents.Count, "Total count of document vectors"); table.SetAdditionalInfoEntry("Local function", weightModel.LocalFunction.GetSignature(), weightModel.LocalFunction.description); var sampleIn = space.documents.Take(Math.Min(sampleDocuments, space.documents.Count)).ToList(); List <SpaceDocumentModel> sample = new List <SpaceDocumentModel>(); foreach (var s in sampleIn) { sample.Add(s); } List <String> terms = new List <String>(); var terms_in = sample.First().GetTerms(true, true).GetTokens(); foreach (var t in terms_in) { if (selectedFeatures.ContainsKey(t)) { terms.Add(t); } if (terms.Count > 500) { break; } } DataColumn column_token = table.Add("Name", "Name of the document vector", "Name", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(50); List <DataColumn> dimensions = new List <DataColumn>(); DataColumn loc = null; List <DataColumn> localColumns = new List <DataColumn>(); for (int i = 0; i < sample.Count; i++) { var doc = sample[i]; localColumns.Add( table.Add(weightModel.LocalFunction.shortName + i.ToString(), weightModel.LocalFunction.GetSignature() + " for document: " + doc.name, weightModel.LocalFunction.shortName, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", weightModel.LocalFunction.GetSignature() + "[" + i.ToString("D2") + "]").SetGroup("Local")); } Int32 c = 0; List <DataColumn> globalColumns = new List <DataColumn>(); foreach (FeatureWeightFactor gl in weightModel.GlobalFactors) { globalColumns.Add( table.Add(gl.GlobalFunction.shortName + c.ToString(), gl.GlobalFunction.shortName + " at w= " + gl.weight, gl.GlobalFunction.shortName, typeof(Double), imbSCI.Core.enums.dataPointImportance.important, "F5", gl.Settings.GetSignature() + "[" + c.ToString("D2") + "]").SetGroup("Global")); c++; } Int32 ct = 0; List <DataColumn> totalColumns = new List <DataColumn>(); foreach (var doc in sample) { totalColumns.Add( table.Add("TotalScore" + ct.ToString(), weightModel.LocalFunction.GetSignature() + " for document: " + doc.name, weightModel.LocalFunction.shortName, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", weightModel.LocalFunction.GetSignature() + "[" + ct.ToString("D2") + "]").SetGroup("Total")); ct++; } /* * for (int i = 0; i < sample.Count; i++) * { * var doc = sample[i]; * * foreach (String term in terms) * { * weightModel.GetCompositeEntry(term, doc, space); * } * * }*/ foreach (String term in terms) { var dr = table.NewRow(); dr[column_token] = term; Int32 li = 0; foreach (DataColumn local in localColumns) { dr[local] = weightModel.LocalFunction.GetElementFactor(term, sample[li]); li++; } li = 0; foreach (DataColumn local in globalColumns) { dr[local] = weightModel.GlobalFactors[li].GlobalFunction.GetElementFactor(term, space); li++; } li = 0; foreach (DataColumn local in totalColumns) { dr[local] = weightModel.GetWeight(term, sample[li], space); //. //GetElementFactor(term, sample[li]); li++; } table.Rows.Add(dr); } return(table); }
/// <summary> /// Makes ranked weight table /// </summary> /// <param name="terms">The terms.</param> /// <param name="name">The name.</param> /// <param name="description">The description.</param> /// <param name="dimension">Custom names of dimensions - for case of vector collection</param> /// <param name="limit">The limit.</param> /// <returns></returns> public static DataTable MakeTable(this WeightDictionary terms, String name, String description, List <String> dimension = null, Int32 limit = 0, Int32 sortByDimension = 0, Int32 distinctBlockSize = 25) { DataTable table = new DataTable(); if (sortByDimension > 0) { if (sortByDimension < dimension.Count) { name = name + "_" + dimension[sortByDimension]; } else { name = name + "_" + sortByDimension.ToString("D3"); } } table.SetTitle(name); table.SetDescription(description); List <WeightDictionaryEntry> ranking = terms.index.Values.OrderByDescending(x => x.dimensions[sortByDimension]).ToList(); String sortedByDimension = dimension[sortByDimension]; if (dimension == null) { dimension = new List <string>(); dimension.Add("Weight"); } table.SetAdditionalInfoEntry("Count", terms.Count, "Total weighted features in the dictionary"); table.SetAdditionalInfoEntry("Dimensions", dimension.Count, "Number of dimensions"); DataColumn column_rank = table.Add("Rank", "Rank by frequency", "R", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(6); DataColumn column_token = table.Add("Token", "Token", "t", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(20); List <DataColumn> dimensions = new List <DataColumn>(); Dictionary <String, List <Double> > distinctValues = new Dictionary <string, List <Double> >(); Int32 cd = 1; foreach (String dim in dimension) { var cn = table.Add(dim, "Associated dimension [" + cd.ToString() + "] " + dim, dim, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", dim); cn.SetWidth(10); distinctValues.Add(dim, new List <Double>()); dimensions.Add(cn); cd++; } var list = ranking; if (limit > 0) { list = ranking.Take(Math.Min(limit, ranking.Count)).ToList(); if (list.Count < terms.Count) { table.AddExtra("Table contains top [" + list.Count + "] entries, out of [" + terms.Count + "] enumerated in the feature weighted dictionary"); } } Int32 sortByDimensionNonDistinct = 0; Int32 c = 1; foreach (var pair in list) { if (distinctValues[sortedByDimension].Contains(pair.dimensions[sortByDimension])) { sortByDimensionNonDistinct++; } else { sortByDimensionNonDistinct = 0; } if (sortByDimensionNonDistinct < distinctBlockSize) { var dr = table.NewRow(); dr[column_rank] = c; //dr[column_id] = terms.GetTokenID(pair.Key); dr[column_token] = pair.name; Int32 ci = 0; foreach (DataColumn dimCol in dimensions) { if (ci < pair.dimensions.Length) { var v = pair.dimensions[ci]; if (!distinctValues[dimCol.ColumnName].Contains(v)) { distinctValues[dimCol.ColumnName].Add(v); } dr[dimCol] = v; } ci++; } //dr[column_freq] = pair.Value; c++; table.Rows.Add(dr); } } foreach (String dim in dimension) { if (dim != sortedByDimension) { if (distinctValues[dim].Count < 2) { table.Columns.Remove(dim); table.SetAdditionalInfoEntry(dim + " removed", "Removed as having no distinct values", "Automatically removed"); } } } return(table); }
//public Boolean ComputeFeatureScores(WeightDictionary featureScores, SpaceModel space, ILogBuilder log, folderNode folder = null) //{ // return doAll; //} /// <summary> /// Selects the top <see cref="limit"/> terms, ranked by <see cref="function"/> /// </summary> /// <param name="space">The space.</param> /// <returns></returns> public List <KeyValuePair <string, double> > SelectFeatures(SpaceModel space, ILogBuilder log, folderNode folder = null, WeightDictionary featureScores = null) { Dictionary <String, Double> rank = new Dictionary <string, double>(); Boolean doAll = false; if (limit == -1) { doAll = true; } if (featureScores == null) { featureScores = new WeightDictionary(); } var tokens = space.terms_known_label.GetTokens(); if (precompiledSelection != null && precompiledSelection.Count > 0) { log.log("Using precompiled selection filter from [" + outputFilename + "]"); featureScores.Merge(precompiledSelection); } else { WeightModel.PrepareTheModel(space, log); featureScores = WeightModel.GetElementFactors(tokens, space); } if (tokens.Count() <= limit) { doAll = true; } if (doAll) { List <KeyValuePair <string, double> > outAll = new List <KeyValuePair <string, double> >(); foreach (String tkn in tokens) { outAll.Add(new KeyValuePair <string, double>(tkn, 1)); } return(outAll); } //function.PrepareTheModel(space, log); if (!outputFilename.isNullOrEmpty()) { if (folder != null) { String p_m = folder.pathFor(outputFilename, imbSCI.Data.enums.getWritableFileMode.none, "", false); featureScores.Save(folder, log, outputFilename); //precompiledSelection = WeightDictionary.LoadFile(p_m, logger); } } foreach (WeightDictionaryEntry en in featureScores.index.Values) { // rank.Add(en.name, en.weight); Double v = 0; if (featureScores.nDimensions > 1) { v = en.CompressNumericVector(nVectorValueSelectionOperation); } else { v = en.weight; } Boolean ok = true; if (RemoveZero) { if (v == 0) { ok = false; } } if (ok) { rank.Add(en.name, v); } } var rankSorted = rank.OrderByDescending(x => x.Value).ToList(); List <KeyValuePair <string, double> > top = rankSorted.Take(Math.Min(limit, rankSorted.Count)).ToList(); return(top); }
public static void SetReportDataFields(this classificationReport report, WeightDictionary selected) { // report.data.Add(nameof(ReportDataFieldEnum.PagePerSite), classifier.GetSignature(), "Signature of the classification algorithm"); report.data.Add(nameof(ReportDataFieldEnum.SelectedFeatures), selected.Count.ToString(), "Number of selected features"); // report.data.Add(nameof(ReportDataFieldEnum.FeatureWeighting), featureWeight.GetSignature(), "Signature of feature weight model"); }
public override double ComputeDimension(WeightDictionary vector, Int32 d = 0) { var entry = vector.GetValue(term, d); return(entry); }
public override ExperimentDataSetFoldContextPair <OperationContext> Execute(ILogBuilder logger, OperationContext executionContextMain = null, ExperimentModelExecutionContext executionContextExtra = null) { ExperimentDataSetFoldContextPair <OperationContext> output = new ExperimentDataSetFoldContextPair <OperationContext>(fold, executionContextMain); Open(); String p_m = FeatureWeightModel.GetModelDefinitionFilename(setup.OutputFilename, fold_notes.folder); String p_d = FeatureWeightModel.GetModelDataFilename(setup.OutputFilename, fold_notes.folder); String w_t = WeightDictionary.GetDictionaryFilename(setup.OutputFilename, fold_notes.folder); Boolean skip = false; if (setup.skipIfExisting) { if (File.Exists(p_m) && File.Exists(p_d) && File.Exists(w_t)) { logger.log("WeightTable [" + p_d + "] found, skipping the operation"); skip = true; } } if (!skip) { output.context.DeployDataSet(fold, logger); entityOperation.TextRendering(output.context, notes); /* * entityOperation.TextPreblendFilter(output.context, notes); * * entityOperation.TextBlending(output.context, notes); */ corpusOperation.SpaceModelPopulation(output.context, notes); corpusOperation.SpaceModelCategories(output.context, notes); corpusOperation.FeatureSelection(output.context, notes, requirements.MayUseSelectedFeatures); output.context.SelectedFeatures.Save(fold_notes.folder, notes, setup.OutputFilename + "_fs"); //corpusOperation.weightModel. corpusOperation.weightModel.PrepareTheModel(output.context.spaceModel, logger); var wt_s = corpusOperation.weightModel.GetElementFactors(output.context.SelectedFeatures.GetKeys(), output.context.spaceModel); wt_s.Save(fold_notes.folder, notes, setup.OutputFilename); corpusOperation.weightModel.Save(setup.OutputFilename, fold_notes.folder, notes); OperationContextReport reportOperation = new OperationContextReport(); reportOperation.DeploySettingsBase(notes); reportOperation.GenerateReports(output.context, setup.reportOptions, notes); } Close(); return(output); }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <exception cref="ArgumentException">context</exception> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { //context.folder.GetOrFindFiles("*", dictionaryFile + "*.xml"); scoreDictionary = FeatureVectorDictionaryWithDimensions.LoadFile(context.folder, dictionaryFile, log); // WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log); if (scoreDictionary == null) { String msg = "Error: Failed to find score dictionary [" + dictionaryFile + "] in " + context.folder.path; throw new ArgumentException(msg, nameof(context)); } if (useMachineLearning) { #region --------------- PREPARING TERM WEIGHT MODEL String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder); String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder); if (TermWeightModel == null) { TermWeightModel = FeatureWeightModel.LoadModel(p_m, log); } TermWeightModel.Deploy(log); if (context.spaceModel == null) { String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation"; throw new ArgumentException(msg, nameof(context)); } if (File.Exists(p_d) && useStoredData) { WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log); TermWeightModel.LoadModelDataSet(data, log); if (useSelectedFeatures) { SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log); } } else { TermWeightModel.PrepareTheModel(context.spaceModel, log); } if (SelectedTerms.Count == 0) { SelectedTerms = context.selectedFeatures; } List <String> sel_tkns = new List <String>(); sel_tkns.AddRange(SelectedTerms.index.Values.Select(x => x.name)); if (!sel_tkns.Any()) { sel_tkns.AddRange(context.spaceModel.terms_known_label.GetTokens()); } #endregion fvConstructor.Deploy(featureMethod.constructor, sel_tkns); classifier = featureMethod.classifierSettings.GetClassifier(); sc_id = scoreDictionary.GetVectorsWithLabelID(null, criterion).ToNameVsLabelID(); List <FeatureVectorWithLabelID> trainingSet = new List <FeatureVectorWithLabelID>(); foreach (var item in context.items) { if (sc_id.ContainsKey(item.AssignedID)) { WeightDictionary dc_vec = TermWeightModel.GetWeights(sel_tkns, item.spaceDocument, context.spaceModel); var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, item.AssignedID); FeatureVectorWithLabelID id_vec = new FeatureVectorWithLabelID(n_vec, sc_id[item.AssignedID]); trainingSet.Add(id_vec); } } log.log("Training [" + classifier.name + "] with [" + sc_id.Count + "] feature vectors."); classifier.DoTraining(trainingSet, log); } }
public override double ComputeDimension(WeightDictionary vector, Int32 d = 0) { var entry = similarityFunction.ComputeSimilarity(vector, classVector.terms); //vector.GetValue(term, d); return(entry); }
public override ExperimentDataSetFoldContextPair <OperationContext> Execute(ILogBuilder logger, OperationContext executionContextMain = null, ExperimentModelExecutionContext executionContextExtra = null) { ExperimentDataSetFoldContextPair <OperationContext> output = new ExperimentDataSetFoldContextPair <OperationContext>(fold, executionContextMain); Open(); Boolean skip = false; // String fn = setup.OutputFilename; String p_m = WeightDictionary.GetDictionaryFilename(setup.OutputFilename, fold_notes.folder); //FeatureWeightModel.GetModelDefinitionFilename(setup.OutputFilename, fold_notes.folder); //String p_d = FeatureWeightModel.GetModelDataFilename(setup.OutputFilename, fold_notes.folder); if (setup.skipIfExisting) { if (File.Exists(p_m)) { logger.log("WeightTable [" + p_m + "] found, skipping the operation"); skip = true; } } if (!skip) { notes.log("Rendering primary view"); // ------------------- PRIMARY CONTEXT output.context.DeployDataSet(fold, logger); primaryEntityOperation.TextRendering(output.context, notes); //primaryEntityOperation.TextPreblendFilter(output.context, notes); //primaryEntityOperation.TextBlending(output.context, notes); corpusOperation.SpaceModelPopulation(output.context, notes); corpusOperation.SpaceModelCategories(output.context, notes); corpusOperation.FeatureSelection(output.context, notes, requirements.MayUseSelectedFeatures); OperationContext primaryContext = output.context; // ------------------- SECONDARY CONTEXT output.context = new OperationContext(); notes.log("Rendering secondary view"); output.context.DeployDataSet(fold, logger); secondaryEntityOperation.TextRendering(output.context, notes); // secondaryEntityOperation.TextPreblendFilter(output.context, notes); // secondaryEntityOperation.TextBlending(output.context, notes); corpusOperation.SpaceModelPopulation(output.context, notes); corpusOperation.SpaceModelCategories(output.context, notes); corpusOperation.FeatureSelection(output.context, notes, requirements.MayUseSelectedFeatures); OperationContext secondaryContext = output.context; ProjectionDictionary projectionPairs = DocumentRankingTools.ConstructPairDictionary(primaryContext.spaceModel.documents, secondaryContext.spaceModel.documents); DocumentSelectResult drmContext = output.context.PrepareContext(rankingOperation, fold_notes.folder, logger); drmContext = rankingOperation.ExecuteEvaluation(drmContext, logger); drmContext.description = "Document score assigned to the primary text render" + name; drmContext.saveObjectToXML(fold_notes.folder.pathFor("DS_" + name + "_projection_score.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Projection within [" + name + "] operation")); TokenFrequencyAndScoreDictionary tokenFrequencyAndScoreDictionary = ProjectionTools.ProjectPrimaryTermsToScores(projectionPairs, drmContext, logger); WeightDictionary wt = tokenFrequencyAndScoreDictionary.ConstructWeightDictionary(); wt.name = setup.OutputFilename; wt.description = "Projected PrimaryView to ScoreTable - WeightTable, constructed from [" + projectionPairs.Count + "] render pairs. Document ranking: " + drmContext.description; wt.Save(fold_notes.folder, logger, setup.OutputFilename); // wt.saveObjectToXML(p_m); } Close(); return(output); }
/// <summary> /// Transforms to fv dictionary. /// </summary> /// <param name="context">The context.</param> /// <param name="TermWeightModel">The term weight model.</param> /// <param name="function">The function.</param> /// <returns></returns> public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log) { List <string> selectedTerms = context.selectedFeatures.GetKeys(); Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>(); foreach (var entry in context.items) { WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel); documentDictionarties.Add(entry.AssignedID, documentWeights); } FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary(); Double total = context.Count; Int32 i = 0; Int32 p = (context.Count / 10); Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null; if (groupmode == ScoreComputationModeEnum.category) { Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true); relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log); if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN)) { assignIDByLabel.Remove(SpaceLabel.UNKNOWN); } log.log("... Page Similarity ... Groups by category"); } else if (groupmode == ScoreComputationModeEnum.site) { relative_groups = context.GetByDomain(log); log.log("... Page Similarity ... Groups by site"); } else if (groupmode == ScoreComputationModeEnum.dataset) { relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >(); relative_groups.Add("dataset", context.items); log.log("... Page Similarity ... dataset"); } ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>(); foreach (var domainPair in relative_groups) { List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList(); foreach (var entry in relatives) { i++; FeatureVector fv = new FeatureVector(entry.AssignedID); // List<Double> d = new List<>(); fv.dimensions = new double[relatives.Count - 1]; // List<String> keys = documentDictionarties.Keys.ToList(); Int32 hostInd = relatives.IndexOf(entry); Int32 c = 0; //foreach (var pair in documentDictionarties) //{ Parallel.ForEach(relatives, (pair) => { Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID); if (ind >= hostInd) { ind = ind - 1; } if (pair.AssignedID != entry.AssignedID) { Double docToClassSimilarity = 0; if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID]; } else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID]; } else { var vecA = documentDictionarties[pair.AssignedID]; var vecB = documentDictionarties[entry.AssignedID]; docToClassSimilarity = function.ComputeSimilarity(vecA, vecB); if (docToClassSimilarity > 0) { } if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID)) { computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity); //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity); } else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID)) { computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity); } } fv.dimensions[ind] = docToClassSimilarity; } }); Int32 r = i % p; if (r == 0) { log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] "); } dict.GetOrAdd(domainPair.Key).Add(fv, -1); } } log.log("... Preparation finished ..."); return(dict); }