/// <summary> /// Gets the vectors with label identifier as dimension value. If vector is not found in this dictionary, it will set labelID to 0, for incorrect it will set 1 and for correct 2 /// </summary> /// <param name="CompleteDataSet">The complete data set.</param> /// <param name="criteria">The criteria.</param> /// <returns></returns> public FeatureVectorWithLabelIDSet GetVectorsWithLabelID(List <String> CompleteDataSet, Double criteria = 0.5, List <String> labels = null) { if (CompleteDataSet == null) { CompleteDataSet = new List <string>(); } labels = SpaceLabel.SetDefaultLabelList(CompleteDataSet.Any(), labels); CompleteDataSet.AddRange(this.Select(x => x.Key)); //if (labels==null) //{ // labels = new List<string>(); // labels.Add(SpaceLabel.UNKNOWN); // labels.Add(SpaceLabel.INCORRECT); // labels.Add(SpaceLabel.CORRECT); //} Int32 l_unknown = labels.IndexOf(SpaceLabel.UNKNOWN); Int32 l_correct = labels.IndexOf(SpaceLabel.CORRECT); Int32 l_incorrect = labels.IndexOf(SpaceLabel.INCORRECT); var output = new FeatureVectorWithLabelIDSet(); output.DoAutoSetUnknownLabels = false; foreach (String id in CompleteDataSet) { Int32 l = l_unknown; FeatureVectorWithLabelID fv_id = null; if (ContainsKey(id)) { if (this[id].dimensions[0] < criteria) { l = l_incorrect; } else { l = l_correct; } fv_id = new FeatureVectorWithLabelID(this[id], l); } else { if (l_unknown > -1) { fv_id = new FeatureVectorWithLabelID(new FeatureVector(id), l); } } output.Add(fv_id); } return(output); }
/// <summary> /// Gets the element factor. /// </summary> /// <param name="term">The term.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public override double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null) { // SpaceLabel label = DocumentVsLabel[document]; Double output = 0; List <String> labelNames = new List <string>(); if (label == null) { if (index.ContainsKey(term)) { return(index[term]); } else { return(0); } //labelNames = computedModel.index.Keys.ToList(); } else { labelNames.Add(label.name); } List <Double> scores = new List <double>(); foreach (String ln in labelNames) { var d = GetElementFactor(term, ln); scores.Add(d); } output = operationExtensions.CompressNumericVector(scores.ToArray(), defaultOperation); /* * if (!computedModel.index.ContainsKey(label.name)) return 0; * * if (!computedModel.index[label.name].ContainsKey(term)) return 0; */ //TermDiscriminatingPower TDP = model[label.name][term]; //output = TDP.Compute(factor, N); //output = computedModel.index[label.name][term]; return(output); }
public SpaceLabel designateSpaceLabel(OperationContext context, IVector vector) { SpaceLabel lab = context.spaceModel.label_unknown; if (context.spaceLabelByDocAssignedID.ContainsKey(vector.name)) { lab = context.spaceLabelByDocAssignedID[vector.name]; } else if (context.spaceLabelsDomains.ContainsKey(vector.name)) { lab = context.spaceLabelsDomains[vector.name]; } return(lab); }
/// <summary> /// Gets the product of global element factors /// </summary> /// <param name="term">The term.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null) { Double GF = 1; foreach (FeatureWeightFactor gf in GlobalFactors) { GF = GF * (gf.GlobalFunction.GetElementFactor(term, space, label) * gf.weight); } //if (Double.IsInfinity(GF)) //{ //} return(GF); }
private void DeployCategory(WebSiteDocumentsSet set) { String labelName = set.name; if (labelName.isNullOrEmpty()) { labelName = SpaceLabel.UNKNOWN; } SpaceLabel lab = new SpaceLabel(labelName); if (labelName == SpaceLabel.UNKNOWN) { spaceModel.label_unknown = lab; } else { spaceModel.labels.Add(lab); } spaceLabels.Add(lab.name, lab); dataset.Add(labelName, set); foreach (WebSiteDocuments site in set) { spaceLabelsDomains.Add(site.domain, lab); webSiteByDomain.Add(site.domain, site); List <WebSiteDocument> toRemove = new List <WebSiteDocument>(); foreach (WebSiteDocument doc in site.documents) { if (webDocumentByAssignedID.ContainsKey(doc.AssignedID)) { toRemove.Add(doc); } else { webDocumentByAssignedID.Add(doc.AssignedID, doc); spaceLabelByDocAssignedID.Add(doc.AssignedID, lab); } } toRemove.ForEach(x => site.documents.Remove(x)); } }
/// <summary> /// Deploys custom truth table /// </summary> /// <param name="vectors">The vectors.</param> /// <param name="logger">The logger.</param> public void Deploy(IEnumerable <FeatureVectorWithLabelID> vectors, ILogBuilder logger, List <String> labels = null) { label_index = SpaceLabel.SetDefaultLabelList(true, labels); labels_without_unknown = SpaceLabel.SetDefaultLabelList(false, labels); index_to_label = new Dictionary <int, string>(); for (int i = 0; i < label_index.Count; i++) { index_to_label.Add(i, label_index[i]); } foreach (FeatureVectorWithLabelID vector in vectors) { siteToLabel.Add(vector.name, index_to_label[vector.labelID]); } }
public override double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null) { if (!IsEnabled) { return(1); } if (!index.ContainsKey(term)) { return(0); } Double score = index[term]; if (!DistinctReturns.ContainsKey(score)) { DistinctReturns.Add(score, term); } return(score); }
public override double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null) { Double score = 0; if (index.ContainsKey(term)) { score = index[term]; } else { score = GetScore(term); } if (!DistinctReturns.ContainsKey(score)) { DistinctReturns.Add(score, term); } return(score); }
public SpaceLabel designateSpaceLabel(OperationContext context, IVector vector) { //SpaceLabel lab = context.spaceModel.label_unknown; SpaceLabel lab = context.spaceModel.LabelToDocumentLinks.GetAllLinkedA(vector.name).FirstOrDefault(); if (context.spaceLabelByDocAssignedID.ContainsKey(vector.name)) { lab = context.spaceLabelByDocAssignedID[vector.name]; } else if (context.spaceLabelsDomains.ContainsKey(vector.name)) { lab = context.spaceLabelsDomains[vector.name]; } if (lab == null) { lab = context.spaceModel.label_unknown; } return(lab); }
/// <summary> /// Spaces the model population. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void SpaceModelPopulation(OperationContext context, ILogBuilder log) { log.log("Space model population"); context.stemmContext = new StemmingContext(stemmer); // modelling the documents foreach (var pair in context.textDocuments) { var doc = pair.Value; SpaceDocumentModel model = spaceConstructor.ConstructDocument(doc.content, doc.name, context.spaceModel, context.stemmContext, tokenizer); foreach (String label in doc.labels) { SpaceLabel sLabel = null; sLabel = context.spaceLabels[label]; context.spaceModel.LabelToDocumentLinks.Add(sLabel, model, 1); } context.spaceModel.documents.Add(model); if (doc.labels.Contains(SpaceLabel.UNKNOWN)) { context.spaceModel.terms_unknown_label.MergeDictionary(model.terms); } else { context.spaceModel.terms.MergeDictionary(model.terms); } } log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]"); }
/// <summary> /// Spaces the model population. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void SpaceModelPopulation(OperationContext context, ILogBuilder log) { log.log("Space model population"); context.stemmContext = new StemmingContext(stemmer); context.tokenizer = tokenizer; context.entityMetrics = new Dictionary <String, ContentMetrics>(); foreach (KeyValuePair <String, TextDocumentSet> pair in context.renderSiteByDomain) { SpaceLabel spaceLabel = context.spaceLabelsDomains[pair.Key]; SpaceDocumentModel modelOfSite = new SpaceDocumentModel(); modelOfSite.name = pair.Key; modelOfSite.labels.Add(spaceLabel.name); foreach (TextDocumentLayerCollection textLayer in pair.Value) { SpaceDocumentModel modelOfPage = new SpaceDocumentModel(textLayer.name); ContentMetrics metrics = null; if (DoKeepContentMetrics) { metrics = new ContentMetrics(textLayer.name); } foreach (var renderLayer in textLayer) { SpaceDocumentModel modelOfLayer = new SpaceDocumentModel(modelOfPage.name + renderLayer.name); modelOfLayer = spaceConstructor.ConstructDocument(renderLayer.content, modelOfPage.name + renderLayer.name, context.spaceModel, context.stemmContext, tokenizer, spaceLabel.name != SpaceLabel.UNKNOWN, metrics); modelOfLayer.weight = renderLayer.layerWeight; modelOfLayer.documentScope = DocumentBlenderFunctionOptions.layerLevel; modelOfPage.Children.Add(modelOfLayer); } modelOfPage.documentScope = DocumentBlenderFunctionOptions.pageLevel; if (DoKeepContentMetrics) { context.entityMetrics.Add(metrics.Name, metrics); } // modelOfPage.Flatten(false); modelOfSite.Children.Add(modelOfPage); } modelOfSite.documentScope = DocumentBlenderFunctionOptions.siteLevel; context.spaceModel.documents.Add(modelOfSite); foreach (String label in modelOfSite.labels) { SpaceLabel sLabel = null; sLabel = context.spaceLabels[label]; context.spaceModel.LabelToDocumentLinks.Add(sLabel, modelOfSite, 1); } modelOfSite.Flatten(false); /* * if (modelOfSite.labels.Contains(SpaceLabel.UNKNOWN)) * { * context.spaceModel.terms_unknown_label.MergeDictionary(modelOfSite.terms); * } * else * { * context.spaceModel.terms_known_label.MergeDictionary(modelOfSite.terms); * }*/ modelOfSite.PropagateLabels(); // modelOfSite.SetLabel(spaceLabel, context.spaceModel); //context.spaceModel.LabelToDocumentLinks.Add(spaceLabel, modelOfSite, 1.0); } log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]"); }
/// <summary> /// Gets the weights. /// </summary> /// <param name="termWhiteList">The term white list.</param> /// <param name="document">The document.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public WeightDictionary GetWeights(List <String> termWhiteList, SpaceDocumentModel document, SpaceModel space, SpaceLabel label = null) { WeightDictionary output = new WeightDictionary(); output.name = GetSignature() + "_" + document.name; output.description = "Feature weight table constructed by [" + GetSignature() + "] for features [" + termWhiteList.Count + "] in document [" + document.name + "]"; output.nDimensions = nDimensions; if (KERNELOPTION_USE_WHITELISTTERMS) { foreach (String term in termWhiteList) { if (document.terms.Contains(term)) { throw new NotImplementedException(); //output.entries.Add(entry); } } } else { List <String> terms = document.terms.GetTokens(); for (int i = 0; i < document.terms.Count; i++) { String term = terms[i]; WeightDictionaryEntry entry = new WeightDictionaryEntry(term, 0); if (DoUseLocalFunction) { entry = LocalFunction.GetElementFactorEntry(term, document); } foreach (FeatureWeightFactor gf in GlobalFactors) { entry = entry * (gf.GlobalFunction.GetElementFactorEntry(term, space, label) * gf.weight); } if (document.weight != 1) { entry = entry * document.weight; } output.Merge(entry); //output.AddEntry(term, entry.dimensions, false); } } return(output); }
public WeightDictionaryEntry GetElementFactorEntry(string term, SpaceModel space, SpaceLabel label = null) { WeightDictionaryEntry output = new WeightDictionaryEntry(term, 0); switch (resultType) { case FunctionResultTypeEnum.numeric: output = new WeightDictionaryEntry(term, GetElementFactor(term, space, label)); break; case FunctionResultTypeEnum.numericVectorForMultiClass: Double[] vec = new double[space.labels.Count]; Int32 c = 0; foreach (SpaceLabel lb in space.labels) { vec[c] = GetElementFactor(term, space, lb); c++; } output = new WeightDictionaryEntry(term, vec); // output.AddEntry(term, vec); break; } if (!DistinctReturns.ContainsKey(output.weight)) { DistinctReturns.Add(output.weight, term); } return(output); }
/// <summary> /// Executes the plane method, invoking contained functions according to the settings /// </summary> /// <param name="inputContext">The input context - related to this plane.</param> /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param> /// <param name="logger">The logger.</param> /// <returns> /// Retur /// </returns> public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger) { if (notes != null) { notes.logStartPhase("[1] Entity Plane - execution", ""); } IEntityPlaneContext context = inputContext as IEntityPlaneContext; CorpusPlaneContext outputContext = new CorpusPlaneContext(); outputContext.provider.StoreAndReceive(context); outputContext.dataset = context.dataset; // ---------------- rendering procedure Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> > renderIndex = new Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> >(); Dictionary <string, SpaceLabel> labels = new Dictionary <string, SpaceLabel>(); Dictionary <WebSiteDocuments, TextDocumentSet> sitesToRenders = new Dictionary <WebSiteDocuments, TextDocumentSet>(); Dictionary <String, WebSiteDocuments> inputSites = new Dictionary <string, WebSiteDocuments>(); Dictionary <String, TextDocumentSet> inputTextRenders = new Dictionary <string, TextDocumentSet>(); Dictionary <WebSiteDocuments, List <SpaceLabel> > inputSiteVsLabels = new Dictionary <WebSiteDocuments, List <SpaceLabel> >(); Int32 c = 0; // rendering foreach (WebSiteDocumentsSet docSet in context.dataset) { if (docSet.name.isNullOrEmpty() || docSet.name == SpaceLabel.UNKNOWN) { outputContext.space.label_unknown = new SpaceLabel(SpaceLabel.UNKNOWN); labels.Add(SpaceLabel.UNKNOWN, outputContext.space.label_unknown); } else { SpaceLabel lab = new SpaceLabel(docSet.name); labels.Add(lab.name, lab); outputContext.space.labels.Add(lab); } String datasetSignature = context.dataset.GetDataSetSignature(); // ---- render List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>(); if (CacheProvider.IsReady) { foreach (WebSiteDocuments site in docSet) { TextDocumentSet tds = CacheProvider.GetCached <TextDocumentSet>(setupSignature, datasetSignature, site.domain); if (tds == null) { tds = render.RenderSiteDocuments(site, logger); CacheProvider.SetCached(setupSignature, datasetSignature, tds.name, tds); } else { tds.name = site.domain; } textSetForLabel.Add(tds); } } else { textSetForLabel = render.RenderDocumentSet(docSet, logger); foreach (TextDocumentSet ws in textSetForLabel) { CacheProvider.SetCached(setupSignature, datasetSignature, ws.name, ws); } } // // <--- performs the rendering textSetForLabel.ForEach(x => inputTextRenders.Add(x.name, x)); // --- rest of indexing docSet.ForEach(x => inputSites.Add(x.domain, x)); renderIndex.Add(docSet, textSetForLabel); foreach (WebSiteDocuments site in docSet) { inputSiteVsLabels.Add(site, new List <SpaceLabel>()); inputSiteVsLabels[site].Add(labels[docSet.name]); c++; } } if (notes != null) { notes.log("Text document for [" + c + "] entities created"); } // tmp index foreach (String key in inputSites.Keys) { sitesToRenders.Add(inputSites[key], inputTextRenders[key]); } // page in site filtering if (filter.IsEnabled) { Dictionary <WebSiteDocuments, TextDocumentSet> renderIndexFiltered = new Dictionary <WebSiteDocuments, TextDocumentSet>(); filter.Learn(inputTextRenders.Values); foreach (KeyValuePair <WebSiteDocuments, TextDocumentSet> pair in sitesToRenders) { renderIndexFiltered.Add(pair.Key, filter.FilterDocumentSet(pair.Value)); } sitesToRenders = renderIndexFiltered; } Dictionary <String, TextDocumentSet> TextDocumentsByDomainName = new Dictionary <string, TextDocumentSet>(); foreach (var pair in sitesToRenders) { TextDocumentsByDomainName.Add(pair.Key.domain, pair.Value); } // blending pages into single page per web site // DoBlendPagesIntoSingleEntity = blender.options.HasFlag(DocumentBlenderFunctionOptions.separatePages); Boolean keepSeparated = blender.DoKeepPagesSeparated; foreach (var pair in renderIndex) { foreach (TextDocumentSet entitySet in pair.Value) { TextDocumentSet selectedTexts = TextDocumentsByDomainName[entitySet.name]; WebSiteDocuments web = inputSites[entitySet.name]; IEnumerable <string> label = inputSiteVsLabels[web].Select(x => x.name); if (keepSeparated) { // filter function TextDocument doc = blender.blendToTextDocument(selectedTexts); doc.labels.AddRange(label); outputContext.corpus_documents.Add(doc); } else { var docs = blender.blendToSeparateTextDocuments(selectedTexts); //blender.blendToTextDocument(selectedTexts); foreach (TextDocument doc in docs) { doc.labels.AddRange(label); outputContext.corpus_documents.Add(doc); } } } } if (notes != null) { notes.logEndPhase(); } return(outputContext); }
public abstract Double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null);
/// <summary> /// Builds dictionary of global element factors /// </summary> /// <param name="terms">The terms.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public WeightDictionary GetElementFactors(IEnumerable <String> terms, SpaceModel space, SpaceLabel label = null) { WeightDictionary output = new WeightDictionary(); switch (resultType) { case FunctionResultTypeEnum.numeric: output.nDimensions = 1; break; case FunctionResultTypeEnum.numericVectorForMultiClass: output.nDimensions = space.labels.Count; break; } foreach (String term in terms) { output.AddEntry(GetElementFactorEntry(term, space, label)); } return(output); }
/// <summary> /// Constructs global weight fictionary using global elements /// </summary> /// <param name="terms">The terms.</param> /// <param name="space">The space.</param> /// <param name="label">The label.</param> /// <returns></returns> public WeightDictionary GetElementFactors(IEnumerable <string> terms, SpaceModel space, SpaceLabel label = null) { var output = new WeightDictionary(); output.name = GetSignature() + "_globalOnly"; foreach (String term in terms) { Double score = GetElementFactor(term, space, label); WeightDictionaryEntry entry = new WeightDictionaryEntry(term, score); output.AddEntry(entry, true); } output.description = "Global weights for [" + output.Count + "] terms."; return(output); }
public WeightDictionaryEntry GetElementFactorEntry(string term, SpaceModel space, SpaceLabel label = null) { Double score = GetElementFactor(term, space, label); WeightDictionaryEntry entry = new WeightDictionaryEntry(term, score); return(entry); }
public Double GetWeight(String term, SpaceDocumentModel document, SpaceModel space, SpaceLabel label = null) { return(GetElementFactor(term, document) * GetElementFactor(term, space, label)); }