Beispiel #1
0
        /// <summary>
        /// Gets the vectors with label identifier as dimension value. If vector is not found in this dictionary, it will set labelID to 0, for incorrect it will set 1 and for correct 2
        /// </summary>
        /// <param name="CompleteDataSet">The complete data set.</param>
        /// <param name="criteria">The criteria.</param>
        /// <returns></returns>
        public FeatureVectorWithLabelIDSet GetVectorsWithLabelID(List <String> CompleteDataSet, Double criteria = 0.5, List <String> labels = null)
        {
            if (CompleteDataSet == null)
            {
                CompleteDataSet = new List <string>();
            }

            labels = SpaceLabel.SetDefaultLabelList(CompleteDataSet.Any(), labels);

            CompleteDataSet.AddRange(this.Select(x => x.Key));

            //if (labels==null)
            //{

            //    labels = new List<string>();
            //    labels.Add(SpaceLabel.UNKNOWN);
            //    labels.Add(SpaceLabel.INCORRECT);
            //    labels.Add(SpaceLabel.CORRECT);
            //}


            Int32 l_unknown = labels.IndexOf(SpaceLabel.UNKNOWN);

            Int32 l_correct   = labels.IndexOf(SpaceLabel.CORRECT);
            Int32 l_incorrect = labels.IndexOf(SpaceLabel.INCORRECT);

            var output = new FeatureVectorWithLabelIDSet();

            output.DoAutoSetUnknownLabels = false;

            foreach (String id in CompleteDataSet)
            {
                Int32 l = l_unknown;
                FeatureVectorWithLabelID fv_id = null;

                if (ContainsKey(id))
                {
                    if (this[id].dimensions[0] < criteria)
                    {
                        l = l_incorrect;
                    }
                    else
                    {
                        l = l_correct;
                    }
                    fv_id = new FeatureVectorWithLabelID(this[id], l);
                }
                else
                {
                    if (l_unknown > -1)
                    {
                        fv_id = new FeatureVectorWithLabelID(new FeatureVector(id), l);
                    }
                }
                output.Add(fv_id);
            }
            return(output);
        }
Beispiel #2
0
        /// <summary>
        /// Gets the element factor.
        /// </summary>
        /// <param name="term">The term.</param>
        /// <param name="space">The space.</param>
        /// <param name="label">The label.</param>
        /// <returns></returns>
        public override double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null)
        {
            // SpaceLabel label = DocumentVsLabel[document];
            Double output = 0;

            List <String> labelNames = new List <string>();

            if (label == null)
            {
                if (index.ContainsKey(term))
                {
                    return(index[term]);
                }
                else
                {
                    return(0);
                }
                //labelNames = computedModel.index.Keys.ToList();
            }
            else
            {
                labelNames.Add(label.name);
            }

            List <Double> scores = new List <double>();

            foreach (String ln in labelNames)
            {
                var d = GetElementFactor(term, ln);

                scores.Add(d);
            }


            output = operationExtensions.CompressNumericVector(scores.ToArray(), defaultOperation);


            /*
             * if (!computedModel.index.ContainsKey(label.name)) return 0;
             *
             * if (!computedModel.index[label.name].ContainsKey(term)) return 0;
             */

            //TermDiscriminatingPower TDP = model[label.name][term];

            //output = TDP.Compute(factor, N);

            //output = computedModel.index[label.name][term];

            return(output);
        }
Beispiel #3
0
        public SpaceLabel designateSpaceLabel(OperationContext context, IVector vector)
        {
            SpaceLabel lab = context.spaceModel.label_unknown;

            if (context.spaceLabelByDocAssignedID.ContainsKey(vector.name))
            {
                lab = context.spaceLabelByDocAssignedID[vector.name];
            }
            else if (context.spaceLabelsDomains.ContainsKey(vector.name))
            {
                lab = context.spaceLabelsDomains[vector.name];
            }

            return(lab);
        }
        /// <summary>
        /// Gets the product of global element factors
        /// </summary>
        /// <param name="term">The term.</param>
        /// <param name="space">The space.</param>
        /// <param name="label">The label.</param>
        /// <returns></returns>
        public double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null)
        {
            Double GF = 1;

            foreach (FeatureWeightFactor gf in GlobalFactors)
            {
                GF = GF * (gf.GlobalFunction.GetElementFactor(term, space, label) * gf.weight);
            }

            //if (Double.IsInfinity(GF))
            //{

            //}

            return(GF);
        }
Beispiel #5
0
        private void DeployCategory(WebSiteDocumentsSet set)
        {
            String labelName = set.name;

            if (labelName.isNullOrEmpty())
            {
                labelName = SpaceLabel.UNKNOWN;
            }
            SpaceLabel lab = new SpaceLabel(labelName);

            if (labelName == SpaceLabel.UNKNOWN)
            {
                spaceModel.label_unknown = lab;
            }
            else
            {
                spaceModel.labels.Add(lab);
            }

            spaceLabels.Add(lab.name, lab);
            dataset.Add(labelName, set);

            foreach (WebSiteDocuments site in set)
            {
                spaceLabelsDomains.Add(site.domain, lab);

                webSiteByDomain.Add(site.domain, site);


                List <WebSiteDocument> toRemove = new List <WebSiteDocument>();

                foreach (WebSiteDocument doc in site.documents)
                {
                    if (webDocumentByAssignedID.ContainsKey(doc.AssignedID))
                    {
                        toRemove.Add(doc);
                    }
                    else
                    {
                        webDocumentByAssignedID.Add(doc.AssignedID, doc);
                        spaceLabelByDocAssignedID.Add(doc.AssignedID, lab);
                    }
                }

                toRemove.ForEach(x => site.documents.Remove(x));
            }
        }
Beispiel #6
0
        /// <summary>
        /// Deploys custom truth table
        /// </summary>
        /// <param name="vectors">The vectors.</param>
        /// <param name="logger">The logger.</param>
        public void Deploy(IEnumerable <FeatureVectorWithLabelID> vectors, ILogBuilder logger, List <String> labels = null)
        {
            label_index = SpaceLabel.SetDefaultLabelList(true, labels);

            labels_without_unknown = SpaceLabel.SetDefaultLabelList(false, labels);


            index_to_label = new Dictionary <int, string>();
            for (int i = 0; i < label_index.Count; i++)
            {
                index_to_label.Add(i, label_index[i]);
            }


            foreach (FeatureVectorWithLabelID vector in vectors)
            {
                siteToLabel.Add(vector.name, index_to_label[vector.labelID]);
            }
        }
        public override double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null)
        {
            if (!IsEnabled)
            {
                return(1);
            }
            if (!index.ContainsKey(term))
            {
                return(0);
            }
            Double score = index[term];

            if (!DistinctReturns.ContainsKey(score))
            {
                DistinctReturns.Add(score, term);
            }


            return(score);
        }
Beispiel #8
0
        public override double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null)
        {
            Double score = 0;

            if (index.ContainsKey(term))
            {
                score = index[term];
            }
            else
            {
                score = GetScore(term);
            }

            if (!DistinctReturns.ContainsKey(score))
            {
                DistinctReturns.Add(score, term);
            }

            return(score);
        }
        public SpaceLabel designateSpaceLabel(OperationContext context, IVector vector)
        {
            //SpaceLabel lab = context.spaceModel.label_unknown;

            SpaceLabel lab = context.spaceModel.LabelToDocumentLinks.GetAllLinkedA(vector.name).FirstOrDefault();

            if (context.spaceLabelByDocAssignedID.ContainsKey(vector.name))
            {
                lab = context.spaceLabelByDocAssignedID[vector.name];
            }
            else if (context.spaceLabelsDomains.ContainsKey(vector.name))
            {
                lab = context.spaceLabelsDomains[vector.name];
            }

            if (lab == null)
            {
                lab = context.spaceModel.label_unknown;
            }

            return(lab);
        }
Beispiel #10
0
        /// <summary>
        /// Spaces the model population.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void SpaceModelPopulation(OperationContext context, ILogBuilder log)
        {
            log.log("Space model population");
            context.stemmContext = new StemmingContext(stemmer);


            // modelling the documents
            foreach (var pair in context.textDocuments)
            {
                var doc = pair.Value;

                SpaceDocumentModel model = spaceConstructor.ConstructDocument(doc.content, doc.name, context.spaceModel, context.stemmContext, tokenizer);

                foreach (String label in doc.labels)
                {
                    SpaceLabel sLabel = null;
                    sLabel = context.spaceLabels[label];

                    context.spaceModel.LabelToDocumentLinks.Add(sLabel, model, 1);
                }

                context.spaceModel.documents.Add(model);



                if (doc.labels.Contains(SpaceLabel.UNKNOWN))
                {
                    context.spaceModel.terms_unknown_label.MergeDictionary(model.terms);
                }
                else
                {
                    context.spaceModel.terms.MergeDictionary(model.terms);
                }
            }

            log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]");
        }
        /// <summary>
        /// Spaces the model population.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void SpaceModelPopulation(OperationContext context, ILogBuilder log)
        {
            log.log("Space model population");
            context.stemmContext = new StemmingContext(stemmer);
            context.tokenizer    = tokenizer;

            context.entityMetrics = new Dictionary <String, ContentMetrics>();

            foreach (KeyValuePair <String, TextDocumentSet> pair in context.renderSiteByDomain)
            {
                SpaceLabel spaceLabel = context.spaceLabelsDomains[pair.Key];

                SpaceDocumentModel modelOfSite = new SpaceDocumentModel();
                modelOfSite.name = pair.Key;
                modelOfSite.labels.Add(spaceLabel.name);

                foreach (TextDocumentLayerCollection textLayer in pair.Value)
                {
                    SpaceDocumentModel modelOfPage = new SpaceDocumentModel(textLayer.name);

                    ContentMetrics metrics = null;
                    if (DoKeepContentMetrics)
                    {
                        metrics = new ContentMetrics(textLayer.name);
                    }

                    foreach (var renderLayer in textLayer)
                    {
                        SpaceDocumentModel modelOfLayer = new SpaceDocumentModel(modelOfPage.name + renderLayer.name);

                        modelOfLayer = spaceConstructor.ConstructDocument(renderLayer.content, modelOfPage.name + renderLayer.name,
                                                                          context.spaceModel, context.stemmContext, tokenizer,
                                                                          spaceLabel.name != SpaceLabel.UNKNOWN, metrics);

                        modelOfLayer.weight = renderLayer.layerWeight;

                        modelOfLayer.documentScope = DocumentBlenderFunctionOptions.layerLevel;

                        modelOfPage.Children.Add(modelOfLayer);
                    }

                    modelOfPage.documentScope = DocumentBlenderFunctionOptions.pageLevel;

                    if (DoKeepContentMetrics)
                    {
                        context.entityMetrics.Add(metrics.Name, metrics);
                    }

                    // modelOfPage.Flatten(false);

                    modelOfSite.Children.Add(modelOfPage);
                }

                modelOfSite.documentScope = DocumentBlenderFunctionOptions.siteLevel;

                context.spaceModel.documents.Add(modelOfSite);

                foreach (String label in modelOfSite.labels)
                {
                    SpaceLabel sLabel = null;
                    sLabel = context.spaceLabels[label];
                    context.spaceModel.LabelToDocumentLinks.Add(sLabel, modelOfSite, 1);
                }

                modelOfSite.Flatten(false);

                /*
                 * if (modelOfSite.labels.Contains(SpaceLabel.UNKNOWN))
                 * {
                 *  context.spaceModel.terms_unknown_label.MergeDictionary(modelOfSite.terms);
                 * }
                 * else
                 * {
                 *  context.spaceModel.terms_known_label.MergeDictionary(modelOfSite.terms);
                 * }*/

                modelOfSite.PropagateLabels();

                //    modelOfSite.SetLabel(spaceLabel, context.spaceModel);

                //context.spaceModel.LabelToDocumentLinks.Add(spaceLabel, modelOfSite, 1.0);
            }

            log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]");
        }
        /// <summary>
        /// Gets the weights.
        /// </summary>
        /// <param name="termWhiteList">The term white list.</param>
        /// <param name="document">The document.</param>
        /// <param name="space">The space.</param>
        /// <param name="label">The label.</param>
        /// <returns></returns>
        public WeightDictionary GetWeights(List <String> termWhiteList, SpaceDocumentModel document, SpaceModel space, SpaceLabel label = null)
        {
            WeightDictionary output = new WeightDictionary();

            output.name        = GetSignature() + "_" + document.name;
            output.description = "Feature weight table constructed by [" + GetSignature() + "] for features [" + termWhiteList.Count + "] in document [" + document.name + "]";
            output.nDimensions = nDimensions;

            if (KERNELOPTION_USE_WHITELISTTERMS)
            {
                foreach (String term in termWhiteList)
                {
                    if (document.terms.Contains(term))
                    {
                        throw new NotImplementedException();
                        //output.entries.Add(entry);
                    }
                }
            }
            else
            {
                List <String> terms = document.terms.GetTokens();

                for (int i = 0; i < document.terms.Count; i++)
                {
                    String term = terms[i];

                    WeightDictionaryEntry entry = new WeightDictionaryEntry(term, 0);


                    if (DoUseLocalFunction)
                    {
                        entry = LocalFunction.GetElementFactorEntry(term, document);
                    }

                    foreach (FeatureWeightFactor gf in GlobalFactors)
                    {
                        entry = entry * (gf.GlobalFunction.GetElementFactorEntry(term, space, label) * gf.weight);
                    }

                    if (document.weight != 1)
                    {
                        entry = entry * document.weight;
                    }

                    output.Merge(entry);
                    //output.AddEntry(term, entry.dimensions, false);
                }
            }

            return(output);
        }
Beispiel #13
0
        public WeightDictionaryEntry GetElementFactorEntry(string term, SpaceModel space, SpaceLabel label = null)
        {
            WeightDictionaryEntry output = new WeightDictionaryEntry(term, 0);

            switch (resultType)
            {
            case FunctionResultTypeEnum.numeric:
                output = new WeightDictionaryEntry(term, GetElementFactor(term, space, label));
                break;

            case FunctionResultTypeEnum.numericVectorForMultiClass:
                Double[] vec = new double[space.labels.Count];
                Int32    c   = 0;
                foreach (SpaceLabel lb in space.labels)
                {
                    vec[c] = GetElementFactor(term, space, lb);
                    c++;
                }
                output = new WeightDictionaryEntry(term, vec);
                //                    output.AddEntry(term, vec);
                break;
            }

            if (!DistinctReturns.ContainsKey(output.weight))
            {
                DistinctReturns.Add(output.weight, term);
            }

            return(output);
        }
Beispiel #14
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            if (notes != null)
            {
                notes.logStartPhase("[1] Entity Plane - execution", "");
            }

            IEntityPlaneContext context       = inputContext as IEntityPlaneContext;
            CorpusPlaneContext  outputContext = new CorpusPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            outputContext.dataset = context.dataset;

            // ---------------- rendering procedure
            Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> > renderIndex = new Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> >();
            Dictionary <string, SpaceLabel> labels = new Dictionary <string, SpaceLabel>();

            Dictionary <WebSiteDocuments, TextDocumentSet>    sitesToRenders    = new Dictionary <WebSiteDocuments, TextDocumentSet>();
            Dictionary <String, WebSiteDocuments>             inputSites        = new Dictionary <string, WebSiteDocuments>();
            Dictionary <String, TextDocumentSet>              inputTextRenders  = new Dictionary <string, TextDocumentSet>();
            Dictionary <WebSiteDocuments, List <SpaceLabel> > inputSiteVsLabels = new Dictionary <WebSiteDocuments, List <SpaceLabel> >();

            Int32 c = 0;

            // rendering
            foreach (WebSiteDocumentsSet docSet in context.dataset)
            {
                if (docSet.name.isNullOrEmpty() || docSet.name == SpaceLabel.UNKNOWN)
                {
                    outputContext.space.label_unknown = new SpaceLabel(SpaceLabel.UNKNOWN);
                    labels.Add(SpaceLabel.UNKNOWN, outputContext.space.label_unknown);
                }
                else
                {
                    SpaceLabel lab = new SpaceLabel(docSet.name);
                    labels.Add(lab.name, lab);
                    outputContext.space.labels.Add(lab);
                }

                String datasetSignature = context.dataset.GetDataSetSignature();

                // ---- render
                List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>();

                if (CacheProvider.IsReady)
                {
                    foreach (WebSiteDocuments site in docSet)
                    {
                        TextDocumentSet tds = CacheProvider.GetCached <TextDocumentSet>(setupSignature, datasetSignature, site.domain);

                        if (tds == null)
                        {
                            tds = render.RenderSiteDocuments(site, logger);
                            CacheProvider.SetCached(setupSignature, datasetSignature, tds.name, tds);
                        }
                        else
                        {
                            tds.name = site.domain;
                        }


                        textSetForLabel.Add(tds);
                    }
                }
                else
                {
                    textSetForLabel = render.RenderDocumentSet(docSet, logger);
                    foreach (TextDocumentSet ws in textSetForLabel)
                    {
                        CacheProvider.SetCached(setupSignature, datasetSignature, ws.name, ws);
                    }
                }

                // // <--- performs the rendering

                textSetForLabel.ForEach(x => inputTextRenders.Add(x.name, x));
                // --- rest of indexing

                docSet.ForEach(x => inputSites.Add(x.domain, x));
                renderIndex.Add(docSet, textSetForLabel);


                foreach (WebSiteDocuments site in docSet)
                {
                    inputSiteVsLabels.Add(site, new List <SpaceLabel>());
                    inputSiteVsLabels[site].Add(labels[docSet.name]);
                    c++;
                }
            }

            if (notes != null)
            {
                notes.log("Text document for [" + c + "] entities created");
            }

            // tmp index
            foreach (String key in inputSites.Keys)
            {
                sitesToRenders.Add(inputSites[key], inputTextRenders[key]);
            }

            // page in site filtering
            if (filter.IsEnabled)
            {
                Dictionary <WebSiteDocuments, TextDocumentSet> renderIndexFiltered = new Dictionary <WebSiteDocuments, TextDocumentSet>();

                filter.Learn(inputTextRenders.Values);

                foreach (KeyValuePair <WebSiteDocuments, TextDocumentSet> pair in sitesToRenders)
                {
                    renderIndexFiltered.Add(pair.Key, filter.FilterDocumentSet(pair.Value));
                }
                sitesToRenders = renderIndexFiltered;
            }


            Dictionary <String, TextDocumentSet> TextDocumentsByDomainName = new Dictionary <string, TextDocumentSet>();

            foreach (var pair in sitesToRenders)
            {
                TextDocumentsByDomainName.Add(pair.Key.domain, pair.Value);
            }



            // blending pages into single page per web site
            //  DoBlendPagesIntoSingleEntity = blender.options.HasFlag(DocumentBlenderFunctionOptions.separatePages);

            Boolean keepSeparated = blender.DoKeepPagesSeparated;

            foreach (var pair in renderIndex)
            {
                foreach (TextDocumentSet entitySet in pair.Value)
                {
                    TextDocumentSet      selectedTexts = TextDocumentsByDomainName[entitySet.name];
                    WebSiteDocuments     web           = inputSites[entitySet.name];
                    IEnumerable <string> label         = inputSiteVsLabels[web].Select(x => x.name);

                    if (keepSeparated)
                    {
                        // filter function
                        TextDocument doc = blender.blendToTextDocument(selectedTexts);
                        doc.labels.AddRange(label);
                        outputContext.corpus_documents.Add(doc);
                    }
                    else
                    {
                        var docs = blender.blendToSeparateTextDocuments(selectedTexts); //blender.blendToTextDocument(selectedTexts);
                        foreach (TextDocument doc in docs)
                        {
                            doc.labels.AddRange(label);
                            outputContext.corpus_documents.Add(doc);
                        }
                    }
                }
            }

            if (notes != null)
            {
                notes.logEndPhase();
            }


            return(outputContext);
        }
Beispiel #15
0
 public abstract Double GetElementFactor(string term, SpaceModel space, SpaceLabel label = null);
Beispiel #16
0
        /// <summary>
        /// Builds dictionary of global element factors
        /// </summary>
        /// <param name="terms">The terms.</param>
        /// <param name="space">The space.</param>
        /// <param name="label">The label.</param>
        /// <returns></returns>
        public WeightDictionary GetElementFactors(IEnumerable <String> terms, SpaceModel space, SpaceLabel label = null)
        {
            WeightDictionary output = new WeightDictionary();

            switch (resultType)
            {
            case FunctionResultTypeEnum.numeric:
                output.nDimensions = 1;
                break;

            case FunctionResultTypeEnum.numericVectorForMultiClass:
                output.nDimensions = space.labels.Count;
                break;
            }

            foreach (String term in terms)
            {
                output.AddEntry(GetElementFactorEntry(term, space, label));
            }

            return(output);
        }
        /// <summary>
        /// Constructs global weight fictionary using global elements
        /// </summary>
        /// <param name="terms">The terms.</param>
        /// <param name="space">The space.</param>
        /// <param name="label">The label.</param>
        /// <returns></returns>
        public WeightDictionary GetElementFactors(IEnumerable <string> terms, SpaceModel space, SpaceLabel label = null)
        {
            var output = new WeightDictionary();


            output.name = GetSignature() + "_globalOnly";

            foreach (String term in terms)
            {
                Double score = GetElementFactor(term, space, label);
                WeightDictionaryEntry entry = new WeightDictionaryEntry(term, score);

                output.AddEntry(entry, true);
            }

            output.description = "Global weights for [" + output.Count + "] terms.";

            return(output);
        }
        public WeightDictionaryEntry GetElementFactorEntry(string term, SpaceModel space, SpaceLabel label = null)
        {
            Double score = GetElementFactor(term, space, label);
            WeightDictionaryEntry entry = new WeightDictionaryEntry(term, score);

            return(entry);
        }
 public Double GetWeight(String term, SpaceDocumentModel document, SpaceModel space, SpaceLabel label = null)
 {
     return(GetElementFactor(term, document) * GetElementFactor(term, space, label));
 }