/*
         * public static WeightDictionary GetChildrenWT(WeightDictionary output, SpaceDocumentModel model, FeatureWeightModel weightModel, SpaceModel space, List<string> FV)
         * {
         *  if (output == null)
         *  {
         *      output = new WeightDictionary(model.name, "");
         *  }
         *  if (model.Children.Any())
         *  {
         *      foreach (var child in model.Children)
         *      {
         *          GetChildrenWT(output, child, weightModel, space, FV);
         *
         *      }
         *
         *  }
         *  else
         *  {
         *      var wd = weightModel.GetWeights(FV, model, space);
         *      output.Merge(wd.index.Values, model.weight);
         *  }
         *  return output;
         *
         * }*/


        public static T BlendToVector <T>(this SpaceDocumentModel model, FeatureWeightModel weightModel, SpaceModel space, List <string> FV) where T : VectorDocument, new()
        {
            T output = new T();

            output.name = model.name;

            var leafs = model.GetLeafs();

            foreach (var leaf in leafs)
            {
                var wd = weightModel.GetWeights(FV, model, space);
                output.terms.Merge(wd);
            }

            //output.terms =

            //output.Merge(wd.index.Values, model.weight);

            //WeightDictionary wd = new WeightDictionary(model.name, "");

            // GetChildrenWT(wd, model, weightModel, space, FV);



            return(output);
        }
        public void SetEntry(String _domainID, WebSiteDocument _webDocument, SpaceDocumentModel _spaceDocument, TextDocument _textDocument)
        {
            type     = DocumentSelectEntryType.unknown;
            DomainID = _domainID;

            webDocument   = _webDocument;
            spaceDocument = _spaceDocument;
            textDocument  = _textDocument;

            if (textDocument != null)
            {
                AssignedID = textDocument.name;
                type      |= DocumentSelectEntryType.textDocument;
            }
            if (spaceDocument != null)
            {
                AssignedID = spaceDocument.name;
                type      |= DocumentSelectEntryType.spaceDocument;
            }
            if (webDocument != null)
            {
                AssignedID = _webDocument.AssignedID;
                type      |= DocumentSelectEntryType.webDocument;
            }
        }
        /// <summary>
        /// Gets the weights.
        /// </summary>
        /// <param name="termWhiteList">The term white list.</param>
        /// <param name="document">The document.</param>
        /// <param name="space">The space.</param>
        /// <param name="label">The label.</param>
        /// <returns></returns>
        public WeightDictionary GetWeights(List <String> termWhiteList, SpaceDocumentModel document, SpaceModel space, SpaceLabel label = null)
        {
            WeightDictionary output = new WeightDictionary();

            output.name        = GetSignature() + "_" + document.name;
            output.description = "Feature weight table constructed by [" + GetSignature() + "] for features [" + termWhiteList.Count + "] in document [" + document.name + "]";
            output.nDimensions = nDimensions;

            if (KERNELOPTION_USE_WHITELISTTERMS)
            {
                foreach (String term in termWhiteList)
                {
                    if (document.terms.Contains(term))
                    {
                        throw new NotImplementedException();
                        //output.entries.Add(entry);
                    }
                }
            }
            else
            {
                List <String> terms = document.terms.GetTokens();

                for (int i = 0; i < document.terms.Count; i++)
                {
                    String term = terms[i];

                    WeightDictionaryEntry entry = new WeightDictionaryEntry(term, 0);


                    if (DoUseLocalFunction)
                    {
                        entry = LocalFunction.GetElementFactorEntry(term, document);
                    }

                    foreach (FeatureWeightFactor gf in GlobalFactors)
                    {
                        entry = entry * (gf.GlobalFunction.GetElementFactorEntry(term, space, label) * gf.weight);
                    }

                    if (document.weight != 1)
                    {
                        entry = entry * document.weight;
                    }

                    output.Merge(entry);
                    //output.AddEntry(term, entry.dimensions, false);
                }
            }

            return(output);
        }
        /// <summary>
        /// Gets the local element factor
        /// </summary>
        /// <param name="term">The term.</param>
        /// <param name="document">The document.</param>
        /// <returns></returns>
        public double GetElementFactor(String term, SpaceDocumentModel document)
        {
            if (DoUseLocalFunction && LocalFunction != null)
            {
                if (LocalFunction.IsEnabled)
                {
                    Double TF = LocalFunction.GetElementFactor(term, document);

                    return(TF);
                }
            }
            return(1);
        }
        /// <summary>
        /// Nests the space document model.
        /// </summary>
        /// <param name="distionary">The distionary.</param>
        /// <param name="key">The key.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static SpaceDocumentModel NestSpaceDocumentModel(this Dictionary <String, SpaceDocumentModel> distionary, String key, ILogBuilder log)
        {
            SpaceDocumentModel output = new SpaceDocumentModel();

            output.name = key;

            foreach (var p in distionary)
            {
                output.Children.Add(p.Value);
                output.terms.MergeDictionary(p.Value.terms);
            }

            return(output);
        }
        public WeightDictionaryEntry GetCompositeEntry(String term, SpaceDocumentModel document, SpaceModel space)
        {
            WeightDictionaryEntry output     = new WeightDictionaryEntry(term, 0);
            List <Double>         dimensions = new List <double>();

            dimensions.Add(LocalFunction.GetElementFactor(term, document));

            foreach (var gf in GlobalFactors)
            {
                dimensions.Add(gf.GlobalFunction.GetElementFactor(term, space) * gf.weight);
            }

            output.dimensions = dimensions.ToArray();

            return(output);
        }
        /// <summary>
        /// Nests the complete space document model.
        /// </summary>
        /// <param name="nest">The nest.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static SpaceDocumentModel NestCompleteSpaceDocumentModel(this Dictionary <string, Dictionary <string, Dictionary <string, SpaceDocumentModel> > > nest, String name, ILogBuilder log)
        {
            SpaceDocumentModel output = new SpaceDocumentModel();

            Dictionary <String, SpaceDocumentModel> rootTmp = new Dictionary <string, SpaceDocumentModel>();

            foreach (var pairCategory in nest)
            {
                Dictionary <String, SpaceDocumentModel> catTmp = new Dictionary <string, SpaceDocumentModel>();

                foreach (var pairWebsite in pairCategory.Value)
                {
                    catTmp.Add(pairWebsite.Key, pairWebsite.Value.NestSpaceDocumentModel(pairWebsite.Key, log));
                }

                rootTmp.Add(pairCategory.Key, catTmp.NestSpaceDocumentModel(pairCategory.Key, log));
            }

            output = rootTmp.NestSpaceDocumentModel(name, log);

            return(output);
        }
        /// <summary>
        /// Prepares the factor by processing the context
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            statsByAssignedID.Clear();


            foreach (DocumentSelectResultEntry docEntry in context.items)
            {
                instanceCountCollection <string> ft = new instanceCountCollection <string>();

                if (docEntry.type.HasFlag(DocumentSelectEntryType.spaceDocument))
                {
                    SpaceDocumentModel document = docEntry.spaceDocument;
                    foreach (var term in document.terms.GetTokens())
                    {
                        ft.AddInstance(term, document.terms.GetTokenFrequency(term));
                    }
                }
                else if (docEntry.type.HasFlag(DocumentSelectEntryType.textDocument))
                {
                    String content = docEntry.textDocument.content; // document.ToString();

                    List <String> tkns = content.getTokens(true, true, true, false, 4);

                    foreach (String tkn in tkns)
                    {
                        String stem = tkn;
                        if (useStems)
                        {
                            stem = context.stemmingContext.Stem(tkn);
                        }
                        ft.AddInstance(stem);
                    }
                }
                statsByAssignedID.Add(docEntry.AssignedID, ft);

                assignedIDs.Add(docEntry.AssignedID);
            }
        }
Пример #9
0
        /// <summary>
        /// Spaces the model population.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void SpaceModelPopulation(OperationContext context, ILogBuilder log)
        {
            log.log("Space model population");
            context.stemmContext = new StemmingContext(stemmer);


            // modelling the documents
            foreach (var pair in context.textDocuments)
            {
                var doc = pair.Value;

                SpaceDocumentModel model = spaceConstructor.ConstructDocument(doc.content, doc.name, context.spaceModel, context.stemmContext, tokenizer);

                foreach (String label in doc.labels)
                {
                    SpaceLabel sLabel = null;
                    sLabel = context.spaceLabels[label];

                    context.spaceModel.LabelToDocumentLinks.Add(sLabel, model, 1);
                }

                context.spaceModel.documents.Add(model);



                if (doc.labels.Contains(SpaceLabel.UNKNOWN))
                {
                    context.spaceModel.terms_unknown_label.MergeDictionary(model.terms);
                }
                else
                {
                    context.spaceModel.terms.MergeDictionary(model.terms);
                }
            }

            log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]");
        }
Пример #10
0
        // public abstract WeightDictionaryEntry GetElementFactor(string term, SpaceDocumentModel document);

        public abstract double GetElementFactor(string term, SpaceDocumentModel document);
 public DocumentSelectResultEntry(SpaceDocumentModel document)
 {
     spaceDocument = document;
     AssignedID    = document.name;
     type          = DocumentSelectEntryType.spaceDocument;
 }
        /// <summary>
        /// Spaces the model population.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void SpaceModelPopulation(OperationContext context, ILogBuilder log)
        {
            log.log("Space model population");
            context.stemmContext = new StemmingContext(stemmer);
            context.tokenizer    = tokenizer;

            context.entityMetrics = new Dictionary <String, ContentMetrics>();

            foreach (KeyValuePair <String, TextDocumentSet> pair in context.renderSiteByDomain)
            {
                SpaceLabel spaceLabel = context.spaceLabelsDomains[pair.Key];

                SpaceDocumentModel modelOfSite = new SpaceDocumentModel();
                modelOfSite.name = pair.Key;
                modelOfSite.labels.Add(spaceLabel.name);

                foreach (TextDocumentLayerCollection textLayer in pair.Value)
                {
                    SpaceDocumentModel modelOfPage = new SpaceDocumentModel(textLayer.name);

                    ContentMetrics metrics = null;
                    if (DoKeepContentMetrics)
                    {
                        metrics = new ContentMetrics(textLayer.name);
                    }

                    foreach (var renderLayer in textLayer)
                    {
                        SpaceDocumentModel modelOfLayer = new SpaceDocumentModel(modelOfPage.name + renderLayer.name);

                        modelOfLayer = spaceConstructor.ConstructDocument(renderLayer.content, modelOfPage.name + renderLayer.name,
                                                                          context.spaceModel, context.stemmContext, tokenizer,
                                                                          spaceLabel.name != SpaceLabel.UNKNOWN, metrics);

                        modelOfLayer.weight = renderLayer.layerWeight;

                        modelOfLayer.documentScope = DocumentBlenderFunctionOptions.layerLevel;

                        modelOfPage.Children.Add(modelOfLayer);
                    }

                    modelOfPage.documentScope = DocumentBlenderFunctionOptions.pageLevel;

                    if (DoKeepContentMetrics)
                    {
                        context.entityMetrics.Add(metrics.Name, metrics);
                    }

                    // modelOfPage.Flatten(false);

                    modelOfSite.Children.Add(modelOfPage);
                }

                modelOfSite.documentScope = DocumentBlenderFunctionOptions.siteLevel;

                context.spaceModel.documents.Add(modelOfSite);

                foreach (String label in modelOfSite.labels)
                {
                    SpaceLabel sLabel = null;
                    sLabel = context.spaceLabels[label];
                    context.spaceModel.LabelToDocumentLinks.Add(sLabel, modelOfSite, 1);
                }

                modelOfSite.Flatten(false);

                /*
                 * if (modelOfSite.labels.Contains(SpaceLabel.UNKNOWN))
                 * {
                 *  context.spaceModel.terms_unknown_label.MergeDictionary(modelOfSite.terms);
                 * }
                 * else
                 * {
                 *  context.spaceModel.terms_known_label.MergeDictionary(modelOfSite.terms);
                 * }*/

                modelOfSite.PropagateLabels();

                //    modelOfSite.SetLabel(spaceLabel, context.spaceModel);

                //context.spaceModel.LabelToDocumentLinks.Add(spaceLabel, modelOfSite, 1.0);
            }

            log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]");
        }
 public override WeightDictionaryEntry GetElementFactorEntry(string term, SpaceDocumentModel document)
 {
     return(new WeightDictionaryEntry(term, GetElementFactor(term, document)));
 }
        public override double GetElementFactor(string term, SpaceDocumentModel document)
        {
            if (!IsEnabled)
            {
                return(1);
            }


            TokenDictionary docDict = document.GetTerms(true, true);



            Double TF = docDict.GetTokenFrequency(term);

            switch (computation)
            {
            case TFComputation.modifiedTF:

                if (!index.ContainsKey(term))
                {
                    return(0);
                }

                Double Tt = index[term];         // training_terms.GetTokenFrequency(term);

                Double length_d = docDict.Count; //.GetTokenCount();

                Double mTF_above = TF * Math.Log(SqrTc / Tt);

                Double mTF_below_2nd = (length_d * length_d) / SqrTc;

                Double mTF_below = Math.Log(docDict.GetSumSquareFrequencies() * mTF_below_2nd);

                return(mTF_above / mTF_below);

                break;
            }


            Double divisor = GetDivisor(docDict);

            //if (TFN_index.ContainsKey(document))
            //{
            //    divisor = TFN_index[document];
            //}
            //else
            //{
            //    divisor
            //}

            switch (computation)
            {
            default:
            case TFComputation.normal:
                return(TF / divisor);

                break;

            case TFComputation.squareRooted:
                return(Math.Sqrt(TF / divisor));

                break;

            case TFComputation.glasgow:
                return(Math.Log(TF + 1) / divisor);

                break;
            }
        }
Пример #15
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            notes.logStartPhase("[2] Corpus Plane - execution", "");

            ICorpusPlaneContext context       = (ICorpusPlaneContext)inputContext;
            VectorPlaneContext  outputContext = new VectorPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            context.stemmContext = new StemmingContext(stemmer);

            Dictionary <String, SpaceDocumentModel> documentVsModel = new Dictionary <string, SpaceDocumentModel>();

            // modelling the documents
            foreach (TextDocument doc in context.corpus_documents)
            {
                SpaceDocumentModel model          = spaceConstructor.ConstructDocument(doc.content, doc.name, context.space, context.stemmContext, tokenizer);
                List <SpaceLabel>  labels         = spaceConstructor.GetLabels(doc.labels, context.space);
                Boolean            isUnknownLabel = true;
                foreach (SpaceLabel label in labels)
                {
                    if (label.name != SpaceLabel.UNKNOWN)
                    {
                        isUnknownLabel = false;
                    }
                    context.space.LabelToDocumentLinks.Add(label, model, 1);
                }
                context.space.documents.Add(model);
                if (!isUnknownLabel)
                {
                    context.space.terms.MergeDictionary(model.terms);
                }

                documentVsModel.Add(doc.name, model);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_textrender))
            {
                foreach (TextDocument doc in context.corpus_documents)
                {
                    String prefix = doc.labels.FirstOrDefault();
                    if (prefix.isNullOrEmpty())
                    {
                        prefix = SpaceLabel.UNKNOWN;
                    }

                    String fn  = prefix + "_" + doc.name;
                    String pth = notes.folder_entity.pathFor(fn.getFilename("txt"), imbSCI.Data.enums.getWritableFileMode.overwrite, "Textual representation of website [" + doc.name + "], produced by rendering and blending settings", true);
                    doc.content.saveStringToFile(pth, imbSCI.Data.enums.getWritableFileMode.overwrite);
                }
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_stats))
            {
                foreach (WebSiteDocumentsSet ds in context.dataset)
                {
                    DataTable dt = ds.MakeTable(documentVsModel);
                    notes.SaveDataTable(dt, notes.folder_entity);
                }



                var dt_vsm = context.space.LabelToDocumentLinks.MakeTable("LabelToDocument", "Relationships between labels and documents in the primary Vector Space Model");
                notes.SaveDataTable(dt_vsm, notes.folder_corpus);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_corpusDictionary))
            {
                notes.SaveDataTable(context.space.terms.MakeTable("corpus_stats", "Training set dictionary, after stemming", generalContext.DictionaryReportLimit), notes.folder_corpus);
            }



            #region SELECTING THE FEATURES
            // forming corpus global weight
            context.SelectedFeatures = new WeightDictionary();
            List <KeyValuePair <string, double> > filter_result = filter.SelectFeatures(context.space);
            List <string> FV = new List <string>();
            FV.AddRange(filter_result.Select(x => x.Key));

            if (filter_result.Any())
            {
                foreach (var pair in filter_result)
                {
                    context.SelectedFeatures.AddEntry(pair.Key, pair.Value);
                }

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_selectedFeatures))
                {
                    notes.SaveDataTable(context.SelectedFeatures.MakeTable("selected_features", "Features selected for BoW construction", new List <string>()
                    {
                        filter.function.shortName
                    }, generalContext.DictionaryReportLimit), notes.folder_corpus);
                }
            }
            else
            {
                logger.log("-- Feature selection function returned zero set. All features [" + context.space.terms.Count + "] are therefore accepted as selected.");
                var tkns = context.space.terms.GetTokens();
                foreach (var tkn in tkns)
                {
                    context.SelectedFeatures.AddEntry(tkn, 1);
                }
            }
            #endregion


            notes.log("Selected features [" + context.SelectedFeatures.entries.Count + "] by [" + filter.functionSettings.functionName + "]");



            //context.space =
            //weightModel.Deploy();

            outputContext.vectorSpace = new Vectors.VectorSpace();


            foreach (SpaceLabel label in context.space.labels)
            {
                var docs = context.space.LabelToDocumentLinks.GetAllLinked(label);
                if (label.name != SpaceLabel.UNKNOWN)
                {
                    SpaceCategoryModel categoryModel = new SpaceCategoryModel(label, docs);
                    context.space.LabelToCategoryLinks.Add(label, categoryModel, 1);

                    context.space.categories.Add(categoryModel);

                    notes.log("Class [" + categoryModel.name + "] BoW model created - terms[" + categoryModel.terms.Count + "] ");
                }
            }

            outputContext.LabelToDocumentLinks = context.space.LabelToDocumentLinks;

            // preparing the model
            weightModel.PrepareTheModel(context.space);

            // logger.log(":: Creating VectorSpace instances for documents");
            // building document VSM
            foreach (SpaceDocumentModel docModel in context.space.documents)
            {
                var            wd     = weightModel.GetWeights(FV, docModel, context.space);
                VectorDocument docVec = new VectorDocument(docModel.name);
                docVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("docVec_" + docModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
                outputContext.vectorSpace.documents.Add(docVec);
            }

            // logger.log(":: Creating VectorSpace instances for categories");
            // building category VSM
            foreach (SpaceCategoryModel catModel in context.space.categories)
            {
                var         wd     = weightModel.GetWeights(FV, catModel, context.space);
                VectorLabel catVec = new VectorLabel(catModel.name);
                catVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("catVec_" + catModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }

                outputContext.vectorSpace.labels.Add(catVec);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
            {
                foreach (SpaceCategoryModel catModel in context.space.categories)
                {
                    var dt = catModel.terms.MakeTable("cat_" + catModel.name, "Vector Space BoW weighted model, representing a category");
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
            }


            notes.logEndPhase();

            return(outputContext);
        }
Пример #16
0
        public static histogramModel GetHistogram(this SpaceDocumentModel dictionary, Int32 binCount = 50)
        {
            histogramModel model = dictionary.terms.GetRankedTokenFrequency().GetHistogramModel(dictionary.name, x => x.Value, binCount); //new histogramModel(binCount, dictionary.name);

            return(model);
        }
 public Double GetWeight(String term, SpaceDocumentModel document, SpaceModel space, SpaceLabel label = null)
 {
     return(GetElementFactor(term, document) * GetElementFactor(term, space, label));
 }
        ///// <summary>
        ///// Transforms to fv dictionary.
        ///// </summary>
        ///// <param name="context">The context.</param>
        ///// <param name="TermWeightModel">The term weight model.</param>
        ///// <param name="function">The function.</param>
        ///// <returns></returns>
        //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        //{
        //    log.log("... Page Similarity ...");

        //    List<string> selectedTerms = context.selectedFeatures.GetKeys();



        //    var ByDomain = context.GetByDomain(log);

        //    Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true);

        //    var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log);

        //    Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>();



        //    Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>();


        //    foreach (var entry in context.items)
        //    {

        //        WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
        //        documentDictionarties.Add(entry.AssignedID, documentWeights);
        //    }


        //    FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();



        //    Double total = context.Count;
        //    Int32 i = 0;
        //    Int32 p = (context.Count / 10);

        //    //List<List<Double>> matrix = new List<List<double>>();

        //    //foreach (var entry in context.items)
        //    //{
        //    //    matrix.Add(new List<double>());
        //    //}


        //    //for (int x = 0; x < context.items.Count; x++)
        //    //{

        //    //    for (int y = 0; y < context.items.Count; x++)
        //    //    {



        //    //    }

        //    //}

        //    ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>();


        //    foreach (var domainPair in ByCategory)
        //    {
        //        List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList();


        //        foreach (var entry in relatives)
        //        {

        //            i++;
        //            FeatureVector fv = new FeatureVector(entry.AssignedID);

        //            // List<Double> d = new List<>();

        //            fv.dimensions = new double[relatives.Count - 1];


        //            // List<String> keys = documentDictionarties.Keys.ToList();

        //            Int32 hostInd = relatives.IndexOf(entry);

        //            Int32 c = 0;


        //            //foreach (var pair in documentDictionarties)
        //            //{

        //            Parallel.ForEach(relatives, (pair) =>
        //            {

        //                Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
        //                if (ind >= hostInd)
        //                {
        //                    ind = ind - 1;
        //                }

        //                if (pair.AssignedID != entry.AssignedID)
        //                {
        //                    Double docToClassSimilarity = 0;

        //                    if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
        //                    }
        //                    else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
        //                    }
        //                    else
        //                    {
        //                        var vecA = documentDictionarties[pair.AssignedID];
        //                        var vecB = documentDictionarties[entry.AssignedID];
        //                        docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
        //                        if (docToClassSimilarity > 0)
        //                        {

        //                        }
        //                        if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
        //                            //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
        //                        }
        //                        else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
        //                        }

        //                    }

        //                    fv.dimensions[ind] = docToClassSimilarity;

        //                }
        //            });



        //            Int32 r = i % p;
        //            if (r == 0)
        //            {
        //                log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
        //            }


        //            dict.GetOrAdd(entry.DomainID).Add(fv, -1);
        //        }



        //    }


        //    //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict)
        //    //{
        //    //    pair.Value.CloseDeploy();
        //    //}

        //    log.log("... Preparation finished ...");

        //    return dict;


        //}



        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        {
            log.log("... Site Similarity ...");

            List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList();

            Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>();
            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();

            var byDomain = context.GetByDomain(log);

            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);

            foreach (var pair in byDomain)
            {
                i++;
                SpaceDocumentModel siteModel = new SpaceDocumentModel();

                foreach (var ent in pair.Value)
                {
                    WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel);
                    documentDictionarties.Add(ent.AssignedID, documentWeights);
                    siteModel.Children.Add(ent.spaceDocument);

                    //siteModel.terms.MergeDictionary(ent.spaceDocument.terms);
                }

                siteModel.Flatten(false);

                categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel));


                foreach (var ent in pair.Value)
                {
                    FeatureVector fv = new FeatureVector(ent.AssignedID);
                    fv.dimensions = new double[context.spaceModel.labels.Count];

                    // documentDictionarties[ent.AssignedID].entries


                    var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]);

                    fv.dimensions[0] = docToClassSimilarity;

                    dict.GetOrAdd(pair.Key).Add(fv, -1);
                }

                Int32 r = i % p;
                if (r == 0)
                {
                    log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                }
            }



            foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict)
            {
                pair.Value.CloseDeploy();
            }

            log.log("... Preparation finished ...");

            return(dict);
        }
Пример #19
0
 public abstract WeightDictionaryEntry GetElementFactorEntry(string term, SpaceDocumentModel document);
Пример #20
0
        /// <summary>
        /// Prepares the context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static DocumentSelectResult PrepareContext(this OperationContext context, DocumentRankingMethod ranking, folderNode folder, ILogBuilder log)
        {
            DocumentSelectResult selectContext = new DocumentSelectResult();

            selectContext.stemmingContext = context.stemmContext;
            selectContext.spaceModel      = context.spaceModel;
            selectContext.folder          = folder;
            if (ranking != null)
            {
                selectContext.name  = ranking.model.GetSignature();
                selectContext.query = ranking.query;

                builderForText builder = new builderForText();
                ranking.Describe(builder);

                builder.AppendLine("Selected features [" + selectContext.selectedFeatures.description + "].");

                selectContext.description = builder.GetContent().Replace(Environment.NewLine, "");
            }

            selectContext.selectedFeatures = context.SelectedFeatures;



            foreach (KeyValuePair <string, WebSiteDocuments> pair in context.webSiteByDomain)
            {
                selectContext.domainNameToGraph.Add(pair.Key, pair.Value?.extensions?.graph);

                foreach (WebSiteDocument doc in pair.Value.documents)
                {
                    DocumentSelectResultEntry entry = new DocumentSelectResultEntry();
                    TextDocument text = null;

                    string err = "";


                    //if (context.textDocuments.ContainsKey(doc.AssignedID))
                    //{
                    //    text = context.textDocuments[doc.AssignedID];
                    //}
                    //else
                    //{
                    //    err += "Failed to find text document for [" + doc.AssignedID + "]";
                    //}

                    SpaceDocumentModel spaceDocument = context.spaceModel.documents.FirstOrDefault(x => x.name == doc.AssignedID);


                    if (spaceDocument == null)
                    {
                        err += "Failed to find space model document for [" + doc.AssignedID + "]";
                    }


                    string dn = pair.Value.domain;
                    entry.SetEntry(dn, doc, spaceDocument, text);

                    if (!entry.HasTextOrSpaceModel)
                    {
                        log.log(err);
                    }

                    selectContext.items.Add(entry);
                    //entry.SetEntry( context.context.webDocumentByAssignedID[pair.Key], webDocIDToDomain[aID], webDocumentRegistry[aID], spaceDocumentRegistry[aID], textDocumentRegistry[aID]);
                }
            }

            // PREPARATION OF MODEL
            if (ranking != null)
            {
                ranking.model.Prepare(selectContext, log);
            }
            return(selectContext);
        }
Пример #21
0
        public void Prepare(SpaceModel spaceModel, ILogBuilder log, bool excludeUnknown = true)
        {
            var labels = spaceModel.LabelToDocumentLinks.GetAllDistinctNames();

            if (excludeUnknown)
            {
                labels.Remove(SpaceLabel.UNKNOWN);
            }

            datasetStatsModel = new SpaceDocumentStatsModel("Stats", log);
            datasetStatsModel.documentScope     = Entity.DocumentBlenderFunctionOptions.datasetLevel;
            flatDataSetStatsModel               = new SpaceDocumentStatsModel("FlatStats", log);
            flatDataSetStatsModel.documentScope = Entity.DocumentBlenderFunctionOptions.datasetLevel;


            foreach (string label in labels)
            {
                var documents = spaceModel.documents.Where(x => x.labels.Contains(label));

                //var documents = spaceModel.LabelToDocumentLinks.GetAllLinkedB(label);
                SpaceDocumentModel labelDocModel = new SpaceDocumentModel(label);
                labelDocModel.documentScope = Entity.DocumentBlenderFunctionOptions.categoryLevel;
                labelDocModel.Children.AddRange(documents);
                labelDocModel.Flatten(false);
                //  var categoryModel = new SpaceDocumentStatsModel(labelDocModel, log);

                SpaceDocumentStatsModel categoryModel = new SpaceDocumentStatsModel(labelDocModel.name, log);
                categoryModel.LearnFrom(labelDocModel, log, true);

                datasetStatsModel.Children.Add(categoryModel);

                datasetStatsModel.terms.MergeDictionary(labelDocModel.terms);
                datasetStatsModel.termsChildCount.CountTokens(labelDocModel.terms.GetTokens());

                if (log != null)
                {
                    log.log("Statistics for category [" + label + "]");
                }


                var allChildren = labelDocModel.GetAllChildren();
                for (int i = 0; i < allChildren.Count; i++)
                {
                    totalCounts.DirectCount(allChildren[i].documentScope);
                }
            }

            if (settings.RequiredScopes.HasFlag(CWPAnalysisScopeEnum.flatSiteLevel))
            {
                if (log != null)
                {
                    log.log("Creating flat report");
                }

                foreach (var document in spaceModel.documents)
                {
                    if (!document.labels.Contains(SpaceLabel.UNKNOWN))
                    {
                        flatDataSetStatsModel.LearnFrom(document, log, true);
                    }
                }
            }
        }