Exemplo n.º 1
0
 /// <summary>
 /// Prepares the factors for score operation
 /// </summary>
 /// <param name="context">The context.</param>
 public void Prepare(DocumentSelectResult context, ILogBuilder log)
 {
     foreach (IScoreModelFactor factor in Factors)
     {
         factor.Prepare(context, log);
     }
 }
        /// <summary>
        /// Computes score for given entry
        /// </summary>
        /// <param name="entry">The entry.</param>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log)
        {
            Double output = 0;

            foreach (String term in entry.spaceDocument.terms.GetTokens())
            {
                Boolean isOk = true;
                if (context.selectedFeatures != null)
                {
                    if (context.selectedFeatures.Count > 0)
                    {
                        if (!context.selectedFeatures.ContainsKey(term))
                        {
                            isOk = false;
                        }
                    }
                }

                if (isOk)
                {
                    if (queryTerms.Any())
                    {
                        if (queryTerms.Contains(term))
                        {
                            output += weightDictionary.GetValue(term); // TermWeightModel.GetWeight(term, entry.spaceDocument, context.spaceModel);
                        }
                    }
                    else
                    {
                        output += weightDictionary.GetValue(term); // TermWeightModel.GetWeight(term, entry.spaceDocument, context.spaceModel);
                    }
                }
            }
            return(output);
        }
        /// <summary>
        /// Merges the ds rankings - searches folder for specified input names or search pattern
        /// </summary>
        /// <param name="folder">The folder.</param>
        /// <param name="inputNames">The input names.</param>
        /// <param name="output">The output.</param>
        /// <param name="searchPattern">The search pattern.</param>
        /// <returns></returns>
        public static FeatureVectorDictionaryWithDimensions MergeDSRankings(folderNode folder, String inputNames, ILogBuilder output, String searchPattern = "DS_*_ranking.xml")
        {
            List <string> filepaths = folder.GetOrFindFiles(inputNames, searchPattern);

            DocumentSelectResult resultOut = new DocumentSelectResult();

            List <DocumentSelectResult> results = new List <DocumentSelectResult>();
            List <String> existingNames         = new List <string>();

            String tmpOutputName = "";

            foreach (var fp in filepaths)
            {
                var    lr = DocumentSelectResult.LoadFromFile(fp, output);
                String fn = Path.GetFileNameWithoutExtension(fp);
                if (existingNames.Contains(lr.name))
                {
                    lr.name = fn;
                }
                existingNames.Add(lr.name);

                results.Add(lr);
                tmpOutputName += lr.name;
            }


            FeatureVectorDictionaryWithDimensions featureDict = DocumentRankingExtensions.TransformToFVDictionary(results);

            return(featureDict);
        }
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <exception cref="ArgumentException">context</exception>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            /*
             * String p_m = WeightDictionary.GetDictionaryFilename(, context.folder);
             *
             * if (File.Exists(p_m))
             * {
             *   //objectSerialization.loadObjectFromXML<WeightDictionary>(p_m, log);
             *
             * }
             */
            weightDictionary = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log);

            if (context.spaceModel == null)
            {
                String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation";
                throw new ArgumentException(msg, nameof(context));
            }



            if (context.query.isNullOrEmpty())
            {
                context.query.QueryTerms = context.query.QueryTerms.Trim();

                List <String> tkns = context.query.QueryTerms.getTokens(true, true, true, false, 4);

                foreach (String tkn in tkns)
                {
                    queryTerms.Add(context.stemmingContext.Stem(tkn));
                }
            }
        }
        /// <summary>
        /// Scores the specified entry.
        /// </summary>
        /// <param name="entry">The entry.</param>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log)
        {
            if (useMachineLearning)
            {
                WeightDictionary dc_vec = TermWeightModel.GetWeights(SelectedTerms.GetKeys(), entry.spaceDocument, context.spaceModel);

                var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, entry.AssignedID);


                Double score = 0;
                Int32  l_id  = -1;
                if (sc_id.ContainsKey(entry.AssignedID))
                {
                    l_id = sc_id[entry.AssignedID];
                }

                score = classifier.DoScore(n_vec, log, l_id);

                return(score);
            }
            else
            {
                if (scoreDictionary.ContainsKey(entry.AssignedID))
                {
                    var fv = scoreDictionary[entry.AssignedID];
                    return(fv.CompressNumericVector(vectorCompression));
                }
                else
                {
                    return(0);
                }
            }
        }
Exemplo n.º 6
0
        /// <summary>
        /// Combines two or more precompiled document selection ranks
        /// </summary>
        /// <param name="inputNames">comma separated list of DS rank file names, leave empty if search pattern is used</param>
        /// <param name="searchPattern">file search pattern to select source files, leave * if no file search should be performed</param>
        /// <param name="compression">vector dimensions compression operation, i.e. how scores should be combined into single dimension</param>
        /// <param name="outputName">Name of the output.</param>
        /// <param name="doRankingFusion">if set to <c>true</c> [do ranking fusion].</param>
        /// <remarks>
        /// What it will do?
        /// </remarks>
        /// <seealso cref="aceOperationSetExecutorBase" />
        public void aceOperation_makeCombineDSRanks(
            [Description("Space separated list of DS rank file names, leave empty if search pattern is used")] String inputNames = " ",
            [Description("vector dimensions compression operation, i.e. how scores should be combined into single dimension")] operation compression = operation.avg,
            [Description("Name of output Document Selection Rank file. Leave * to assign name as combination of input files")] String outputName     = "*",
            [Description("If true, it will perform ranking fusion instead of simple score fusion")] Boolean doRankingFusion = true,
            [Description("file search pattern to select source files, leave * if no file search should be performed")] String searchPattern = "*"
            )
        {
            SetupDocumentSelection setup = docSelection.data.CloneViaXML();

            ProceduralFolderFor <ProcedureCreateScoreSet, SetupDocumentSelection, OperationContext, ExperimentModelExecutionContext> procedures
                = new ProceduralFolderFor <ProcedureCreateScoreSet, SetupDocumentSelection, OperationContext, ExperimentModelExecutionContext>(mainContext.folds, setup, mainContext.notes, parent);

            outputName = DocumentSelectResult.CheckAndMakeFilename(outputName);

            foreach (var p in procedures)
            {
                p.Open();


                DocumentSelectResult resultOut = new DocumentSelectResult();

                var fl = mainContext.resourceProvider.GetResourceFiles(inputNames, p.fold);

                List <DocumentSelectResult> results = DocumentRankingExtensions.LoadDSRankings(fl, p.notes);

                resultOut = results.Fusion(compression, doRankingFusion, true, p.notes);

                String pt = mainContext.resourceProvider.SetResourceFilePath(outputName, p.fold);

                resultOut.saveObjectToXML(pt);

                p.Close();
            }
        }
Exemplo n.º 7
0
        /// <summary>
        /// Scores the specified entry.
        /// </summary>
        /// <param name="entry">The entry.</param>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log)
        {
            Double score = 0;



            WebSiteGraph webSiteGraph = context.domainNameToGraph[entry.DomainID]; // GraphRegistry[entry.DomainID];

            freeGraphNodeAndLinks outLinks = webSiteGraph.GetLinks(entry.AssignedID, true, false);
            freeGraphNodeAndLinks inLinks  = webSiteGraph.GetLinks(entry.AssignedID, false, true);

            if (functionFlags.HasFlag(GraphFactorFunctionEnum.count_outbound))
            {
                score += outLinks.Count;
            }

            if (functionFlags.HasFlag(GraphFactorFunctionEnum.count_inbound))
            {
                score += inLinks.Count;
            }

            if (score == 0)
            {
                return(score);
            }

            if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_graphlinks))
            {
                score = score / webSiteGraph.CountLinks();
            }

            if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_graphnodes))
            {
                score = score / webSiteGraph.CountNodes();
            }

            if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_inbound))
            {
                score = score / inLinks.Count;
            }

            if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_outbound))
            {
                score = score / outLinks.Count;
            }
            if (functionFlags.HasFlag(GraphFactorFunctionEnum.divide_by_linkCount))
            {
                score = score / (inLinks.Count + outLinks.Count);
            }
            return(score);
        }
        public static List <DocumentSelectResult> LoadDSRankings(IEnumerable <String> filepaths, ILogBuilder output)
        {
            List <DocumentSelectResult> results = new List <DocumentSelectResult>();


            foreach (var fp in filepaths)
            {
                var lr = DocumentSelectResult.LoadFromFile(fp, output);

                results.Add(lr);
            }

            return(results);
        }
Exemplo n.º 9
0
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void Prepare(DocumentSelectResult context, ILogBuilder log, bool excludeUnknown = true)
        {
            var nested_dict = context.GetModelsByCategoryDomainAssignedID(log);

            if (nested_dict.ContainsKey(SpaceLabel.UNKNOWN))
            {
                nested_dict.Remove(SpaceLabel.UNKNOWN);
            }

            datasetModel = nested_dict.NestCompleteSpaceDocumentModel(context.name, log);

            datasetStatsModel = new SpaceDocumentStatsModel(datasetModel.name, log);
            datasetStatsModel.LearnFrom(datasetModel, log, true);
        }
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <exception cref="ArgumentException">context</exception>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder);
            String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder);

            TermWeightModel = FeatureWeightModel.LoadModel(p_m, log);

            //if (File.Exists(p_m))
            //{

            //    //TermWeightModel = objectSerialization.loadObjectFromXML<FeatureWeightModel>(p_m, log);
            //}

            TermWeightModel.Deploy(log);

            if (context.spaceModel == null)
            {
                String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation";
                throw new ArgumentException(msg, nameof(context));
            }

            if (File.Exists(p_d) && useStoredData)
            {
                WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log);
                TermWeightModel.LoadModelDataSet(data, log);

                if (useSelectedFeatures)
                {
                    SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log);
                }
            }
            else
            {
                TermWeightModel.PrepareTheModel(context.spaceModel, log);
            }

            if (context.query.isNullOrEmpty())
            {
                context.query.QueryTerms = context.query.QueryTerms.Trim();

                List <String> tkns = context.query.QueryTerms.getTokens(true, true, true, false, 4);

                foreach (String tkn in tkns)
                {
                    queryTerms.Add(context.stemmingContext.Stem(tkn));
                }
            }
        }
Exemplo n.º 11
0
        /*
         * /// <summary>
         * /// Prepares the context.
         * /// </summary>
         * /// <param name="context">The context.</param>
         * /// <param name="log">The log.</param>
         * /// <returns></returns>
         * public DocumentSelectResult PrepareContext(OperationContext context, ILogBuilder log)
         * {
         *  DocumentSelectResult selectContext = new DocumentSelectResult();
         *  selectContext.stemmingContext = context.stemmContext;
         *  selectContext.spaceModel = context.spaceModel;
         *  selectContext.query = query;
         *
         *  selectContext.selectedFeatures = context.SelectedFeatures;
         *
         *  foreach (KeyValuePair<string, WebSiteDocuments> pair in context.webSiteByDomain)
         *  {
         *      selectContext.domainNameToGraph.Add(pair.Key, pair.Value.extensions.graph);
         *
         *      foreach (WebSiteDocument doc in pair.Value.documents)
         *      {
         *          DocumentSelectResultEntry entry = new DocumentSelectResultEntry();
         *          TextDocument text = context.textDocuments[doc.AssociatedID];
         *          SpaceDocumentModel spaceDocument = context.spaceModel.documents.FirstOrDefault(x => x.name == doc.AssociatedID);
         *
         *          string dn = pair.Value.domain;
         *          entry.SetEntry(dn, doc, spaceDocument, text);
         *          selectContext.Add(entry);
         *          //entry.SetEntry( context.context.webDocumentByAssignedID[pair.Key], webDocIDToDomain[aID], webDocumentRegistry[aID], spaceDocumentRegistry[aID], textDocumentRegistry[aID]);
         *      }
         *
         *  }
         *
         *  // PREPARATION OF MODEL
         *
         *  model.Prepare(selectContext, log);
         *
         *  return selectContext;
         *
         * }
         */
        /*
         * /// <summary>
         * /// Prepares the context.
         * /// </summary>
         * /// <param name="space">The space.</param>
         * /// <param name="sites">The sites.</param>
         * /// <param name="documents">The documents.</param>
         * /// <param name="stemmingContext">The stemming context.</param>
         * /// <returns></returns>
         * public DocumentSelectResult PrepareContext(SpaceModel space, IEnumerable<WebSiteDocuments> sites, IEnumerable<TextDocument> documents, StemmingContext stemmingContext)
         * {
         *  DocumentSelectResult context = new DocumentSelectResult();
         *  context.query = query;
         *
         *  context.stemmingContext = stemmingContext;
         *  context.spaceModel = space;
         *
         *  List<String> associatedIDs = new List<string>();
         *
         *  Dictionary<String, TextDocument> textDocumentRegistry = new Dictionary<string, TextDocument>();
         *  foreach (TextDocument textDocument in documents)
         *  {
         *      textDocumentRegistry.Add(textDocument.name, textDocument);
         *  }
         *
         *  Dictionary<String, SpaceDocumentModel> spaceDocumentRegistry = new Dictionary<string, SpaceDocumentModel>();
         *  foreach (var textDocument in space.documents)
         *  {
         *      spaceDocumentRegistry.Add(textDocument.name, textDocument);
         *  }
         *
         *
         *  Dictionary<String, String> webDocIDToDomain = new Dictionary<string, string>();
         *
         *  Dictionary<String, WebSiteDocument> webDocumentRegistry = new Dictionary<string, WebSiteDocument>();
         *
         *  foreach (WebSiteDocuments site in sites)
         *  {
         *      context.domainNameToGraph.Add(site.domain, site.extensions.graph);
         *
         *      foreach (WebSiteDocument webDocument in site.documents)
         *      {
         *          webDocumentRegistry.Add(webDocument.AssociatedID, webDocument);
         *          associatedIDs.Add(webDocument.AssociatedID);
         *          webDocIDToDomain.Add(webDocument.AssociatedID, site.domain);
         *      }
         *  }
         *
         *  foreach (String aID in associatedIDs)
         *  {
         *      DocumentSelectResultEntry entry = new DocumentSelectResultEntry();
         *      entry.SetEntry(webDocIDToDomain[aID], webDocumentRegistry[aID], spaceDocumentRegistry[aID], textDocumentRegistry[aID]);
         *      context.Add(entry);
         *  }
         *
         *  return context;
         * }
         */


        public DocumentSelectResult ExecuteEvaluation(DocumentSelectResult context, ILogBuilder log)
        {
            // SCORE COMPUTATION
            foreach (IScoreModelFactor factor in model.Factors)
            {
                rangeFinder ranger = new rangeFinder();

                foreach (DocumentSelectResultEntry entry in context.items)
                {
                    Double score = factor.Score(entry, context, log);
                    entry.SetScore(factor, score);
                    if (score != Double.NaN)
                    {
                        if (factor.doNormalize)
                        {
                            ranger.Learn(score);
                        }
                    }
                }

                foreach (DocumentSelectResultEntry entry in context.items)
                {
                    Double score = entry.GetScore(factor);

                    if (ranger.Range != Double.NaN)
                    {
                        if (factor.doNormalize)
                        {
                            score = score - ranger.Minimum;

                            score = score / ranger.Range;
                        }
                    }
                    score = score * factor.weight;

                    entry.SetScore(factor, score, false);
                }
            }

            foreach (DocumentSelectResultEntry entry in context.items)
            {
                entry.SumFactorScores();
            }



            return(context);
        }
        /// <summary>
        /// Loads multiple DocumentSelect results
        /// </summary>
        /// <param name="folder">The folder.</param>
        /// <param name="inputNames">The input names.</param>
        /// <param name="output">The output.</param>
        /// <param name="searchPattern">The search pattern.</param>
        /// <returns></returns>
        public static List <DocumentSelectResult> LoadDSRankings(folderNode folder, String inputNames, ILogBuilder output, String searchPattern = "DS_*_ranking.xml")
        {
            List <string> filepaths = folder.GetOrFindFiles(inputNames, searchPattern, SearchOption.TopDirectoryOnly);

            List <DocumentSelectResult> results = new List <DocumentSelectResult>();


            foreach (var fp in filepaths)
            {
                var lr = DocumentSelectResult.LoadFromFile(fp, output);

                results.Add(lr);
            }

            return(results);
        }
        /// <summary>
        /// Scores the specified entry.
        /// </summary>
        /// <param name="entry">The entry.</param>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public override Double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log)
        {
            var entry_stats = statsByAssignedID[entry.AssignedID];

            entry_stats.reCalculate(instanceCountCollection <string> .preCalculateTasks.all);

            Double score = 0;

            switch (functionName)
            {
            case ScoreModelMetricFactorEnum.varianceFreq:
                score = entry_stats.varianceFreq;
                break;

            case ScoreModelMetricFactorEnum.TotalScore:
                score = entry_stats.TotalScore;
                break;

            case ScoreModelMetricFactorEnum.standardDeviation:
                score = entry_stats.standardDeviation;
                break;

            case ScoreModelMetricFactorEnum.entropyFreq:
                score = entry_stats.entropyFreq;
                break;

            case ScoreModelMetricFactorEnum.avgFreq:
                score = entry_stats.avgFreq;
                break;

            case ScoreModelMetricFactorEnum.Count:
                score = entry_stats.Count;
                break;

            case ScoreModelMetricFactorEnum.Ordinal:
                score = assignedIDs.Count - assignedIDs.IndexOf(entry.AssignedID);
                break;

            default:
                score = entry_stats.Count;
                break;
            }


            return(score);
        }
Exemplo n.º 14
0
        /// <summary>
        /// Fusions the specified operation.
        /// </summary>
        /// <param name="scoreSet">The score set.</param>
        /// <param name="operation">The operation.</param>
        /// <param name="doRankingFusion">if set to <c>true</c> [do ranking fusion].</param>
        /// <param name="doDomainNormalization">if set to <c>true</c> [do domain normalization].</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static DocumentSelectResult Fusion(this IEnumerable <DocumentSelectResult> scoreSet, operation operation, Boolean doRankingFusion, Boolean doDomainNormalization, ILogBuilder log)
        {
            List <DocumentSelectResultEntry> fusioned = null;


            if (doDomainNormalization)
            {
                log.log("Performing domain-level normalization over [" + scoreSet.Count() + "] document score sets");
                foreach (DocumentSelectResult set in scoreSet)
                {
                    DocumentRankingExtensions.NormalizeWithinDomain(set.items, log);
                }
            }



            if (doRankingFusion)
            {
                log.log("Ranking fusion over [" + scoreSet.Count() + "] document score sets");

                fusioned = rankFusion(scoreSet.Select(x => x.items), log);
            }
            else
            {
                log.log("Score fusion over [" + scoreSet.Count() + "] document score sets");

                fusioned = ScoreFusion(scoreSet.Select(x => x.items), operation, log);
            }


            DocumentSelectResult output = new DocumentSelectResult(); // scoreSet.First();

            output.name        = "ScoreFusionBy" + operation.ToString();
            output.description = "Sets fusioned: ";

            foreach (var s in scoreSet)
            {
                output.description = s.name + " ";
            }

            output.items.AddRange(fusioned);


            return(output);
        }
        public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log)
        {
            FeatureVectorWithLabelID fv = vectorDictionary.Get(entry.DomainID, entry.AssignedID);

            if (fv == null)
            {
                log.log("Can't find vector dictionary entry for [" + entry.DomainID + "]>[" + entry.AssignedID + "]");
                return(0);
            }
            Double sc = 0;

            if (computation.HasFlag(ScoreComputationModeEnum.offset))
            {
                sc = fv.CompressByTrueDimension(fv.labelID);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.variance))
            {
                sc = fv.dimensions.GetVarianceCoefficient();
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.distance))
            {
                sc = fv.CompressNumericVector(imbSCI.Core.enums.operation.max);
            }
            else
            {
                sc = fv.dimensions[0];
            }

            if (computation.HasFlag(ScoreComputationModeEnum.inverse))
            {
                sc = -sc;
            }

            if (sc == Double.NaN)
            {
                sc = 0;
            }
            else
            {
            }

            return(sc);
        }
Exemplo n.º 16
0
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            var byDomain = context.GetByDomain(log);

            foreach (var pair in byDomain)
            {
                WebSiteGraph webSiteGraph = context.domainNameToGraph[pair.Key];

                var matrix = webSiteGraph.GetIDMatrix(scoreUnit);
                p_matrix.Add(pair.Key, matrix);

                switch (algorithm)
                {
                case GraphFactorAlgorithm.HITS:
                    HITSRank hits = new HITSRank();
                    hits.recalculate(matrix, convergence, steps);
                    p_hits.Add(pair.Key, hits);
                    break;

                case GraphFactorAlgorithm.PageRank:



                    var pageRank = new PageRank(matrix.GetMatrix(), alpha, convergence, steps);

                    double[]     dbl = pageRank.ComputePageRank();
                    List <Int32> pri = new List <Int32>();
                    foreach (Double db in dbl)
                    {
                        pri.Add(Convert.ToInt32(db * scoreUnit));
                    }
                    var ranks = new Dictionary <String, Int32>();
                    ranks = matrix.MapToX(pri);

                    p_rank.Add(pair.Key, ranks);


                    break;
                }
            }
        }
        /// <summary>
        /// Evaluates the saved ds ranking.
        /// </summary>
        /// <param name="filepath">The filepath.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="minDiversity">The minimum diversity.</param>
        /// <returns></returns>
        public static Boolean EvaluateSavedDSRanking(String filepath, ILogBuilder logger, Double minDiversity = 0.01)
        {
            DocumentSelectResult ds_loaded = null;

            filepath = filepath.Trim();

            if (filepath.isNullOrEmpty())
            {
                logger.log("EvaluateSavedDSRanking -- no filepath specified");
                return(false);
            }

            if (!File.Exists(filepath))
            {
                logger.log("Ranking scores not found at [" + filepath + "]");
                return(false);
            }

            ds_loaded = DocumentSelectResult.LoadFromFile(filepath, logger);

            return(EvaluateDSRanking(ds_loaded, logger, filepath, minDiversity));
        }
        /// <summary>
        /// Prepares the factor by processing the context
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            statsByAssignedID.Clear();


            foreach (DocumentSelectResultEntry docEntry in context.items)
            {
                instanceCountCollection <string> ft = new instanceCountCollection <string>();

                if (docEntry.type.HasFlag(DocumentSelectEntryType.spaceDocument))
                {
                    SpaceDocumentModel document = docEntry.spaceDocument;
                    foreach (var term in document.terms.GetTokens())
                    {
                        ft.AddInstance(term, document.terms.GetTokenFrequency(term));
                    }
                }
                else if (docEntry.type.HasFlag(DocumentSelectEntryType.textDocument))
                {
                    String content = docEntry.textDocument.content; // document.ToString();

                    List <String> tkns = content.getTokens(true, true, true, false, 4);

                    foreach (String tkn in tkns)
                    {
                        String stem = tkn;
                        if (useStems)
                        {
                            stem = context.stemmingContext.Stem(tkn);
                        }
                        ft.AddInstance(stem);
                    }
                }
                statsByAssignedID.Add(docEntry.AssignedID, ft);

                assignedIDs.Add(docEntry.AssignedID);
            }
        }
        /// <summary>
        /// Evaluates the ds ranking.
        /// </summary>
        /// <param name="ds_loaded">The ds loaded.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="filepath">The filepath.</param>
        /// <param name="minDiversity">The minimum diversity.</param>
        /// <returns></returns>
        public static Boolean EvaluateDSRanking(DocumentSelectResult ds_loaded, ILogBuilder logger, String filepath = "", Double minDiversity = 0.01)
        {
            if (filepath == "")
            {
                filepath = ds_loaded.name;
            }

            var   distinct = ds_loaded.items.GetDistinctScores();
            Int32 c        = distinct.Count();

            Boolean skip = true;



            if (distinct.Contains(Double.NaN))
            {
                logger.log("Ranking scores [" + filepath + "] is refused as it contains NaN entries");
                return(false);
            }

            if (c < 2)
            {
                logger.log("Ranking scores [" + filepath + "] is refused as it contains [" + c + "] distinct values");
                return(false);
            }

            Double rate = c.GetRatio(ds_loaded.items.Count());

            if (rate < minDiversity)
            {
                logger.log("Ranking scores [" + filepath + "] is refused for having [" + rate.ToString("F5") + "] below criterion [" + minDiversity.ToString("F2") + "]");
                return(false);
            }


            logger.log("Ranking scores [" + filepath + "] accepted d=[" + rate.ToString("F5") + "] c=[" + distinct.Count + "] |e|=[" + ds_loaded.items.Count + "]");
            return(true);
        }
Exemplo n.º 20
0
        public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log)
        {
            // WebSiteGraph webSiteGraph = context.domainNameToGraph[entry.DomainID];

            Double score = 0;

            //var matrix = webSiteGraph.GetIDMatrix();

            switch (algorithm)
            {
            case GraphFactorAlgorithm.HITS:
                if (p_hits.ContainsKey(entry.DomainID))
                {
                    HITSRank hits = p_hits[entry.DomainID];

                    if (hits.ContainsKey(entry.AssignedID))
                    {
                        score = hits[entry.AssignedID] * scoreUnit;
                    }
                }

                break;

            case GraphFactorAlgorithm.PageRank:

                if (p_rank[entry.DomainID].ContainsKey(entry.AssignedID))
                {
                    score = p_rank[entry.DomainID][entry.AssignedID];
                }


                break;
            }


            return(score);
        }
 /// <summary>
 /// Sorts entries by domain name
 /// </summary>
 /// <param name="context">The context.</param>
 /// <param name="log">The log.</param>
 /// <returns></returns>
 public static Dictionary <String, List <DocumentSelectResultEntry> > GetByDomain(this DocumentSelectResult context, ILogBuilder log)
 {
     return(context.items.GetByDomain(log));
 }
Exemplo n.º 22
0
        /// <summary>
        /// The graph registry
        /// </summary>
        // protected Dictionary<String, WebSiteGraph> GraphRegistry = new Dictionary<string, WebSiteGraph>();

        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
        }
        /// <summary>
        /// Gets nested dictionaries: [category][domain][assignedID]
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static Dictionary <String, Dictionary <String, Dictionary <String, SpaceDocumentModel> > > GetModelsByCategoryDomainAssignedID(this DocumentSelectResult context, ILogBuilder log)
        {
            Dictionary <String, Dictionary <String, Dictionary <String, DocumentSelectResultEntry> > > entries = context.GetByCategoryDomainAssignedID(log);

            Dictionary <String, Dictionary <String, Dictionary <String, SpaceDocumentModel> > > output = new Dictionary <string, Dictionary <string, Dictionary <string, SpaceDocumentModel> > >();

            foreach (var pairCategory in entries)
            {
                var categoryDictionary = new Dictionary <string, Dictionary <string, SpaceDocumentModel> >();

                foreach (var pairWebsite in pairCategory.Value)
                {
                    var domainDictionary = new Dictionary <string, SpaceDocumentModel>();

                    foreach (var pairWebpage in pairWebsite.Value)
                    {
                        domainDictionary.Add(pairWebpage.Key, pairWebpage.Value.spaceDocument);
                    }

                    categoryDictionary.Add(pairWebsite.Key, domainDictionary);
                }

                output.Add(pairCategory.Key, categoryDictionary);
            }


            return(output);
        }
 public abstract void Prepare(DocumentSelectResult context, ILogBuilder log);
 public abstract Double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log);
        /// <summary>
        /// Gets nested dictionaries: [category][domain][assignedID]
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static Dictionary <String, Dictionary <String, Dictionary <String, DocumentSelectResultEntry> > > GetByCategoryDomainAssignedID(this DocumentSelectResult context, ILogBuilder log)
        {
            Dictionary <String, Dictionary <String, Dictionary <String, DocumentSelectResultEntry> > > output = new Dictionary <string, Dictionary <string, Dictionary <string, DocumentSelectResultEntry> > >();

            var byAssigned = context.GetByAssignedID(log);

            List <string> labels = context.spaceModel.LabelToDocumentLinks.GetAllDistinctNames();

            foreach (String label in labels)
            {
                output.Add(label, new Dictionary <String, Dictionary <String, DocumentSelectResultEntry> >());

                List <SpaceDocumentModel> linked_documents = context.spaceModel.LabelToDocumentLinks.GetAllLinkedB(label);

                List <DocumentSelectResultEntry> underLabel = new List <DocumentSelectResultEntry>();

                foreach (var sdoc in linked_documents)
                {
                    underLabel.Add(byAssigned[sdoc.name]);
                }


                var byDomain = underLabel.GetByDomain(log);

                foreach (var sitePair in byDomain)
                {
                    output[label].Add(sitePair.Key, new Dictionary <string, DocumentSelectResultEntry>());

                    foreach (var pagePair in sitePair.Value)
                    {
                        output[label][sitePair.Key].Add(pagePair.AssignedID, pagePair);
                    }
                }
            }



            return(output);
        }
        //public static Dictionary<String, Dictionary<String, Dictionary<String, DocumentSelectResultEntry>>> GetByCategoryDomainAssignedID (this IEnumerable<DocumentSelectResultEntry> entries, SpaceModel model, ILogBuilder log)
        //{
        //    Dictionary<String, Dictionary<String, Dictionary<String, DocumentSelectResultEntry>>> output = new Dictionary<string, Dictionary<string, Dictionary<string, DocumentSelectResultEntry>>>();


        //    var byDomain = entries.GetByDomain(log);

        //    var byCategoryAssignedID = GetByAssignIDCategory()

        //}

        /// <summary>
        /// Gets the by assign identifier category.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="catIndex">Index of the cat.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static Dictionary <String, List <DocumentSelectResultEntry> > GetByAssignIDCategory(this DocumentSelectResult context, Dictionary <string, List <string> > catIndex, ILogBuilder log)
        {
            Dictionary <String, List <DocumentSelectResultEntry> > output = new Dictionary <string, List <DocumentSelectResultEntry> >();


            Dictionary <String, DocumentSelectResultEntry> byID = context.GetByAssignedID(log);


            foreach (var pair in catIndex)
            {
                output.Add(pair.Key, new List <DocumentSelectResultEntry>());

                foreach (var k in pair.Value)
                {
                    output[pair.Key].Add(byID[k]);
                }
            }



            return(output);
        }
Exemplo n.º 28
0
        /// <summary>
        /// Prepares the context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static DocumentSelectResult PrepareContext(this OperationContext context, DocumentRankingMethod ranking, folderNode folder, ILogBuilder log)
        {
            DocumentSelectResult selectContext = new DocumentSelectResult();

            selectContext.stemmingContext = context.stemmContext;
            selectContext.spaceModel      = context.spaceModel;
            selectContext.folder          = folder;
            if (ranking != null)
            {
                selectContext.name  = ranking.model.GetSignature();
                selectContext.query = ranking.query;

                builderForText builder = new builderForText();
                ranking.Describe(builder);

                builder.AppendLine("Selected features [" + selectContext.selectedFeatures.description + "].");

                selectContext.description = builder.GetContent().Replace(Environment.NewLine, "");
            }

            selectContext.selectedFeatures = context.SelectedFeatures;



            foreach (KeyValuePair <string, WebSiteDocuments> pair in context.webSiteByDomain)
            {
                selectContext.domainNameToGraph.Add(pair.Key, pair.Value?.extensions?.graph);

                foreach (WebSiteDocument doc in pair.Value.documents)
                {
                    DocumentSelectResultEntry entry = new DocumentSelectResultEntry();
                    TextDocument text = null;

                    string err = "";


                    //if (context.textDocuments.ContainsKey(doc.AssignedID))
                    //{
                    //    text = context.textDocuments[doc.AssignedID];
                    //}
                    //else
                    //{
                    //    err += "Failed to find text document for [" + doc.AssignedID + "]";
                    //}

                    SpaceDocumentModel spaceDocument = context.spaceModel.documents.FirstOrDefault(x => x.name == doc.AssignedID);


                    if (spaceDocument == null)
                    {
                        err += "Failed to find space model document for [" + doc.AssignedID + "]";
                    }


                    string dn = pair.Value.domain;
                    entry.SetEntry(dn, doc, spaceDocument, text);

                    if (!entry.HasTextOrSpaceModel)
                    {
                        log.log(err);
                    }

                    selectContext.items.Add(entry);
                    //entry.SetEntry( context.context.webDocumentByAssignedID[pair.Key], webDocIDToDomain[aID], webDocumentRegistry[aID], spaceDocumentRegistry[aID], textDocumentRegistry[aID]);
                }
            }

            // PREPARATION OF MODEL
            if (ranking != null)
            {
                ranking.model.Prepare(selectContext, log);
            }
            return(selectContext);
        }
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            String p_m = "";

            String p_d = "";

            modelDefinitionFile = modelDefinitionFile.Replace("*", "");

            if (!modelDefinitionFile.isNullOrEmpty())
            {
                p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder);
                p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder);
            }

            if (TermWeightModel == null)
            {
                log.log("Loading model from [" + p_m + "]");

                if (File.Exists(p_m))
                {
                    TermWeightModel = FeatureWeightModel.LoadModel(p_m, log);
                }
            }

            TermWeightModel.Deploy(log);

            if (File.Exists(p_d) && UseModelData)
            {
                log.log("Loading model data from [" + p_d + "]");

                var dataset = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log);

                //  WeightingModelDataSet
                TermWeightModel.LoadModelDataSet(dataset, log);
            }
            else
            {
                log.log("Preparing model ...");
                TermWeightModel.PrepareTheModel(context.spaceModel, log);
            }


            if (computation.HasFlag(ScoreComputationModeEnum.category))
            {
                vectorDictionary = context.TransformToFVDictionaryAsCategorySimilarity(TermWeightModel, function, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.site))
            {
                vectorDictionary = context.TransformToFVDictionaryAsSiteSimilarity(TermWeightModel, function, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.pageDivergence))
            {
                vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.site, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.pagesOfCategory))
            {
                vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.category, log);
            }
            else if (computation.HasFlag(ScoreComputationModeEnum.pagesOfDataset))
            {
                vectorDictionary = context.TransformToFVDictionaryAsPageSimilarity(TermWeightModel, function, ScoreComputationModeEnum.dataset, log);
            }



            log.log("Category similarity ready ... [" + computation.ToString() + "]");
        }
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <exception cref="ArgumentException">context</exception>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            //context.folder.GetOrFindFiles("*", dictionaryFile + "*.xml");

            scoreDictionary = FeatureVectorDictionaryWithDimensions.LoadFile(context.folder, dictionaryFile, log); // WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log);

            if (scoreDictionary == null)
            {
                String msg = "Error: Failed to find score dictionary [" + dictionaryFile + "] in " + context.folder.path;
                throw new ArgumentException(msg, nameof(context));
            }

            if (useMachineLearning)
            {
                #region --------------- PREPARING TERM WEIGHT MODEL


                String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder);
                String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder);


                if (TermWeightModel == null)
                {
                    TermWeightModel = FeatureWeightModel.LoadModel(p_m, log);
                }


                TermWeightModel.Deploy(log);

                if (context.spaceModel == null)
                {
                    String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation";
                    throw new ArgumentException(msg, nameof(context));
                }



                if (File.Exists(p_d) && useStoredData)
                {
                    WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log);
                    TermWeightModel.LoadModelDataSet(data, log);

                    if (useSelectedFeatures)
                    {
                        SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log);
                    }
                }
                else
                {
                    TermWeightModel.PrepareTheModel(context.spaceModel, log);
                }

                if (SelectedTerms.Count == 0)
                {
                    SelectedTerms = context.selectedFeatures;
                }
                List <String> sel_tkns = new List <String>();

                sel_tkns.AddRange(SelectedTerms.index.Values.Select(x => x.name));

                if (!sel_tkns.Any())
                {
                    sel_tkns.AddRange(context.spaceModel.terms_known_label.GetTokens());
                }


                #endregion

                fvConstructor.Deploy(featureMethod.constructor, sel_tkns);



                classifier = featureMethod.classifierSettings.GetClassifier();

                sc_id = scoreDictionary.GetVectorsWithLabelID(null, criterion).ToNameVsLabelID();


                List <FeatureVectorWithLabelID> trainingSet = new List <FeatureVectorWithLabelID>();
                foreach (var item in context.items)
                {
                    if (sc_id.ContainsKey(item.AssignedID))
                    {
                        WeightDictionary dc_vec = TermWeightModel.GetWeights(sel_tkns, item.spaceDocument, context.spaceModel);


                        var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, item.AssignedID);

                        FeatureVectorWithLabelID id_vec = new FeatureVectorWithLabelID(n_vec, sc_id[item.AssignedID]);

                        trainingSet.Add(id_vec);
                    }
                }


                log.log("Training [" + classifier.name + "] with [" + sc_id.Count + "] feature vectors.");
                classifier.DoTraining(trainingSet, log);
            }
        }