コード例 #1
0
        /// <summary>
        /// Prepares the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            var byDomain = context.GetByDomain(log);

            foreach (var pair in byDomain)
            {
                WebSiteGraph webSiteGraph = context.domainNameToGraph[pair.Key];

                var matrix = webSiteGraph.GetIDMatrix(scoreUnit);
                p_matrix.Add(pair.Key, matrix);

                switch (algorithm)
                {
                case GraphFactorAlgorithm.HITS:
                    HITSRank hits = new HITSRank();
                    hits.recalculate(matrix, convergence, steps);
                    p_hits.Add(pair.Key, hits);
                    break;

                case GraphFactorAlgorithm.PageRank:



                    var pageRank = new PageRank(matrix.GetMatrix(), alpha, convergence, steps);

                    double[]     dbl = pageRank.ComputePageRank();
                    List <Int32> pri = new List <Int32>();
                    foreach (Double db in dbl)
                    {
                        pri.Add(Convert.ToInt32(db * scoreUnit));
                    }
                    var ranks = new Dictionary <String, Int32>();
                    ranks = matrix.MapToX(pri);

                    p_rank.Add(pair.Key, ranks);


                    break;
                }
            }
        }
コード例 #2
0
        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log)
        {
            List <string> selectedTerms = context.selectedFeatures.GetKeys();

            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();


            foreach (var entry in context.items)
            {
                WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
                documentDictionarties.Add(entry.AssignedID, documentWeights);
            }


            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);



            Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null;


            if (groupmode == ScoreComputationModeEnum.category)
            {
                Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true);

                relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log);
                if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN))
                {
                    assignIDByLabel.Remove(SpaceLabel.UNKNOWN);
                }
                log.log("... Page Similarity ... Groups by category");
            }
            else if (groupmode == ScoreComputationModeEnum.site)
            {
                relative_groups = context.GetByDomain(log);
                log.log("... Page Similarity ... Groups by site");
            }
            else if (groupmode == ScoreComputationModeEnum.dataset)
            {
                relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >();
                relative_groups.Add("dataset", context.items);
                log.log("... Page Similarity ... dataset");
            }


            ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>();


            foreach (var domainPair in relative_groups)
            {
                List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList();


                foreach (var entry in relatives)
                {
                    i++;
                    FeatureVector fv = new FeatureVector(entry.AssignedID);

                    // List<Double> d = new List<>();

                    fv.dimensions = new double[relatives.Count - 1];


                    // List<String> keys = documentDictionarties.Keys.ToList();

                    Int32 hostInd = relatives.IndexOf(entry);

                    Int32 c = 0;


                    //foreach (var pair in documentDictionarties)
                    //{

                    Parallel.ForEach(relatives, (pair) =>
                    {
                        Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
                        if (ind >= hostInd)
                        {
                            ind = ind - 1;
                        }

                        if (pair.AssignedID != entry.AssignedID)
                        {
                            Double docToClassSimilarity = 0;

                            if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
                            }
                            else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
                            }
                            else
                            {
                                var vecA             = documentDictionarties[pair.AssignedID];
                                var vecB             = documentDictionarties[entry.AssignedID];
                                docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
                                if (docToClassSimilarity > 0)
                                {
                                }
                                if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                                {
                                    computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
                                    //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
                                }
                                else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                                {
                                    computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
                                }
                            }

                            fv.dimensions[ind] = docToClassSimilarity;
                        }
                    });



                    Int32 r = i % p;
                    if (r == 0)
                    {
                        log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                    }


                    dict.GetOrAdd(domainPair.Key).Add(fv, -1);
                }
            }



            log.log("... Preparation finished ...");

            return(dict);
        }
コード例 #3
0
        ///// <summary>
        ///// Transforms to fv dictionary.
        ///// </summary>
        ///// <param name="context">The context.</param>
        ///// <param name="TermWeightModel">The term weight model.</param>
        ///// <param name="function">The function.</param>
        ///// <returns></returns>
        //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        //{
        //    log.log("... Page Similarity ...");

        //    List<string> selectedTerms = context.selectedFeatures.GetKeys();



        //    var ByDomain = context.GetByDomain(log);

        //    Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true);

        //    var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log);

        //    Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>();



        //    Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>();


        //    foreach (var entry in context.items)
        //    {

        //        WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
        //        documentDictionarties.Add(entry.AssignedID, documentWeights);
        //    }


        //    FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();



        //    Double total = context.Count;
        //    Int32 i = 0;
        //    Int32 p = (context.Count / 10);

        //    //List<List<Double>> matrix = new List<List<double>>();

        //    //foreach (var entry in context.items)
        //    //{
        //    //    matrix.Add(new List<double>());
        //    //}


        //    //for (int x = 0; x < context.items.Count; x++)
        //    //{

        //    //    for (int y = 0; y < context.items.Count; x++)
        //    //    {



        //    //    }

        //    //}

        //    ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>();


        //    foreach (var domainPair in ByCategory)
        //    {
        //        List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList();


        //        foreach (var entry in relatives)
        //        {

        //            i++;
        //            FeatureVector fv = new FeatureVector(entry.AssignedID);

        //            // List<Double> d = new List<>();

        //            fv.dimensions = new double[relatives.Count - 1];


        //            // List<String> keys = documentDictionarties.Keys.ToList();

        //            Int32 hostInd = relatives.IndexOf(entry);

        //            Int32 c = 0;


        //            //foreach (var pair in documentDictionarties)
        //            //{

        //            Parallel.ForEach(relatives, (pair) =>
        //            {

        //                Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
        //                if (ind >= hostInd)
        //                {
        //                    ind = ind - 1;
        //                }

        //                if (pair.AssignedID != entry.AssignedID)
        //                {
        //                    Double docToClassSimilarity = 0;

        //                    if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
        //                    }
        //                    else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
        //                    }
        //                    else
        //                    {
        //                        var vecA = documentDictionarties[pair.AssignedID];
        //                        var vecB = documentDictionarties[entry.AssignedID];
        //                        docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
        //                        if (docToClassSimilarity > 0)
        //                        {

        //                        }
        //                        if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
        //                            //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
        //                        }
        //                        else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
        //                        }

        //                    }

        //                    fv.dimensions[ind] = docToClassSimilarity;

        //                }
        //            });



        //            Int32 r = i % p;
        //            if (r == 0)
        //            {
        //                log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
        //            }


        //            dict.GetOrAdd(entry.DomainID).Add(fv, -1);
        //        }



        //    }


        //    //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict)
        //    //{
        //    //    pair.Value.CloseDeploy();
        //    //}

        //    log.log("... Preparation finished ...");

        //    return dict;


        //}



        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        {
            log.log("... Site Similarity ...");

            List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList();

            Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>();
            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();

            var byDomain = context.GetByDomain(log);

            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);

            foreach (var pair in byDomain)
            {
                i++;
                SpaceDocumentModel siteModel = new SpaceDocumentModel();

                foreach (var ent in pair.Value)
                {
                    WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel);
                    documentDictionarties.Add(ent.AssignedID, documentWeights);
                    siteModel.Children.Add(ent.spaceDocument);

                    //siteModel.terms.MergeDictionary(ent.spaceDocument.terms);
                }

                siteModel.Flatten(false);

                categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel));


                foreach (var ent in pair.Value)
                {
                    FeatureVector fv = new FeatureVector(ent.AssignedID);
                    fv.dimensions = new double[context.spaceModel.labels.Count];

                    // documentDictionarties[ent.AssignedID].entries


                    var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]);

                    fv.dimensions[0] = docToClassSimilarity;

                    dict.GetOrAdd(pair.Key).Add(fv, -1);
                }

                Int32 r = i % p;
                if (r == 0)
                {
                    log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                }
            }



            foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict)
            {
                pair.Value.CloseDeploy();
            }

            log.log("... Preparation finished ...");

            return(dict);
        }
コード例 #4
0
        /// <summary>
        /// Executes the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        /// <returns></returns>
        public static DocumentSelectResult ExecuteLimit(this DocumentSelectQuery query, DocumentSelectResult context, ILogBuilder log)
        {
            if (query.options.HasFlag(DocumentSelectQueryOptions.DomainLevelNormalization))
            {
                log.log("DS Scores normalized on website / domain level");
                context.items.NormalizeWithinDomain(log);
            }


            // QUERY LIMITS
            List <DocumentSelectResultEntry> sortedList = context.items.OrderByDescending(x => x.score).ToList();

            context.items.Clear();

            context.items.AddRange(sortedList);


            if (query.TrasholdLimit != 0.0)
            {
                List <DocumentSelectResultEntry> underTrashold = new List <DocumentSelectResultEntry>();
                foreach (DocumentSelectResultEntry entry in context.items)
                {
                    if (entry.score < query.TrasholdLimit)
                    {
                        underTrashold.Add(entry);
                    }
                }
                foreach (DocumentSelectResultEntry entry in underTrashold)
                {
                    context.items.Remove(entry);
                }
            }



            if (query.SizeLimit > 0)
            {
                if (query.options.HasFlag(DocumentSelectQueryOptions.ApplyDomainLevelLimits))
                {
                    List <DocumentSelectResultEntry> overLimit = new List <DocumentSelectResultEntry>();

                    var byDomain = context.GetByDomain(log);

                    foreach (var pair in byDomain)
                    {
                        Int32 count = 0;
                        List <DocumentSelectResultEntry> domainSortedList = pair.Value.OrderByDescending(x => x.score).ToList();

                        if (query.options.HasFlag(DocumentSelectQueryOptions.ForceHomePage))
                        {
                            DocumentSelectResultEntry homeEntry = domainSortedList.GetDocWithShortestID();
                            if (domainSortedList.Remove(homeEntry))
                            {
                                count++;
                            }
                        }

                        foreach (DocumentSelectResultEntry entry in domainSortedList)
                        {
                            if (count >= query.SizeLimit)
                            {
                                overLimit.Add(entry);
                            }
                            count++;
                        }
                    }


                    foreach (DocumentSelectResultEntry entry in overLimit)
                    {
                        context.Remove(entry);
                    }
                }
                else
                {
                    if (context.Count > query.SizeLimit)
                    {
                        context.RemoveRange(query.SizeLimit, context.Count - query.SizeLimit);
                    }
                }
            }



            return(context);
        }