Ejemplo n.º 1
0
        /// <summary>
        /// Processes the specified chunk table into semantic cloud
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
        /// ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception>
        public lemmaSemanticCloud process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            if (output == null)
            {
                output           = new lemmaSemanticCloud();
                output.className = termTable.name;
            }

            switch (settings.algorithm)
            {
            case cloudConstructorAlgorithm.complex:
                output = processPOSEnhanced(chunkTable, termTable, output, logger, subjects, resolver);
                break;

            case cloudConstructorAlgorithm.standard:
                output = processStandard(chunkTable, termTable, output, logger, subjects);
                break;

            case cloudConstructorAlgorithm.alternative:
                output = processAlternative(chunkTable, termTable, output, logger, subjects, resolver);
                break;
            }

            output.RebuildIndex();


            output.weaverReport = settings.cloudWeaver.Process(output, logger);

            output.RebuildIndex();

            return(output);
        }
Ejemplo n.º 2
0
        protected lemmaSemanticCloud processStandard(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects)
        {
            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }


            List <webLemmaTerm> allChunks = chunkTable.GetList();

            IEnumerable <webLemmaTerm> docSetFreq = null;

            if (subjects.Count > 1)
            {
                docSetFreq = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit);
            }
            else
            {
                docSetFreq = allChunks;
            }

            //  allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit);

            Dictionary <List <String>, webLemmaTerm> weightDict = new Dictionary <List <String>, webLemmaTerm>();

            List <List <String> > lemmasList = new List <List <string> >();
            List <String>         nodeNames  = new List <string>();

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();


                if (lemmas.Count > 1)
                {
                    lemmas.Sort((x, y) => String.CompareOrdinal(x, y));
                    lemmasList.Add(lemmas);

                    weightDict.Add(lemmas, chunk);

                    nodeNames.AddRange(lemmas, true);
                }
            }

            foreach (String n in nodeNames) // <------------ creating nodes
            {
                Double weight = 0;
                if (settings.assignTermTableWeightToNode)
                {
                    weight = termTable.ResolveSingleTerm(n, logger);
                }
                else
                {
                    weight = 1;
                }
                if (weight > 0)
                {
                    output.AddNode(n, weight);
                }
            }

            foreach (List <String> n in lemmasList) // <-------- creating links
            {
                String first = n[0];
                if (output.ContainsNode(first, true))
                {
                    foreach (String m in n)
                    {
                        if (m != first)
                        {
                            if (output.ContainsNode(m, true))
                            {
                                Double weight = 1;
                                if (settings.assignChunkTableWeightToLink)
                                {
                                    weight = weightDict[n].weight;
                                }
                                else
                                {
                                    if (settings.doAdjustLinkWeightByChunkSize)
                                    {
                                        weight = (n.Count - 1).GetRatio(1);
                                    }
                                    else
                                    {
                                        weight = 1;
                                    }
                                }
                                var link = output.GetLink(first, m);
                                if (link == null)
                                {
                                    output.AddLink(first, m, weight);
                                }
                                else
                                {
                                    if (settings.doSumExistingLinkWeights)
                                    {
                                        link.weight += weight;
                                    }
                                    else
                                    {
                                        // it will not create new link as it already exists
                                        // this is irrational in case settings.assignChunkTableWeightToLink is true
                                        if (settings.assignChunkTableWeightToLink)
                                        {
                                            throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
                                                                          ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }


            return(output);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Builds the cloud - common part of the algorithm
        /// </summary>
        /// <param name="c">The c.</param>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
        ///                                                 ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception>
        protected lemmaSemanticCloud BuildCloud(lemmaSemanticConstruct c, webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, ITextResourceResolver resolver)
        {
            c.TrashBin.ForEach(x => c.nodeNames.Remove(x));


            foreach (String n in c.nodeNames) // <------------ creating nodes
            {
                Double weight = 0;
                if (settings.assignTermTableWeightToNode)
                {
                    var lemma = termTable[n];
                    if (lemma != null)
                    {
                        weight = lemma.weight;
                    }
                }
                else
                {
                    weight = 1;
                }
                if (weight > 0)
                {
                    if (c.isCaseCloud)
                    {
                        if (settings.doFactorToCaseClouds)
                        {
                            if (c.PrimaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2);
                            }
                            else if (c.SecondaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1);
                            }
                            else
                            {
                                output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0);
                            }
                        }
                        else
                        {
                            output.AddNode(n, weight);
                        }
                    }
                    else
                    {
                        // class cloud
                        if (settings.doFactorToClassClouds)
                        {
                            if (c.PrimaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2);
                            }
                            else if (c.SecondaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1);
                            }
                            else
                            {
                                output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0);
                            }
                        }
                        else
                        {
                            output.AddNode(n, weight);
                        }
                    }
                }
            }

            foreach (List <String> n in c.lemmasList) // <-------- creating links
            {
                String first = n[0];
                if (c.TrashBin.Contains(first))
                {
                    continue;
                }

                if (output.ContainsNode(first, true))
                {
                    foreach (String m in n)
                    {
                        if (c.TrashBin.Contains(m))
                        {
                            continue;
                        }
                        if (m != first)
                        {
                            if (output.ContainsNode(m, true))
                            {
                                Double weight = 1;
                                if (settings.assignChunkTableWeightToLink)
                                {
                                    weight = c.weightDict[n].weight;
                                }
                                else
                                {
                                    if (settings.doAdjustLinkWeightByChunkSize)
                                    {
                                        weight = (n.Count - 1).GetRatio(1);
                                    }
                                    else
                                    {
                                        weight = 1;
                                    }
                                }
                                var link = output.GetLink(first, m);
                                if (link == null)
                                {
                                    output.AddLink(first, m, weight);
                                }
                                else
                                {
                                    if (settings.doSumExistingLinkWeights)
                                    {
                                        link.weight += weight;
                                    }
                                    else
                                    {
                                        // it will not create new link as it already exists
                                        // this is irrational in case settings.assignChunkTableWeightToLink is true
                                        if (settings.assignChunkTableWeightToLink)
                                        {
                                            throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
                                                                          ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            c.primaryChunks.ForEach(x => output.primaryChunks.Add(x.nominalForm));
            c.secondaryChunks.ForEach(x => output.secondaryChunks.Add(x.nominalForm));

            return(output);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Processes the complex.
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        protected lemmaSemanticCloud processAlternative(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }
            lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects);



            List <webLemmaTerm> allChunks = chunkTable.GetList();

            // <--------------------------------- DETECTING THE MOST IMPORTANT TERMS
            IEnumerable <webLemmaTerm> vipChunks = null;

            if (subjects.Count > 1)
            {
                vipChunks = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit);
            }
            else
            {
                vipChunks = allChunks;
            }

            instanceCountCollection <String> lemmaCounter     = new instanceCountCollection <string>();
            List <List <String> >            primaryLemmaList = new List <List <String> >();

            foreach (webLemmaTerm chunk in vipChunks)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();

                lemmaCounter.AddInstanceRange(lemmas);
            }

            c.RelevantTerms = lemmaCounter.getSorted();


            lemmaCounter.reCalculate();

            foreach (String term in c.RelevantTerms)
            {
                if (lemmaCounter[term] == lemmaCounter.maxFreq)
                {
                    c.PrimaryTerms.Add(term);
                }
                else if (lemmaCounter[term] > lemmaCounter.minFreq)
                {
                    c.SecondaryTerms.Add(term);
                }
                else
                {
                    c.ReserveTerms.Add(term);
                }
            }


            c.CollectRelevantTerms(settings.doReserveTermsForClass);
            c.LogConstruct(logger);



            // <---------------------------------

            var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y)));

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();

                if (lemmas.Count > 1)
                {
                    lemmas.Sort((x, y) => String.CompareOrdinal(x, y));
                    c.lemmasList.Add(lemmas);

                    c.weightDict.Add(lemmas, chunk);

                    c.nodeNames.AddRange(lemmas, true);
                }
            }

            return(BuildCloud(c, chunkTable, termTable, output, logger, resolver));
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Processes the position enhanced.
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        protected lemmaSemanticCloud processPOSEnhanced(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            List <webLemmaTerm> allChunks = chunkTable.GetList();

            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }



            // <----------------- PRIMARY
            lemmaSemanticConstruct c  = new lemmaSemanticConstruct(subjects);
            lemmaSemanticConstruct cl = new lemmaSemanticConstruct(subjects);

            while (c != cl)
            {
                c  = cl;
                cl = lemmaSemanticConstruct.NextIteration(cl, resolver, allChunks, settings, subjects, logger);

                if (cl.createdInIteration > settings.primaryTermOptimizationIterationLimit)
                {
                    c = cl;
                    break;
                }

                if (cl.OptimizationDone)
                {
                    break;
                }
            }

            c = cl;



            // <------------------- PRIM

            c.CollectRelevantTerms(settings.doReserveTermsForClass);

            if (!c.isCaseCloud)
            {
                c.LogConstruct(logger);
            }



            // <---------------------------------

            var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y)));

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var           lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                List <String> l_out  = new List <string>();
                foreach (String lm in lemmas)
                {
                    if (c.NotProcessed(lm))
                    {
                        var lu = resolver.GetLexicUnit(lm, logger);
                        if (lu == null)
                        {
                            c.TrashBin.AddUnique(lm);
                        }
                        else
                        {
                            var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                            if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A }))
                            {
                                c.ReserveTerms.AddUnique(lm);
                                l_out.Add(lm);
                            }
                            else
                            {
                                c.TrashBin.AddUnique(lm);
                            }
                        }
                    }
                    else
                    {
                        if (!c.TrashBin.Contains(lm))
                        {
                            l_out.Add(lm);
                        }
                    }
                }


                if (l_out.Count > 1)
                {
                    l_out.Sort((x, y) => String.CompareOrdinal(x, y));

                    c.lemmasList.Add(l_out);

                    c.weightDict.Add(l_out, chunk);

                    c.nodeNames.AddRange(l_out, true);
                }
            }

            return(BuildCloud(c, chunkTable, termTable, output, logger, resolver));
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Returns expanded cloud from given lemma list - only for matched lemmas
        /// </summary>
        /// <param name="lemmas">The lemmas.</param>
        /// <param name="expansionSteps">The expansion steps.</param>
        /// <param name="options">The options.</param>
        /// <param name="typeToMin">todo: describe typeToMin parameter on ExpandTermsToCloud</param>
        /// <returns></returns>
        public lemmaSemanticCloud ExpandTermsToCloud(IEnumerable <String> lemmas, Int32 expansionSteps, Boolean typeToMin = true, lemmaExpansionOptions options = lemmaExpansionOptions.initialWeightFromParent | lemmaExpansionOptions.weightAsSemanticDistanceFromParent)
        {
            lemmaSemanticCloud output = new lemmaSemanticCloud();

            output.name         = name + "_subset_exp" + expansionSteps;
            output.DisableCheck = true;
            StringBuilder sb = new StringBuilder();

            sb.Append("Subset expanded from matched query lemmas [");


            List <String> nextTerms = new List <string>();
            List <String> allTerms  = new List <string>();

            foreach (String t in lemmas)
            {
                if (ContainsNode(t))
                {
                    sb.Append(t + " ");

                    var l = GetNode(t);


                    output.AddNode(l.name, l.weight, 0).distance = 1;
                    nextTerms.Add(t);
                    allTerms.Add(t);
                }
            }

            sb.Append("] using cloud [" + name + "]");
            output.description = sb.ToString();


            Int32 exp_i = 1;

            while (nextTerms.Any())
            {
                List <String> newNextTerms = new List <string>();
                foreach (String t in nextTerms)
                {
                    freeGraphNodeAndLinks links = new freeGraphNodeAndLinks();

                    if (options.HasFlag(lemmaExpansionOptions.weightAsSemanticDistanceFromParent))
                    {
                        links = GetLinks(t, true, false, 1.GetRatio(exp_i), exp_i, true, options.HasFlag(lemmaExpansionOptions.initialWeightFromParent));
                    }
                    else if (options.HasFlag(lemmaExpansionOptions.weightAsSemanticDistanceThatIsSumOfLinkWeights))
                    {
                        var nd = output.GetNode(t, true);
                        links = GetLinks(t, true, false, 1, exp_i, true, options.HasFlag(lemmaExpansionOptions.initialWeightFromParent));
                    }
                    else
                    {
                        links = GetLinks(t, true, false, 1, exp_i, true, options.HasFlag(lemmaExpansionOptions.initialWeightFromParent));
                    }



                    foreach (freeGraphLink link in links)
                    {
                        if (!allTerms.Contains(link.nodeA.name))
                        {
                            newNextTerms.Add(link.nodeA.name);
                            allTerms.Add(link.nodeA.name);
                        }

                        if (link.nodeA.name != t)
                        {
                            output.AddOrUpdateNode(link.nodeA, link, links, typeToMin, options);
                        }

                        if (!allTerms.Contains(link.nodeB.name))
                        {
                            newNextTerms.Add(link.nodeB.name);
                            allTerms.Add(link.nodeB.name);
                        }

                        if (link.nodeB.name != t)
                        {
                            output.AddOrUpdateNode(link.nodeB, link, links, typeToMin, options);
                        }
                    }

                    foreach (freeGraphLink link in links)
                    {
                        if (!output.ContainsLink(link.linkBase.nodeNameA, link.linkBase.nodeNameB))
                        {
                            output.AddLink(link.linkBase.nodeNameA, link.linkBase.nodeNameB, Math.Max(link.linkBase.weight, 1), link.linkBase.type);
                            //var nd = output.GetNode(link.nodeB.name);
                            //nd.weight = nd.weight + (link.nodeB.weight.GetRatio(links.linkedNodeClones.Count));
                            //nd.type = Math.Max(nd.type, link.nodeB.type);
                        }
                        else
                        {
                            var lnk = output.GetLink(link.linkBase.nodeNameA, link.linkBase.nodeNameB);
                            lnk.weight += link.linkBase.weight;
                            if (typeToMin)
                            {
                                lnk.type = Math.Min(link.linkBase.type, lnk.type);
                            }
                            else
                            {
                                lnk.type = Math.Max(link.linkBase.type, lnk.type);
                            }
                        }
                    }
                }
                nextTerms = newNextTerms;
                exp_i++;
                if (exp_i > expansionSteps)
                {
                    break;
                }
            }

            output.DisableCheck = false;
            output.RebuildIndex();


            return(output);
        }