コード例 #1
0
        /// <summary>
        /// Processes the position enhanced.
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        protected lemmaSemanticCloud processPOSEnhanced(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            List <webLemmaTerm> allChunks = chunkTable.GetList();

            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }



            // <----------------- PRIMARY
            lemmaSemanticConstruct c  = new lemmaSemanticConstruct(subjects);
            lemmaSemanticConstruct cl = new lemmaSemanticConstruct(subjects);

            while (c != cl)
            {
                c  = cl;
                cl = lemmaSemanticConstruct.NextIteration(cl, resolver, allChunks, settings, subjects, logger);

                if (cl.createdInIteration > settings.primaryTermOptimizationIterationLimit)
                {
                    c = cl;
                    break;
                }

                if (cl.OptimizationDone)
                {
                    break;
                }
            }

            c = cl;



            // <------------------- PRIM

            c.CollectRelevantTerms(settings.doReserveTermsForClass);

            if (!c.isCaseCloud)
            {
                c.LogConstruct(logger);
            }



            // <---------------------------------

            var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y)));

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var           lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                List <String> l_out  = new List <string>();
                foreach (String lm in lemmas)
                {
                    if (c.NotProcessed(lm))
                    {
                        var lu = resolver.GetLexicUnit(lm, logger);
                        if (lu == null)
                        {
                            c.TrashBin.AddUnique(lm);
                        }
                        else
                        {
                            var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                            if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A }))
                            {
                                c.ReserveTerms.AddUnique(lm);
                                l_out.Add(lm);
                            }
                            else
                            {
                                c.TrashBin.AddUnique(lm);
                            }
                        }
                    }
                    else
                    {
                        if (!c.TrashBin.Contains(lm))
                        {
                            l_out.Add(lm);
                        }
                    }
                }


                if (l_out.Count > 1)
                {
                    l_out.Sort((x, y) => String.CompareOrdinal(x, y));

                    c.lemmasList.Add(l_out);

                    c.weightDict.Add(l_out, chunk);

                    c.nodeNames.AddRange(l_out, true);
                }
            }

            return(BuildCloud(c, chunkTable, termTable, output, logger, resolver));
        }
コード例 #2
0
        public static lemmaSemanticConstruct NextIteration(lemmaSemanticConstruct lastIteration, ITextResourceResolver resolver, List <webLemmaTerm> allChunks, cloudConstructorSettings settings, List <pipelineTaskMCSiteSubject> subjects, ILogBuilder logger)
        {
            var cl = lastIteration;

            var c = new lemmaSemanticConstruct(subjects);

            c.createdInIteration = lastIteration.createdInIteration + 1;
            c.PTCountMin         = Math.Min(lastIteration.PTCountMin, lastIteration.PrimaryTerms.Count);
            c.PTCountMax         = Math.Max(lastIteration.PTCountMax, lastIteration.PrimaryTerms.Count);

            if (!c.isCaseCloud)
            {
                c.onTopChunks.AddRange(allChunks.Where(x => x.documentSetFrequency > (settings.documentSetFreqLowLimit + lastIteration.createdInIteration)));
            }
            else
            {
                if (!settings.doFactorToCaseClouds)
                {
                    c.OptimizationDone = true;
                }
                c.onTopChunks = allChunks;
            }

            if (!c.isCaseCloud)
            {
                instanceCountCollection <String> lemmaCounter     = new instanceCountCollection <string>();
                List <List <String> >            primaryLemmaList = new List <List <String> >();

                foreach (webLemmaTerm chunk in c.onTopChunks)
                {
                    var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                    lemmaCounter.AddInstanceRange(lemmas);
                }

                lemmaCounter.reCalculate();

                foreach (String st in lemmaCounter)
                {
                    if (lemmaCounter.maxFreq == 1 || lemmaCounter[st] > 1)
                    {
                        var lu = resolver.GetLexicUnit(st, logger);
                        if (lu == null)
                        {
                            c.TrashBin.AddUnique(st);
                        }
                        else
                        {
                            var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                            if (tg.Contains(pos_type.N))
                            {
                                c.PrimaryTerms.AddUnique(st);
                            }
                            else if (tg.Contains(pos_type.A))
                            {
                                c.SecondaryTerms.AddUnique(st);
                            }
                            else
                            {
                                c.TrashBin.AddUnique(st);
                            }
                        }
                    }
                }
                ;  // <---------------------------- Primary terms extracted

                if (c.PrimaryTerms.Count == 0)
                {
                    if (c.SecondaryTerms.Any())
                    {
                        logger.log(":: Moving Adjective terms [" + c.SecondaryTerms.Count + "] to Primary Terms category, as no Nouns were qualified to the cateogry");
                        c.PrimaryTerms.AddRange(c.SecondaryTerms);
                        c.SecondaryTerms.Clear();
                    }
                }
            }

            instanceCountCollection <String> secondCounter = new instanceCountCollection <string>();

            foreach (webLemmaTerm chunk in allChunks)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                secondCounter.AddInstanceRange(lemmas);
            }

            foreach (webLemmaTerm chunk in allChunks)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);

                if (lemmas.ContainsAny(c.PrimaryTerms))
                {
                    if (c.onTopChunks.Contains(chunk))
                    {
                        c.primaryChunks.Add(chunk);
                    }
                    else
                    {
                        c.secondaryChunks.Add(chunk);
                    }

                    foreach (String lm in lemmas)
                    {
                        if (c.NotProcessed(lm))
                        {
                            var lu = resolver.GetLexicUnit(lm, logger);
                            if (lu == null)
                            {
                                c.TrashBin.AddUnique(lm);
                            }
                            else
                            {
                                var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                                if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A }))
                                {
                                    c.SecondaryTerms.AddUnique(lm);
                                }
                                else
                                {
                                    c.TrashBin.AddUnique(lm);
                                }
                            }
                        }
                    }
                }
                else
                {
                    foreach (String lm in lemmas)
                    {
                        if (secondCounter[lm] > settings.termInChunkLowerLimit)
                        {
                            if (c.NotProcessed(lm))
                            {
                                var lu = resolver.GetLexicUnit(lm, logger);
                                if (lu == null)
                                {
                                    c.TrashBin.AddUnique(lm);
                                }
                                else
                                {
                                    var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                                    if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A }))
                                    {
                                        c.ReserveTerms.AddUnique(lm);
                                    }
                                    else
                                    {
                                        c.TrashBin.AddUnique(lm);
                                    }
                                }
                            }
                        }
                        else
                        {
                            c.TrashBin.AddUnique(lm);
                        }
                    }
                }
            }

            if (c.OptimizationDone)
            {
                return(c);
            }

            c.PTCountMin = Math.Min(lastIteration.PTCountMin, c.PrimaryTerms.Count);
            c.PTCountMax = Math.Max(lastIteration.PTCountMax, c.PrimaryTerms.Count);

            if (c.PrimaryTerms.Count <= settings.primaryTermLowTargetCount)
            {
                if (lastIteration.PrimaryTerms.Count < c.PrimaryTerms.Count)
                {
                    logger.log("[" +
                               c.createdInIteration.ToString("D3") + "] PrimaryTerms count [" + c.PrimaryTerms.Count + "] after [" + c.createdInIteration + "] iterations optimized ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name);
                }
                else
                {
                    logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "]  --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "]  T:" + Thread.CurrentThread.Name);

                    logger.log("[" +
                               c.createdInIteration.ToString("D3") + "] previous PrimaryTerms count [" + lastIteration.PrimaryTerms.Count + "] accepted, after [" + c.createdInIteration + "]  ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name);
                    c = lastIteration;
                }

                c.OptimizationDone = true;
            }
            else
            {
                logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "]  --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "]  T:" + Thread.CurrentThread.Name);
            }

            return(c);
        }