protected override List <pipelineTaskSubjectContentToken> processIteration(pipelineTaskSubjectContentToken streamSubject)
        {
            List <pipelineTaskSubjectContentToken> chunks = new List <pipelineTaskSubjectContentToken>();

            if (streamSubject.contentLevelType == cnt_level.mcChunk)
            {
                chunks.Add(streamSubject);
            }

            foreach (imbSCI.Data.interfaces.IObjectWithPathAndChildren chk in streamSubject)
            {
                pipelineTaskSubjectContentToken ch = chk as pipelineTaskSubjectContentToken;
                if (ch.contentLevelType == cnt_level.mcChunk)
                {
                }
            }

            var subchk = pipelineSubjectTools.GetSubjectChildrenTokenType <pipelineTaskSubjectContentToken, IGraphNode>(streamSubject, new cnt_level[] { cnt_level.mcChunk }, true);

            chunks.AddRange(subchk);

            imbMCDocumentElement stream = streamSubject.mcElement as imbMCDocumentElement;

            if (stream == null)
            {
                return(chunks);
            }

            subjectRenderLayers layers = new subjectRenderLayers();

            foreach (chunkMatchRule rule in settings.rules)
            {
                textMap <pipelineTaskSubjectContentToken> typeTagFormMap = layers.render(streamSubject, rule.renderMode);

                MatchCollection mchs = rule.regex.Matches(typeTagFormMap.render);
                List <List <pipelineTaskSubjectContentToken> > mchs_s = typeTagFormMap.Select(mchs);

                List <List <pipelineTaskSubjectContentToken> > mchs_subjects = new List <List <pipelineTaskSubjectContentToken> >();
                foreach (List <pipelineTaskSubjectContentToken> mg in mchs_s)
                {
                    List <pipelineTaskSubjectContentToken> mgc = new List <pipelineTaskSubjectContentToken>();
                    foreach (pipelineTaskSubjectContentToken m in mg)
                    {
                        if (rule.contentLevel.Contains(m.contentLevelType))
                        {
                            mgc.Add(m);
                        }
                    }
                    if (mgc.Any())
                    {
                        mchs_subjects.Add(mgc);
                    }
                }

                foreach (List <pipelineTaskSubjectContentToken> mGroup in mchs_subjects)
                {
                    String tkn = imbStringGenerators.getRandomString(4);

                    Boolean createChunk = true;

                    foreach (pipelineTaskSubjectContentToken s in mGroup)
                    {
                        tkn = tkn + s.name;
                    }

                    if (rule.flagTypesToMatch.Any())
                    {
                        Dictionary <Type, Object> flags = new Dictionary <Type, object>();

                        foreach (pipelineTaskSubjectContentToken s in mGroup)
                        {
                            foreach (Type flagType in rule.flagTypesToMatch)
                            {
                                Object fl = s.flagBag.FirstOrDefault(x => x.GetType() == flagType);
                                if (fl == null)
                                {
                                    continue;
                                }

                                if (!flags.ContainsKey(flagType))
                                {
                                    if (fl != null)
                                    {
                                        flags.Add(flagType, fl);
                                    }
                                    continue;
                                }

                                if (settings.doCheckGramTagCriteria)
                                {
                                    if (flags[flagType] != fl)
                                    {
                                        createChunk = false;
                                        break;
                                    }
                                }
                            }
                            if (createChunk == false)
                            {
                                break;
                            }
                        }
                    }

                    if (createChunk)
                    {
                        pipelineTaskSubjectContentToken chunkSubject = new pipelineTaskSubjectContentToken();
                        chunkSubject.name = tkn;

                        streamSubject.Add(chunkSubject);

                        imbMCChunk chunk = new imbMCChunk();
                        chunk.name = tkn;

                        chunkSubject.contentLevelType = flags.token.cnt_level.mcChunk;
                        chunkSubject.flagBag.Add(rule.chunkType);
                        chunkSubject.mcElement = chunk;

                        if (stream != null)
                        {
                            chunk.htmlNode = stream.htmlNode;
                        }

                        chunk.position = mGroup.Min(x => x.mcElement.position);

                        stream.Add(chunk);

                        List <Object> commonFlags    = new List <object>();
                        List <Object> forbidenFlags  = new List <object>();
                        Boolean       isFirstSubject = true;

                        foreach (pipelineTaskSubjectContentToken s in mGroup)
                        {
                            s.mcElement.removeFromParent();
                            chunk.Add(s.mcElement);

                            s.removeFromParent();
                            chunkSubject.Add(s);

                            if (isFirstSubject)
                            {
                                commonFlags.AddRange(s.flagBag, true);
                                isFirstSubject = false;
                            }
                            else
                            {
                                foreach (Object flag in s.flagBag)
                                {
                                    if (!forbidenFlags.Contains(flag))
                                    {
                                        if (!commonFlags.Contains(flag))
                                        {
                                            forbidenFlags.AddUnique(flag);
                                        }
                                    }
                                    else
                                    {
                                        commonFlags.Remove(flag);
                                    }
                                }
                            }
                        }

                        chunkSubject.initialForm = chunkSubject.render(contentTokenSubjectRenderMode.currentForm).render;
                        var lemmaForm = chunkSubject.render(contentTokenSubjectRenderMode.lemmaForm);
                        chunkSubject.currentForm = lemmaForm.GetCleanRender();

                        chunk.content = chunkSubject.currentForm;

                        chunkSubject.flagBag.Clear();
                        chunkSubject.flagBag.AddRange(commonFlags, true);

                        chunks.Add(chunkSubject);
                    }
                }
            }

            return(chunks);
        }
예제 #2
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="tableName">Name of the table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false)
        {
            if (table == null)
            {
                table = new webLemmaTermTable(tableName);
            }

            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            List <String> tfdfList = counter.GetIndexForms();
            Int32         i        = 0;
            Int32         c        = 0;
            Int32         li       = 0;
            Int32         limit    = tfdfList.Count + 100;



            if (!tableName.isNullOrEmpty())
            {
                table.name = tableName;
            }


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();

            Int32 startIndex  = tfdfList.Count;
            Int32 cycleLength = startIndex / 5;

            while (tfdfList.Any())
            {
                String term = tfdfList.FirstOrDefault();
                Int32  d    = tfdfList.Count;

                if (term != null)
                {
                    lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger);
                    d = d - tfdfList.Count;
                    if (d == 0)
                    {
                        table.unresolved.Add(term);
                        tfdfList.Remove(term);
                        d = 1;
                    }
                    else
                    {
                        Boolean ok = true;

                        if (settings.allowedLemmaTypes.Any())
                        {
                            var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none);

                            if (settings.strictPosTypePolicy)
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                    if (tps.Contains(pos_type.V))
                                    {
                                        ok = false;
                                    }
                                    //foreach (pos_type t in tps)
                                    //{
                                    //    if (!settings.allowedLemmaTypes.Contains(t))
                                    //    {
                                    //        ok = false;
                                    //        break;
                                    //    }
                                    //}
                                }
                            }
                            else
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                }
                            }
                        }
                        else
                        {
                        }


                        if (ok)
                        {
                            List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();
                            List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();



                            webLemmaTerm lemma = new webLemmaTerm();
                            lemma.nominalForm = inflectSet.lemmaForm;
                            lemma.name        = inflectSet.lemmaForm;


                            Double documentFrequency = 0;
                            Double termFrequency     = 0;

                            foreach (lexicInflection inflect in inflectSet.Values)
                            {
                                TFDFContainer cn = counter.GetContainer(inflect.inflectedForm);
                                if (cn != null)
                                {
                                    lemma.AFreqPoints += cn.items.Count;
                                    foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                                    {
                                        imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>();
                                        documents.AddUnique(document);

                                        imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement;
                                        if (docSet != null)
                                        {
                                            documentSet.AddUnique(docSet);
                                        }
                                        else
                                        {
                                            logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")");
                                        }

                                        if (cntPair.flagBag.Contains(cnt_containerType.link))
                                        {
                                            termFrequency += settings.anchorTextFactor;
                                        }
                                        else if (cntPair.flagBag.Contains(cnt_containerType.title))
                                        {
                                            termFrequency += settings.titleTextFactor;
                                        }
                                        else
                                        {
                                            termFrequency += settings.contentTextFactor;
                                        }

                                        cntPair.AddGraph(inflect);
                                    }

                                    lemma.otherForms.AddUnique(cn.indexForm);
                                }
                                else
                                {
                                    lemma.otherForms.AddUnique(inflect.inflectedForm);
                                }
                            }
                            lemma.documentSetFrequency = documentSet.Count;
                            lemma.documentFrequency    = documents.Count;
                            lemma.termFrequency        = termFrequency;
                            lemmas.Add(lemma);
                            //table.Add(lemma);
                        }
                        else
                        {
                        }
                    }
                }
                li++;
                i = i + d;
                c = c + d;
                d = startIndex - tfdfList.Count;

                if (c > cycleLength)
                {
                    c = 0;
                    logger.AppendLine();
                    logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_");
                    logger.AppendLine();
                }

                if (li > limit)
                {
                    logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]");
                    break;
                }
            }


            if (settings.doComputeTFIDF)
            {
                recompute(table, logger, forSingleWebSite, lemmas);
            }
            else
            {
                foreach (var le in lemmas)
                {
                    table.Add(le);
                }
            }


            //  table.ReadOnlyMode = true;

            return(table);
        }