Esempio n. 1
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="chunks">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> chunks, cnt_level document_level, webLemmaTermTable table, ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false)
        {
            // List<pipelineTaskMCPageSubject> MCPageSubjects = new List<pipelineTaskMCPageSubject>();

            TFDFCounter counter = new TFDFCounter();

            var listChunks = chunks.ToList();

            listChunks.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));


            foreach (pipelineTaskSubjectContentToken mcSubject in listChunks)
            {
                //var page = mcSubject.GetParentOfType<pipelineTaskMCPageSubject>();

                //if (!MCPageSubjects.Contains(page))
                //{
                //    MCPageSubjects.Add(page);
                //    counter.NextDocument();
                //}

                if (mcSubject.contentLevelType == cnt_level.mcChunk)
                {
                    counter.Add(mcSubject.currentForm, mcSubject);
                }
            }


            return(process(counter, logger, table, forSingleWebSite));
        }
Esempio n. 2
0
        /// <summary>
        /// Prepares the counter.
        /// </summary>
        /// <param name="source">The source.</param>
        /// <returns></returns>
        public TFDFCounter prepareCounter(IEnumerable <IPipelineTaskSubject> source)
        {
            TFDFCounter counter = new TFDFCounter();

            var listChunks = source.ToList();

            listChunks.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));

            foreach (pipelineTaskSubjectContentToken mcSubject in source)
            {
                if (mcSubject.contentLevelType == cnt_level.mcTokenStream)
                {
                    counter.Add(mcSubject.currentForm, mcSubject);
                }
            }

            return(counter);
        }
Esempio n. 3
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="table">The table.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <returns></returns>
        public webLemmaTermTable process(TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table, Boolean forSingleWebSite = false)
        {
            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;

                            /*
                             * if (cntPair.flagBag.Contains(cnt_containerType.link))
                             * {
                             *  termFrequency += settings.anchorTextFactor;
                             * }
                             * else if (cntPair.flagBag.Contains(cnt_containerType.title))
                             * {
                             *  termFrequency += settings.titleTextFactor;
                             * }
                             * else
                             * {
                             *  termFrequency += settings.contentTextFactor;
                             * }*/

                            // lemma.otherForms.AddUnique(cntPair.initialForm);
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Chunk TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Chunk Lemma Frequency table at [" + li.ToString() + "]");
                        break;
                    }
                }
            }


            // table.WriteOnlyMode = false;


            recompute(table, logger, forSingleWebSite, lemmas);


            // table.ReadOnlyMode = true;


            return(table);
        }
Esempio n. 4
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="source">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <param name="counter">The counter.</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null)
        {
            if (counter == null)
            {
                counter = prepareCounter(source);
            }


            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]");
                        break;
                    }
                }
            }



            recompute(table, logger, forSingleWebSite, lemmas);



            return(table);
        }
Esempio n. 5
0
        //process(IEnumerable<IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table, ILogBuilder logger = null, Boolean forSingleWebSite = false)*
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null,
                                         ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false)
        {
            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + table.name + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            TFDFCounter counter = new TFDFCounter();

            lock (getAllChildrenLock)
            {
                var listSource = source.ToList();
                // listSource.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));
                source = listSource;
            }



            List <IPipelineTaskSubject> rkns = source.GetSubjectsOfLevel <IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcToken }); // source.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>();

            rkns.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));

            //var tkns = source.GetSubjectsOfLevel(cnt_level.mcToken);
            Int32 shorties = 0;

            foreach (var tkn in rkns)
            {
                //if (tkn.currentForm.Length > 1)
                //{
                if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters))
                {
                    counter.Add(tkn.currentForm.ToLower(), tkn);
                }
                //} else
                //{
                //    shorties++;
                //}
            }

            if (shorties > 0)
            {
                logger.log("[" + shorties + "] too short tokens removed");
            }

            /*
             *
             * List<pipelineTaskSubjectContentToken> MCPageSubjects = source.ToSubjectTokenType<pipelineTaskSubjectContentToken>();
             *
             *
             *
             * foreach (pipelineTaskSubjectContentToken mcSubject in MCPageSubjects)
             * {
             *  counter.NextDocument();
             *  List<pipelineTaskSubjectContentToken> tkns = new List<pipelineTaskSubjectContentToken>();
             *  lock (getAllChildrenLock)
             *  {
             *      tkns = mcSubject.getAllChildrenInType<pipelineTaskSubjectContentToken>(null, false, false).GetSubjectsOfLevel(cnt_level.mcToken);
             *  }
             *  foreach (var tkn in tkns)
             *  {
             *      if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters))
             *      {
             *          counter.Add(tkn.currentForm.ToLower(), tkn);
             *      }
             *  }
             * }
             */
            return(process(table.name, parser, counter, logger, table, forSingleWebSite));
        }
Esempio n. 6
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="tableName">Name of the table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false)
        {
            if (table == null)
            {
                table = new webLemmaTermTable(tableName);
            }

            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            List <String> tfdfList = counter.GetIndexForms();
            Int32         i        = 0;
            Int32         c        = 0;
            Int32         li       = 0;
            Int32         limit    = tfdfList.Count + 100;



            if (!tableName.isNullOrEmpty())
            {
                table.name = tableName;
            }


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();

            Int32 startIndex  = tfdfList.Count;
            Int32 cycleLength = startIndex / 5;

            while (tfdfList.Any())
            {
                String term = tfdfList.FirstOrDefault();
                Int32  d    = tfdfList.Count;

                if (term != null)
                {
                    lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger);
                    d = d - tfdfList.Count;
                    if (d == 0)
                    {
                        table.unresolved.Add(term);
                        tfdfList.Remove(term);
                        d = 1;
                    }
                    else
                    {
                        Boolean ok = true;

                        if (settings.allowedLemmaTypes.Any())
                        {
                            var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none);

                            if (settings.strictPosTypePolicy)
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                    if (tps.Contains(pos_type.V))
                                    {
                                        ok = false;
                                    }
                                    //foreach (pos_type t in tps)
                                    //{
                                    //    if (!settings.allowedLemmaTypes.Contains(t))
                                    //    {
                                    //        ok = false;
                                    //        break;
                                    //    }
                                    //}
                                }
                            }
                            else
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                }
                            }
                        }
                        else
                        {
                        }


                        if (ok)
                        {
                            List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();
                            List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();



                            webLemmaTerm lemma = new webLemmaTerm();
                            lemma.nominalForm = inflectSet.lemmaForm;
                            lemma.name        = inflectSet.lemmaForm;


                            Double documentFrequency = 0;
                            Double termFrequency     = 0;

                            foreach (lexicInflection inflect in inflectSet.Values)
                            {
                                TFDFContainer cn = counter.GetContainer(inflect.inflectedForm);
                                if (cn != null)
                                {
                                    lemma.AFreqPoints += cn.items.Count;
                                    foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                                    {
                                        imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>();
                                        documents.AddUnique(document);

                                        imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement;
                                        if (docSet != null)
                                        {
                                            documentSet.AddUnique(docSet);
                                        }
                                        else
                                        {
                                            logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")");
                                        }

                                        if (cntPair.flagBag.Contains(cnt_containerType.link))
                                        {
                                            termFrequency += settings.anchorTextFactor;
                                        }
                                        else if (cntPair.flagBag.Contains(cnt_containerType.title))
                                        {
                                            termFrequency += settings.titleTextFactor;
                                        }
                                        else
                                        {
                                            termFrequency += settings.contentTextFactor;
                                        }

                                        cntPair.AddGraph(inflect);
                                    }

                                    lemma.otherForms.AddUnique(cn.indexForm);
                                }
                                else
                                {
                                    lemma.otherForms.AddUnique(inflect.inflectedForm);
                                }
                            }
                            lemma.documentSetFrequency = documentSet.Count;
                            lemma.documentFrequency    = documents.Count;
                            lemma.termFrequency        = termFrequency;
                            lemmas.Add(lemma);
                            //table.Add(lemma);
                        }
                        else
                        {
                        }
                    }
                }
                li++;
                i = i + d;
                c = c + d;
                d = startIndex - tfdfList.Count;

                if (c > cycleLength)
                {
                    c = 0;
                    logger.AppendLine();
                    logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_");
                    logger.AppendLine();
                }

                if (li > limit)
                {
                    logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]");
                    break;
                }
            }


            if (settings.doComputeTFIDF)
            {
                recompute(table, logger, forSingleWebSite, lemmas);
            }
            else
            {
                foreach (var le in lemmas)
                {
                    table.Add(le);
                }
            }


            //  table.ReadOnlyMode = true;

            return(table);
        }