Example #1
0
        //internal static String RenderRegexPosTypePattern()


        /// <summary>
        /// Renders the open.
        /// </summary>
        /// <param name="level">The level.</param>
        /// <returns></returns>
        public static String renderOpen(this cnt_level level, contentTokenSubjectRenderMode mode)
        {
            switch (level)
            {
            case flags.token.cnt_level.mcChunk:
                switch (mode)
                {
                case contentTokenSubjectRenderMode.currentForm:
                case contentTokenSubjectRenderMode.lemmaForm:
                    return(" ");

                    break;
                }
                return("{");

                break;

            default:
            case flags.token.cnt_level.mcToken:
                return("");

                break;

            case flags.token.cnt_level.mcBlock:
                return(Environment.NewLine);

                break;

            case flags.token.cnt_level.mcTokenStream:
                return(Environment.NewLine);

                break;
            }
        }
Example #2
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="chunks">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> chunks, cnt_level document_level, webLemmaTermTable table, ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false)
        {
            // List<pipelineTaskMCPageSubject> MCPageSubjects = new List<pipelineTaskMCPageSubject>();

            TFDFCounter counter = new TFDFCounter();

            var listChunks = chunks.ToList();

            listChunks.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));


            foreach (pipelineTaskSubjectContentToken mcSubject in listChunks)
            {
                //var page = mcSubject.GetParentOfType<pipelineTaskMCPageSubject>();

                //if (!MCPageSubjects.Contains(page))
                //{
                //    MCPageSubjects.Add(page);
                //    counter.NextDocument();
                //}

                if (mcSubject.contentLevelType == cnt_level.mcChunk)
                {
                    counter.Add(mcSubject.currentForm, mcSubject);
                }
            }


            return(process(counter, logger, table, forSingleWebSite));
        }
        /// <summary>
        /// Gets the subjects of level.
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="source">The source.</param>
        /// <param name="level">The level.</param>
        /// <returns></returns>
        public static List <T> GetSubjectsOfLevel <T>(this IEnumerable <T> source, cnt_level level) where T : IPipelineTaskSubject
        {
            List <T> MCStreams = new List <T>();

            foreach (T mcSubject in source)
            {
                if (mcSubject.contentLevelType == level)
                {
                    MCStreams.AddUnique(mcSubject);
                }
            }
            return(MCStreams);
        }
        /// <summary>
        /// Runs the specified model.
        /// </summary>
        /// <param name="__model">The model.</param>
        /// <param name="paramsForPrimaryTasks">The parameters for primary tasks creation, passed to the model</param>
        /// <returns></returns>
        public pipelineModelExecutionContext run(IPipelineModel __model, params Object[] paramsForPrimaryTasks)
        {
            var output = new pipelineModelExecutionContext(__model);

            var primTasks = __model.createPrimaryTasks(paramsForPrimaryTasks);

            foreach (var pTask in primTasks)
            {
                output.scheduledTasks.Push(pTask);
            }

            statusExplain();

            Task runMasterTask = new Task(() =>
            {
                runSeparate(output);
            });

            machineRunning = true;

            statusUpdate(output);

            runMasterTask.Start();

            while (machineRunning)
            {
                if (output.GetSinceLastStatusUpdate() > settings.StatusReportPeriod)
                {
                    output.lastStatusUpdate = DateTime.Now;

                    statusUpdate(output);
                }

                Thread.Sleep(settings.TickForCheck);

                if (output.scheduledTasks.Count == 0 && output.lastTakeSize == 0)
                {
                    machineRunning = false;
                    statusUpdate(output);
                }
            }

            logger.log("Sorting exit bin into dictionaries (by type and content level)");

            foreach (var item in output.exitSubjects)
            {
                Type t = item.GetType();
                if (!output.exitByType.ContainsKey(t))
                {
                    output.exitByType.Add(t, new ConcurrentBag <IPipelineTaskSubject>());
                }
                cnt_level level = item.contentLevelType;
                if (!output.exitByLevel.ContainsKey(level))
                {
                    output.exitByLevel.Add(level, new ConcurrentBag <IPipelineTaskSubject>());
                }
                output.exitByType[t].Add(item);
                output.exitByLevel[level].Add(item);
            }

            logger.log("Exit bin sorted by [" + output.exitByType.Count + "] types and [" + output.exitByLevel.Count + "] levels");

            //imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1);

            return(output);
        }
Example #5
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="source">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <param name="counter">The counter.</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null)
        {
            if (counter == null)
            {
                counter = prepareCounter(source);
            }


            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]");
                        break;
                    }
                }
            }



            recompute(table, logger, forSingleWebSite, lemmas);



            return(table);
        }
        //process(IEnumerable<IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table, ILogBuilder logger = null, Boolean forSingleWebSite = false)*
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null,
                                         ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false)
        {
            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + table.name + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            TFDFCounter counter = new TFDFCounter();

            lock (getAllChildrenLock)
            {
                var listSource = source.ToList();
                // listSource.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));
                source = listSource;
            }



            List <IPipelineTaskSubject> rkns = source.GetSubjectsOfLevel <IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcToken }); // source.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>();

            rkns.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));

            //var tkns = source.GetSubjectsOfLevel(cnt_level.mcToken);
            Int32 shorties = 0;

            foreach (var tkn in rkns)
            {
                //if (tkn.currentForm.Length > 1)
                //{
                if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters))
                {
                    counter.Add(tkn.currentForm.ToLower(), tkn);
                }
                //} else
                //{
                //    shorties++;
                //}
            }

            if (shorties > 0)
            {
                logger.log("[" + shorties + "] too short tokens removed");
            }

            /*
             *
             * List<pipelineTaskSubjectContentToken> MCPageSubjects = source.ToSubjectTokenType<pipelineTaskSubjectContentToken>();
             *
             *
             *
             * foreach (pipelineTaskSubjectContentToken mcSubject in MCPageSubjects)
             * {
             *  counter.NextDocument();
             *  List<pipelineTaskSubjectContentToken> tkns = new List<pipelineTaskSubjectContentToken>();
             *  lock (getAllChildrenLock)
             *  {
             *      tkns = mcSubject.getAllChildrenInType<pipelineTaskSubjectContentToken>(null, false, false).GetSubjectsOfLevel(cnt_level.mcToken);
             *  }
             *  foreach (var tkn in tkns)
             *  {
             *      if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters))
             *      {
             *          counter.Add(tkn.currentForm.ToLower(), tkn);
             *      }
             *  }
             * }
             */
            return(process(table.name, parser, counter, logger, table, forSingleWebSite));
        }