Beispiel #1
0
        /// <summary>
        /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/>
        /// </summary>
        /// <param name="task">The task.</param>
        /// <returns></returns>
        public override IPipelineNode process(IPipelineTask task)
        {
            pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>;

            if (realTask == null)
            {
                return(next);
            }

            pipelineTaskMCRepoSubject realSubject = realTask.subject;



            folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content");



            imbMCRepository           repo    = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger);
            imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium();

            docRepo.webRepository = repo;
            realSubject.mcElement = docRepo;
            realSubject.MCRepo    = repo;


            List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup);
            List <imbMCWebSite> ws       = new List <imbMCWebSite>();



            //try
            //{
            //    repo.siteTable.Clear();

            //    repo.CheckSiteTable(task.context.logger);


            //    if (realSubject.WebSiteSample.Any())
            //    {
            //        foreach (String w in realSubject.WebSiteSample)
            //        {
            //            var iws = websites.FirstOrDefault(x => w.Contains(x.name));  //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger);
            //            if (iws != null)
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]");
            //                websites.Add(iws);
            //            }
            //            else
            //            {
            //                task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]");
            //            }
            //        }
            //    }
            //    else
            //    {

            //    }
            //} catch (Exception ex)
            //{
            //    throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message);
            //}



            if (!websites.Any())
            {
                task.context.logger.log(this.name + " Failed --- no web sites loaded");
            }
            else
            {
            }

            List <String> needle = new List <string>();

            realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x)));


            List <String> urls = new List <string>();

            foreach (imbMCWebSite site in websites)
            {
                String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper);

                Boolean ok = true;

                if (realSubject.MCSiteTargets.Any())
                {
                    if (!needle.Contains(sName))
                    {
                        ok = false;

#if DEBUG
                        //Console.WriteLine("Site refused [" + sName + "]");
#endif
                    }
                }

                if (urls.Contains(sName))
                {
                    ok = false;
                }

                if (ok)
                {
                    pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject();
                    mCSiteSubject.MCSite = site;


                    imbMCDocumentSet docSet = new imbMCDocumentSet();

                    docRepo.Add(docSet);
                    mCSiteSubject.mcElement        = docSet;
                    mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite;
                    mCSiteSubject.name             = sName;
                    mCSiteSubject.parent           = realSubject;
                    realSubject.Add(mCSiteSubject);


                    urls.Add(mCSiteSubject.name);

                    pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject);

                    task.context.scheduledTasks.Push(taskForSite);
                }
            }

            if (urls.Count < needle.Count)
            {
                urls.ForEach(x => needle.Remove(x));

                if (needle.Any())
                {
                    String nd = "";
                    needle.ForEach(x => nd += x + " ");

                    throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this);
                }
            }



            return(forward);
        }
Beispiel #2
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="table">The table.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <returns></returns>
        public webLemmaTermTable process(TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table, Boolean forSingleWebSite = false)
        {
            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;

                            /*
                             * if (cntPair.flagBag.Contains(cnt_containerType.link))
                             * {
                             *  termFrequency += settings.anchorTextFactor;
                             * }
                             * else if (cntPair.flagBag.Contains(cnt_containerType.title))
                             * {
                             *  termFrequency += settings.titleTextFactor;
                             * }
                             * else
                             * {
                             *  termFrequency += settings.contentTextFactor;
                             * }*/

                            // lemma.otherForms.AddUnique(cntPair.initialForm);
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Chunk TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Chunk Lemma Frequency table at [" + li.ToString() + "]");
                        break;
                    }
                }
            }


            // table.WriteOnlyMode = false;


            recompute(table, logger, forSingleWebSite, lemmas);


            // table.ReadOnlyMode = true;


            return(table);
        }
Beispiel #3
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="source">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <param name="counter">The counter.</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null)
        {
            if (counter == null)
            {
                counter = prepareCounter(source);
            }


            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]");
                        break;
                    }
                }
            }



            recompute(table, logger, forSingleWebSite, lemmas);



            return(table);
        }