예제 #1
0
        /// <summary>
        /// Processes the specified chunk table into semantic cloud
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
        /// ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception>
        public lemmaSemanticCloud process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            if (output == null)
            {
                output           = new lemmaSemanticCloud();
                output.className = termTable.name;
            }

            switch (settings.algorithm)
            {
            case cloudConstructorAlgorithm.complex:
                output = processPOSEnhanced(chunkTable, termTable, output, logger, subjects, resolver);
                break;

            case cloudConstructorAlgorithm.standard:
                output = processStandard(chunkTable, termTable, output, logger, subjects);
                break;

            case cloudConstructorAlgorithm.alternative:
                output = processAlternative(chunkTable, termTable, output, logger, subjects, resolver);
                break;
            }

            output.RebuildIndex();


            output.weaverReport = settings.cloudWeaver.Process(output, logger);

            output.RebuildIndex();

            return(output);
        }
예제 #2
0
 public override void OnLoaded()
 {
     WLTableOfIndustryClass = new webLemmaTermTable(folder.pathFor(name + "tfidfTable.xml"), nameof(WLTableOfIndustryClass));
     if (WLTableOfIndustryClass.Count == 0)
     {
         doBuild = true;
     }
 }
예제 #3
0
        protected webLemmaTermTable BuildLemmaTableForClass(classifierTools tools, IDocumentSetClass documentSetClass, List <pipelineTaskMCSiteSubject> sites)
        {
            var context = items[documentSetClass.name];

            experimentContext.notes.log("Master TF-IDF table construction (used for POS flagging)... [" + documentSetClass.name + "]");
            webLemmaTermTable lemmaTable = knowledgeByClass[documentSetClass].WLTableOfIndustryClass; // new webLemmaTermTable(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml"), true, "master_table_" + documentSetClass.name);

            lemmaTable.Clear();
            experimentContext.masterConstructor.process(GetTokensForSites <IPipelineTaskSubject>(sites), cnt_level.mcPage, lemmaTable, tools.GetLemmaResource(), context.logger, false);

            //lemmaTableByClass.TryAdd(documentSetClass, lemmaTable);
            return(lemmaTable);
        }
예제 #4
0
        public override void OnLoaded()
        {
            WLTableOfIndustryClass      = new webLemmaTermTable(folder.pathFor(name + "WLTable.xml", getWritableFileMode.none, "Web Lemma TF-IDF table for [" + name + "] - as XML Serialized object"), name + "Lemmas");
            WLChunkTableOfIndustryClass = new webLemmaTermTable(folder.pathFor(name + "WLChunkTable.xml", getWritableFileMode.none, "Chunks TF-IDF table for [" + name + "] - as XML Serialized object"), name + "Chunks");

            if (semanticCloud == null)
            {
                semanticCloud      = lemmaSemanticCloud.Load <lemmaSemanticCloud>(folder.pathFor(name + "Cloud.xml", getWritableFileMode.existing, "Initial version of the semantic cloud, extracted from the Chunk Table"), true);
                semanticCloud.name = name;
            }


            if (semanticCloudFiltered == null)
            {
                semanticCloudFiltered      = lemmaSemanticCloud.Load <lemmaSemanticCloud>(folder.pathFor(name + "CloudFiltered.xml", getWritableFileMode.existing, "Initial version of the semantic cloud, extracted from the Chunk Table"), true);
                semanticCloudFiltered.name = name + "flt";
            }
        }
예제 #5
0
        protected lemmaSemanticCloud processStandard(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects)
        {
            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }


            List <webLemmaTerm> allChunks = chunkTable.GetList();

            IEnumerable <webLemmaTerm> docSetFreq = null;

            if (subjects.Count > 1)
            {
                docSetFreq = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit);
            }
            else
            {
                docSetFreq = allChunks;
            }

            //  allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit);

            Dictionary <List <String>, webLemmaTerm> weightDict = new Dictionary <List <String>, webLemmaTerm>();

            List <List <String> > lemmasList = new List <List <string> >();
            List <String>         nodeNames  = new List <string>();

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();


                if (lemmas.Count > 1)
                {
                    lemmas.Sort((x, y) => String.CompareOrdinal(x, y));
                    lemmasList.Add(lemmas);

                    weightDict.Add(lemmas, chunk);

                    nodeNames.AddRange(lemmas, true);
                }
            }

            foreach (String n in nodeNames) // <------------ creating nodes
            {
                Double weight = 0;
                if (settings.assignTermTableWeightToNode)
                {
                    weight = termTable.ResolveSingleTerm(n, logger);
                }
                else
                {
                    weight = 1;
                }
                if (weight > 0)
                {
                    output.AddNode(n, weight);
                }
            }

            foreach (List <String> n in lemmasList) // <-------- creating links
            {
                String first = n[0];
                if (output.ContainsNode(first, true))
                {
                    foreach (String m in n)
                    {
                        if (m != first)
                        {
                            if (output.ContainsNode(m, true))
                            {
                                Double weight = 1;
                                if (settings.assignChunkTableWeightToLink)
                                {
                                    weight = weightDict[n].weight;
                                }
                                else
                                {
                                    if (settings.doAdjustLinkWeightByChunkSize)
                                    {
                                        weight = (n.Count - 1).GetRatio(1);
                                    }
                                    else
                                    {
                                        weight = 1;
                                    }
                                }
                                var link = output.GetLink(first, m);
                                if (link == null)
                                {
                                    output.AddLink(first, m, weight);
                                }
                                else
                                {
                                    if (settings.doSumExistingLinkWeights)
                                    {
                                        link.weight += weight;
                                    }
                                    else
                                    {
                                        // it will not create new link as it already exists
                                        // this is irrational in case settings.assignChunkTableWeightToLink is true
                                        if (settings.assignChunkTableWeightToLink)
                                        {
                                            throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
                                                                          ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }


            return(output);
        }
예제 #6
0
        /// <summary>
        /// Builds the cloud - common part of the algorithm
        /// </summary>
        /// <param name="c">The c.</param>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
        ///                                                 ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception>
        protected lemmaSemanticCloud BuildCloud(lemmaSemanticConstruct c, webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, ITextResourceResolver resolver)
        {
            c.TrashBin.ForEach(x => c.nodeNames.Remove(x));


            foreach (String n in c.nodeNames) // <------------ creating nodes
            {
                Double weight = 0;
                if (settings.assignTermTableWeightToNode)
                {
                    var lemma = termTable[n];
                    if (lemma != null)
                    {
                        weight = lemma.weight;
                    }
                }
                else
                {
                    weight = 1;
                }
                if (weight > 0)
                {
                    if (c.isCaseCloud)
                    {
                        if (settings.doFactorToCaseClouds)
                        {
                            if (c.PrimaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2);
                            }
                            else if (c.SecondaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1);
                            }
                            else
                            {
                                output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0);
                            }
                        }
                        else
                        {
                            output.AddNode(n, weight);
                        }
                    }
                    else
                    {
                        // class cloud
                        if (settings.doFactorToClassClouds)
                        {
                            if (c.PrimaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2);
                            }
                            else if (c.SecondaryTerms.Contains(n))
                            {
                                output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1);
                            }
                            else
                            {
                                output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0);
                            }
                        }
                        else
                        {
                            output.AddNode(n, weight);
                        }
                    }
                }
            }

            foreach (List <String> n in c.lemmasList) // <-------- creating links
            {
                String first = n[0];
                if (c.TrashBin.Contains(first))
                {
                    continue;
                }

                if (output.ContainsNode(first, true))
                {
                    foreach (String m in n)
                    {
                        if (c.TrashBin.Contains(m))
                        {
                            continue;
                        }
                        if (m != first)
                        {
                            if (output.ContainsNode(m, true))
                            {
                                Double weight = 1;
                                if (settings.assignChunkTableWeightToLink)
                                {
                                    weight = c.weightDict[n].weight;
                                }
                                else
                                {
                                    if (settings.doAdjustLinkWeightByChunkSize)
                                    {
                                        weight = (n.Count - 1).GetRatio(1);
                                    }
                                    else
                                    {
                                        weight = 1;
                                    }
                                }
                                var link = output.GetLink(first, m);
                                if (link == null)
                                {
                                    output.AddLink(first, m, weight);
                                }
                                else
                                {
                                    if (settings.doSumExistingLinkWeights)
                                    {
                                        link.weight += weight;
                                    }
                                    else
                                    {
                                        // it will not create new link as it already exists
                                        // this is irrational in case settings.assignChunkTableWeightToLink is true
                                        if (settings.assignChunkTableWeightToLink)
                                        {
                                            throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" +
                                                                          ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            c.primaryChunks.ForEach(x => output.primaryChunks.Add(x.nominalForm));
            c.secondaryChunks.ForEach(x => output.secondaryChunks.Add(x.nominalForm));

            return(output);
        }
예제 #7
0
        /// <summary>
        /// Processes the complex.
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        protected lemmaSemanticCloud processAlternative(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }
            lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects);



            List <webLemmaTerm> allChunks = chunkTable.GetList();

            // <--------------------------------- DETECTING THE MOST IMPORTANT TERMS
            IEnumerable <webLemmaTerm> vipChunks = null;

            if (subjects.Count > 1)
            {
                vipChunks = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit);
            }
            else
            {
                vipChunks = allChunks;
            }

            instanceCountCollection <String> lemmaCounter     = new instanceCountCollection <string>();
            List <List <String> >            primaryLemmaList = new List <List <String> >();

            foreach (webLemmaTerm chunk in vipChunks)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();

                lemmaCounter.AddInstanceRange(lemmas);
            }

            c.RelevantTerms = lemmaCounter.getSorted();


            lemmaCounter.reCalculate();

            foreach (String term in c.RelevantTerms)
            {
                if (lemmaCounter[term] == lemmaCounter.maxFreq)
                {
                    c.PrimaryTerms.Add(term);
                }
                else if (lemmaCounter[term] > lemmaCounter.minFreq)
                {
                    c.SecondaryTerms.Add(term);
                }
                else
                {
                    c.ReserveTerms.Add(term);
                }
            }


            c.CollectRelevantTerms(settings.doReserveTermsForClass);
            c.LogConstruct(logger);



            // <---------------------------------

            var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y)));

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();

                if (lemmas.Count > 1)
                {
                    lemmas.Sort((x, y) => String.CompareOrdinal(x, y));
                    c.lemmasList.Add(lemmas);

                    c.weightDict.Add(lemmas, chunk);

                    c.nodeNames.AddRange(lemmas, true);
                }
            }

            return(BuildCloud(c, chunkTable, termTable, output, logger, resolver));
        }
예제 #8
0
        /// <summary>
        /// Processes the position enhanced.
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        protected lemmaSemanticCloud processPOSEnhanced(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            List <webLemmaTerm> allChunks = chunkTable.GetList();

            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }



            // <----------------- PRIMARY
            lemmaSemanticConstruct c  = new lemmaSemanticConstruct(subjects);
            lemmaSemanticConstruct cl = new lemmaSemanticConstruct(subjects);

            while (c != cl)
            {
                c  = cl;
                cl = lemmaSemanticConstruct.NextIteration(cl, resolver, allChunks, settings, subjects, logger);

                if (cl.createdInIteration > settings.primaryTermOptimizationIterationLimit)
                {
                    c = cl;
                    break;
                }

                if (cl.OptimizationDone)
                {
                    break;
                }
            }

            c = cl;



            // <------------------- PRIM

            c.CollectRelevantTerms(settings.doReserveTermsForClass);

            if (!c.isCaseCloud)
            {
                c.LogConstruct(logger);
            }



            // <---------------------------------

            var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y)));

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var           lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                List <String> l_out  = new List <string>();
                foreach (String lm in lemmas)
                {
                    if (c.NotProcessed(lm))
                    {
                        var lu = resolver.GetLexicUnit(lm, logger);
                        if (lu == null)
                        {
                            c.TrashBin.AddUnique(lm);
                        }
                        else
                        {
                            var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                            if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A }))
                            {
                                c.ReserveTerms.AddUnique(lm);
                                l_out.Add(lm);
                            }
                            else
                            {
                                c.TrashBin.AddUnique(lm);
                            }
                        }
                    }
                    else
                    {
                        if (!c.TrashBin.Contains(lm))
                        {
                            l_out.Add(lm);
                        }
                    }
                }


                if (l_out.Count > 1)
                {
                    l_out.Sort((x, y) => String.CompareOrdinal(x, y));

                    c.lemmasList.Add(l_out);

                    c.weightDict.Add(l_out, chunk);

                    c.nodeNames.AddRange(l_out, true);
                }
            }

            return(BuildCloud(c, chunkTable, termTable, output, logger, resolver));
        }
예제 #9
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="table">The table.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <returns></returns>
        public webLemmaTermTable process(TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table, Boolean forSingleWebSite = false)
        {
            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;

                            /*
                             * if (cntPair.flagBag.Contains(cnt_containerType.link))
                             * {
                             *  termFrequency += settings.anchorTextFactor;
                             * }
                             * else if (cntPair.flagBag.Contains(cnt_containerType.title))
                             * {
                             *  termFrequency += settings.titleTextFactor;
                             * }
                             * else
                             * {
                             *  termFrequency += settings.contentTextFactor;
                             * }*/

                            // lemma.otherForms.AddUnique(cntPair.initialForm);
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Chunk TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Chunk Lemma Frequency table at [" + li.ToString() + "]");
                        break;
                    }
                }
            }


            // table.WriteOnlyMode = false;


            recompute(table, logger, forSingleWebSite, lemmas);


            // table.ReadOnlyMode = true;


            return(table);
        }
예제 #10
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="chunks">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> chunks, cnt_level document_level, webLemmaTermTable table, ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false)
        {
            // List<pipelineTaskMCPageSubject> MCPageSubjects = new List<pipelineTaskMCPageSubject>();

            TFDFCounter counter = new TFDFCounter();

            var listChunks = chunks.ToList();

            listChunks.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));


            foreach (pipelineTaskSubjectContentToken mcSubject in listChunks)
            {
                //var page = mcSubject.GetParentOfType<pipelineTaskMCPageSubject>();

                //if (!MCPageSubjects.Contains(page))
                //{
                //    MCPageSubjects.Add(page);
                //    counter.NextDocument();
                //}

                if (mcSubject.contentLevelType == cnt_level.mcChunk)
                {
                    counter.Add(mcSubject.currentForm, mcSubject);
                }
            }


            return(process(counter, logger, table, forSingleWebSite));
        }
예제 #11
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="source">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <param name="counter">The counter.</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null)
        {
            if (counter == null)
            {
                counter = prepareCounter(source);
            }


            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]");
                        break;
                    }
                }
            }



            recompute(table, logger, forSingleWebSite, lemmas);



            return(table);
        }
예제 #12
0
        public industryLemmaRankTable process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, industryLemmaRankTable output)
        {
            List <webLemmaTerm> allChunks = chunkTable.GetList();

            var docSetFreq = allChunks.Where(x => x.documentSetFrequency > 1);

            instanceCountCollection <String> termCounter = new instanceCountCollection <string>();

            aceDictionarySet <String, String> dict = new aceDictionarySet <string, string>();

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();
                termCounter.AddInstanceRange(lemmas);

                foreach (String lm in lemmas)
                {
                    foreach (String lmi in lemmas)
                    {
                        if (lmi != lm)
                        {
                            dict[lm].AddUnique(lmi);
                        }
                    }
                }
            }

            List <String> primaries = new List <string>();

            foreach (var pair in termCounter)
            {
                if (termCounter[pair] > 1)
                {
                    primaries.Add(pair);
                    industryLemmaTerm lemma = output.GetOrCreate(pair);
                    lemma.termType = industryLemmaTermType.primary;
                    lemma.weight   = settings.PrimaryTermFactor * termTable[lemma.name].weight;


                    lemma.nominalForm = pair;
                    output.AddOrUpdate(lemma);

                    if (dict.ContainsKey(lemma.nominalForm))
                    {
                        foreach (String secLemmas in dict[lemma.nominalForm])
                        {
                            industryLemmaTerm lemmaSec = output.GetOrCreate(secLemmas);
                            if (lemmaSec.termType == industryLemmaTermType.none)
                            {
                                lemmaSec.termType    = industryLemmaTermType.secondary;
                                lemmaSec.weight      = settings.SecondaryTermFactor * termTable[lemmaSec.name].weight;
                                lemmaSec.nominalForm = secLemmas;
                                output.AddOrUpdate(lemmaSec);
                            }
                        }
                    }
                }
            }

            //var reserveChunks = allChunks.Where(x => x.nominalForm.ContainsAny(primaries));

            //aceDictionarySet<String, String> dictReserve = new aceDictionarySet<string, string>();

            //foreach (webLemmaTerm chunk in reserveChunks)
            //{
            //    var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true);
            //    lemmas = lemmas.Where(x => x.Length > 2).ToList();

            //    String prim = lemmas.FirstOrDefault(x => primaries.Contains(x));

            //    if (!prim.isNullOrEmpty())
            //    {
            //        foreach (String lm in lemmas)
            //        {
            //            if (prim != lm)
            //            {
            //                dictReserve[prim].AddUnique(lm);
            //            }
            //        }
            //    }

            //}

            //foreach (String prim in primaries)
            //{
            //    if (dictReserve.ContainsKey(prim))
            //    {
            //        foreach (String res in dictReserve[prim])
            //        {
            //            industryLemmaTerm resLemma = output.GetOrCreate(res);
            //            if (resLemma.termType == industryLemmaTermType.none)
            //            {

            //                resLemma.nominalForm = res;
            //                resLemma.weight = settings.ReserveTermFactor  *termTable[resLemma.name].weight;
            //                resLemma.termType = industryLemmaTermType.reserve;
            //            }
            //            output.AddOrUpdate(resLemma);
            //        }

            //    }
            //}

            return(output);
        }
예제 #13
0
        //process(IEnumerable<IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table, ILogBuilder logger = null, Boolean forSingleWebSite = false)*
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null,
                                         ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false)
        {
            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + table.name + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            TFDFCounter counter = new TFDFCounter();

            lock (getAllChildrenLock)
            {
                var listSource = source.ToList();
                // listSource.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));
                source = listSource;
            }



            List <IPipelineTaskSubject> rkns = source.GetSubjectsOfLevel <IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcToken }); // source.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>();

            rkns.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm));

            //var tkns = source.GetSubjectsOfLevel(cnt_level.mcToken);
            Int32 shorties = 0;

            foreach (var tkn in rkns)
            {
                //if (tkn.currentForm.Length > 1)
                //{
                if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters))
                {
                    counter.Add(tkn.currentForm.ToLower(), tkn);
                }
                //} else
                //{
                //    shorties++;
                //}
            }

            if (shorties > 0)
            {
                logger.log("[" + shorties + "] too short tokens removed");
            }

            /*
             *
             * List<pipelineTaskSubjectContentToken> MCPageSubjects = source.ToSubjectTokenType<pipelineTaskSubjectContentToken>();
             *
             *
             *
             * foreach (pipelineTaskSubjectContentToken mcSubject in MCPageSubjects)
             * {
             *  counter.NextDocument();
             *  List<pipelineTaskSubjectContentToken> tkns = new List<pipelineTaskSubjectContentToken>();
             *  lock (getAllChildrenLock)
             *  {
             *      tkns = mcSubject.getAllChildrenInType<pipelineTaskSubjectContentToken>(null, false, false).GetSubjectsOfLevel(cnt_level.mcToken);
             *  }
             *  foreach (var tkn in tkns)
             *  {
             *      if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters))
             *      {
             *          counter.Add(tkn.currentForm.ToLower(), tkn);
             *      }
             *  }
             * }
             */
            return(process(table.name, parser, counter, logger, table, forSingleWebSite));
        }
예제 #14
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="tableName">Name of the table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false)
        {
            if (table == null)
            {
                table = new webLemmaTermTable(tableName);
            }

            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            List <String> tfdfList = counter.GetIndexForms();
            Int32         i        = 0;
            Int32         c        = 0;
            Int32         li       = 0;
            Int32         limit    = tfdfList.Count + 100;



            if (!tableName.isNullOrEmpty())
            {
                table.name = tableName;
            }


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();

            Int32 startIndex  = tfdfList.Count;
            Int32 cycleLength = startIndex / 5;

            while (tfdfList.Any())
            {
                String term = tfdfList.FirstOrDefault();
                Int32  d    = tfdfList.Count;

                if (term != null)
                {
                    lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger);
                    d = d - tfdfList.Count;
                    if (d == 0)
                    {
                        table.unresolved.Add(term);
                        tfdfList.Remove(term);
                        d = 1;
                    }
                    else
                    {
                        Boolean ok = true;

                        if (settings.allowedLemmaTypes.Any())
                        {
                            var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none);

                            if (settings.strictPosTypePolicy)
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                    if (tps.Contains(pos_type.V))
                                    {
                                        ok = false;
                                    }
                                    //foreach (pos_type t in tps)
                                    //{
                                    //    if (!settings.allowedLemmaTypes.Contains(t))
                                    //    {
                                    //        ok = false;
                                    //        break;
                                    //    }
                                    //}
                                }
                            }
                            else
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                }
                            }
                        }
                        else
                        {
                        }


                        if (ok)
                        {
                            List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();
                            List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();



                            webLemmaTerm lemma = new webLemmaTerm();
                            lemma.nominalForm = inflectSet.lemmaForm;
                            lemma.name        = inflectSet.lemmaForm;


                            Double documentFrequency = 0;
                            Double termFrequency     = 0;

                            foreach (lexicInflection inflect in inflectSet.Values)
                            {
                                TFDFContainer cn = counter.GetContainer(inflect.inflectedForm);
                                if (cn != null)
                                {
                                    lemma.AFreqPoints += cn.items.Count;
                                    foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                                    {
                                        imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>();
                                        documents.AddUnique(document);

                                        imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement;
                                        if (docSet != null)
                                        {
                                            documentSet.AddUnique(docSet);
                                        }
                                        else
                                        {
                                            logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")");
                                        }

                                        if (cntPair.flagBag.Contains(cnt_containerType.link))
                                        {
                                            termFrequency += settings.anchorTextFactor;
                                        }
                                        else if (cntPair.flagBag.Contains(cnt_containerType.title))
                                        {
                                            termFrequency += settings.titleTextFactor;
                                        }
                                        else
                                        {
                                            termFrequency += settings.contentTextFactor;
                                        }

                                        cntPair.AddGraph(inflect);
                                    }

                                    lemma.otherForms.AddUnique(cn.indexForm);
                                }
                                else
                                {
                                    lemma.otherForms.AddUnique(inflect.inflectedForm);
                                }
                            }
                            lemma.documentSetFrequency = documentSet.Count;
                            lemma.documentFrequency    = documents.Count;
                            lemma.termFrequency        = termFrequency;
                            lemmas.Add(lemma);
                            //table.Add(lemma);
                        }
                        else
                        {
                        }
                    }
                }
                li++;
                i = i + d;
                c = c + d;
                d = startIndex - tfdfList.Count;

                if (c > cycleLength)
                {
                    c = 0;
                    logger.AppendLine();
                    logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_");
                    logger.AppendLine();
                }

                if (li > limit)
                {
                    logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]");
                    break;
                }
            }


            if (settings.doComputeTFIDF)
            {
                recompute(table, logger, forSingleWebSite, lemmas);
            }
            else
            {
                foreach (var le in lemmas)
                {
                    table.Add(le);
                }
            }


            //  table.ReadOnlyMode = true;

            return(table);
        }
        /// <summary>
        /// Recomputes the specified table.
        /// </summary>
        /// <param name="table">The table.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <param name="lemmas">The lemmas.</param>
        /// <returns></returns>
        /// <exception cref="aceGeneralException">Permanent Add() lemma problem at [" + table.name + "] - Permanent Lemma TF-IDF Add(Lemma) failure</exception>
        public virtual webLemmaTermTable recompute(webLemmaTermTable table, ILogBuilder logger, Boolean forSingleWebSite, List <webLemmaTerm> lemmas)
        {
            kernelComputeWeightTask kernelTask = new kernelComputeWeightTask(lemmas, logger, forSingleWebSite, settings);

            ITermWeightKernel kernel = kernelManager.GetKernel(weightKernelName);

            kernel.compute(kernelTask);

            #region OLD_CODE

            /*
             * Double documentSetFrequencyMax = 0;
             * Double documentFrequencyMax = 0;
             * Double termFrequencyMax = 0;
             *
             * // List<webLemmaTerm> lemmas = tabl//e.GetList();
             *
             * if (lemmas.Count == 0)
             * {
             *  logger.log("ERROR: NO ENTRIES IN TF-TDF TABLE [" + table.name + "] - is for single web site [" + forSingleWebSite.ToString() + "]");
             * }
             *
             * foreach (webLemmaTerm lemma in lemmas)
             * {
             *  documentSetFrequencyMax = Math.Max(documentSetFrequencyMax, lemma.documentSetFrequency);
             *  documentFrequencyMax = Math.Max(documentFrequencyMax, lemma.documentFrequency);
             *  termFrequencyMax = Math.Max(termFrequencyMax, lemma.termFrequency);
             * }
             *
             * if (forSingleWebSite) {
             *  if (settings.doAdjustIDFForCase)
             *  {
             *      documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
             *  } else
             *  {
             *      documentFrequencyMax = documentFrequencyMax + settings.documentFrequencyMaxCorrection;
             *  }
             * } else
             * {
             *  documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
             * }
             *
             * /// COMPUTING NON NORMALIZED WEIGHTs
             * ///
             * Double weightMax = Double.MinValue;
             *
             * foreach (webLemmaTerm lemma in lemmas)
             * {
             *  lemma.termFrequency = lemma.termFrequency.GetRatio(termFrequencyMax);
             *
             *  if (settings.doUseIDF)
             *  {
             *      if (settings.doUseNaturalLog)
             *      {
             *          lemma.documentFactor = Math.Log(documentFrequencyMax.GetRatio(lemma.documentFrequency));
             *      } else
             *      {
             *          lemma.documentFactor = Math.Log10(documentFrequencyMax.GetRatio(lemma.documentFrequency));
             *      }
             *  } else
             *  {
             *      lemma.documentFactor = 1;
             *  }
             *
             *  lemma.weight = lemma.termFrequency * lemma.documentFactor;
             *
             *  if (settings.doUseDocumentSet)
             *  {
             *      if ((documentSetFrequencyMax != 1) || !forSingleWebSite)
             *      {
             *          if (lemma.documentSetFrequency == 0)
             *          {
             *              lemma.weight = 0;
             *          }
             *          else
             *          {
             *              Double docSetFactor = (1 - Math.Log10(documentSetFrequencyMax / lemma.documentSetFrequency));
             *              lemma.weight = lemma.weight * docSetFactor;
             *          }
             *      }
             *  }
             *
             *  weightMax = Math.Max(weightMax, lemma.weight);
             * }
             *
             * /// WEIGHT NORMALIZATION
             * foreach (webLemmaTerm lemma in lemmas)
             * {
             *  lemma.weight = lemma.weight.GetRatio(weightMax);
             * }
             */

            #endregion OLD_CODE

            /// SAVING THE RESULTS
            Int32 globalRetry = retry_global_limit;
            foreach (webLemmaTerm lemma in lemmas)
            {
                Int32 retry = retry_limit;
                while (retry > 0)
                {
                    try
                    {
                        table.Add(lemma);
                        retry = 0;
                    }
                    catch (Exception ex)
                    {
                        retry--;
                        globalRetry--;

                        if (doBeep)
                        {
                            logger.log("WFT [" + table.name + "] add lemma [" + lemma.name + "]  retries left [" + retry + "] global[" + globalRetry + "]");
                            imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1);
                        }
                        Thread.Sleep(250);

                        if (globalRetry < 0)
                        {
                            throw new aceGeneralException("Permanent Add() lemma problem at [" + table.name + "]", ex, this, "Permanent Lemma TF-IDF Add(Lemma) failure");
                        }
                    }
                }
            }

            logger.log("WFT [" + table.name + "] recomputed TFmax[" + kernelTask.weightMax + "] : DFmax[" + kernelTask.documentFrequencyMax + "]  TC[" + lemmas.Count + "]");

            return(table);
        }
예제 #16
0
        public virtual webLemmaTermTable recompute(webLemmaTermTable table, ILogBuilder logger, Boolean forSingleWebSite, List <webLemmaTerm> lemmas)
        {
            Double documentSetFrequencyMax = 0;
            Double documentFrequencyMax    = 0;
            Double termFrequencyMax        = 0;


            // List<webLemmaTerm> lemmas = tabl//e.GetList();

            if (lemmas.Count == 0)
            {
                logger.log("ERROR: NO ENTRIES IN TF-TDF TABLE [" + table.name + "] - is for single web site [" + forSingleWebSite.ToString() + "]");
            }

            foreach (webLemmaTerm lemma in lemmas)
            {
                documentSetFrequencyMax = Math.Max(documentSetFrequencyMax, lemma.documentSetFrequency);
                documentFrequencyMax    = Math.Max(documentFrequencyMax, lemma.documentFrequency);
                termFrequencyMax        = Math.Max(termFrequencyMax, lemma.termFrequency);
            }

            if (forSingleWebSite)
            {
                if (settings.doAdjustIDFForCase)
                {
                    documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
                }
                else
                {
                    documentFrequencyMax = documentFrequencyMax + settings.documentFrequencyMaxCorrection;
                }
            }
            else
            {
                documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
            }

            Double weightMax = Double.MinValue;

            foreach (webLemmaTerm lemma in lemmas)
            {
                lemma.termFrequency = lemma.termFrequency.GetRatio(termFrequencyMax);

                if (settings.doUseIDF)
                {
                    if (settings.doUseNaturalLog)
                    {
                        lemma.documentFactor = Math.Log(documentFrequencyMax.GetRatio(lemma.documentFrequency));
                    }
                    else
                    {
                        lemma.documentFactor = Math.Log10(documentFrequencyMax.GetRatio(lemma.documentFrequency));
                    }
                }
                else
                {
                    lemma.documentFactor = 1;
                }

                lemma.weight = lemma.termFrequency * lemma.documentFactor;



                if (settings.doUseDocumentSet)
                {
                    if ((documentSetFrequencyMax != 1) || !forSingleWebSite)
                    {
                        if (lemma.documentSetFrequency == 0)
                        {
                            lemma.weight = 0;
                        }
                        else
                        {
                            Double docSetFactor = (1 - Math.Log10(documentSetFrequencyMax / lemma.documentSetFrequency));
                            lemma.weight = lemma.weight * docSetFactor;
                        }
                    }
                }

                weightMax = Math.Max(weightMax, lemma.weight);
            }

            foreach (webLemmaTerm lemma in lemmas)
            {
                lemma.weight = lemma.weight.GetRatio(weightMax);
            }

            Int32 globalRetry = retry_global_limit;

            foreach (webLemmaTerm lemma in lemmas)
            {
                Int32 retry = retry_limit;
                while (retry > 0)
                {
                    try
                    {
                        table.Add(lemma);
                        retry = 0;
                    }
                    catch (Exception ex)
                    {
                        retry--;
                        globalRetry--;

                        if (doBeep)
                        {
                            logger.log("WFT [" + table.name + "] add lemma [" + lemma.name + "]  retries left [" + retry + "] global[" + globalRetry + "]");
                            imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1);
                        }
                        Thread.Sleep(250);

                        if (globalRetry < 0)
                        {
                            throw new aceGeneralException("Permanent Add() lemma problem at [" + table.name + "]", ex, this, "Permanent Lemma TF-IDF Add(Lemma) failure");
                        }
                    }
                }
            }


            logger.log("WFT [" + table.name + "] recomputed TFmax[" + weightMax + "] : DFmax[" + documentFrequencyMax + "]  TC[" + lemmas.Count + "]");

            return(table);
        }