Пример #1
0
        /// <summary>
        /// It will get all inflections of the same lemma, if <c>allInflections</c> is supplied, it will remove all matched inflectional form from the list.
        /// </summary>
        /// <param name="inflection">The inflection.</param>
        /// <param name="allInflections">All inflections.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public lexicGraphSetWithLemma GetLemmaSetForInflection(String inflection, List <String> allInflections = null, ILogBuilder logger = null)
        {
            if (!isLoaded)
            {
                LoadLexicResource(logger, resourcePath);
            }

            lexicInflection inflect = resolve(inflection, logger);

            if (inflect == null)
            {
                return(new lexicGraphSetWithLemma());
            }

            lexicGraphSetWithLemma inflectSet = registratedLemmaIndex[inflect.lemmaForm];
            List <String>          keys       = inflectSet.Keys.ToList();

            if (allInflections != null)
            {
                foreach (String k in keys)
                {
                    allInflections.RemoveAll(x => x.Equals(k, StringComparison.InvariantCultureIgnoreCase));
                }
            }

            resolveIfRequired(inflectSet.Values);

            return(inflectSet);
        }
Пример #2
0
        /// <summary>
        /// Loads the lexic resource.
        /// </summary>
        /// <param name="output">The output.</param>
        /// <param name="resourceFilePath">The resource file path.</param>
        public void LoadLexicResource(ILogBuilder output, String resourceFilePath)
        {
            List <String> lines = new List <String>();

            // <---------------------------------------------- [
            if (isLoaded)
            {
                return;
            }
            String pt = "";

            if (!localCache.isNullOrEmpty())
            {
                pt = localCache;
                lines.AddRange(File.ReadLines(localCache));
            }

            if (lines.Count < 100)
            {
                pt    = resourceFilePath;
                lines = new List <string>();
                lines.AddRange(File.ReadAllLines(resourceFilePath));
            }

            Int32  i      = 0;
            Int32  iCycle = lines.Count() / 20;
            Int32  l      = lines.Count();
            Int32  c      = 0;
            Double p      = 0;

            output.logStartPhase("Loading", "Loading the lexic resource - with mode: " + mode.ToString());
            output.log("Start of loading lexic resource [" + pt + "]");
            //   Parallel.ForEach(lines, new ParallelOptions { MaxDegreeOfParallelism=1 }, (line) =>

            Parallel.ForEach(lines, new ParallelOptions {
                MaxDegreeOfParallelism = 1
            }, (line) =>
                             //  Parallel.ForEach(lines, (line) =>
            {
                string inflectForm = "";
                string lemma       = "";
                string gramTag     = "";

                SelectFromLine(line, out inflectForm, out lemma, out gramTag);

                lexicInflection inflect = null;

                if (!inflectForm.isNullOrEmpty())
                {
                    if (!ContainsKey(inflectForm))
                    {
                        inflect                       = new lexicInflection(line);
                        inflect.lemmaForm             = lemma;
                        inflect.name                  = inflectForm;
                        inflect.inflectedForm         = inflectForm;
                        inflect.lexicalDefinitionLine = line;

                        if (spellAlternator.IsInitiated)
                        {
                            String altInflectedForm = spellAlternator.ConvertFromAtoB(inflectForm);
                            spellAlternatives.GetOrAdd(altInflectedForm, inflectForm);
                        }

                        Add(inflectForm, inflect);
                    }
                    else
                    {
                        inflect = base[inflectForm];
                    }

                    lexicGrammarCase gramCase = null;

                    if (mode == textResourceIndexResolveMode.resolveOnLoad)
                    {
                        var gramTagColl = grammTagConverter.ConvertFromString(gramTag);

                        gramCase = inflect.AddGrammarCase(gramTagColl);
                        gramCase.lexicalDefinitionLine = gramTag;
                    }
                    else
                    {
                        gramCase = new lexicGrammarCase();
                        gramCase.lexicalDefinitionLine = gramTag;
                        gramCase.name = "gc" + i.ToString();
                        inflect.Add(gramCase);
                    }

                    // <----------------- construction of Lemma centered dictionary

                    lexicGraphSetWithLemma lxSet = null;

                    if (!registratedLemmaIndex.ContainsKey(lemma))
                    {
                        lock (LemmaIndexLock)
                        {
                            if (!registratedLemmaIndex.ContainsKey(lemma))
                            {
                                lxSet           = new lexicGraphSetWithLemma();
                                lxSet.lemmaForm = lemma;
                                registratedLemmaIndex.TryAdd(lemma, lxSet);
                            }
                        }
                    }

                    lxSet = registratedLemmaIndex[lemma];

                    if (!lxSet.ContainsKey(inflectForm))
                    {
                        lock (SetLock)
                        {
                            if (!lxSet.ContainsKey(inflectForm))
                            {
                                lxSet.TryAdd(inflect.name, inflect);
                            }
                        }
                    }

                    Interlocked.Increment(ref c);
                    Interlocked.Increment(ref i);
                    if (c > iCycle)
                    {
                        lock (loadStatusLock)
                        {
                            if (c > iCycle)
                            {
                                c = 0;
                                p = i.GetRatio(l);
                                output.AppendLine("Done: _" + p.ToString("P2") + "_");
                            }
                        }
                    }
                }
            });

            output.logEndPhase();
            output.log("End of loading process");
            isLoaded = true;
        }
Пример #3
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="tableName">Name of the table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false)
        {
            if (table == null)
            {
                table = new webLemmaTermTable(tableName);
            }

            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            List <String> tfdfList = counter.GetIndexForms();
            Int32         i        = 0;
            Int32         c        = 0;
            Int32         li       = 0;
            Int32         limit    = tfdfList.Count + 100;



            if (!tableName.isNullOrEmpty())
            {
                table.name = tableName;
            }


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();

            Int32 startIndex  = tfdfList.Count;
            Int32 cycleLength = startIndex / 5;

            while (tfdfList.Any())
            {
                String term = tfdfList.FirstOrDefault();
                Int32  d    = tfdfList.Count;

                if (term != null)
                {
                    lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger);
                    d = d - tfdfList.Count;
                    if (d == 0)
                    {
                        table.unresolved.Add(term);
                        tfdfList.Remove(term);
                        d = 1;
                    }
                    else
                    {
                        Boolean ok = true;

                        if (settings.allowedLemmaTypes.Any())
                        {
                            var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none);

                            if (settings.strictPosTypePolicy)
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                    if (tps.Contains(pos_type.V))
                                    {
                                        ok = false;
                                    }
                                    //foreach (pos_type t in tps)
                                    //{
                                    //    if (!settings.allowedLemmaTypes.Contains(t))
                                    //    {
                                    //        ok = false;
                                    //        break;
                                    //    }
                                    //}
                                }
                            }
                            else
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                }
                            }
                        }
                        else
                        {
                        }


                        if (ok)
                        {
                            List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();
                            List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();



                            webLemmaTerm lemma = new webLemmaTerm();
                            lemma.nominalForm = inflectSet.lemmaForm;
                            lemma.name        = inflectSet.lemmaForm;


                            Double documentFrequency = 0;
                            Double termFrequency     = 0;

                            foreach (lexicInflection inflect in inflectSet.Values)
                            {
                                TFDFContainer cn = counter.GetContainer(inflect.inflectedForm);
                                if (cn != null)
                                {
                                    lemma.AFreqPoints += cn.items.Count;
                                    foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                                    {
                                        imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>();
                                        documents.AddUnique(document);

                                        imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement;
                                        if (docSet != null)
                                        {
                                            documentSet.AddUnique(docSet);
                                        }
                                        else
                                        {
                                            logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")");
                                        }

                                        if (cntPair.flagBag.Contains(cnt_containerType.link))
                                        {
                                            termFrequency += settings.anchorTextFactor;
                                        }
                                        else if (cntPair.flagBag.Contains(cnt_containerType.title))
                                        {
                                            termFrequency += settings.titleTextFactor;
                                        }
                                        else
                                        {
                                            termFrequency += settings.contentTextFactor;
                                        }

                                        cntPair.AddGraph(inflect);
                                    }

                                    lemma.otherForms.AddUnique(cn.indexForm);
                                }
                                else
                                {
                                    lemma.otherForms.AddUnique(inflect.inflectedForm);
                                }
                            }
                            lemma.documentSetFrequency = documentSet.Count;
                            lemma.documentFrequency    = documents.Count;
                            lemma.termFrequency        = termFrequency;
                            lemmas.Add(lemma);
                            //table.Add(lemma);
                        }
                        else
                        {
                        }
                    }
                }
                li++;
                i = i + d;
                c = c + d;
                d = startIndex - tfdfList.Count;

                if (c > cycleLength)
                {
                    c = 0;
                    logger.AppendLine();
                    logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_");
                    logger.AppendLine();
                }

                if (li > limit)
                {
                    logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]");
                    break;
                }
            }


            if (settings.doComputeTFIDF)
            {
                recompute(table, logger, forSingleWebSite, lemmas);
            }
            else
            {
                foreach (var le in lemmas)
                {
                    table.Add(le);
                }
            }


            //  table.ReadOnlyMode = true;

            return(table);
        }