/// <summary> /// It will get all inflections of the same lemma, if <c>allInflections</c> is supplied, it will remove all matched inflectional form from the list. /// </summary> /// <param name="inflection">The inflection.</param> /// <param name="allInflections">All inflections.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public lexicGraphSetWithLemma GetLemmaSetForInflection(String inflection, List <String> allInflections = null, ILogBuilder logger = null) { if (!isLoaded) { LoadLexicResource(logger, resourcePath); } lexicInflection inflect = resolve(inflection, logger); if (inflect == null) { return(new lexicGraphSetWithLemma()); } lexicGraphSetWithLemma inflectSet = registratedLemmaIndex[inflect.lemmaForm]; List <String> keys = inflectSet.Keys.ToList(); if (allInflections != null) { foreach (String k in keys) { allInflections.RemoveAll(x => x.Equals(k, StringComparison.InvariantCultureIgnoreCase)); } } resolveIfRequired(inflectSet.Values); return(inflectSet); }
/// <summary> /// Loads the lexic resource. /// </summary> /// <param name="output">The output.</param> /// <param name="resourceFilePath">The resource file path.</param> public void LoadLexicResource(ILogBuilder output, String resourceFilePath) { List <String> lines = new List <String>(); // <---------------------------------------------- [ if (isLoaded) { return; } String pt = ""; if (!localCache.isNullOrEmpty()) { pt = localCache; lines.AddRange(File.ReadLines(localCache)); } if (lines.Count < 100) { pt = resourceFilePath; lines = new List <string>(); lines.AddRange(File.ReadAllLines(resourceFilePath)); } Int32 i = 0; Int32 iCycle = lines.Count() / 20; Int32 l = lines.Count(); Int32 c = 0; Double p = 0; output.logStartPhase("Loading", "Loading the lexic resource - with mode: " + mode.ToString()); output.log("Start of loading lexic resource [" + pt + "]"); // Parallel.ForEach(lines, new ParallelOptions { MaxDegreeOfParallelism=1 }, (line) => Parallel.ForEach(lines, new ParallelOptions { MaxDegreeOfParallelism = 1 }, (line) => // Parallel.ForEach(lines, (line) => { string inflectForm = ""; string lemma = ""; string gramTag = ""; SelectFromLine(line, out inflectForm, out lemma, out gramTag); lexicInflection inflect = null; if (!inflectForm.isNullOrEmpty()) { if (!ContainsKey(inflectForm)) { inflect = new lexicInflection(line); inflect.lemmaForm = lemma; inflect.name = inflectForm; inflect.inflectedForm = inflectForm; inflect.lexicalDefinitionLine = line; if (spellAlternator.IsInitiated) { String altInflectedForm = spellAlternator.ConvertFromAtoB(inflectForm); spellAlternatives.GetOrAdd(altInflectedForm, inflectForm); } Add(inflectForm, inflect); } else { inflect = base[inflectForm]; } lexicGrammarCase gramCase = null; if (mode == textResourceIndexResolveMode.resolveOnLoad) { var gramTagColl = grammTagConverter.ConvertFromString(gramTag); gramCase = inflect.AddGrammarCase(gramTagColl); gramCase.lexicalDefinitionLine = gramTag; } else { gramCase = new lexicGrammarCase(); gramCase.lexicalDefinitionLine = gramTag; gramCase.name = "gc" + i.ToString(); inflect.Add(gramCase); } // <----------------- construction of Lemma centered dictionary lexicGraphSetWithLemma lxSet = null; if (!registratedLemmaIndex.ContainsKey(lemma)) { lock (LemmaIndexLock) { if (!registratedLemmaIndex.ContainsKey(lemma)) { lxSet = new lexicGraphSetWithLemma(); lxSet.lemmaForm = lemma; registratedLemmaIndex.TryAdd(lemma, lxSet); } } } lxSet = registratedLemmaIndex[lemma]; if (!lxSet.ContainsKey(inflectForm)) { lock (SetLock) { if (!lxSet.ContainsKey(inflectForm)) { lxSet.TryAdd(inflect.name, inflect); } } } Interlocked.Increment(ref c); Interlocked.Increment(ref i); if (c > iCycle) { lock (loadStatusLock) { if (c > iCycle) { c = 0; p = i.GetRatio(l); output.AppendLine("Done: _" + p.ToString("P2") + "_"); } } } } }); output.logEndPhase(); output.log("End of loading process"); isLoaded = true; }
/// <summary> /// Constructs the webLemmaTable /// </summary> /// <param name="tableName">Name of the table.</param> /// <param name="parser">The parser.</param> /// <param name="counter">The counter.</param> /// <param name="logger">The logger.</param> /// <returns></returns> protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false) { if (table == null) { table = new webLemmaTermTable(tableName); } if (table.Count > 0) { logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]"); if (DoBeep == 1) { imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250); Interlocked.Increment(ref DoBeep); } return(table); } List <String> tfdfList = counter.GetIndexForms(); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count + 100; if (!tableName.isNullOrEmpty()) { table.name = tableName; } List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count; Int32 cycleLength = startIndex / 5; while (tfdfList.Any()) { String term = tfdfList.FirstOrDefault(); Int32 d = tfdfList.Count; if (term != null) { lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger); d = d - tfdfList.Count; if (d == 0) { table.unresolved.Add(term); tfdfList.Remove(term); d = 1; } else { Boolean ok = true; if (settings.allowedLemmaTypes.Any()) { var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none); if (settings.strictPosTypePolicy) { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { if (tps.Contains(pos_type.V)) { ok = false; } //foreach (pos_type t in tps) //{ // if (!settings.allowedLemmaTypes.Contains(t)) // { // ok = false; // break; // } //} } } else { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { } } } else { } if (ok) { List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); webLemmaTerm lemma = new webLemmaTerm(); lemma.nominalForm = inflectSet.lemmaForm; lemma.name = inflectSet.lemmaForm; Double documentFrequency = 0; Double termFrequency = 0; foreach (lexicInflection inflect in inflectSet.Values) { TFDFContainer cn = counter.GetContainer(inflect.inflectedForm); if (cn != null) { lemma.AFreqPoints += cn.items.Count; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>(); documents.AddUnique(document); imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement; if (docSet != null) { documentSet.AddUnique(docSet); } else { logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")"); } if (cntPair.flagBag.Contains(cnt_containerType.link)) { termFrequency += settings.anchorTextFactor; } else if (cntPair.flagBag.Contains(cnt_containerType.title)) { termFrequency += settings.titleTextFactor; } else { termFrequency += settings.contentTextFactor; } cntPair.AddGraph(inflect); } lemma.otherForms.AddUnique(cn.indexForm); } else { lemma.otherForms.AddUnique(inflect.inflectedForm); } } lemma.documentSetFrequency = documentSet.Count; lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); //table.Add(lemma); } else { } } } li++; i = i + d; c = c + d; d = startIndex - tfdfList.Count; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_"); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]"); break; } } if (settings.doComputeTFIDF) { recompute(table, logger, forSingleWebSite, lemmas); } else { foreach (var le in lemmas) { table.Add(le); } } // table.ReadOnlyMode = true; return(table); }