/// <summary> /// Explores definition on an unknown term /// </summary> /// <param name="term">The term.</param> /// <param name="loger">The loger.</param> /// <param name="shortExplore">if set to <c>true</c> [short explore].</param> /// <param name="debug">if set to <c>true</c> [debug].</param> /// <returns></returns> public List <termExploreModel> explore(string term, ILogBuilder loger, bool shortExplore = true, bool debug = true, termExploreModel exploreModel = null) { term = term.Trim(); List <termExploreModel> output = new List <termExploreModel>(); if (modelRegistry.ContainsKey(term)) { return(modelRegistry[term]); } if (missing.Contains(term)) { return(GetModels(term)); } if (term.isNumber()) { termExploreModel tmp = makeTempModel(term, pos_type.NUMnumerical); tmp.flags = termExploreItemEnumFlag.datapoint; if (loger != null) { loger.AppendLine("Term [" + term + "] detected to be number."); } AddModel(tmp); return(GetModels(term)); } // <----- drugi test exploreModel = termExploreProcedures.exploreWithHunspell(new termExploreItem(term), loger); List <string> suggests = new List <string>(); exploreModel.instances.ForEach(x => suggests.Add(x.inputForm)); //languageManagerDBNamedEntities.manager.exploreEntities(exploreModel.rootWord, exploreModel); suggests.Add(exploreModel.rootWord); // s apertiumDictionaryResult result = languageManagerApertium.manager.query(suggests, apertiumDictQueryScope.exact, apertiumDictNeedleSide.serbian); if (result.Any()) { List <termExploreItem> gramCheck = new List <termExploreItem>(); gramFlags gr = null; if (result.termVsGramFlags.ContainsKey(exploreModel.inputForm)) { exploreModel.gramSet.Add(new gramFlags(result.termVsGramFlags[exploreModel.inputForm])); if (exploreModel.lemma == null) { exploreModel.lemma = exploreModel.instances[exploreModel.inputForm]; } gramCheck.Add(exploreModel); if (debug) { if (loger != null) { loger.AppendLine("Apertium discovered model [" + exploreModel.inputForm + "]"); } } } else { //if (loger != null) loger.AppendLine("Apertium failed to discover [" + exploreModel.inputForm + "]"); } foreach (termExploreItem item in exploreModel.instances) { if (result.termVsGramFlags.ContainsKey(item.inputForm)) { item.gramSet.Add(new gramFlags(result.termVsGramFlags[item.inputForm])); gramCheck.Add(exploreModel); exploreModel.lemmaForm = item.inputForm; if (exploreModel.lemma == null) { exploreModel.lemma = item; } if (debug) { if (loger != null) { loger.AppendLine("Apertium discovered model [" + item.inputForm + "]"); } } } else { //if (loger != null) loger.AppendLine("Apertium failed to discover [" + item.inputForm + "]"); } } exploreModel.translations.AddRange(result.GetEnglish()); gramCheck.RemoveAll(x => posConverter.posTypeVsPattern[x.gramSet.getPosType()].Count() == 0); int disc = 0; foreach (var gram in gramCheck) { if (discoverGram(gram, loger, debug)) { disc++; } } if (loger != null) { loger.AppendLine("Gram [" + term + "] autodiscovered for [" + disc + "] / [" + gramCheck.Count() + "]"); } if (debug) { if (loger != null) { exploreModel.ToString(loger, true, false); manager.constructor.saveTermModel(exploreModel, "Apertium_"); } } AddModel(exploreModel); exploreModel.flags = termExploreItemEnumFlag.aper; if (shortExplore) { return(GetModels(term)); } } else { if (loger != null) { loger.AppendLine("Apertium failed to discover any information on [" + term + "]"); } } if (loger != null) { loger.consoleAltColorToggle(); } // <------------------ APERTIUM ^^ foreach (string s in suggests) { languageManagerDBNamedEntities.manager.exploreEntities(s, exploreModel); } if (exploreModel.flags == termExploreItemEnumFlag.namedEntity) { AddModel(exploreModel); if (debug) { if (loger != null) { exploreModel.ToString(loger, true, false); manager.constructor.saveTermModel(exploreModel, "NamedEntity_"); loger.AppendLine("Named entities discovered model [" + exploreModel.inputForm + "]:" + exploreModel.gramSet.ToString()); } } if (shortExplore) { return(GetModels(term)); } } else { if (loger != null) { if (debug) { loger.AppendLine("Named entities found nothing for [" + exploreModel.inputForm + "]:" + exploreModel.gramSet.ToString()); } } } if (loger != null) { loger.consoleAltColorToggle(); } // <------------------ NAMED ENTITY ^^ // <----------------- Wordnet wordnetSymsetResults resSrWordnet = languageManagerWordnet.manager.query_srb(suggests, loger); bool found = false; if (resSrWordnet.Any()) { foreach (termExploreItem item in exploreModel.instances) { if (resSrWordnet.GetByKey(item.inputForm).Any()) { exploreModel.lemma = item; exploreModel.lemmaForm = item.inputForm; exploreModel.translations.AddRange(resSrWordnet.GetValues()); exploreModel.synonyms.AddRange(resSrWordnet.GetKeys()); exploreModel.flags = termExploreItemEnumFlag.srWNet; found = true; item.gramSet.Add(new gramFlags(new Enum[] { resSrWordnet.models[item.inputForm].gramSet.getPosType() })); } } foreach (termExploreItem item in exploreModel.instances) { discoverGram(item, loger, debug); } } if (found) { if (loger != null) { loger.AppendLine("SerbianWordNet discovered model [" + term + "]:" + exploreModel.gramSet.ToString()); } if (debug) { if (loger != null) { exploreModel.ToString(loger, true, false); manager.constructor.saveTermModel(exploreModel, "SrWordNet_");; } } AddModel(exploreModel); exploreModel.flags = termExploreItemEnumFlag.srWNet; if (shortExplore) { return(GetModels(term)); } } else { if (loger != null) { if (debug) { loger.AppendLine("Serbian wordnet found nothing for [" + term + "]"); } } } // <------------------ SERBIAN WORD NET ^^ bool failed = discoverGram(exploreModel, loger, debug); exploreModel.instances.ForEach(x => discoverGram(x, loger, debug)); int d = 0; List <termExploreItem> lastCheck = new List <termExploreItem>(); foreach (var gram in lastCheck) { if (discoverGram(gram, loger, debug)) { d++; } } if (debug) { if (loger != null) { loger.AppendLine("The last check [" + term + "] autodiscovered for [" + d + "] / [" + lastCheck.Count() + "]"); } } if (d == 0) { failed = true; } if (loger != null) { loger.consoleAltColorToggle(); } // <------------------ LAST CHECK ^^ if (!failed) { exploreModel.flags = termExploreItemEnumFlag.termExplorer; AddModel(exploreModel); return(GetModels(term)); } else { if (debug) { if (loger != null) { loger.AppendLine("Exploration failed for [" + term + "] -- creating temporary term model"); } } output.Add(makeTempModel(term, pos_type.TEMP)); missing.Add(term); return(output); } }
/// <summary> /// The stage two exploration /// </summary> /// <param name="lemma">The lemma.</param> /// <param name="response">The response.</param> /// <param name="savemodel">if set to <c>true</c> [savemodel].</param> /// <param name="debug">if set to <c>true</c> [debug].</param> /// <param name="verbose">if set to <c>true</c> [verbose].</param> /// <returns></returns> public static termExploreModelSet exploreStageTwo(string lemma, ILogBuilder response, bool savemodel, bool debug, bool verbose, lexiconTaskBase task = null) { lexiconConstructor constructor = semanticLexiconManager.manager.constructor; termExploreModelSet outset = semanticLexiconManager.manager.constructor.loadTermModels(lemma, true); if (!Enumerable.Any(outset)) { outset.missingLemmas.Add(lemma); return(outset); } foreach (termExploreModel mod in outset) { builderForLog logout = new builderForLog(); if (verbose) { aceLog.consoleControl.setAsOutput(logout, "stage2"); } termExploreModel model = getSynonymsWithApertium(mod, logout); string pt = model.lemma.gramSet.getPosType().ToString(); if (savemodel) { // model.graph.saveDescription(constructor.projectFolderStructure[lexiconConstructorProjectFolder.logs].path, pt + "_related"); } model = getSynonymsWithWordnetViaApertium(model, logout, true, false); if (savemodel) { model.graph.saveDescription(constructor.projectFolderStructure[lexiconConstructorProjectFolder.logs].path, pt + "_concepts"); // model.graph.savePaths(constructor.projectFolderStructure[lexiconConstructorProjectFolder.logs].path, pt + "_concepts"); } model.PostProcess(); if (debug) { model.ToString(logout, true, true); string fn = model.lemma.inputForm + "_" + pt + "_log.md"; logout.ToString(false).saveStringToFile(constructor.projectFolderStructure[lexiconConstructorProjectFolder.logs].pathFor(fn), getWritableFileMode.overwrite); } if (verbose) { aceLog.consoleControl.removeFromOutput(logout); } if (savemodel) { if (task != null) { model.lastModifiedByStage = task.taskTitle; } else { model.lastModifiedByStage = "stageTwo-exploreProcedure"; } if (!model.wasExploreFailed) { constructor.saveTermModel(model); } else { outset.failedModels.Add(model); } } } return(outset); }
public static termExploreModel explore(string word, ILogBuilder response, termExploreMode mode, bool verbose = false) { termExploreModel model = new termExploreModel(word); termExploreModelSet outset = semanticLexiconManager.manager.constructor.loadTermModels(word, true); if (response != null) { response.consoleAltColorToggle(); response.AppendHorizontalLine(); response.AppendLine("Exploring term[" + model.inputForm + "] with [" + mode.ToString() + "]"); response.consoleAltColorToggle(); } if (Enumerable.Any(outset)) { model = Enumerable.First(outset); if (response != null) { response.AppendLine("term[" + model.inputForm + "]->lemma[" + model.lemma.inputForm + "]"); } } else { model.lemmaForm = ""; if (response != null) { response.AppendLine("term[" + word + "]->missingLemma[]"); } } var output = response; if (!verbose) { response = null; } switch (mode) { case termExploreMode.apertium_direct: model = getSynonymsWithApertium(model, response); break; case termExploreMode.apertium_wordnet_eng: model = getSynonymsWithWordnetViaApertium(model, response); break; case termExploreMode.apertium_wordnet_srb: model = getSynonymsWithSerbianWordNetAndApertium(model, response); break; case termExploreMode.corpus: model = getSynonymsByCorpus(model, response); break; case termExploreMode.hunspell_srb: model = getSynonymsWithHunspell(model, response); break; case termExploreMode.none: break; case termExploreMode.wordnet_srb: model = getSynonymsWithSerbianWordNet(model, response); break; case termExploreMode.unitex: model = exploreWithUnitex(word, response); break; } model.PostProcess(); if (output != null) { model.ToString(output, verbose, false); } return(model); }
/// <summary> /// Builds a term model out from Word input /// </summary> /// <param name="word">The word.</param> /// <param name="response">The response.</param> /// <returns></returns> public static termExploreModel exploreWithUnitex(string word, ILogBuilder response, bool wordIsLemma = false) { termExploreModel output = new termExploreModel(); output.modelSource = termExploreModelSource.fromToken; output.inputForm = word; string lemma = word; var tls = semanticLexiconManager.manager.resolve(word); if (tls != null) { if (Enumerable.Count(tls) > 0) { if (response != null) { response.AppendLine("#1 Lemma already defined in the triplestore [" + word + "] "); } output = semanticLexiconManager.manager.constructor.getTermModel(Enumerable.First(tls)); return(output); } } if (!wordIsLemma) { if (response != null) { response.AppendLine("#1 Finding Lemma for [" + word + "] "); } string query = string.Format(posConverter.REGEX_UNITEX_InstanceToLemmaFormat, word); fileTextSearchResult reslt = languageManagerUnitex.manager.operatorDelaf.Search(query, true, 1, RegexOptions.IgnoreCase); if (response != null) { reslt.ToString(response, true); } Regex instanceToLemmaReg = new Regex(query); if (reslt.Count() > 0) { var lnp = reslt.First(); Match mch = instanceToLemmaReg.Match(lnp.Value); lemma = mch.Groups[1].Value; } } else { if (response != null) { response.AppendLine("#1 The word is trusted to be a lemma [" + word + "] - skipping search"); } } // <------------------------------------------------------------------- preparing chache --------------- var cache = languageManagerUnitex.manager.operatorDelaf.Search(lemma, false, 300); if (response != null) { response.AppendLine("Cached definitions [" + cache.Count() + "] "); } // <------------------------------------------------------------ 2. finding lemma definition output.lemmaForm = lemma; output.lemma = new termExploreItem(lemma); if (response != null) { response.AppendLine("#2 Finding Lemma definition [" + lemma + "] "); } string lemmaQuery = string.Format(posConverter.REGEX_UNITEX_DeclarationForLemma, lemma); Regex lemmaQueryRegex = new Regex(lemmaQuery); fileTextSearchResult lemmaResult = languageManagerUnitex.manager.operatorDelaf.Search(cache, lemmaQuery, true, 5, RegexOptions.IgnoreCase); if (response != null) { lemmaResult.ToString(response, true); } if (lemmaResult.Count() == 0) { if (response != null) { response.consoleAltColorToggle(); response.AppendLine("Failed to find lemma definition for [" + word + "]. Aborting exploration."); response.consoleAltColorToggle(); } output.wasExploreFailed = true; return(output); } foreach (var lr_lnp in lemmaResult) { Match lmch = lemmaQueryRegex.Match(lr_lnp.Value); if (lmch.Success) { output.lemma.gramSet.Add(lmch.Groups[1].Value); } } if (response != null) { output.lemma.ToString(response); } // <------------------------------------------------------------ 3. getting all instances for the lemma if (response != null) { response.AppendLine("#3 Extracting all instances for the Lemma [" + lemma + "] "); } string instanceQuery = string.Format(posConverter.REGEX_UNITEX_LemmaToInstanceFormat, lemma); string instanceUnitexQuery = "," + lemma + "."; Regex instanceQueryRegex = new Regex(instanceQuery); fileTextSearchResult instanceResult = languageManagerUnitex.manager.operatorDelaf.Search(cache, instanceUnitexQuery, false, 100, RegexOptions.IgnoreCase); if (response != null) { instanceResult.ToString(response, true); } foreach (var lr_lnp in instanceResult) { Match lmch = instanceQueryRegex.Match(lr_lnp.Value); output.instances.Add(lmch.Groups[1].Value, lmch.Groups[2].Value); } // <------------------------------------------------------------ 4. Resulting term model if (response != null) { response.AppendLine("#4 Resulting term model [" + lemma + "] "); output.ToString(response); } return(output); }
/// <summary> /// Explores the with hunspell. /// </summary> /// <param name="item">The item.</param> /// <param name="log">The log.</param> /// <returns></returns> public static termExploreModel exploreWithHunspell(this termExploreItem item, ILogBuilder log) { termExploreModel output = new termExploreModel(); List <string> terms = new List <string>(); terms.Add(item.inputForm); List <string> suggest = imbLanguageFrameworkManager.serbian.basic.hunspellEngine.Suggest(item.inputForm); List <string> sug2 = new List <string>(); suggest.ForEach(x => sug2.Add(x.Replace("\\-", "-"))); suggest = sug2; if (Enumerable.Any(suggest)) { int min_l = Enumerable.Min(suggest, x => x.Length); List <string> possibleTerm = new List <string>(); int tocut = min_l - item.inputForm.Length; string start = item.inputForm; if (tocut != 0) { start = start.substring(tocut); } string rootComposite = ""; int rootCompositeSplit = 0; foreach (string sug in suggest) { if (!sug.Contains(" ")) { if (sug.Contains("-")) { int rcSplit = sug.IndexOf("-"); if (rcSplit > rootCompositeSplit) { rootCompositeSplit = rcSplit; rootComposite = sug.Substring(0, rootCompositeSplit).Trim(Enumerable.ToArray("-")); } } else { if (sug.StartsWith(start)) { possibleTerm.Add(sug); } } } } if (tocut == 0) { if (possibleTerm.Count == 0) { possibleTerm.AddRange(suggest); } } possibleTerm.Add(item.inputForm); if (rootCompositeSplit == 0) { rootComposite = possibleTerm.MinItem(x => x.Length); } suggest = possibleTerm.Clone(); possibleTerm.Clear(); string lemmaForm = ""; foreach (string sug in suggest) { if (sug.Contains(rootComposite, StringComparison.CurrentCultureIgnoreCase)) { possibleTerm.Add(sug); if (lemmaForm.isNullOrEmpty()) { lemmaForm = sug; } if (sug.Length < lemmaForm.Length) { lemmaForm = sug; } } } output.lemmaForm = lemmaForm; output.rootWord = rootComposite; output.inputForm = item.inputForm; foreach (string sug in possibleTerm) { output.instances.Add(sug); //log.log(sug); } } else { output.lemmaForm = item.inputForm; output.rootWord = item.inputForm; output.inputForm = item.inputForm; } //log.log("Input term: " + item.inputForm); //log.log("Root: " + output.rootWord); //log.log("Lemma: " + output.lemmaForm); //log.log("Instances: "); output.ToString(log); return(output); }