public void Deserialize(BinaryReader binRead) { lsett = new LemmatizerSettings(binRead); bool bSerializeExamples = binRead.ReadBoolean(); elExamples = new ExampleList(binRead, lsett); ExampleList elExamplesRear; ExampleList elExamplesFront; if (bSerializeExamples) { elExamplesRear = elExamples.GetFrontRearExampleList(false); elExamplesFront = elExamples.GetFrontRearExampleList(true); } else { elExamplesRear = new ExampleList(binRead, lsett); elExamplesFront = new ExampleList(binRead, lsett); } if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null); } else { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamplesRear, null); ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null); } }
public Lemmatizer(LemmatizerSettings lsett) { this.lsett = lsett; this.elExamples = new ExampleList(lsett); this.ltnRootNode = null; this.ltnRootNodeFront = null; }
public string Lemmatize(string sWord, bool ignoreCase, string sMsd) { if (sWord.Length >= iSimilarity && dictSubNodes != null) { //try first correct casing char chChar = sWord.Length > iSimilarity ? sWord[sWord.Length - 1 - iSimilarity] : '\0'; if (dictSubNodes.ContainsKey(chChar) && dictSubNodes[chChar].ConditionSatisfied(sWord, ignoreCase, sMsd)) { return(dictSubNodes[chChar].Lemmatize(sWord, ignoreCase, sMsd)); } //try also inversed casing if (ignoreCase && char.IsLetter(chChar)) { char chCharInvert = char.IsLower(chChar) ? char.ToUpper(chChar) : char.ToLower(chChar); if (dictSubNodes.ContainsKey(chCharInvert) && dictSubNodes[chCharInvert].ConditionSatisfied(sWord, ignoreCase, sMsd)) { return(dictSubNodes[chCharInvert].Lemmatize(sWord, ignoreCase, sMsd)); } } } if (lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct && sMsd != null) { LemmaRule lrBestValid = null; LemmaTreeNode ltnValid = this; bool useNoMsd = false; while (lrBestValid == null && useNoMsd == false) { if (ltnValid.dictMsdBestRules.ContainsKey(sMsd)) { lrBestValid = ltnValid.dictMsdBestRules[sMsd][0].Rule; } else { if (ltnValid.ltnParentNode != null) { ltnValid = ltnValid.ltnParentNode; } else { useNoMsd = true; } } } if (useNoMsd) { return(ltnValid.lrBestRule.Lemmatize(sWord)); } else { return(lrBestValid.Lemmatize(sWord)); } } else { return(lrBestRule.Lemmatize(sWord)); } }
public void BuildModel() { if (ltnRootNode != null) { return; } //TODO remove: elExamples.FinalizeAdditions(); ltnRootNode = new LemmaTreeNode(lsett, elExamples); }
/// <summary> /// /// </summary> /// <param name="lsett"></param> /// <param name="elExamples"></param> /// <param name="iStart">Index of the first word of the current group</param> /// <param name="iEnd">Index of the last word of the current group</param> /// <param name="ltnParentNode"></param> private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett) { this.ltnParentNode = ltnParentNode; dictSubNodes = null; this.iStart = iStart; this.iEnd = iEnd; this.elExamples = elExamples; if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd) { lrBestRule = elExamples.Rules.DefaultRule; aBestRules = new RuleWeighted[1]; aBestRules[0] = new RuleWeighted(lrBestRule, 0); dWeight = 0; return; } int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length); sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength); iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]); bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity; FindBestRules(); AddSubAll(); //TODO check this heuristics, can be problematic when there are more applicable rules if (dictSubNodes != null) { List <KeyValuePair <char, LemmaTreeNode> > lReplaceNodes = new List <KeyValuePair <char, LemmaTreeNode> >(); foreach (KeyValuePair <char, LemmaTreeNode> kvpChild in dictSubNodes) { if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1) { IEnumerator <LemmaTreeNode> enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator(); enumChildChild.MoveNext(); LemmaTreeNode ltrChildChild = enumChildChild.Current; if (kvpChild.Value.lrBestRule == lrBestRule) { lReplaceNodes.Add(new KeyValuePair <char, LemmaTreeNode>(kvpChild.Key, ltrChildChild)); } } } foreach (KeyValuePair <char, LemmaTreeNode> kvpChild in lReplaceNodes) { dictSubNodes[kvpChild.Key] = kvpChild.Value; kvpChild.Value.ltnParentNode = this; } } }
public void Load(Latino.BinarySerializer binRead) { lsett = new LemmatizerSettings(binRead); elExamples = new ExampleList(binRead, lsett); if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null); } else { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(false), null); ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(true), null); } }
public void BuildModel(string msdSpec, MsdSplitTree.BeamSearchParams beamSearchOpt) { if (ltnRootNode != null) { return; } //if msd are used and other criterias are fulfiled than use MsdSplitTreeOptimization if (lsett.bUseMsdSplitTreeOptimization && lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct && !string.IsNullOrEmpty(msdSpec)) { msdSplitTree = new MsdSplitTree(elExamples.ListExamples, new MsdSpec(msdSpec), beamSearchOpt); //Console.WriteLine("MsdSplitTree consturcetd with {0} leaves!",msdSplitTree.subTreeSizeRecurs); ExampleList el = elExamples; elExamples = new ExampleList(lsett); //int s = 0; Dictionary <string, double> msds = new Dictionary <string, double>(); foreach (LemmaExample le in el.ListExamples) { //Console.WriteLine("{0}: {1}",s++,le.Msd); string newMsd = msdSplitTree.TransformMsd(le.Msd); elExamples.AddExample(le.Word, le.Lemma, le.Weight, newMsd); //Console.WriteLine("\t" + newMsd); if (msds.ContainsKey(newMsd)) { msds[newMsd] += le.Weight; } else { msds[newMsd] = le.Weight; } } foreach (KeyValuePair <string, double> msd in msds) { //Console.WriteLine("{0} {1}", msd.Key, msd.Value); } //TODO problem, if buildmodel is called twice than a problem occurs!!!! } elExamples.FinalizeAdditions(); if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(lsett, elExamples); } else { ltnRootNode = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false)); ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true)); } }
public void Deserialize(BinaryReader binRead) { using (binRead) { // settings Lsett = new LemmatizerSettings(binRead); // examples bool bSerializeExamples = binRead.ReadBoolean(); ElExamples = new ExampleList(binRead, Lsett); ExampleList elExamplesRear; ExampleList elExamplesFront; if (bSerializeExamples) { elExamplesRear = ElExamples.GetFrontRearExampleList(false); elExamplesFront = ElExamples.GetFrontRearExampleList(true); } else { elExamplesRear = new ExampleList(binRead, Lsett); elExamplesFront = new ExampleList(binRead, Lsett); } // root node LtnRootNode = new LemmaTreeNode(binRead, Lsett, Lsett.bBuildFrontLemmatizer ? elExamplesRear : ElExamples, null); // root node front if (Lsett.bBuildFrontLemmatizer) { LtnRootNodeFront = new LemmaTreeNode(binRead, Lsett, elExamplesFront, null); } // exceptions - use try catch for retro compatibility // --> this section is missing in the old lemmatizer files try { var nbOfExceptions = binRead.ReadInt32(); for (var i = 0; i < nbOfExceptions; i++) { var exception = binRead.ReadString(); var parts = exception.Split(' '); this.AddException(parts[0], parts[1]); } } catch (Exception) { Trace.WriteLine("Couldn't deserialize exceptions in Lemmatizer file"); } } }
private void AddSub(int iStart, int iEnd, char chChar) { LemmaTreeNode ltnSub = new LemmaTreeNode(lsett, elExamples, iStart, iEnd, this); //TODO - maybe not realy appropriate because loosing statisitcs from multiple possible rules if (ltnSub.lrBestRule == lrBestRule && ltnSub.dictSubNodes == null) { return; } if (dictSubNodes == null) { dictSubNodes = new Dictionary <char, LemmaTreeNode>(); } dictSubNodes.Add(chChar, ltnSub); }
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; // read is not null? if (binRead.ReadBoolean()) { // read all dictionary (key + value) dictSubNodes = new ConcurrentDictionary <char, LemmaTreeNode>(); int iCount = binRead.ReadInt32(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); var ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.TryAdd(cKey, ltrSub); } } else { dictSubNodes = null; } this.ltnParentNode = ltnParentNode; // read similarity, condition and wholeword? iSimilarity = binRead.ReadInt32(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBoolean(); // best rule signature lrBestRule = elExamples.Rules[binRead.ReadString()]; // best rules int iCountBest = binRead.ReadInt32(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) { aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); } // weight, start, end dWeight = binRead.ReadDouble(); iStart = binRead.ReadInt32(); iEnd = binRead.ReadInt32(); this.elExamples = elExamples; }
/// <summary> /// /// </summary> /// <param name="lsett"></param> /// <param name="elExamples"></param> /// <param name="iStart">Index of the first word of the current group</param> /// <param name="iEnd">Index of the last word of the current group</param> /// <param name="ltnParentNode"></param> private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett) { this.ltnParentNode = ltnParentNode; this.dictSubNodes = null; this.iStart = iStart; this.iEnd = iEnd; this.elExamples = elExamples; if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd) { lrBestRule = elExamples.Rules.DefaultRule; aBestRules = new RuleWeighted[1]; aBestRules[0] = new RuleWeighted(lrBestRule, 0); dWeight = 0; return; } int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length); this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength); this.iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]); this.bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity; FindBestRules(); AddSubAll(); //TODO check this heuristics, can be problematic when there are more applicable rules if (dictSubNodes != null) { List<KeyValuePair<char, LemmaTreeNode>> lReplaceNodes = new List<KeyValuePair<char, LemmaTreeNode>>(); foreach (KeyValuePair<char, LemmaTreeNode> kvpChild in dictSubNodes) if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1) { IEnumerator<LemmaTreeNode> enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator(); enumChildChild.MoveNext(); LemmaTreeNode ltrChildChild = enumChildChild.Current; if (kvpChild.Value.lrBestRule == lrBestRule) lReplaceNodes.Add(new KeyValuePair<char, LemmaTreeNode>(kvpChild.Key, ltrChildChild)); } foreach (KeyValuePair<char, LemmaTreeNode> kvpChild in lReplaceNodes) { dictSubNodes[kvpChild.Key] = kvpChild.Value; kvpChild.Value.ltnParentNode = this; } } }
private SerializationModel SerializeModel(LemmaTreeNode ltn, StreamWriter sb, int iLevel) { SerializationModel model = new SerializationModel(); model.matchWholeWord = ltn.bWholeWord; model.suffixCondition = ltn.sCondition; model.ruleFrom = ltn.sCondition.Substring(ltn.sCondition.Length - ltn.lrBestRule.iFrom); model.ruleTo = ltn.lrBestRule.sTo; model.childNodes = new List <SerializationModel>(); if (ltn.dictSubNodes != null) { foreach (LemmaTreeNode ltnChild in ltn.dictSubNodes.Values) { SerializationModel node = SerializeModel(ltnChild, sb, iLevel + 1); model.childNodes.Add(node); } } return(model); }
public void BuildModel() { if (ltnRootNode != null) { return; } if (!lsett.bBuildFrontLemmatizer) { //TODO remove: elExamples.FinalizeAdditions(); elExamples.FinalizeAdditions(); ltnRootNode = new LemmaTreeNode(lsett, elExamples); } else { ltnRootNode = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false)); ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true)); } }
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; if (binRead.ReadBoolean()) { dictSubNodes = new Dictionary <char, LemmaTreeNode>(); int iCount = binRead.ReadInt32(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.Add(cKey, ltrSub); } } else { dictSubNodes = null; } this.ltnParentNode = ltnParentNode; iSimilarity = binRead.ReadInt32(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBoolean(); lrBestRule = elExamples.Rules[binRead.ReadString()]; int iCountBest = binRead.ReadInt32(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) { aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); } dWeight = binRead.ReadDouble(); iStart = binRead.ReadInt32(); iEnd = binRead.ReadInt32(); this.elExamples = elExamples; }
public LemmaTreeNode(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { Deserialize(binRead, lsett, elExamples, ltnParentNode); }
public void AddExample(string sWord, string sLemma, double dWeight, string sMsd) { elExamples.AddExample(sWord, sLemma, dWeight, sMsd); ltnRootNode = null; }
public void Load(Latino.BinarySerializer binRead) { lsett = new LemmatizerSettings(binRead); elExamples = new ExampleList(binRead, lsett); ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null); }
public void AddMultextFile(StreamReader srIn, string sFormat) { this.elExamples.AddMultextFile(srIn, sFormat); ltnRootNode = null; }
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; if (binRead.ReadBoolean()) { dictSubNodes = new Dictionary <char, LemmaTreeNode>(); int iCount = binRead.ReadInt32(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.Add(cKey, ltrSub); } } else { dictSubNodes = null; } this.ltnParentNode = ltnParentNode; iSimilarity = binRead.ReadInt32(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBoolean(); lrBestRule = elExamples.Rules[binRead.ReadString()]; int iCountBest = binRead.ReadInt32(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) { aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); } dWeight = binRead.ReadDouble(); //deserialize dictMsdBestRules dictionary int dictMsdBestRulesCount = binRead.ReadInt32(); if (dictMsdBestRulesCount == -1) { dictMsdBestRules = null; } else { dictMsdBestRules = new Dictionary <string, RuleWeighted[]>(); for (int msdId = 0; msdId < dictMsdBestRulesCount; msdId++) { string sMsd = binRead.ReadString(); RuleWeighted[] lRuleWeighted; int ruleWeightedCount = binRead.ReadInt32(); if (ruleWeightedCount == -1) { lRuleWeighted = null; } else { lRuleWeighted = new RuleWeighted[ruleWeightedCount]; for (int ruleId = 0; ruleId < ruleWeightedCount; ruleId++) { string ruleSignature = binRead.ReadString(); double ruleWeight = binRead.ReadDouble(); LemmaRule rule = elExamples.Rules[ruleSignature]; lRuleWeighted[ruleId] = new RuleWeighted(rule, ruleWeight); } } dictMsdBestRules.Add(sMsd, lRuleWeighted); } } //deserialize dictMsdWeights dictionary int dictMsdWeightsCount = binRead.ReadInt32(); if (dictMsdWeightsCount == -1) { dictMsdWeights = null; } else { dictMsdWeights = new Dictionary <string, double>(); for (int msdId = 0; msdId < dictMsdWeightsCount; msdId++) { string sMsd = binRead.ReadString(); double dMsdWeight = binRead.ReadDouble(); dictMsdWeights.Add(sMsd, dMsdWeight); } } iStart = binRead.ReadInt32(); iEnd = binRead.ReadInt32(); this.elExamples = elExamples; }
public void BuildModel() { if (ltnRootNode != null) return; if (!lsett.bBuildFrontLemmatizer) { //TODO remove: elExamples.FinalizeAdditions(); elExamples.FinalizeAdditions(); ltnRootNode = new LemmaTreeNode(lsett, elExamples); } else { ltnRootNode = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false)); ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true)); } }
public void Load(Latino.BinarySerializer binRead) { lsett = new LemmatizerSettings(binRead); elExamples = new ExampleList(binRead, lsett); if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null); } else { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(false) , null); ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(true), null); } }
public LemmaTreeNode(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { Load(binRead, lsett, elExamples, ltnParentNode); }
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; if (binRead.ReadBool()) { dictSubNodes = new Dictionary<char, LemmaTreeNode>(); int iCount = binRead.ReadInt(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.Add(cKey, ltrSub); } } else dictSubNodes = null; this.ltnParentNode = ltnParentNode; iSimilarity = binRead.ReadInt(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBool(); lrBestRule = elExamples.Rules[binRead.ReadString()]; int iCountBest = binRead.ReadInt(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); dWeight = binRead.ReadDouble(); iStart = binRead.ReadInt(); iEnd = binRead.ReadInt(); this.elExamples = elExamples; }
private void AddSub(int iStart, int iEnd, char chChar) { LemmaTreeNode ltnSub = new LemmaTreeNode(lsett, elExamples, iStart, iEnd, this); //TODO - maybe not realy appropriate because loosing statisitcs from multiple possible rules if (ltnSub.lrBestRule == lrBestRule && ltnSub.dictSubNodes == null) return; if (dictSubNodes == null) dictSubNodes = new Dictionary<char, LemmaTreeNode>(); dictSubNodes.Add(chChar, ltnSub); }