public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) { //load metadata bool bThisTopObject = binRead.ReadBool(); //load value types -------------------------------------- sWord = binRead.ReadString(); sLemma = binRead.ReadString(); sSignature = binRead.ReadString(); if (binRead.ReadBool()) { sMsd = binRead.ReadString(); } else { sMsd = null; } dWeight = binRead.ReadDouble(); //load refernce types if needed ------------------------- if (bThisTopObject) { this.lsett = new LemmatizerSettings(binRead); this.lrRule = new LemmaRule(binRead, this.lsett); } else { this.lsett = lsett; this.lrRule = lrRule; } }
// Constructor(s) & Destructor(s) --------- public LemmaExample(string sWord, string sLemma, double dWeight, string sMsd, RuleList rlRules, LemmatizerSettings lsett) { this.lsett = lsett; this.sWord = sWord; this.sLemma = sLemma; this.sMsd = sMsd; this.dWeight = dWeight; this.lrRule = rlRules.AddRule(this); switch (lsett.eMsdConsider) { case LemmatizerSettings.MsdConsideration.Ignore: case LemmatizerSettings.MsdConsideration.JoinAll: case LemmatizerSettings.MsdConsideration.JoinDistinct: case LemmatizerSettings.MsdConsideration.JoinSameSubstring: sSignature = "[" + sWord + "]==>[" + sLemma + "]"; break; case LemmatizerSettings.MsdConsideration.Distinct: default: sSignature = "[" + sWord + "]==>[" + sLemma + "](" + (sMsd ?? "") + ")"; break; } this.sWordRearCache = null; this.sWordFrontCache = null; this.sLemmaFrontCache = null; }
/// <summary> /// /// </summary> /// <param name="lsett"></param> /// <param name="elExamples"></param> /// <param name="iStart">Index of the first word of the current group</param> /// <param name="iEnd">Index of the last word of the current group</param> /// <param name="ltnParentNode"></param> private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett) { this.ltnParentNode = ltnParentNode; this.dictSubNodes = null; this.iStart = iStart; this.iEnd = iEnd; this.elExamples = elExamples; if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd) { lrBestRule = elExamples.Rules.DefaultRule; aBestRules = new RuleWeighted[1]; aBestRules[0] = new RuleWeighted(lrBestRule, 0); dWeight = 0; return; } int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length); this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength); this.iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]); this.bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity; FindBestRules(); AddSubAll(); //TODO check this heuristics, can be problematic when there are more applicable rules if (dictSubNodes != null) { List <KeyValuePair <char, LemmaTreeNode> > lReplaceNodes = new List <KeyValuePair <char, LemmaTreeNode> >(); foreach (KeyValuePair <char, LemmaTreeNode> kvpChild in dictSubNodes) { if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1) { IEnumerator <LemmaTreeNode> enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator(); enumChildChild.MoveNext(); LemmaTreeNode ltrChildChild = enumChildChild.Current; if (kvpChild.Value.lrBestRule == lrBestRule) { lReplaceNodes.Add(new KeyValuePair <char, LemmaTreeNode>(kvpChild.Key, ltrChildChild)); } } } foreach (KeyValuePair <char, LemmaTreeNode> kvpChild in lReplaceNodes) { dictSubNodes[kvpChild.Key] = kvpChild.Value; kvpChild.Value.ltnParentNode = this; } } }
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; // read is not null? if (binRead.ReadBoolean()) { // read all dictionary (key + value) dictSubNodes = new ConcurrentDictionary <char, LemmaTreeNode>(); int iCount = binRead.ReadInt32(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); var ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.TryAdd(cKey, ltrSub); } } else { dictSubNodes = null; } this.ltnParentNode = ltnParentNode; // read similarity, condition and wholeword? iSimilarity = binRead.ReadInt32(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBoolean(); // best rule signature lrBestRule = elExamples.Rules[binRead.ReadString()]; // best rules int iCountBest = binRead.ReadInt32(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) { aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); } // weight, start, end dWeight = binRead.ReadDouble(); iStart = binRead.ReadInt32(); iEnd = binRead.ReadInt32(); this.elExamples = elExamples; }
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; if (binRead.ReadBool()) { dictSubNodes = new Dictionary <char, LemmaTreeNode>(); int iCount = binRead.ReadInt(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.Add(cKey, ltrSub); } } else { dictSubNodes = null; } this.ltnParentNode = ltnParentNode; iSimilarity = binRead.ReadInt(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBool(); lrBestRule = elExamples.Rules[binRead.ReadString()]; int iCountBest = binRead.ReadInt(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) { aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); } dWeight = binRead.ReadDouble(); iStart = binRead.ReadInt(); iEnd = binRead.ReadInt(); this.elExamples = elExamples; }
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett) { //load metadata bool bThisTopObject = binRead.ReadBoolean(); //load refernce types if needed ------------------------- if (bThisTopObject) { this.lsett = new LemmatizerSettings(binRead); } else { this.lsett = lsett; } // deserialize rules rlRules = new RuleList(binRead, this.lsett); // deserialize examples bool bCreateLstExamples = binRead.ReadBoolean(); lstExamples = bCreateLstExamples ? new List <LemmaExample>() : null; dictExamples = new Dictionary <string, LemmaExample>(); //load dictionary items int iCount = binRead.ReadInt32(); for (int iId = 0; iId < iCount; iId++) { LemmaRule lrRule = rlRules[binRead.ReadString()]; var le = new LemmaExample(binRead, this.lsett, lrRule); dictExamples.Add(le.Signature, le); if (bCreateLstExamples) { lstExamples.Add(le); } } }
// Essential Class Functions (building model) ---- private void FindBestRules() { /* * LINQ SPEED TEST (Slower than current metodology) * * * List<LemmaExample> leApplicable = new List<LemmaExample>(); * for (int iExm = iStart; iExm <= iEnd; iExm++) * if (elExamples[iExm].Rule.IsApplicableToGroup(sCondition.Length)) * leApplicable.Add(elExamples[iExm]); * * List<KeyValuePair<LemmaRule, double>> lBestRules = new List<KeyValuePair<LemmaRule,double>>(); * lBestRules.AddRange( * leApplicable. * GroupBy<LemmaExample, LemmaRule, double, KeyValuePair<LemmaRule, double>>( * le => le.Rule, * le => le.Weight, * (lr, enumDbl) => new KeyValuePair<LemmaRule, double>(lr, enumDbl.Aggregate((acc, curr) => acc + curr)) * ). * OrderBy(kvpLrWght=>kvpLrWght.Value) * ); * * if (lBestRules.Count > 0) * lrBestRule = lBestRules[0].Key; * else { * lrBestRule = elExamples.Rules.DefaultRule; * * } */ dWeight = 0; //calculate dWeight of whole node and calculates qualities for all rules var dictApplicableRules = new Dictionary <LemmaRule, double>(); //dictApplicableRules.Add(elExamples.Rules.DefaultRule, 0); while (dictApplicableRules.Count == 0) { for (int iExm = iStart; iExm <= iEnd; iExm++) { LemmaRule lr = elExamples[iExm].Rule; double dExmWeight = elExamples[iExm].Weight; dWeight += dExmWeight; if (lr.IsApplicableToGroup(sCondition.Length)) { if (dictApplicableRules.ContainsKey(lr)) { dictApplicableRules[lr] += dExmWeight; } else { dictApplicableRules.Add(lr, dExmWeight); } } } //if none found then increase condition length or add some default appliable rule if (dictApplicableRules.Count == 0) { if (this.sCondition.Length < iSimilarity) { this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - (sCondition.Length + 1)); } else { //TODO preveri hevristiko, mogoce je bolje ce se doda default rule namesto rulea od starsa dictApplicableRules.Add(ltnParentNode.lrBestRule, 0); } } } //TODO can optimize this step using sorted list (dont add if it's worse than the worst) List <RuleWeighted> lSortedRules = new List <RuleWeighted>(); foreach (KeyValuePair <LemmaRule, double> kvp in dictApplicableRules) { lSortedRules.Add(new RuleWeighted(kvp.Key, kvp.Value / dWeight)); } lSortedRules.Sort(); //keep just best iMaxRulesPerNode rules int iNumRules = lSortedRules.Count; if (lsett.iMaxRulesPerNode > 0) { iNumRules = Math.Min(lSortedRules.Count, lsett.iMaxRulesPerNode); } aBestRules = new RuleWeighted[iNumRules]; for (int iRule = 0; iRule < iNumRules; iRule++) { aBestRules[iRule] = lSortedRules[iRule]; } //set best rule lrBestRule = aBestRules[0].Rule; //TODO must check if this hevristics is OK (to privilige parent rule) if (ltnParentNode != null) { for (int iRule = 0; iRule < lSortedRules.Count && lSortedRules[iRule].Weight == lSortedRules[0].Weight; iRule++) { if (lSortedRules[iRule].Rule == ltnParentNode.lrBestRule) { lrBestRule = lSortedRules[iRule].Rule; break; } } } }
public LemmaExample(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) { Load(binRead, lsett, lrRule); }
public LemmaExample(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule) { Deserialize(binRead, lsett, lrRule); }
// Constructor(s) & Destructor(s) ------- public RuleWeighted(LemmaRule lrRule, double dWeight) { this.lrRule = lrRule; this.dWeight = dWeight; }