public ExampleList(LemmatizerSettings lsett): base() { this.lsett = lsett; this.dictExamples = new Dictionary<string, LemmaExample>(); this.lstExamples = null; this.rlRules = new RuleList(lsett); }
public LemmaRule(string sWord, string sLemma, int iId, LemmatizerSettings lsett) { this.lsett = lsett; this.iId = iId; int iSameStem = SameStem(sWord, sLemma); sTo = sLemma.Substring(iSameStem); iFrom = sWord.Length - iSameStem; if (lsett.bUseFromInRules) { sFrom = sWord.Substring(iSameStem); sSignature = "[" + sFrom + "]==>[" + sTo + "]"; } else { sFrom = null; sSignature = "[#" + iFrom + "]==>[" + sTo + "]"; } }
/// <summary> /// /// </summary> /// <param name="lsett"></param> /// <param name="elExamples"></param> /// <param name="iStart">Index of the first word of the current group</param> /// <param name="iEnd">Index of the last word of the current group</param> /// <param name="ltnParentNode"></param> private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett) { this.ltnParentNode = ltnParentNode; this.dictSubNodes = null; this.iStart = iStart; this.iEnd = iEnd; this.elExamples = elExamples; if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd) { lrBestRule = elExamples.Rules.DefaultRule; aBestRules = new RuleWeighted[1]; aBestRules[0] = new RuleWeighted(lrBestRule, 0); dWeight = 0; return; } int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length); this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength); this.iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]); this.bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity; FindBestRules(); AddSubAll(); //TODO check this heuristics, can be problematic when there are more applicable rules if (dictSubNodes != null) { List<KeyValuePair<char, LemmaTreeNode>> lReplaceNodes = new List<KeyValuePair<char, LemmaTreeNode>>(); foreach (KeyValuePair<char, LemmaTreeNode> kvpChild in dictSubNodes) if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1) { IEnumerator<LemmaTreeNode> enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator(); enumChildChild.MoveNext(); LemmaTreeNode ltrChildChild = enumChildChild.Current; if (kvpChild.Value.lrBestRule == lrBestRule) lReplaceNodes.Add(new KeyValuePair<char, LemmaTreeNode>(kvpChild.Key, ltrChildChild)); } foreach (KeyValuePair<char, LemmaTreeNode> kvpChild in lReplaceNodes) { dictSubNodes[kvpChild.Key] = kvpChild.Value; kvpChild.Value.ltnParentNode = this; } } }
public Lemmatizer(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett) { AddMultextFile(srIn, sFormat); }
public void Deserialize(BinaryReader binRead) { lsett = new LemmatizerSettings(binRead); bool bSerializeExamples = binRead.ReadBoolean(); elExamples = new ExampleList(binRead, lsett); ExampleList elExamplesRear; ExampleList elExamplesFront; if (bSerializeExamples) { elExamplesRear = elExamples.GetFrontRearExampleList(false); elExamplesFront = elExamples.GetFrontRearExampleList(true); } else { elExamplesRear = new ExampleList(binRead, lsett); elExamplesFront = new ExampleList(binRead, lsett); } if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null); } else { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamplesRear, null); ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null); } }
public ExampleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { Load(binRead, lsett); }
private LemmaTreeNode(LemmatizerSettings lsett) { this.lsett = lsett; }
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; if (binRead.ReadBoolean()) { dictSubNodes = new Dictionary <char, LemmaTreeNode>(); int iCount = binRead.ReadInt32(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.Add(cKey, ltrSub); } } else { dictSubNodes = null; } this.ltnParentNode = ltnParentNode; iSimilarity = binRead.ReadInt32(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBoolean(); lrBestRule = elExamples.Rules[binRead.ReadString()]; int iCountBest = binRead.ReadInt32(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) { aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); } dWeight = binRead.ReadDouble(); //deserialize dictMsdBestRules dictionary int dictMsdBestRulesCount = binRead.ReadInt32(); if (dictMsdBestRulesCount == -1) { dictMsdBestRules = null; } else { dictMsdBestRules = new Dictionary <string, RuleWeighted[]>(); for (int msdId = 0; msdId < dictMsdBestRulesCount; msdId++) { string sMsd = binRead.ReadString(); RuleWeighted[] lRuleWeighted; int ruleWeightedCount = binRead.ReadInt32(); if (ruleWeightedCount == -1) { lRuleWeighted = null; } else { lRuleWeighted = new RuleWeighted[ruleWeightedCount]; for (int ruleId = 0; ruleId < ruleWeightedCount; ruleId++) { string ruleSignature = binRead.ReadString(); double ruleWeight = binRead.ReadDouble(); LemmaRule rule = elExamples.Rules[ruleSignature]; lRuleWeighted[ruleId] = new RuleWeighted(rule, ruleWeight); } } dictMsdBestRules.Add(sMsd, lRuleWeighted); } } //deserialize dictMsdWeights dictionary int dictMsdWeightsCount = binRead.ReadInt32(); if (dictMsdWeightsCount == -1) { dictMsdWeights = null; } else { dictMsdWeights = new Dictionary <string, double>(); for (int msdId = 0; msdId < dictMsdWeightsCount; msdId++) { string sMsd = binRead.ReadString(); double dMsdWeight = binRead.ReadDouble(); dictMsdWeights.Add(sMsd, dMsdWeight); } } iStart = binRead.ReadInt32(); iEnd = binRead.ReadInt32(); this.elExamples = elExamples; }
public RuleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { Load(binRead, lsett); }
public LemmaExample(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule) { Deserialize(binRead, lsett, lrRule); }
private LemmaTreeNode(LemmatizerSettings lsett) { this.lsett = lsett; }
public LemmaTreeNode(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { Deserialize(binRead, lsett, elExamples, ltnParentNode); }
public ExampleList(BinaryReader binRead, LemmatizerSettings lsett) { Deserialize(binRead, lsett); }
public LemmatizerPrebuilt(LanguagePrebuilt lang, LemmatizerSettings lsett) : base(lsett) { this.lang = lang; }
public LemmaRule(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { Load(binRead, lsett); }
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { //load metadata bool bThisTopObject = binRead.ReadBool(); //load value types -------------------------------------- iId = binRead.ReadInt(); iFrom = binRead.ReadInt(); if (binRead.ReadBool()) sFrom = binRead.ReadString(); else sFrom = null; sTo = binRead.ReadString(); sSignature = binRead.ReadString(); //load refernce types if needed ------------------------- if (bThisTopObject) this.lsett = new LemmatizerSettings(binRead); else this.lsett = lsett; }
public LemmaRule(System.IO.BinaryReader binRead, LemmatizerSettings lsett) { this.Deserialize(binRead, lsett); }
// Constructor(s) & Destructor(s) ------------ public RuleList(LemmatizerSettings lsett) { this.lsett = lsett; lrDefaultRule = AddRule(new LemmaRule("", "", 0, lsett)); }
public LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples) : this(lsett, elExamples, 0, elExamples.Count-1, null) { }
public RuleList(BinaryReader binRead, LemmatizerSettings lsett) { this.Deserialize(binRead, lsett); }
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { this.lsett = lsett; if (binRead.ReadBool()) { dictSubNodes = new Dictionary<char, LemmaTreeNode>(); int iCount = binRead.ReadInt(); for (int i = 0; i < iCount; i++) { char cKey = binRead.ReadChar(); LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this); dictSubNodes.Add(cKey, ltrSub); } } else dictSubNodes = null; this.ltnParentNode = ltnParentNode; iSimilarity = binRead.ReadInt(); sCondition = binRead.ReadString(); bWholeWord = binRead.ReadBool(); lrBestRule = elExamples.Rules[binRead.ReadString()]; int iCountBest = binRead.ReadInt(); aBestRules = new RuleWeighted[iCountBest]; for (int i = 0; i < iCountBest; i++) aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble()); dWeight = binRead.ReadDouble(); iStart = binRead.ReadInt(); iEnd = binRead.ReadInt(); this.elExamples = elExamples; }
public LemmaExample(string sWord, string sLemma, double dWeight, string sMsd, RuleList rlRules, LemmatizerSettings lsett) { this.lsett = lsett; this.sWord = sWord; this.sLemma = sLemma; this.sMsd = sMsd; this.dWeight = dWeight; this.lrRule = rlRules.AddRule(this); switch (lsett.eMsdConsider) { case LemmatizerSettings.MsdConsideration.Ignore: case LemmatizerSettings.MsdConsideration.JoinAll: case LemmatizerSettings.MsdConsideration.JoinDistinct: case LemmatizerSettings.MsdConsideration.JoinSameSubstring: sSignature = "[" + sWord + "]==>[" + sLemma + "]"; break; case LemmatizerSettings.MsdConsideration.Distinct: default: sSignature = "[" + sWord + "]==>[" + sLemma + "](" + (sMsd != null ? sMsd : "") + ")"; break; } this.sWordRearCache = null; this.sWordFrontCache = null; this.sLemmaFrontCache = null; }
public LemmaTreeNode(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) { Load(binRead, lsett, elExamples, ltnParentNode); }
public LemmaExample(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule) { Load(binRead, lsett, lrRule); }
public Lemmatizer(StreamReader srIn, string sFormat, LemmatizerSettings lsett): this(lsett) { AddMultextFile(srIn, sFormat); }
public LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples) : this(lsett, elExamples, 0, elExamples.Count - 1, null) { }
public ExampleList(SerializationInfo info, StreamingContext context) { lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings)); this.dictExamples = new Dictionary<string, LemmaExample>(); this.lstExamples = null; this.rlRules = new RuleList(lsett); string[] aWords = (string[])info.GetValue("aWords", typeof(string[])); string[] aLemmas = (string[])info.GetValue("aLemmas", typeof(string[])); double[] aWeights = (double[])info.GetValue("aWeights", typeof(double[])); string[] aMsds = (string[])info.GetValue("aMsds", typeof(string[])); for (int iExm = 0; iExm < aWords.Length; iExm++) AddExample(aWords[iExm], aLemmas[iExm], aWeights[iExm], aMsds[iExm]); }
public Lemmatizer(SerializationInfo info, StreamingContext context): this() { lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings)); elExamples = (ExampleList)info.GetValue("elExamples", typeof(ExampleList)); this.BuildModel(); }
public LemmatizerPrebuilt(LanguagePrebuilt lang, LemmatizerSettings lsett) : base(lsett) { this.lang = lang; }
public Lemmatizer(LemmatizerSettings lsett) { this.lsett = lsett; this.elExamples = new ExampleList(lsett); this.ltnRootNode = null; this.ltnRootNodeFront = null; }
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) { //load metadata bool bThisTopObject = binRead.ReadBool(); //load refernce types if needed ------------------------- if (bThisTopObject) this.lsett = new LemmatizerSettings(binRead); else this.lsett = lsett; rlRules = new RuleList(binRead, this.lsett); bool bCreateLstExamples = binRead.ReadBool(); lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null; dictExamples = new Dictionary<string, LemmaExample>(); //load dictionary items int iCount = binRead.ReadInt(); for (int iId = 0; iId < iCount; iId++) { LemmaRule lrRule = rlRules[binRead.ReadString()]; LemmaExample le = new LemmaExample(binRead, this.lsett, lrRule); dictExamples.Add(le.Signature, le); if (bCreateLstExamples) lstExamples.Add(le); } }
public void Load(Latino.BinarySerializer binRead) { lsett = new LemmatizerSettings(binRead); elExamples = new ExampleList(binRead, lsett); if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null); } else { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(false) , null); ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(true), null); } }
public LemmaRule(System.IO.BinaryReader binRead, LemmatizerSettings lsett) { this.Deserialize(binRead, lsett); }