public LemmaExample AddExample(string sWord, string sLemma, double dWeight, string sMsd) { string sNewMsd = lsett.eMsdConsider != LemmatizerSettings.MsdConsideration.Ignore ? sMsd : null; LemmaExample leNew = new LemmaExample(sWord, sLemma, dWeight, sNewMsd, rlRules, lsett); return(Add(leNew)); }
private static List <LemmaExample> CompactExamples(List <LemmaExample> examples) { Dictionary <string, LemmaExample> exampleDict = new Dictionary <string, LemmaExample>(); Dictionary <string, double> weights = new Dictionary <string, double>(); foreach (LemmaExample le in examples) { string signature = le.Word + "\t" + le.Lemma + "\t" + le.Msd; if (exampleDict.ContainsKey(signature)) { weights[signature] += le.Weight; } else { exampleDict[signature] = le; weights[signature] = le.Weight; } } List <LemmaExample> el = new List <LemmaExample>(); foreach (KeyValuePair <string, LemmaExample> kvp in exampleDict) { string signature = kvp.Key; LemmaExample le = kvp.Value; el.Add(new LemmaExample(le.Word, le.Lemma, weights[signature], le.Msd, null, null)); } return(el); }
public void Deserialize(BinaryReader binRead) { bool msdSpecExists = binRead.ReadBoolean(); if (!msdSpecExists) { msdSpec = null; } else { msdSpec = new MsdSpec(binRead); } Dictionary <int, LemmaExample> exampleMapping = new Dictionary <int, LemmaExample>(); int exampleListCount = binRead.ReadInt32(); if (exampleListCount < 0) { exampleList = null; } else { exampleList = new List <LemmaExample>(exampleListCount); for (int leId = 0; leId < exampleListCount; leId++) { LemmaExample le = new LemmaExample(binRead, null, null); exampleMapping[leId] = le; exampleList.Add(le); } } Deserialize(binRead, exampleMapping, msdSpec); }
private static MsdSplitTree SplitByMsdAttribute(List <LemmaExample> el, int attrId, MsdSpec msdSpec) { MsdSplitTree et = new MsdSplitTree(msdSpec); et.attrId = attrId; et.subTrees = new Dictionary <char, MsdSplitTree>(); et.exampleList = el; //todo FIX IT MsdSplitTree etSubDef = new MsdSplitTree(msdSpec); etSubDef.exampleList = new List <LemmaExample>(); et.subTrees['#'] = etSubDef; for (int i = 0; i < el.Count; i++) { LemmaExample e = el[i]; char cls = msdSpec.GetAttrValue(e.Msd, attrId); if (et.subTrees.ContainsKey(cls)) { et.subTrees[cls].exampleList.Add(e); } else { MsdSplitTree etSub = new MsdSplitTree(msdSpec); et.subTrees[cls] = etSub; etSub.exampleList = new List <LemmaExample>(); etSub.exampleList.Add(e); } } double ambigChild = 0; foreach (KeyValuePair <char, MsdSplitTree> sub in et.subTrees) { MsdSplitTree etSub = sub.Value; double ambig = GetListAmbiguities(sub.Value.exampleList); etSub.ambigThis = ambig; etSub.ambigChild = ambig; etSub.ambigRecurs = ambig; etSub.subTreeSizeRecurs = 1; ambigChild += ambig; } et.ambigChild = ambigChild; et.ambigRecurs = ambigChild; et.subTreeSizeRecurs = et.subTrees.Count; return(et); }
private static double GetListAmbiguities(List <LemmaExample> el) { Dictionary <string, Dictionary <string, Dictionary <string, double> > > wordLemmaMsdWeight = new Dictionary <string, Dictionary <string, Dictionary <string, double> > >(); for (int i = 0; i < el.Count; i++) { LemmaExample exmp = el[i]; if (!wordLemmaMsdWeight.ContainsKey(exmp.Word)) { wordLemmaMsdWeight[exmp.Word] = new Dictionary <string, Dictionary <string, double> >(); } if (!wordLemmaMsdWeight[exmp.Word].ContainsKey(exmp.Lemma)) { wordLemmaMsdWeight[exmp.Word][exmp.Lemma] = new Dictionary <string, double>(); } if (!wordLemmaMsdWeight[exmp.Word][exmp.Lemma].ContainsKey(exmp.Msd)) { wordLemmaMsdWeight[exmp.Word][exmp.Lemma][exmp.Msd] = exmp.Weight; } else { wordLemmaMsdWeight[exmp.Word][exmp.Lemma][exmp.Msd] += exmp.Weight; } } double wghtAmbiguities = 0; foreach (KeyValuePair <string, Dictionary <string, Dictionary <string, double> > > wordBase in wordLemmaMsdWeight) { double weightLemmaSum = 0; double weightLemmaMax = 0; foreach (KeyValuePair <string, Dictionary <string, double> > wordLemmaBase in wordBase.Value) { double weightLemma = 0; foreach (KeyValuePair <string, double> wordLemmaMsdBase in wordLemmaBase.Value) { weightLemma += wordLemmaMsdBase.Value; } weightLemmaSum += weightLemma; if (weightLemma > weightLemmaMax) { weightLemmaMax = weightLemma; } } wghtAmbiguities += weightLemmaSum - weightLemmaMax; } return(wghtAmbiguities); }
public string Lemmatize(string sWord) { if (!lsett.bBuildFrontLemmatizer) { return(ltrRootNodeSafe.Lemmatize(sWord)); } else { string sWordFront = LemmaExample.StringReverse(sWord); string sLemmaFront = ltrRootNodeFrontSafe.Lemmatize(sWordFront); string sWordRear = LemmaExample.StringReverse(sLemmaFront); return(ltrRootNodeSafe.Lemmatize(sWordRear)); } }
private static int CompareExamplesWordLemmaMsd(LemmaExample x, LemmaExample y) { int ret = 0; ret = String.Compare(x.Word, y.Word); if (ret != 0) { return(ret); } ret = String.Compare(x.Lemma, y.Lemma); if (ret != 0) { return(ret); } ret = String.Compare(x.Msd, y.Msd); return(ret); }
private LemmaExample Add(LemmaExample leNew) { LemmaExample leReturn = null; if (!dictExamples.TryGetValue(leNew.Signature, out leReturn)) { leReturn = leNew; dictExamples.Add(leReturn.Signature, leReturn); } else { leReturn.Join(leNew); } lstExamples = null; return(leReturn); }
/// <summary> /// Extended lemamtization interface with more options /// </summary> /// <param name="sWord">word to be lemmatized</param> /// <param name="ignoreCase">If true than casing will be ignored. If set to false, than lemmatizer will match the longest rule it knows but requiering same casing of rule and word.</param> /// <param name="sMsd">morpho static descriptor of the word to be lemmatized</param> /// <returns>Lemmatized word.</returns> public string Lemmatize(string sWord, bool ignoreCase, string sMsd) { string sNewMsd = sMsd; if (sMsd != null && lsett.bUseMsdSplitTreeOptimization && lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct) { sNewMsd = msdSplitTree.TransformMsd(sNewMsd); } if (!lsett.bBuildFrontLemmatizer) { return(ltrRootNodeSafe.Lemmatize(sWord, ignoreCase, sNewMsd)); } else { string sWordFront = LemmaExample.StringReverse(sWord); string sLemmaFront = ltrRootNodeFrontSafe.Lemmatize(sWordFront, ignoreCase, sNewMsd); string sWordRear = LemmaExample.StringReverse(sLemmaFront); return(ltrRootNodeSafe.Lemmatize(sWordRear, ignoreCase, sNewMsd)); } }
public string Lemmatize(string word) { var wordLower = word.ToLower(); if (this.Exceptions.ContainsKey(wordLower)) { return(this.Exceptions[wordLower]); } if (!Lsett.bBuildFrontLemmatizer) { return(LtrRootNodeSafe.Lemmatize(word)); } else { string sWordFront = LemmaExample.StringReverse(word); string sLemmaFront = LtrRootNodeFrontSafe.Lemmatize(sWordFront); string sWordRear = LemmaExample.StringReverse(sLemmaFront); return(LtrRootNodeSafe.Lemmatize(sWordRear)); } }
private static int CompareExamplesWordMsdWeightLemma(LemmaExample x, LemmaExample y) { int ret = 0; ret = String.Compare(x.Word, y.Word); if (ret != 0) { return(ret); } ret = String.Compare(x.Msd, y.Msd); if (ret != 0) { return(ret); } ret = x.Weight > y.Weight ? -1 : (x.Weight < y.Weight ? 1 : 0); if (ret != 0) { return(ret); } ret = String.Compare(x.Lemma, y.Lemma); return(ret); }
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett) { //load metadata bool bThisTopObject = binRead.ReadBoolean(); //load refernce types if needed ------------------------- if (bThisTopObject) { this.lsett = new LemmatizerSettings(binRead); } else { this.lsett = lsett; } // deserialize rules rlRules = new RuleList(binRead, this.lsett); // deserialize examples bool bCreateLstExamples = binRead.ReadBoolean(); lstExamples = bCreateLstExamples ? new List <LemmaExample>() : null; dictExamples = new Dictionary <string, LemmaExample>(); //load dictionary items int iCount = binRead.ReadInt32(); for (int iId = 0; iId < iCount; iId++) { LemmaRule lrRule = rlRules[binRead.ReadString()]; var le = new LemmaExample(binRead, this.lsett, lrRule); dictExamples.Add(le.Signature, le); if (bCreateLstExamples) { lstExamples.Add(le); } } }
public LemmaRule AddRule(LemmaExample le) { return(AddRule(new LemmaRule(le.Word, le.Lemma, this.Count, lsett))); }
private void Deserialize(BinaryReader binRead, Dictionary <int, LemmaExample> exampleMapping, MsdSpec msdSpec) { this.msdSpec = msdSpec; attrId = binRead.ReadInt32(); int exampleListCount = binRead.ReadInt32(); if (exampleListCount < 0) { exampleList = null; } else { exampleList = new List <LemmaExample>(exampleListCount); for (int i = 0; i < exampleListCount; i++) { int leId = binRead.ReadInt32(); LemmaExample le = exampleMapping[leId]; exampleList.Add(le); } } ambigThis = binRead.ReadDouble(); ambigChild = binRead.ReadDouble(); ambigRecurs = binRead.ReadDouble(); subTreeSizeRecurs = binRead.ReadInt32(); int subTreesCount = binRead.ReadInt32(); if (subTreesCount < 0) { subTrees = null; } else { subTrees = new Dictionary <char, MsdSplitTree>(); for (int i = 0; i < subTreesCount; i++) { char key = binRead.ReadChar(); MsdSplitTree mst = new MsdSplitTree(binRead, exampleMapping, msdSpec); subTrees.Add(key, mst); } } int beamSiblingsCount = binRead.ReadInt32(); if (beamSiblingsCount < 0) { beamSiblings = null; } else { beamSiblings = new List <MsdSplitTree>(beamSiblingsCount); for (int i = 0; i < beamSiblingsCount; i++) { bool bThisTree = binRead.ReadBoolean(); if (bThisTree) { beamSiblings.Add(this); } else { MsdSplitTree mst = new MsdSplitTree(binRead, exampleMapping, msdSpec); beamSiblings.Add(mst); } } } }