private static Dictionary <char, MsdSplitTree> RecursiveSplit(MsdSplitTree bestTree, MsdSpec msdSpec, int level, BeamSearchParams beamParams) { Dictionary <char, MsdSplitTree> newSubTrees = new Dictionary <char, MsdSplitTree>(); bestTree.ambigRecurs = 0; bestTree.ambigChild = 0; bestTree.subTreeSizeRecurs = 0; foreach (KeyValuePair <char, MsdSplitTree> kvp in bestTree.subTrees) { MsdSplitTree subTree = kvp.Value; MsdSplitTree newSubTree = subTree; if (subTree.ambigChild > 0) { newSubTree = RecursiveSplitBeam(subTree.exampleList, subTree.ambigChild, msdSpec, level + 1, beamParams); } if (newSubTree == null) { newSubTree = subTree; } newSubTrees.Add(kvp.Key, newSubTree); bestTree.ambigRecurs += newSubTree.ambigRecurs; bestTree.ambigChild += newSubTree.ambigThis; bestTree.subTreeSizeRecurs += newSubTree.subTreeSizeRecurs; } return(newSubTrees); }
public Lemmatizer(LemmatizerSettings lsett) { this.lsett = lsett; this.elExamples = new ExampleList(lsett); this.ltnRootNode = null; this.ltnRootNodeFront = null; this.msdSplitTree = null; }
public MsdSplitTree(List <LemmaExample> examples, MsdSpec msdSpec, BeamSearchParams beamParams) { if (beamParams == null) { beamParams = new BeamSearchParams(); } MsdSplitTree et = Split(PrepareExampleList(examples), msdSpec, beamParams); CopyVariablesToThis(et); }
private void CopyVariablesToThis(MsdSplitTree et) { this.msdSpec = et.msdSpec; this.attrId = et.attrId; this.exampleList = et.exampleList; this.ambigThis = et.ambigThis; this.ambigChild = et.ambigChild; this.ambigRecurs = et.ambigRecurs; this.subTreeSizeRecurs = et.subTreeSizeRecurs; this.subTrees = et.subTrees; this.beamSiblings = et.beamSiblings; }
private static double GetChildsAmbiguities(MsdSplitTree et) { double weight = 0; foreach (MsdSplitTree etSub in et.subTrees.Values) { double ambig = GetListAmbiguities(etSub.exampleList); etSub.ambigChild = ambig; weight += ambig; } return(weight); }
private static int CompareTreesRecurSizeAsc(MsdSplitTree x, MsdSplitTree y) { if (x.subTreeSizeRecurs > y.subTreeSizeRecurs) { return(1); } if (x.subTreeSizeRecurs < y.subTreeSizeRecurs) { return(-1); } return(0); }
private static int CompareTreesAbmibuitiesAsc(MsdSplitTree x, MsdSplitTree y) { if (x.ambigChild > y.ambigChild) { return(1); } if (x.ambigChild < y.ambigChild) { return(-1); } return(0); }
private static MsdSplitTree SplitByMsdAttribute(List <LemmaExample> el, int attrId, MsdSpec msdSpec) { MsdSplitTree et = new MsdSplitTree(msdSpec); et.attrId = attrId; et.subTrees = new Dictionary <char, MsdSplitTree>(); et.exampleList = el; //todo FIX IT MsdSplitTree etSubDef = new MsdSplitTree(msdSpec); etSubDef.exampleList = new List <LemmaExample>(); et.subTrees['#'] = etSubDef; for (int i = 0; i < el.Count; i++) { LemmaExample e = el[i]; char cls = msdSpec.GetAttrValue(e.Msd, attrId); if (et.subTrees.ContainsKey(cls)) { et.subTrees[cls].exampleList.Add(e); } else { MsdSplitTree etSub = new MsdSplitTree(msdSpec); et.subTrees[cls] = etSub; etSub.exampleList = new List <LemmaExample>(); etSub.exampleList.Add(e); } } double ambigChild = 0; foreach (KeyValuePair <char, MsdSplitTree> sub in et.subTrees) { MsdSplitTree etSub = sub.Value; double ambig = GetListAmbiguities(sub.Value.exampleList); etSub.ambigThis = ambig; etSub.ambigChild = ambig; etSub.ambigRecurs = ambig; etSub.subTreeSizeRecurs = 1; ambigChild += ambig; } et.ambigChild = ambigChild; et.ambigRecurs = ambigChild; et.subTreeSizeRecurs = et.subTrees.Count; return(et); }
public void BuildModel(string msdSpec, MsdSplitTree.BeamSearchParams beamSearchOpt) { if (ltnRootNode != null) { return; } //if msd are used and other criterias are fulfiled than use MsdSplitTreeOptimization if (lsett.bUseMsdSplitTreeOptimization && lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct && !string.IsNullOrEmpty(msdSpec)) { msdSplitTree = new MsdSplitTree(elExamples.ListExamples, new MsdSpec(msdSpec), beamSearchOpt); //Console.WriteLine("MsdSplitTree consturcetd with {0} leaves!",msdSplitTree.subTreeSizeRecurs); ExampleList el = elExamples; elExamples = new ExampleList(lsett); //int s = 0; Dictionary <string, double> msds = new Dictionary <string, double>(); foreach (LemmaExample le in el.ListExamples) { //Console.WriteLine("{0}: {1}",s++,le.Msd); string newMsd = msdSplitTree.TransformMsd(le.Msd); elExamples.AddExample(le.Word, le.Lemma, le.Weight, newMsd); //Console.WriteLine("\t" + newMsd); if (msds.ContainsKey(newMsd)) { msds[newMsd] += le.Weight; } else { msds[newMsd] = le.Weight; } } foreach (KeyValuePair <string, double> msd in msds) { //Console.WriteLine("{0} {1}", msd.Key, msd.Value); } //TODO problem, if buildmodel is called twice than a problem occurs!!!! } elExamples.FinalizeAdditions(); if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(lsett, elExamples); } else { ltnRootNode = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false)); ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true)); } }
private static double GetRecursiveAmbiguities(MsdSplitTree et) { double weight = 0; if (et.subTrees == null) { return(GetChildsAmbiguities(et)); } foreach (MsdSplitTree etSub in et.subTrees.Values) { weight += GetRecursiveAmbiguities(etSub); } return(weight); }
private static void OutputTree(MsdSplitTree et, MsdSpec msdSpec, int level, int maxLevel, string attrSet) { if (level > maxLevel) { return; } int attrId = et.attrId; string attrName = msdSpec.attrIdToNameMap[attrId]; StringBuilder sbSubGroups = new StringBuilder(); if (et.subTrees != null) { sbSubGroups.AppendFormat(",SplitBy={0}({1}) To={2} classes:", attrName, attrId, (et.subTrees == null ? "0" : et.subTrees.Count.ToString())); foreach (KeyValuePair <char, MsdSplitTree> sub in et.subTrees) { sbSubGroups.AppendFormat("|{0}:{1}", sub.Key, sub.Value.exampleList.Count); } } StringBuilder sbBeam = new StringBuilder(); if (et.beamSiblings != null) { sbSubGroups.AppendFormat(",BeamSibling="); foreach (MsdSplitTree beamSibl in et.beamSiblings) { sbSubGroups.AppendFormat("|{0}", beamSibl.subTreeSizeRecurs); } } Console.Write(new string(' ', level * 2)); Console.WriteLine("Examples={0},AttrSet=({1}),SubTree={2},Ambig:(T={3}/S={4}/R={5}){6}{7}", et.exampleList.Count, attrSet, et.subTreeSizeRecurs, et.ambigThis, et.ambigChild, et.ambigRecurs, sbSubGroups, sbBeam); if (et.subTrees != null) { foreach (KeyValuePair <char, MsdSplitTree> sub in et.subTrees) { OutputTree(sub.Value, msdSpec, level + 1, maxLevel, attrSet + (attrSet.Length > 0 ? "&" : "") + attrName + "='" + sub.Key + "'"); } } }
public void Deserialize(BinaryReader binRead) { lsett = new LemmatizerSettings(binRead); bool bSerializeExamples = binRead.ReadBoolean(); elExamples = new ExampleList(binRead, lsett); ExampleList elExamplesRear; ExampleList elExamplesFront; if (bSerializeExamples) { elExamplesRear = elExamples.GetFrontRearExampleList(false); elExamplesFront = elExamples.GetFrontRearExampleList(true); } else { elExamplesRear = new ExampleList(binRead, lsett); elExamplesFront = new ExampleList(binRead, lsett); } if (!lsett.bBuildFrontLemmatizer) { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null); } else { ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamplesRear, null); ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null); } bool bMsdSplitTreePresent = binRead.ReadBoolean(); if (bMsdSplitTreePresent) { msdSplitTree = new MsdSplitTree(binRead); } else { msdSplitTree = null; } }
private static MsdSplitTree RecursiveSplitBeam(List <LemmaExample> el, double weightInitial, MsdSpec msdSpec, int level, BeamSearchParams beamParams) { List <MsdSplitTree> splits = ProduceOrderedSplits(el, weightInitial, msdSpec); //OutputSplits(splits, el, weightInitial, msdSpec, level); List <MsdSplitTree> beamSplits = new List <MsdSplitTree>(); int beamSize = 1; if (beamParams.beamsPerLevel != null && beamParams.beamsPerLevel.ContainsKey(level)) { beamSize = Math.Min(beamParams.beamsPerLevel[level], splits.Count); } for (int beam = 0; beam < beamSize; beam++) { MsdSplitTree bestTree = splits[beam]; if (bestTree.ambigChild < weightInitial) { Dictionary <char, MsdSplitTree> newSubTrees = RecursiveSplit(bestTree, msdSpec, level, beamParams); bestTree.subTrees = newSubTrees; bestTree.ambigThis = weightInitial; beamSplits.Add(bestTree); } } if (beamSplits.Count == 0) { return(null); } if (beamSplits.Count == 1) { return(beamSplits[0]); } beamSplits.Sort(CompareTreesRecurSizeAsc); MsdSplitTree best = beamSplits[0]; best.beamSiblings = beamSplits; return(best); }
private void Deserialize(BinaryReader binRead, Dictionary <int, LemmaExample> exampleMapping, MsdSpec msdSpec) { this.msdSpec = msdSpec; attrId = binRead.ReadInt32(); int exampleListCount = binRead.ReadInt32(); if (exampleListCount < 0) { exampleList = null; } else { exampleList = new List <LemmaExample>(exampleListCount); for (int i = 0; i < exampleListCount; i++) { int leId = binRead.ReadInt32(); LemmaExample le = exampleMapping[leId]; exampleList.Add(le); } } ambigThis = binRead.ReadDouble(); ambigChild = binRead.ReadDouble(); ambigRecurs = binRead.ReadDouble(); subTreeSizeRecurs = binRead.ReadInt32(); int subTreesCount = binRead.ReadInt32(); if (subTreesCount < 0) { subTrees = null; } else { subTrees = new Dictionary <char, MsdSplitTree>(); for (int i = 0; i < subTreesCount; i++) { char key = binRead.ReadChar(); MsdSplitTree mst = new MsdSplitTree(binRead, exampleMapping, msdSpec); subTrees.Add(key, mst); } } int beamSiblingsCount = binRead.ReadInt32(); if (beamSiblingsCount < 0) { beamSiblings = null; } else { beamSiblings = new List <MsdSplitTree>(beamSiblingsCount); for (int i = 0; i < beamSiblingsCount; i++) { bool bThisTree = binRead.ReadBoolean(); if (bThisTree) { beamSiblings.Add(this); } else { MsdSplitTree mst = new MsdSplitTree(binRead, exampleMapping, msdSpec); beamSiblings.Add(mst); } } } }