예제 #1
0
        private static Dictionary <char, MsdSplitTree> RecursiveSplit(MsdSplitTree bestTree, MsdSpec msdSpec, int level, BeamSearchParams beamParams)
        {
            Dictionary <char, MsdSplitTree> newSubTrees = new Dictionary <char, MsdSplitTree>();

            bestTree.ambigRecurs       = 0;
            bestTree.ambigChild        = 0;
            bestTree.subTreeSizeRecurs = 0;
            foreach (KeyValuePair <char, MsdSplitTree> kvp in bestTree.subTrees)
            {
                MsdSplitTree subTree    = kvp.Value;
                MsdSplitTree newSubTree = subTree;
                if (subTree.ambigChild > 0)
                {
                    newSubTree = RecursiveSplitBeam(subTree.exampleList, subTree.ambigChild, msdSpec, level + 1, beamParams);
                }
                if (newSubTree == null)
                {
                    newSubTree = subTree;
                }
                newSubTrees.Add(kvp.Key, newSubTree);


                bestTree.ambigRecurs       += newSubTree.ambigRecurs;
                bestTree.ambigChild        += newSubTree.ambigThis;
                bestTree.subTreeSizeRecurs += newSubTree.subTreeSizeRecurs;
            }
            return(newSubTrees);
        }
예제 #2
0
 public Lemmatizer(LemmatizerSettings lsett)
 {
     this.lsett            = lsett;
     this.elExamples       = new ExampleList(lsett);
     this.ltnRootNode      = null;
     this.ltnRootNodeFront = null;
     this.msdSplitTree     = null;
 }
예제 #3
0
        public MsdSplitTree(List <LemmaExample> examples, MsdSpec msdSpec, BeamSearchParams beamParams)
        {
            if (beamParams == null)
            {
                beamParams = new BeamSearchParams();
            }

            MsdSplitTree et = Split(PrepareExampleList(examples), msdSpec, beamParams);

            CopyVariablesToThis(et);
        }
예제 #4
0
 private void CopyVariablesToThis(MsdSplitTree et)
 {
     this.msdSpec           = et.msdSpec;
     this.attrId            = et.attrId;
     this.exampleList       = et.exampleList;
     this.ambigThis         = et.ambigThis;
     this.ambigChild        = et.ambigChild;
     this.ambigRecurs       = et.ambigRecurs;
     this.subTreeSizeRecurs = et.subTreeSizeRecurs;
     this.subTrees          = et.subTrees;
     this.beamSiblings      = et.beamSiblings;
 }
예제 #5
0
        private static double GetChildsAmbiguities(MsdSplitTree et)
        {
            double weight = 0;

            foreach (MsdSplitTree etSub in et.subTrees.Values)
            {
                double ambig = GetListAmbiguities(etSub.exampleList);
                etSub.ambigChild = ambig;
                weight          += ambig;
            }
            return(weight);
        }
예제 #6
0
 private static int CompareTreesRecurSizeAsc(MsdSplitTree x, MsdSplitTree y)
 {
     if (x.subTreeSizeRecurs > y.subTreeSizeRecurs)
     {
         return(1);
     }
     if (x.subTreeSizeRecurs < y.subTreeSizeRecurs)
     {
         return(-1);
     }
     return(0);
 }
예제 #7
0
 private static int CompareTreesAbmibuitiesAsc(MsdSplitTree x, MsdSplitTree y)
 {
     if (x.ambigChild > y.ambigChild)
     {
         return(1);
     }
     if (x.ambigChild < y.ambigChild)
     {
         return(-1);
     }
     return(0);
 }
예제 #8
0
        private static MsdSplitTree SplitByMsdAttribute(List <LemmaExample> el, int attrId, MsdSpec msdSpec)
        {
            MsdSplitTree et = new MsdSplitTree(msdSpec);

            et.attrId      = attrId;
            et.subTrees    = new Dictionary <char, MsdSplitTree>();
            et.exampleList = el;

            //todo FIX IT
            MsdSplitTree etSubDef = new MsdSplitTree(msdSpec);

            etSubDef.exampleList = new List <LemmaExample>();
            et.subTrees['#']     = etSubDef;

            for (int i = 0; i < el.Count; i++)
            {
                LemmaExample e   = el[i];
                char         cls = msdSpec.GetAttrValue(e.Msd, attrId);
                if (et.subTrees.ContainsKey(cls))
                {
                    et.subTrees[cls].exampleList.Add(e);
                }
                else
                {
                    MsdSplitTree etSub = new MsdSplitTree(msdSpec);

                    et.subTrees[cls] = etSub;

                    etSub.exampleList = new List <LemmaExample>();
                    etSub.exampleList.Add(e);
                }
            }

            double ambigChild = 0;

            foreach (KeyValuePair <char, MsdSplitTree> sub in et.subTrees)
            {
                MsdSplitTree etSub = sub.Value;
                double       ambig = GetListAmbiguities(sub.Value.exampleList);
                etSub.ambigThis         = ambig;
                etSub.ambigChild        = ambig;
                etSub.ambigRecurs       = ambig;
                etSub.subTreeSizeRecurs = 1;
                ambigChild += ambig;
            }

            et.ambigChild        = ambigChild;
            et.ambigRecurs       = ambigChild;
            et.subTreeSizeRecurs = et.subTrees.Count;

            return(et);
        }
예제 #9
0
        public void BuildModel(string msdSpec, MsdSplitTree.BeamSearchParams beamSearchOpt)
        {
            if (ltnRootNode != null)
            {
                return;
            }

            //if msd are used and other criterias are fulfiled than use MsdSplitTreeOptimization
            if (lsett.bUseMsdSplitTreeOptimization && lsett.eMsdConsider == LemmatizerSettings.MsdConsideration.Distinct && !string.IsNullOrEmpty(msdSpec))
            {
                msdSplitTree = new MsdSplitTree(elExamples.ListExamples, new MsdSpec(msdSpec), beamSearchOpt);
                //Console.WriteLine("MsdSplitTree consturcetd with {0} leaves!",msdSplitTree.subTreeSizeRecurs);
                ExampleList el = elExamples;
                elExamples = new ExampleList(lsett);
                //int s = 0;
                Dictionary <string, double> msds = new Dictionary <string, double>();
                foreach (LemmaExample le in el.ListExamples)
                {
                    //Console.WriteLine("{0}: {1}",s++,le.Msd);
                    string newMsd = msdSplitTree.TransformMsd(le.Msd);
                    elExamples.AddExample(le.Word, le.Lemma, le.Weight, newMsd);
                    //Console.WriteLine("\t" + newMsd);
                    if (msds.ContainsKey(newMsd))
                    {
                        msds[newMsd] += le.Weight;
                    }
                    else
                    {
                        msds[newMsd] = le.Weight;
                    }
                }
                foreach (KeyValuePair <string, double> msd in msds)
                {
                    //Console.WriteLine("{0} {1}", msd.Key, msd.Value);
                }
                //TODO problem, if buildmodel is called twice than a problem occurs!!!!
            }
            elExamples.FinalizeAdditions();


            if (!lsett.bBuildFrontLemmatizer)
            {
                ltnRootNode = new LemmaTreeNode(lsett, elExamples);
            }
            else
            {
                ltnRootNode      = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(false));
                ltnRootNodeFront = new LemmaTreeNode(lsett, elExamples.GetFrontRearExampleList(true));
            }
        }
예제 #10
0
        private static double GetRecursiveAmbiguities(MsdSplitTree et)
        {
            double weight = 0;

            if (et.subTrees == null)
            {
                return(GetChildsAmbiguities(et));
            }

            foreach (MsdSplitTree etSub in et.subTrees.Values)
            {
                weight += GetRecursiveAmbiguities(etSub);
            }
            return(weight);
        }
예제 #11
0
        private static void OutputTree(MsdSplitTree et, MsdSpec msdSpec, int level, int maxLevel, string attrSet)
        {
            if (level > maxLevel)
            {
                return;
            }
            int    attrId   = et.attrId;
            string attrName = msdSpec.attrIdToNameMap[attrId];

            StringBuilder sbSubGroups = new StringBuilder();

            if (et.subTrees != null)
            {
                sbSubGroups.AppendFormat(",SplitBy={0}({1}) To={2} classes:",
                                         attrName, attrId, (et.subTrees == null ? "0" : et.subTrees.Count.ToString()));
                foreach (KeyValuePair <char, MsdSplitTree> sub in et.subTrees)
                {
                    sbSubGroups.AppendFormat("|{0}:{1}", sub.Key, sub.Value.exampleList.Count);
                }
            }
            StringBuilder sbBeam = new StringBuilder();

            if (et.beamSiblings != null)
            {
                sbSubGroups.AppendFormat(",BeamSibling=");
                foreach (MsdSplitTree beamSibl in et.beamSiblings)
                {
                    sbSubGroups.AppendFormat("|{0}", beamSibl.subTreeSizeRecurs);
                }
            }

            Console.Write(new string(' ', level * 2));
            Console.WriteLine("Examples={0},AttrSet=({1}),SubTree={2},Ambig:(T={3}/S={4}/R={5}){6}{7}",
                              et.exampleList.Count, attrSet, et.subTreeSizeRecurs,
                              et.ambigThis, et.ambigChild, et.ambigRecurs, sbSubGroups, sbBeam);

            if (et.subTrees != null)
            {
                foreach (KeyValuePair <char, MsdSplitTree> sub in et.subTrees)
                {
                    OutputTree(sub.Value, msdSpec, level + 1, maxLevel, attrSet + (attrSet.Length > 0 ? "&" : "") + attrName + "='" + sub.Key + "'");
                }
            }
        }
예제 #12
0
        public void Deserialize(BinaryReader binRead)
        {
            lsett = new LemmatizerSettings(binRead);

            bool bSerializeExamples = binRead.ReadBoolean();

            elExamples = new ExampleList(binRead, lsett);

            ExampleList elExamplesRear;
            ExampleList elExamplesFront;

            if (bSerializeExamples)
            {
                elExamplesRear  = elExamples.GetFrontRearExampleList(false);
                elExamplesFront = elExamples.GetFrontRearExampleList(true);
            }
            else
            {
                elExamplesRear  = new ExampleList(binRead, lsett);
                elExamplesFront = new ExampleList(binRead, lsett);
            }

            if (!lsett.bBuildFrontLemmatizer)
            {
                ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
            }
            else
            {
                ltnRootNode      = new LemmaTreeNode(binRead, lsett, elExamplesRear, null);
                ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null);
            }

            bool bMsdSplitTreePresent = binRead.ReadBoolean();

            if (bMsdSplitTreePresent)
            {
                msdSplitTree = new MsdSplitTree(binRead);
            }
            else
            {
                msdSplitTree = null;
            }
        }
예제 #13
0
        private static MsdSplitTree RecursiveSplitBeam(List <LemmaExample> el, double weightInitial, MsdSpec msdSpec, int level, BeamSearchParams beamParams)
        {
            List <MsdSplitTree> splits = ProduceOrderedSplits(el, weightInitial, msdSpec);
            //OutputSplits(splits, el, weightInitial, msdSpec, level);

            List <MsdSplitTree> beamSplits = new List <MsdSplitTree>();

            int beamSize = 1;

            if (beamParams.beamsPerLevel != null && beamParams.beamsPerLevel.ContainsKey(level))
            {
                beamSize = Math.Min(beamParams.beamsPerLevel[level], splits.Count);
            }

            for (int beam = 0; beam < beamSize; beam++)
            {
                MsdSplitTree bestTree = splits[beam];
                if (bestTree.ambigChild < weightInitial)
                {
                    Dictionary <char, MsdSplitTree> newSubTrees = RecursiveSplit(bestTree, msdSpec, level, beamParams);
                    bestTree.subTrees  = newSubTrees;
                    bestTree.ambigThis = weightInitial;
                    beamSplits.Add(bestTree);
                }
            }

            if (beamSplits.Count == 0)
            {
                return(null);
            }
            if (beamSplits.Count == 1)
            {
                return(beamSplits[0]);
            }

            beamSplits.Sort(CompareTreesRecurSizeAsc);
            MsdSplitTree best = beamSplits[0];

            best.beamSiblings = beamSplits;
            return(best);
        }
예제 #14
0
        private void Deserialize(BinaryReader binRead, Dictionary <int, LemmaExample> exampleMapping, MsdSpec msdSpec)
        {
            this.msdSpec = msdSpec;

            attrId = binRead.ReadInt32();

            int exampleListCount = binRead.ReadInt32();

            if (exampleListCount < 0)
            {
                exampleList = null;
            }
            else
            {
                exampleList = new List <LemmaExample>(exampleListCount);
                for (int i = 0; i < exampleListCount; i++)
                {
                    int          leId = binRead.ReadInt32();
                    LemmaExample le   = exampleMapping[leId];
                    exampleList.Add(le);
                }
            }

            ambigThis         = binRead.ReadDouble();
            ambigChild        = binRead.ReadDouble();
            ambigRecurs       = binRead.ReadDouble();
            subTreeSizeRecurs = binRead.ReadInt32();

            int subTreesCount = binRead.ReadInt32();

            if (subTreesCount < 0)
            {
                subTrees = null;
            }
            else
            {
                subTrees = new Dictionary <char, MsdSplitTree>();
                for (int i = 0; i < subTreesCount; i++)
                {
                    char         key = binRead.ReadChar();
                    MsdSplitTree mst = new MsdSplitTree(binRead, exampleMapping, msdSpec);
                    subTrees.Add(key, mst);
                }
            }

            int beamSiblingsCount = binRead.ReadInt32();

            if (beamSiblingsCount < 0)
            {
                beamSiblings = null;
            }
            else
            {
                beamSiblings = new List <MsdSplitTree>(beamSiblingsCount);
                for (int i = 0; i < beamSiblingsCount; i++)
                {
                    bool bThisTree = binRead.ReadBoolean();
                    if (bThisTree)
                    {
                        beamSiblings.Add(this);
                    }
                    else
                    {
                        MsdSplitTree mst = new MsdSplitTree(binRead, exampleMapping, msdSpec);
                        beamSiblings.Add(mst);
                    }
                }
            }
        }