示例#1
0
        public ExampleList(LemmatizerSettings lsett): base() {
            this.lsett = lsett;

            this.dictExamples = new Dictionary<string, LemmaExample>();
            this.lstExamples = null;
            this.rlRules = new RuleList(lsett);
        }
示例#2
0
        public LemmaRule(string sWord, string sLemma, int iId, LemmatizerSettings lsett) {
            this.lsett = lsett;
            this.iId = iId;

            int iSameStem = SameStem(sWord, sLemma);
            sTo = sLemma.Substring(iSameStem);
            iFrom = sWord.Length - iSameStem;

            if (lsett.bUseFromInRules) {
                sFrom = sWord.Substring(iSameStem);
                sSignature = "[" + sFrom + "]==>[" + sTo + "]";
            }
            else {
                sFrom = null;
                sSignature = "[#" + iFrom + "]==>[" + sTo + "]";
            }
        }
        /// <summary>
        /// 
        /// </summary>
        /// <param name="lsett"></param>
        /// <param name="elExamples"></param>
        /// <param name="iStart">Index of the first word of the current group</param>
        /// <param name="iEnd">Index of the last word of the current group</param>
        /// <param name="ltnParentNode"></param>
        private LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples, int iStart, int iEnd, LemmaTreeNode ltnParentNode) : this(lsett) {
            this.ltnParentNode = ltnParentNode;
            this.dictSubNodes = null;

            this.iStart = iStart;
            this.iEnd = iEnd;
            this.elExamples = elExamples;

            if (iStart >= elExamples.Count || iEnd >= elExamples.Count || iStart > iEnd) {
                lrBestRule = elExamples.Rules.DefaultRule;
                aBestRules = new RuleWeighted[1];
                aBestRules[0] = new RuleWeighted(lrBestRule, 0);
                dWeight = 0;
                return;
            }


            int iConditionLength = Math.Min(ltnParentNode == null ? 0 : ltnParentNode.iSimilarity + 1, elExamples[iStart].Word.Length);
            this.sCondition = elExamples[iStart].Word.Substring(elExamples[iStart].Word.Length - iConditionLength);
            this.iSimilarity = elExamples[iStart].Similarity(elExamples[iEnd]);
            this.bWholeWord = ltnParentNode == null ? false : elExamples[iEnd].Word.Length == ltnParentNode.iSimilarity;

            FindBestRules();
            AddSubAll();


            //TODO check this heuristics, can be problematic when there are more applicable rules
            if (dictSubNodes != null) {
                List<KeyValuePair<char, LemmaTreeNode>> lReplaceNodes = new List<KeyValuePair<char, LemmaTreeNode>>();
                foreach (KeyValuePair<char, LemmaTreeNode> kvpChild in dictSubNodes)
                    if (kvpChild.Value.dictSubNodes != null && kvpChild.Value.dictSubNodes.Count == 1) {
                        IEnumerator<LemmaTreeNode> enumChildChild = kvpChild.Value.dictSubNodes.Values.GetEnumerator();
                        enumChildChild.MoveNext();
                        LemmaTreeNode ltrChildChild = enumChildChild.Current;
                        if (kvpChild.Value.lrBestRule == lrBestRule)
                            lReplaceNodes.Add(new KeyValuePair<char, LemmaTreeNode>(kvpChild.Key, ltrChildChild));
                    }
                foreach (KeyValuePair<char, LemmaTreeNode> kvpChild in lReplaceNodes) {
                    dictSubNodes[kvpChild.Key] = kvpChild.Value;
                    kvpChild.Value.ltnParentNode = this;
                }

            }

        }
示例#4
0
 public Lemmatizer(StreamReader srIn, string sFormat, LemmatizerSettings lsett) : this(lsett)
 {
     AddMultextFile(srIn, sFormat);
 }
示例#5
0
        public void Deserialize(BinaryReader binRead) {
            lsett = new LemmatizerSettings(binRead);

            bool bSerializeExamples = binRead.ReadBoolean();
            elExamples = new ExampleList(binRead, lsett);

            ExampleList elExamplesRear;
            ExampleList elExamplesFront;

            if (bSerializeExamples) {
                elExamplesRear = elExamples.GetFrontRearExampleList(false);
                elExamplesFront = elExamples.GetFrontRearExampleList(true);
            }
            else {
                elExamplesRear = new ExampleList(binRead, lsett);
                elExamplesFront = new ExampleList(binRead, lsett);
            }                

            if (!lsett.bBuildFrontLemmatizer) {
                ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
            }
            else {
                ltnRootNode = new LemmaTreeNode(binRead, lsett,  elExamplesRear, null);
                ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamplesFront, null);
            }
        }
示例#6
0
 public ExampleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
     Load(binRead, lsett);
 }
示例#7
0
 private LemmaTreeNode(LemmatizerSettings lsett)
 {
     this.lsett = lsett;
 }
示例#8
0
        public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode)
        {
            this.lsett = lsett;

            if (binRead.ReadBoolean())
            {
                dictSubNodes = new Dictionary <char, LemmaTreeNode>();
                int iCount = binRead.ReadInt32();
                for (int i = 0; i < iCount; i++)
                {
                    char          cKey   = binRead.ReadChar();
                    LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
                    dictSubNodes.Add(cKey, ltrSub);
                }
            }
            else
            {
                dictSubNodes = null;
            }

            this.ltnParentNode = ltnParentNode;

            iSimilarity = binRead.ReadInt32();
            sCondition  = binRead.ReadString();
            bWholeWord  = binRead.ReadBoolean();

            lrBestRule = elExamples.Rules[binRead.ReadString()];

            int iCountBest = binRead.ReadInt32();

            aBestRules = new RuleWeighted[iCountBest];
            for (int i = 0; i < iCountBest; i++)
            {
                aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());
            }

            dWeight = binRead.ReadDouble();

            //deserialize dictMsdBestRules dictionary
            int dictMsdBestRulesCount = binRead.ReadInt32();

            if (dictMsdBestRulesCount == -1)
            {
                dictMsdBestRules = null;
            }
            else
            {
                dictMsdBestRules = new Dictionary <string, RuleWeighted[]>();
                for (int msdId = 0; msdId < dictMsdBestRulesCount; msdId++)
                {
                    string         sMsd = binRead.ReadString();
                    RuleWeighted[] lRuleWeighted;
                    int            ruleWeightedCount = binRead.ReadInt32();
                    if (ruleWeightedCount == -1)
                    {
                        lRuleWeighted = null;
                    }
                    else
                    {
                        lRuleWeighted = new RuleWeighted[ruleWeightedCount];
                        for (int ruleId = 0; ruleId < ruleWeightedCount; ruleId++)
                        {
                            string    ruleSignature = binRead.ReadString();
                            double    ruleWeight    = binRead.ReadDouble();
                            LemmaRule rule          = elExamples.Rules[ruleSignature];
                            lRuleWeighted[ruleId] = new RuleWeighted(rule, ruleWeight);
                        }
                    }
                    dictMsdBestRules.Add(sMsd, lRuleWeighted);
                }
            }

            //deserialize dictMsdWeights dictionary
            int dictMsdWeightsCount = binRead.ReadInt32();

            if (dictMsdWeightsCount == -1)
            {
                dictMsdWeights = null;
            }
            else
            {
                dictMsdWeights = new Dictionary <string, double>();
                for (int msdId = 0; msdId < dictMsdWeightsCount; msdId++)
                {
                    string sMsd       = binRead.ReadString();
                    double dMsdWeight = binRead.ReadDouble();
                    dictMsdWeights.Add(sMsd, dMsdWeight);
                }
            }

            iStart          = binRead.ReadInt32();
            iEnd            = binRead.ReadInt32();
            this.elExamples = elExamples;
        }
示例#9
0
 public RuleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett)
 {
     Load(binRead, lsett);
 }
 public LemmaExample(BinaryReader binRead, LemmatizerSettings lsett, LemmaRule lrRule)
 {
     Deserialize(binRead, lsett, lrRule);
 }
示例#11
0
 private LemmaTreeNode(LemmatizerSettings lsett) {
     this.lsett = lsett;
 }
示例#12
0
 public LemmaTreeNode(BinaryReader binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
     Deserialize(binRead, lsett, elExamples, ltnParentNode);
 }
示例#13
0
 public ExampleList(BinaryReader binRead, LemmatizerSettings lsett) {
     Deserialize(binRead, lsett);
 }
 public LemmatizerPrebuilt(LanguagePrebuilt lang, LemmatizerSettings lsett)
     : base(lsett) {
     this.lang = lang;
 }
示例#15
0
 public LemmaRule(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
     Load(binRead, lsett);
 }
示例#16
0
        public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
            //load metadata
            bool bThisTopObject = binRead.ReadBool();

            //load value types --------------------------------------
            iId = binRead.ReadInt();
            iFrom = binRead.ReadInt();
            if (binRead.ReadBool())
                sFrom = binRead.ReadString();
            else
                sFrom = null;
            sTo = binRead.ReadString();
            sSignature = binRead.ReadString();

            //load refernce types if needed -------------------------
            if (bThisTopObject)
                this.lsett = new LemmatizerSettings(binRead);
            else
                this.lsett = lsett;
        }
示例#17
0
 public LemmaRule(System.IO.BinaryReader binRead, LemmatizerSettings lsett) {
     this.Deserialize(binRead, lsett);
 }
示例#18
0
        // Constructor(s) & Destructor(s) ------------

        public RuleList(LemmatizerSettings lsett)
        {
            this.lsett    = lsett;
            lrDefaultRule = AddRule(new LemmaRule("", "", 0, lsett));
        }
示例#19
0
 public LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples)
     : this(lsett, elExamples, 0, elExamples.Count-1, null) {
 }
示例#20
0
 public RuleList(BinaryReader binRead, LemmatizerSettings lsett)
 {
     this.Deserialize(binRead, lsett);
 }
示例#21
0
        public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
            this.lsett = lsett;

            if (binRead.ReadBool()) {
                dictSubNodes = new Dictionary<char, LemmaTreeNode>();
                int iCount = binRead.ReadInt();
                for (int i = 0; i < iCount; i++) {
                    char cKey = binRead.ReadChar();
                    LemmaTreeNode ltrSub = new LemmaTreeNode(binRead, this.lsett, elExamples, this);
                    dictSubNodes.Add(cKey, ltrSub);
                }
            }
            else
                dictSubNodes = null;

            this.ltnParentNode = ltnParentNode;

            iSimilarity = binRead.ReadInt();
            sCondition = binRead.ReadString();
            bWholeWord = binRead.ReadBool();

            lrBestRule = elExamples.Rules[binRead.ReadString()];

            int iCountBest = binRead.ReadInt();
            aBestRules = new RuleWeighted[iCountBest];
            for (int i = 0; i < iCountBest; i++)
                aBestRules[i] = new RuleWeighted(elExamples.Rules[binRead.ReadString()], binRead.ReadDouble());

            dWeight = binRead.ReadDouble();

            iStart = binRead.ReadInt();
            iEnd = binRead.ReadInt();
            this.elExamples = elExamples;

        }
        public LemmaExample(string sWord, string sLemma, double dWeight, string sMsd, RuleList rlRules, LemmatizerSettings lsett)
        {
            this.lsett = lsett;

            this.sWord   = sWord;
            this.sLemma  = sLemma;
            this.sMsd    = sMsd;
            this.dWeight = dWeight;
            this.lrRule  = rlRules.AddRule(this);

            switch (lsett.eMsdConsider)
            {
            case LemmatizerSettings.MsdConsideration.Ignore:
            case LemmatizerSettings.MsdConsideration.JoinAll:
            case LemmatizerSettings.MsdConsideration.JoinDistinct:
            case LemmatizerSettings.MsdConsideration.JoinSameSubstring:
                sSignature = "[" + sWord + "]==>[" + sLemma + "]";
                break;

            case LemmatizerSettings.MsdConsideration.Distinct:
            default:
                sSignature = "[" + sWord + "]==>[" + sLemma + "](" + (sMsd != null ? sMsd : "") + ")";
                break;
            }

            this.sWordRearCache   = null;
            this.sWordFrontCache  = null;
            this.sLemmaFrontCache = null;
        }
示例#23
0
 public LemmaTreeNode(Latino.BinarySerializer binRead, LemmatizerSettings lsett, ExampleList elExamples, LemmaTreeNode ltnParentNode) {
     Load(binRead, lsett, elExamples, ltnParentNode);
 }
 public LemmaExample(Latino.BinarySerializer binRead, LemmatizerSettings lsett, LemmaRule lrRule)
 {
     Load(binRead, lsett, lrRule);
 }
示例#25
0
 public Lemmatizer(StreamReader srIn, string sFormat, LemmatizerSettings lsett): this(lsett) {
     AddMultextFile(srIn, sFormat);
 }
示例#26
0
 public LemmaTreeNode(LemmatizerSettings lsett, ExampleList elExamples)
     : this(lsett, elExamples, 0, elExamples.Count - 1, null)
 {
 }
示例#27
0
        public ExampleList(SerializationInfo info, StreamingContext context) {

            lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));

            this.dictExamples = new Dictionary<string, LemmaExample>();
            this.lstExamples = null;
            this.rlRules = new RuleList(lsett);

            string[] aWords = (string[])info.GetValue("aWords", typeof(string[]));
            string[] aLemmas = (string[])info.GetValue("aLemmas", typeof(string[]));
            double[] aWeights = (double[])info.GetValue("aWeights", typeof(double[]));
            string[] aMsds = (string[])info.GetValue("aMsds", typeof(string[]));

            for (int iExm = 0; iExm < aWords.Length; iExm++)
                AddExample(aWords[iExm], aLemmas[iExm], aWeights[iExm], aMsds[iExm]);
        }
示例#28
0
 public Lemmatizer(SerializationInfo info, StreamingContext context): this() {
     lsett = (LemmatizerSettings)info.GetValue("lsett", typeof(LemmatizerSettings));
     elExamples = (ExampleList)info.GetValue("elExamples", typeof(ExampleList));
     this.BuildModel();
 }
示例#29
0
 public LemmatizerPrebuilt(LanguagePrebuilt lang, LemmatizerSettings lsett)
     : base(lsett)
 {
     this.lang = lang;
 }
示例#30
0
 public Lemmatizer(LemmatizerSettings lsett) { 
     this.lsett = lsett;
     this.elExamples = new ExampleList(lsett);
     this.ltnRootNode = null;
     this.ltnRootNodeFront = null;
 } 
示例#31
0
        public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
            //load metadata
            bool bThisTopObject = binRead.ReadBool();

            //load refernce types if needed -------------------------
            if (bThisTopObject)
                this.lsett = new LemmatizerSettings(binRead);
            else
                this.lsett = lsett;

            rlRules = new RuleList(binRead, this.lsett);

            bool bCreateLstExamples = binRead.ReadBool();

            lstExamples = bCreateLstExamples ? new List<LemmaExample>() : null;
            dictExamples = new Dictionary<string, LemmaExample>();

            //load dictionary items
            int iCount = binRead.ReadInt();
            for (int iId = 0; iId < iCount; iId++) {
                LemmaRule lrRule = rlRules[binRead.ReadString()];
                LemmaExample le = new LemmaExample(binRead, this.lsett, lrRule);

                dictExamples.Add(le.Signature, le);
                if (bCreateLstExamples) lstExamples.Add(le);
            }

        }
示例#32
0
 public void Load(Latino.BinarySerializer binRead) {
     lsett = new LemmatizerSettings(binRead);
     elExamples = new ExampleList(binRead, lsett);
     if (!lsett.bBuildFrontLemmatizer) {
         ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples, null);
     }
     else {
         ltnRootNode = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(false) , null);
         ltnRootNodeFront = new LemmaTreeNode(binRead, lsett, elExamples.GetFrontRearExampleList(true), null);
     }               
 }
示例#33
0
 public LemmaRule(System.IO.BinaryReader binRead, LemmatizerSettings lsett)
 {
     this.Deserialize(binRead, lsett);
 }