Example #1
0
        public void LargeEnglishWordList()
        {
            var words = new List <string>();

            using (var file = new StreamReader(Path.Combine("Morphology", "LEX")))
            {
                string line;
                while ((line = file.ReadLine()) != null)
                {
                    string word = line.Substring(6);
                    if (word.All(Char.IsLetter))
                    {
                        words.Add(word);
                    }
                }
            }

            var stemmer = new PoorMansStemmer <string, char>(w => w)
            {
                NormalizeScores = true, Threshold = 0.03, MaxAffixLength = 5
            };

            stemmer.Train(words);

            Assert.That(stemmer.HaveSameStem("spied", "spy"), Is.True);
            Assert.That(stemmer.HaveSameStem("station", "sting"), Is.False);
            Assert.That(stemmer.HaveSameStem("called", "calls"), Is.True);
            Assert.That(stemmer.HaveSameStem("jungle", "excited"), Is.False);
            Assert.That(stemmer.HaveSameStem("jammed", "jams"), Is.True);
            Assert.That(stemmer.HaveSameStem("fix", "fixes"), Is.True);
            Assert.That(stemmer.HaveSameStem("sting", "stern"), Is.False);
            Assert.That(stemmer.HaveSameStem("jogged", "jogging"), Is.True);
            Assert.That(stemmer.HaveSameStem("unbelieveable", "believe"), Is.True);
        }
Example #2
0
        private ILexEntry GetMatchingEntryFromStemmer(string wordForm)
        {
            var repo = m_cache.ServiceLocator.GetInstance <ILexEntryRepository>();

            if (m_stemmer == null)
            {
                m_stemmer = new PoorMansStemmer <string, char>(s => s)
                {
                    NormalizeScores = true, WeightScores = false, Threshold = 0.12
                };
                var forms = new HashSet <string>();
                foreach (ILexEntry entry in repo.AllInstances())
                {
                    if (!(entry.LexemeFormOA is IMoStemAllomorph))
                    {
                        continue;
                    }

                    forms.UnionWith(GetForms(entry));
                }
                m_stemmer.Train(forms);
            }

            int       bestLen   = 0;
            double    bestScore = 0;
            ILexEntry bestMatch = null;

            foreach (ILexEntry entry in repo.AllInstances())
            {
                if (!(entry.LexemeFormOA is IMoStemAllomorph))
                {
                    continue;
                }

                foreach (string form in GetForms(entry))
                {
                    double formScore;
                    if (m_stemmer.HaveSameStem(wordForm, form, out formScore))
                    {
                        if (formScore > bestScore)
                        {
                            bestMatch = entry;
                            bestScore = formScore;
                            bestLen   = LongestCommonSubstringLength(wordForm, form);
                        }
                        else if (Math.Abs(formScore - bestScore) < double.Epsilon)
                        {
                            int len = LongestCommonSubstringLength(wordForm, form);
                            if (len > bestLen)
                            {
                                bestMatch = entry;
                                bestScore = formScore;
                                bestLen   = len;
                            }
                        }
                    }
                }
            }
            return(bestMatch);
        }
Example #3
0
        public void HaveSameStem()
        {
            var words = new[]
            {
                "calls",
                "fixes",
                "coughs",
                "begs",
                "explains",
                "jams",
                "kisses",
                "learns",
                "whips",
                "visits",
                "rushes",
                "traces",
                "attends",
                "detects",
                "extends",
                "explains",
                "forces",
                "frames",
                "cycles",
                "notices",
                "turns",
                "uses",
                "excites",
                "damages",
                "boils",
                "avoids",
                "allows",
                "jokes",
                "murders",
                "sucks",

                "called",
                "fixed",
                "coughed",
                "begged",
                "explained",
                "jammed",
                "kissed",
                "learned",
                "whipped",
                "visited",
                "rushed",
                "traced",
                "attended",
                "detected",
                "extended",
                "explained",
                "forced",
                "framed",
                "cycled",
                "noticed",
                "turned",
                "used",
                "excited",
                "damaged",
                "boiled",
                "avoided",
                "allowed",
                "joked",
                "murdered",
                "sucked",

                "call",
                "fix",
                "cough",
                "beg",
                "explain",
                "jam",
                "kiss",
                "learn",
                "whip",
                "visit",
                "rush",
                "trace",
                "attend",
                "detect",
                "extend",
                "explain",
                "force",
                "frame",
                "cycle",
                "notice",
                "turn",
                "use",
                "excite",
                "damage",
                "boil",
                "avoid",
                "allow",
                "joke",
                "murder",
                "suck"
            };

            var stemmer = new PoorMansStemmer <string, char>(s => s)
            {
                NormalizeScores = true, Threshold = 0.05
            };

            stemmer.Train(words);

            Assert.That(stemmer.HaveSameStem("locked", "locks"), Is.True);

            Assert.That(stemmer.HaveSameStem("flock", "locked"), Is.False);

            Assert.That(stemmer.HaveSameStem("locked", "locker"), Is.False);

            Assert.That(stemmer.HaveSameStem("lock", "locked"), Is.True);

            Assert.That(stemmer.HaveSameStem("misses", "missed"), Is.True);

            Assert.That(stemmer.HaveSameStem("extend", "intend"), Is.False);
        }
Example #4
0
        public void HaveSameStem()
        {
            var words = new[]
                {
                    "calls",
                    "fixes",
                    "coughs",
                    "begs",
                    "explains",
                    "jams",
                    "kisses",
                    "learns",
                    "whips",
                    "visits",
                    "rushes",
                    "traces",
                    "attends",
                    "detects",
                    "extends",
                    "explains",
                    "forces",
                    "frames",
                    "cycles",
                    "notices",
                    "turns",
                    "uses",
                    "excites",
                    "damages",
                    "boils",
                    "avoids",
                    "allows",
                    "jokes",
                    "murders",
                    "sucks",

                    "called",
                    "fixed",
                    "coughed",
                    "begged",
                    "explained",
                    "jammed",
                    "kissed",
                    "learned",
                    "whipped",
                    "visited",
                    "rushed",
                    "traced",
                    "attended",
                    "detected",
                    "extended",
                    "explained",
                    "forced",
                    "framed",
                    "cycled",
                    "noticed",
                    "turned",
                    "used",
                    "excited",
                    "damaged",
                    "boiled",
                    "avoided",
                    "allowed",
                    "joked",
                    "murdered",
                    "sucked",

                    "call",
                    "fix",
                    "cough",
                    "beg",
                    "explain",
                    "jam",
                    "kiss",
                    "learn",
                    "whip",
                    "visit",
                    "rush",
                    "trace",
                    "attend",
                    "detect",
                    "extend",
                    "explain",
                    "force",
                    "frame",
                    "cycle",
                    "notice",
                    "turn",
                    "use",
                    "excite",
                    "damage",
                    "boil",
                    "avoid",
                    "allow",
                    "joke",
                    "murder",
                    "suck"
                };

            var stemmer = new PoorMansStemmer<string, char>(s => s) {NormalizeScores = true, Threshold = 0.05};
            stemmer.Train(words);

            Assert.That(stemmer.HaveSameStem("locked", "locks"), Is.True);

            Assert.That(stemmer.HaveSameStem("flock", "locked"), Is.False);

            Assert.That(stemmer.HaveSameStem("locked", "locker"), Is.False);

            Assert.That(stemmer.HaveSameStem("lock", "locked"), Is.True);

            Assert.That(stemmer.HaveSameStem("misses", "missed"), Is.True);

            Assert.That(stemmer.HaveSameStem("extend", "intend"), Is.False);
        }
Example #5
0
        public void LargeEnglishWordList()
        {
            var words = new List<string>();
            using (var file = new StreamReader(Path.Combine("Morphology", "LEX")))
            {
                string line;
                while ((line = file.ReadLine()) != null)
                {
                    string word = line.Substring(6);
                    if (word.All(Char.IsLetter))
                        words.Add(word);
                }
            }

            var stemmer = new PoorMansStemmer<string, char>(w => w) {NormalizeScores = true, Threshold = 0.03, MaxAffixLength = 5};
            stemmer.Train(words);

            Assert.That(stemmer.HaveSameStem("spied", "spy"), Is.True);
            Assert.That(stemmer.HaveSameStem("station", "sting"), Is.False);
            Assert.That(stemmer.HaveSameStem("called", "calls"), Is.True);
            Assert.That(stemmer.HaveSameStem("jungle", "excited"), Is.False);
            Assert.That(stemmer.HaveSameStem("jammed", "jams"), Is.True);
            Assert.That(stemmer.HaveSameStem("fix", "fixes"), Is.True);
            Assert.That(stemmer.HaveSameStem("sting", "stern"), Is.False);
            Assert.That(stemmer.HaveSameStem("jogged", "jogging"), Is.True);
            Assert.That(stemmer.HaveSameStem("unbelieveable", "believe"), Is.True);
        }
Example #6
0
        void IVwNotifyChange.PropChanged(int hvo, int tag, int ivMin, int cvIns, int cvDel)
        {
            ICmObject obj = m_cache.ServiceLocator.GetObject(hvo);

            switch (obj.ClassID)
            {
            case LexDbTags.kClassId:
                if (tag == m_cache.ServiceLocator.GetInstance <Virtuals>().LexDbEntries)
                {
                    if (!UpdatingEntries)
                    {
                        m_entryIndex = null;
                    }
                    m_stemmer = null;
                }
                break;

            case MoStemAllomorphTags.kClassId:
                if (tag == MoFormTags.kflidForm)
                {
                    if (!UpdatingEntries && obj.OwningFlid == LexEntryTags.kflidLexemeForm)
                    {
                        m_entryIndex = null;
                    }
                    m_stemmer = null;
                }
                break;

            case MoAffixAllomorphTags.kClassId:
                if (!UpdatingEntries && obj.OwningFlid == LexEntryTags.kflidLexemeForm && tag == MoFormTags.kflidForm)
                {
                    m_entryIndex = null;
                }
                break;

            case LexEntryTags.kClassId:
                var entry = (ILexEntry)obj;
                switch (tag)
                {
                case LexEntryTags.kflidLexemeForm:
                    if (!UpdatingEntries)
                    {
                        m_entryIndex = null;
                    }
                    if (entry.LexemeFormOA is IMoStemAllomorph)
                    {
                        m_stemmer = null;
                    }
                    break;

                case LexEntryTags.kflidAlternateForms:
                    if (entry.LexemeFormOA is IMoStemAllomorph)
                    {
                        m_stemmer = null;
                    }
                    break;
                }
                break;

            case MoMorphDataTags.kClassId:
                if (tag == MoMorphDataTags.kflidParserParameters)
                {
                    if (m_parser != null)
                    {
                        m_parser.Dispose();
                        m_parser = null;
                    }
                }
                break;
            }
        }