예제 #1
0
 public void TestDictionaryLoading()
 {
     var stemmer = new MyStemmer(Program.QuestionsFileName, Program.AnswersFileName);
     Assert.AreEqual(661255, stemmer.GetVocabulary().GetWordInfos().Count);
     Assert.AreEqual("сильный", stemmer.Stem("СилЬнЫх"));
 }
예제 #2
0
        public void SomeWordsTest()
        {
            var ww = new string[] {"проверка", "прибыла","прибытие", "воплощать"};
            var stemmer = new MyStemmer("TestSample.txt", ww);
            foreach (var s1 in ww)
            {
                Console.Out.WriteLine(stemmer.Stem(s1));

            }
            Assert.Fail();
        }
예제 #3
0
        public Medicaments(string medicamentsFileName)
        {
            var notMedicamentsStemmer = new MyStemmer(NotMedicamentsFileName);
            var notMedicamentsWords = File.ReadAllLines(NotMedicamentsFileName);
            NotMedicaments = new HashSet<string>(notMedicamentsWords.Select(notMedicamentsStemmer.Stem).Concat(notMedicamentsWords));

            stemmer = new MyStemmer(Program.MedicamentsFileName);
            invertedIndex = LoadIndexFromFile(medicamentsFileName);
        }
예제 #4
0
        private Dictionary<string, HashSet<int>> Filter(Dictionary<string, HashSet<int>> index)
        {
            var commonWordsStemmer = new MyStemmer(Program.FilesDirectory + "1grams-3.txt");
            var commonWords = new HashSet<string>(
                File.ReadAllLines(Program.FilesDirectory + "1grams-3.txt")
                    .Select(s => s.Split(new[] {'\t'}))
                    .TakeWhile(a => int.Parse(a[0]) > 30)
                    .Select(a => commonWordsStemmer.Stem(a[1])));

            return index
                    .Where(item => item.Value.Count <= 85 && item.Key.Length > 2)
                    .Where(m => !NotMedicaments.Contains(m.Key))
                    .Where(m => !commonWords.Contains(m.Key))
                    .ToDictionary(item => item.Key, item => item.Value);
        }
예제 #5
0
        public IEnumerable<Tuple<int, InvertedIndexUnit>> GetMedicamentsFuzzyIndex()
        {
            var medNames = idToTradeName.Select(med => Tuple.Create(med.Key, med.Value.Name)).ToList();
            var stemmer = new MyStemmer("Mkb10TradeNames.stemmed.txt", medNames.Select(it => it.Item2));
            var stemmedTnNames =
                medNames.Select(it => Tuple.Create(it.Item1, String.Join(" ", it.Item2.SplitIntoWords().Select(stemmer.Stem)))).ToList();

            var tnWords = stemmedTnNames.Select(it => it.Item2).SelectMany(name => name.SplitIntoWords());
            var answerTexts = medicalQuestions.SelectMany(q => q.GetAnswers()).Select(a => Tuple.Create(a.QuestionId, a.Text));
            var wordsIndex = new FuzzyIndex(answerTexts, tnWords);

            return (from it in medNames
                    let medId = it.Item1
                    let name = it.Item2
                    let medIds = GetUnitsIntersection(wordsIndex.GetIndex(), name.SplitIntoWords().ToArray()).ToList()
                    where medIds.Any()
                    select Tuple.Create(medId, new InvertedIndexUnit(idToTradeName[medId].Name, medIds))).ToList();
        }
예제 #6
0
        public IEnumerable<Tuple<int, InvertedIndexUnit>> GetDeseasesFuzzyIndex()
        {
            var deseaseNames = idToDesease.Select(des => { var names = des.Value.Synonyms; names.Add(des.Value.Name);
                                                      	return Tuple.Create(des.Key, names);
            }).ToList();

            var stemmer = new MyStemmer("Mkb10Deseases.stemmed.txt", deseaseNames.SelectMany(it => it.Item2));
            var stemmedDeseaseNames =
                deseaseNames.Select(it => Tuple.Create(it.Item1, it.Item2.Select(name => String.Join(" ", name.SplitIntoWords().Select(stemmer.Stem))))).ToList();

            var desWords = stemmedDeseaseNames.SelectMany(it => it.Item2).SelectMany(name => name.SplitIntoWords());
            var questionTexts = medicalQuestions.Select(q => Tuple.Create(q.Id, q.WholeText));
            var wordsIndex = new FuzzyIndex(questionTexts, desWords);

            var deseaseIndex = new List<Tuple<int, InvertedIndexUnit>>();

            foreach (var it in stemmedDeseaseNames)
            {
                var desId = it.Item1;
                var names = it.Item2;
                var deseaseIds = new List<long>();
                foreach (var name in names)
                {
                    deseaseIds.AddRange(GetUnitsIntersection(wordsIndex.GetIndex(), name.SplitIntoWords().ToArray()));
                }
                if(deseaseIds.Any())
                    deseaseIndex.Add(Tuple.Create(desId, new InvertedIndexUnit(idToDesease[desId].Name, deseaseIds)));
            }
            return deseaseIndex;
        }