public void TestDictionaryLoading() { var stemmer = new MyStemmer(Program.QuestionsFileName, Program.AnswersFileName); Assert.AreEqual(661255, stemmer.GetVocabulary().GetWordInfos().Count); Assert.AreEqual("сильный", stemmer.Stem("СилЬнЫх")); }
public void SomeWordsTest() { var ww = new string[] {"проверка", "прибыла","прибытие", "воплощать"}; var stemmer = new MyStemmer("TestSample.txt", ww); foreach (var s1 in ww) { Console.Out.WriteLine(stemmer.Stem(s1)); } Assert.Fail(); }
public Medicaments(string medicamentsFileName) { var notMedicamentsStemmer = new MyStemmer(NotMedicamentsFileName); var notMedicamentsWords = File.ReadAllLines(NotMedicamentsFileName); NotMedicaments = new HashSet<string>(notMedicamentsWords.Select(notMedicamentsStemmer.Stem).Concat(notMedicamentsWords)); stemmer = new MyStemmer(Program.MedicamentsFileName); invertedIndex = LoadIndexFromFile(medicamentsFileName); }
private Dictionary<string, HashSet<int>> Filter(Dictionary<string, HashSet<int>> index) { var commonWordsStemmer = new MyStemmer(Program.FilesDirectory + "1grams-3.txt"); var commonWords = new HashSet<string>( File.ReadAllLines(Program.FilesDirectory + "1grams-3.txt") .Select(s => s.Split(new[] {'\t'})) .TakeWhile(a => int.Parse(a[0]) > 30) .Select(a => commonWordsStemmer.Stem(a[1]))); return index .Where(item => item.Value.Count <= 85 && item.Key.Length > 2) .Where(m => !NotMedicaments.Contains(m.Key)) .Where(m => !commonWords.Contains(m.Key)) .ToDictionary(item => item.Key, item => item.Value); }
public IEnumerable<Tuple<int, InvertedIndexUnit>> GetMedicamentsFuzzyIndex() { var medNames = idToTradeName.Select(med => Tuple.Create(med.Key, med.Value.Name)).ToList(); var stemmer = new MyStemmer("Mkb10TradeNames.stemmed.txt", medNames.Select(it => it.Item2)); var stemmedTnNames = medNames.Select(it => Tuple.Create(it.Item1, String.Join(" ", it.Item2.SplitIntoWords().Select(stemmer.Stem)))).ToList(); var tnWords = stemmedTnNames.Select(it => it.Item2).SelectMany(name => name.SplitIntoWords()); var answerTexts = medicalQuestions.SelectMany(q => q.GetAnswers()).Select(a => Tuple.Create(a.QuestionId, a.Text)); var wordsIndex = new FuzzyIndex(answerTexts, tnWords); return (from it in medNames let medId = it.Item1 let name = it.Item2 let medIds = GetUnitsIntersection(wordsIndex.GetIndex(), name.SplitIntoWords().ToArray()).ToList() where medIds.Any() select Tuple.Create(medId, new InvertedIndexUnit(idToTradeName[medId].Name, medIds))).ToList(); }
public IEnumerable<Tuple<int, InvertedIndexUnit>> GetDeseasesFuzzyIndex() { var deseaseNames = idToDesease.Select(des => { var names = des.Value.Synonyms; names.Add(des.Value.Name); return Tuple.Create(des.Key, names); }).ToList(); var stemmer = new MyStemmer("Mkb10Deseases.stemmed.txt", deseaseNames.SelectMany(it => it.Item2)); var stemmedDeseaseNames = deseaseNames.Select(it => Tuple.Create(it.Item1, it.Item2.Select(name => String.Join(" ", name.SplitIntoWords().Select(stemmer.Stem))))).ToList(); var desWords = stemmedDeseaseNames.SelectMany(it => it.Item2).SelectMany(name => name.SplitIntoWords()); var questionTexts = medicalQuestions.Select(q => Tuple.Create(q.Id, q.WholeText)); var wordsIndex = new FuzzyIndex(questionTexts, desWords); var deseaseIndex = new List<Tuple<int, InvertedIndexUnit>>(); foreach (var it in stemmedDeseaseNames) { var desId = it.Item1; var names = it.Item2; var deseaseIds = new List<long>(); foreach (var name in names) { deseaseIds.AddRange(GetUnitsIntersection(wordsIndex.GetIndex(), name.SplitIntoWords().ToArray())); } if(deseaseIds.Any()) deseaseIndex.Add(Tuple.Create(desId, new InvertedIndexUnit(idToDesease[desId].Name, deseaseIds))); } return deseaseIndex; }