private static List <string> BooleanSearch(string query) { var stopwords = File.ReadAllLines(StopWordsPath); var words = query.Split(" ", StringSplitOptions.RemoveEmptyEntries).Where(x => x != "" && !stopwords.Contains(x)).ToList(); var excludeWords = new HashSet <string>(); var includeWords = new HashSet <string>(); foreach (var word in words) { if (word.StartsWith('-')) { excludeWords.Add(word.Substring(1)); } else { includeWords.Add(word); } } var lemmasWithIndexes = File.ReadAllLines($"{Task3Path}\\invertedIndex.txt"); var lemmaToIndexesDict = new Dictionary <string, HashSet <int> >(); foreach (var lemmaWithIndexes in lemmasWithIndexes) { var parsedString = lemmaWithIndexes.Split(' ', StringSplitOptions.RemoveEmptyEntries); lemmaToIndexesDict.Add(parsedString[0], new HashSet <int>(parsedString.Skip(1).Select(int.Parse))); } var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); var excludeLemmas = lemmatizer.LemmatizeWordsList(excludeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList(); var includeLemmas = lemmatizer.LemmatizeWordsList(includeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList(); if (!includeLemmas.Any() && !excludeLemmas.Any()) { return(new List <string>()); } var documentIndexesSet = new HashSet <int>(); if (includeLemmas.Any()) { var firstIncludeLemma = includeLemmas.FirstOrDefault(); documentIndexesSet.UnionWith(lemmaToIndexesDict[firstIncludeLemma !]);;
private static void LemmatizeAllPagesAndWrite() { var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); for (int i = 0; i < PagesCount; i++) { var text = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt"); var words = ParseTextToWordsInLower(text); var lemmatizedWords = lemmatizer.LemmatizeWordsList(words); lemmatizedWords.Sort(); File.WriteAllLines($"{Task5Path}\\sortedLemmatizedPages\\page_{i + 1}.txt", lemmatizedWords); } }
private static void CreateTfIdf() { var lemmaToIndexesDict = GetLemmasWithIndexes(); var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); var texts = new List <List <string> >(100); for (int i = 0; i < PagesCount; i++) { var fileText = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt"); var words = ParseTextToWordsInLower(fileText); var lemmatizedWords = lemmatizer.LemmatizeWordsList(words); texts.Add(lemmatizedWords); } foreach (var(lemma, indexes) in lemmaToIndexesDict) { var idf = Math.Log10(Convert.ToDouble(PagesCount) / indexes.Count); foreach (var index in indexes) { using (var writer = File.AppendText($"{Task4Path}\\page_{index + 1}_tfidf.txt")) { var text = texts[index]; var wordEntriesCount = 0; foreach (var word in text) { if (word == lemma) { wordEntriesCount++; } } var tf = Convert.ToDouble(wordEntriesCount) / text.Count; writer.WriteLine($"{lemma} tf={tf:0.#######} idf={idf:0.#######} tf-idf={tf * idf:0.#######}"); } } } }