public static string Search(string query) { var documentsSimilarity = new Dictionary <int, double>(); var lemmas = Lemmatizer.Lemmatize(Tokenizer.Tokenize(query)) .Split('\n').ToList(); var dimensions = lemmas.Distinct().ToList(); var frequencies = new Dictionary <string, double>(); var text = File.ReadAllLines(IDF_FILE_NAME).ToList(); var idfs = new Dictionary <string, double>(); foreach (var idf in text) { var lemma = idf.Substring(0, idf.LastIndexOf(": ")); var freqString = idf .Replace(lemma + ": ", "") .Replace(",", "."); var freq = double.Parse(freqString, CultureInfo.InvariantCulture); idfs[lemma] = freq; } dimensions.ForEach(dim => { var lemmaTf = (double)lemmas.Count(lemma => lemma == dim) / lemmas.Count; frequencies[dim] = lemmaTf * (idfs.ContainsKey(dim) ? idfs[dim] : 1); }); text = File.ReadAllLines(TF_IDF_FILE_NAME).ToList(); var tfIdfs = new Dictionary <(string, int), double>(); foreach (var tfIdf in text) { var keys = tfIdf.Substring(0, tfIdf.LastIndexOf(": ")); var freqString = tfIdf .Replace(keys + ": ", "") .Replace(",", "."); var freq = double.Parse(freqString, CultureInfo.InvariantCulture); var lemma = keys.Substring(0, keys.LastIndexOf(" ")); var docId = int.Parse(keys.Replace(lemma + " ", "")); tfIdfs[(lemma, docId)] = freq;
private static void LemmatizeAllPagesAndWrite() { var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); for (int i = 0; i < PagesCount; i++) { var text = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt"); var words = ParseTextToWordsInLower(text); var lemmatizedWords = lemmatizer.LemmatizeWordsList(words); lemmatizedWords.Sort(); File.WriteAllLines($"{Task5Path}\\sortedLemmatizedPages\\page_{i + 1}.txt", lemmatizedWords); } }
private static List <string> BooleanSearch(string query) { var stopwords = File.ReadAllLines(StopWordsPath); var words = query.Split(" ", StringSplitOptions.RemoveEmptyEntries).Where(x => x != "" && !stopwords.Contains(x)).ToList(); var excludeWords = new HashSet <string>(); var includeWords = new HashSet <string>(); foreach (var word in words) { if (word.StartsWith('-')) { excludeWords.Add(word.Substring(1)); } else { includeWords.Add(word); } } var lemmasWithIndexes = File.ReadAllLines($"{Task3Path}\\invertedIndex.txt"); var lemmaToIndexesDict = new Dictionary <string, HashSet <int> >(); foreach (var lemmaWithIndexes in lemmasWithIndexes) { var parsedString = lemmaWithIndexes.Split(' ', StringSplitOptions.RemoveEmptyEntries); lemmaToIndexesDict.Add(parsedString[0], new HashSet <int>(parsedString.Skip(1).Select(int.Parse))); } var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); var excludeLemmas = lemmatizer.LemmatizeWordsList(excludeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList(); var includeLemmas = lemmatizer.LemmatizeWordsList(includeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList(); if (!includeLemmas.Any() && !excludeLemmas.Any()) { return(new List <string>()); } var documentIndexesSet = new HashSet <int>(); if (includeLemmas.Any()) { var firstIncludeLemma = includeLemmas.FirstOrDefault(); documentIndexesSet.UnionWith(lemmaToIndexesDict[firstIncludeLemma !]);;
private static void CreateTfIdf() { var lemmaToIndexesDict = GetLemmasWithIndexes(); var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); var texts = new List <List <string> >(100); for (int i = 0; i < PagesCount; i++) { var fileText = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt"); var words = ParseTextToWordsInLower(fileText); var lemmatizedWords = lemmatizer.LemmatizeWordsList(words); texts.Add(lemmatizedWords); } foreach (var(lemma, indexes) in lemmaToIndexesDict) { var idf = Math.Log10(Convert.ToDouble(PagesCount) / indexes.Count); foreach (var index in indexes) { using (var writer = File.AppendText($"{Task4Path}\\page_{index + 1}_tfidf.txt")) { var text = texts[index]; var wordEntriesCount = 0; foreach (var word in text) { if (word == lemma) { wordEntriesCount++; } } var tf = Convert.ToDouble(wordEntriesCount) / text.Count; writer.WriteLine($"{lemma} tf={tf:0.#######} idf={idf:0.#######} tf-idf={tf * idf:0.#######}"); } } } }
private static IEnumerable <string> VectorSearch(string query) { var queryWordToFrequencyDict = new Dictionary <string, int>(); var queryWords = query.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => x.ToLowerInvariant()).ToList(); var queryWordsSet = queryWords.ToImmutableSortedSet(); foreach (var queryWord in queryWordsSet) { queryWordToFrequencyDict.Add(queryWord, queryWords.Count(x => string.Equals(queryWord, x, StringComparison.InvariantCultureIgnoreCase))); } var queryVector = new Dictionary <string, double>(); var lemmaToIndexesDict = GetLemmasWithIndexes(); var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); var maxFrequency = queryWordToFrequencyDict.Max(x => x.Value); double queryLength = 0; foreach (var(queryWord, frequency) in queryWordToFrequencyDict) { var lemmatizedQueryWord = GetLemma(lemmatizer.Lemmatize(queryWord)); if (lemmatizedQueryWord != null) { if (lemmaToIndexesDict.ContainsKey(lemmatizedQueryWord)) { var vectorElement = ((double)frequency / maxFrequency) * Math.Log10((double)PagesCount / lemmaToIndexesDict[lemmatizedQueryWord].Count); queryVector.Add(lemmatizedQueryWord, vectorElement); queryLength += vectorElement * vectorElement; } else { Console.WriteLine($"Слово {queryWord} не входит в индекс"); } } } queryLength = Math.Sqrt(queryLength); var documentsLength = new List <double>(); var pageLemmasToTfIdfDictList = new List <Dictionary <string, double> >(); for (int i = 0; i < PagesCount; i++) { var text = File.ReadAllLines($"{Task4Path}\\page_{i + 1}_tfidf.txt"); var squaredDocumentLength = 0.0; pageLemmasToTfIdfDictList.Add(new Dictionary <string, double>()); foreach (var lemmaWithTfIdfString in text) { var wordWithTfIdf = lemmaWithTfIdfString.Split(' ', StringSplitOptions.RemoveEmptyEntries); var word = wordWithTfIdf[0]; var tfIdfString = wordWithTfIdf[3]; var tfIdf = double.Parse(tfIdfString.Substring(7)); squaredDocumentLength += tfIdf * tfIdf; pageLemmasToTfIdfDictList[i].Add(word, tfIdf); } documentsLength.Add(Math.Sqrt(squaredDocumentLength)); } if (queryLength == 0) { return(File.ReadAllLines($"{Task1Path}\\index.txt")); } var documentCosineSimilarities = new List <(int pageIndex, double cosineSimilarity)>(); for (int i = 0; i < PagesCount; i++) { double cosineSimilarity = 0; foreach (var(lemmatizedQueryWord, vectorElementValue) in queryVector) { if (pageLemmasToTfIdfDictList[i].ContainsKey(lemmatizedQueryWord)) { cosineSimilarity += vectorElementValue * pageLemmasToTfIdfDictList[i][lemmatizedQueryWord]; } } documentCosineSimilarities.Add((i, cosineSimilarity / (queryLength * documentsLength[i]))); } documentCosineSimilarities = documentCosineSimilarities.OrderByDescending(x => x.cosineSimilarity).ToList(); var resultLinks = new List <string>(); var links = File.ReadAllLines($"{Task1Path}\\index.txt"); foreach (var documentCosineSimilarity in documentCosineSimilarities) { resultLinks.Add(links[documentCosineSimilarity.pageIndex]); } return(resultLinks); }