public static string Search(string query) { var documentsSimilarity = new Dictionary <int, double>(); var lemmas = Lemmatizer.Lemmatize(Tokenizer.Tokenize(query)) .Split('\n').ToList(); var dimensions = lemmas.Distinct().ToList(); var frequencies = new Dictionary <string, double>(); var text = File.ReadAllLines(IDF_FILE_NAME).ToList(); var idfs = new Dictionary <string, double>(); foreach (var idf in text) { var lemma = idf.Substring(0, idf.LastIndexOf(": ")); var freqString = idf .Replace(lemma + ": ", "") .Replace(",", "."); var freq = double.Parse(freqString, CultureInfo.InvariantCulture); idfs[lemma] = freq; } dimensions.ForEach(dim => { var lemmaTf = (double)lemmas.Count(lemma => lemma == dim) / lemmas.Count; frequencies[dim] = lemmaTf * (idfs.ContainsKey(dim) ? idfs[dim] : 1); }); text = File.ReadAllLines(TF_IDF_FILE_NAME).ToList(); var tfIdfs = new Dictionary <(string, int), double>(); foreach (var tfIdf in text) { var keys = tfIdf.Substring(0, tfIdf.LastIndexOf(": ")); var freqString = tfIdf .Replace(keys + ": ", "") .Replace(",", "."); var freq = double.Parse(freqString, CultureInfo.InvariantCulture); var lemma = keys.Substring(0, keys.LastIndexOf(" ")); var docId = int.Parse(keys.Replace(lemma + " ", "")); tfIdfs[(lemma, docId)] = freq;
private static IEnumerable <string> VectorSearch(string query) { var queryWordToFrequencyDict = new Dictionary <string, int>(); var queryWords = query.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => x.ToLowerInvariant()).ToList(); var queryWordsSet = queryWords.ToImmutableSortedSet(); foreach (var queryWord in queryWordsSet) { queryWordToFrequencyDict.Add(queryWord, queryWords.Count(x => string.Equals(queryWord, x, StringComparison.InvariantCultureIgnoreCase))); } var queryVector = new Dictionary <string, double>(); var lemmaToIndexesDict = GetLemmasWithIndexes(); var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath()); var maxFrequency = queryWordToFrequencyDict.Max(x => x.Value); double queryLength = 0; foreach (var(queryWord, frequency) in queryWordToFrequencyDict) { var lemmatizedQueryWord = GetLemma(lemmatizer.Lemmatize(queryWord)); if (lemmatizedQueryWord != null) { if (lemmaToIndexesDict.ContainsKey(lemmatizedQueryWord)) { var vectorElement = ((double)frequency / maxFrequency) * Math.Log10((double)PagesCount / lemmaToIndexesDict[lemmatizedQueryWord].Count); queryVector.Add(lemmatizedQueryWord, vectorElement); queryLength += vectorElement * vectorElement; } else { Console.WriteLine($"Слово {queryWord} не входит в индекс"); } } } queryLength = Math.Sqrt(queryLength); var documentsLength = new List <double>(); var pageLemmasToTfIdfDictList = new List <Dictionary <string, double> >(); for (int i = 0; i < PagesCount; i++) { var text = File.ReadAllLines($"{Task4Path}\\page_{i + 1}_tfidf.txt"); var squaredDocumentLength = 0.0; pageLemmasToTfIdfDictList.Add(new Dictionary <string, double>()); foreach (var lemmaWithTfIdfString in text) { var wordWithTfIdf = lemmaWithTfIdfString.Split(' ', StringSplitOptions.RemoveEmptyEntries); var word = wordWithTfIdf[0]; var tfIdfString = wordWithTfIdf[3]; var tfIdf = double.Parse(tfIdfString.Substring(7)); squaredDocumentLength += tfIdf * tfIdf; pageLemmasToTfIdfDictList[i].Add(word, tfIdf); } documentsLength.Add(Math.Sqrt(squaredDocumentLength)); } if (queryLength == 0) { return(File.ReadAllLines($"{Task1Path}\\index.txt")); } var documentCosineSimilarities = new List <(int pageIndex, double cosineSimilarity)>(); for (int i = 0; i < PagesCount; i++) { double cosineSimilarity = 0; foreach (var(lemmatizedQueryWord, vectorElementValue) in queryVector) { if (pageLemmasToTfIdfDictList[i].ContainsKey(lemmatizedQueryWord)) { cosineSimilarity += vectorElementValue * pageLemmasToTfIdfDictList[i][lemmatizedQueryWord]; } } documentCosineSimilarities.Add((i, cosineSimilarity / (queryLength * documentsLength[i]))); } documentCosineSimilarities = documentCosineSimilarities.OrderByDescending(x => x.cosineSimilarity).ToList(); var resultLinks = new List <string>(); var links = File.ReadAllLines($"{Task1Path}\\index.txt"); foreach (var documentCosineSimilarity in documentCosineSimilarities) { resultLinks.Add(links[documentCosineSimilarity.pageIndex]); } return(resultLinks); }