Ejemplo n.º 1
0
        public static string Search(string query)
        {
            var documentsSimilarity = new Dictionary <int, double>();

            var lemmas = Lemmatizer.Lemmatize(Tokenizer.Tokenize(query))
                         .Split('\n').ToList();

            var dimensions = lemmas.Distinct().ToList();

            var frequencies = new Dictionary <string, double>();

            var text = File.ReadAllLines(IDF_FILE_NAME).ToList();

            var idfs = new Dictionary <string, double>();

            foreach (var idf in text)
            {
                var lemma = idf.Substring(0, idf.LastIndexOf(": "));

                var freqString = idf
                                 .Replace(lemma + ": ", "")
                                 .Replace(",", ".");

                var freq = double.Parse(freqString, CultureInfo.InvariantCulture);

                idfs[lemma] = freq;
            }

            dimensions.ForEach(dim =>
            {
                var lemmaTf = (double)lemmas.Count(lemma => lemma == dim) / lemmas.Count;

                frequencies[dim] = lemmaTf * (idfs.ContainsKey(dim) ? idfs[dim] : 1);
            });

            text = File.ReadAllLines(TF_IDF_FILE_NAME).ToList();

            var tfIdfs = new Dictionary <(string, int), double>();

            foreach (var tfIdf in text)
            {
                var keys = tfIdf.Substring(0, tfIdf.LastIndexOf(": "));

                var freqString = tfIdf
                                 .Replace(keys + ": ", "")
                                 .Replace(",", ".");

                var freq  = double.Parse(freqString, CultureInfo.InvariantCulture);
                var lemma = keys.Substring(0, keys.LastIndexOf(" "));
                var docId = int.Parse(keys.Replace(lemma + " ", ""));

                tfIdfs[(lemma, docId)] = freq;
Ejemplo n.º 2
0
        private static void LemmatizeAllPagesAndWrite()
        {
            var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath());

            for (int i = 0; i < PagesCount; i++)
            {
                var text            = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt");
                var words           = ParseTextToWordsInLower(text);
                var lemmatizedWords = lemmatizer.LemmatizeWordsList(words);
                lemmatizedWords.Sort();
                File.WriteAllLines($"{Task5Path}\\sortedLemmatizedPages\\page_{i + 1}.txt", lemmatizedWords);
            }
        }
Ejemplo n.º 3
0
        private static List <string> BooleanSearch(string query)
        {
            var stopwords    = File.ReadAllLines(StopWordsPath);
            var words        = query.Split(" ", StringSplitOptions.RemoveEmptyEntries).Where(x => x != "" && !stopwords.Contains(x)).ToList();
            var excludeWords = new HashSet <string>();
            var includeWords = new HashSet <string>();

            foreach (var word in words)
            {
                if (word.StartsWith('-'))
                {
                    excludeWords.Add(word.Substring(1));
                }
                else
                {
                    includeWords.Add(word);
                }
            }


            var lemmasWithIndexes  = File.ReadAllLines($"{Task3Path}\\invertedIndex.txt");
            var lemmaToIndexesDict = new Dictionary <string, HashSet <int> >();

            foreach (var lemmaWithIndexes in lemmasWithIndexes)
            {
                var parsedString = lemmaWithIndexes.Split(' ', StringSplitOptions.RemoveEmptyEntries);
                lemmaToIndexesDict.Add(parsedString[0], new HashSet <int>(parsedString.Skip(1).Select(int.Parse)));
            }

            var lemmatizer    = new Lemmatizer(new StemDownloader().GetLocalPath());
            var excludeLemmas = lemmatizer.LemmatizeWordsList(excludeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList();
            var includeLemmas = lemmatizer.LemmatizeWordsList(includeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList();

            if (!includeLemmas.Any() && !excludeLemmas.Any())
            {
                return(new List <string>());
            }

            var documentIndexesSet = new HashSet <int>();

            if (includeLemmas.Any())
            {
                var firstIncludeLemma = includeLemmas.FirstOrDefault();
                documentIndexesSet.UnionWith(lemmaToIndexesDict[firstIncludeLemma !]);;
Ejemplo n.º 4
0
        private static void CreateTfIdf()
        {
            var lemmaToIndexesDict = GetLemmasWithIndexes();

            var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath());
            var texts      = new List <List <string> >(100);

            for (int i = 0; i < PagesCount; i++)
            {
                var fileText        = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt");
                var words           = ParseTextToWordsInLower(fileText);
                var lemmatizedWords = lemmatizer.LemmatizeWordsList(words);
                texts.Add(lemmatizedWords);
            }

            foreach (var(lemma, indexes) in lemmaToIndexesDict)
            {
                var idf = Math.Log10(Convert.ToDouble(PagesCount) / indexes.Count);
                foreach (var index in indexes)
                {
                    using (var writer = File.AppendText($"{Task4Path}\\page_{index + 1}_tfidf.txt"))
                    {
                        var text             = texts[index];
                        var wordEntriesCount = 0;
                        foreach (var word in text)
                        {
                            if (word == lemma)
                            {
                                wordEntriesCount++;
                            }
                        }
                        var tf = Convert.ToDouble(wordEntriesCount) / text.Count;
                        writer.WriteLine($"{lemma} tf={tf:0.#######} idf={idf:0.#######} tf-idf={tf * idf:0.#######}");
                    }
                }
            }
        }
Ejemplo n.º 5
0
        private static IEnumerable <string> VectorSearch(string query)
        {
            var queryWordToFrequencyDict = new Dictionary <string, int>();
            var queryWords    = query.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(x => x.ToLowerInvariant()).ToList();
            var queryWordsSet = queryWords.ToImmutableSortedSet();

            foreach (var queryWord in queryWordsSet)
            {
                queryWordToFrequencyDict.Add(queryWord, queryWords.Count(x => string.Equals(queryWord, x, StringComparison.InvariantCultureIgnoreCase)));
            }

            var    queryVector        = new Dictionary <string, double>();
            var    lemmaToIndexesDict = GetLemmasWithIndexes();
            var    lemmatizer         = new Lemmatizer(new StemDownloader().GetLocalPath());
            var    maxFrequency       = queryWordToFrequencyDict.Max(x => x.Value);
            double queryLength        = 0;

            foreach (var(queryWord, frequency) in queryWordToFrequencyDict)
            {
                var lemmatizedQueryWord = GetLemma(lemmatizer.Lemmatize(queryWord));
                if (lemmatizedQueryWord != null)
                {
                    if (lemmaToIndexesDict.ContainsKey(lemmatizedQueryWord))
                    {
                        var vectorElement = ((double)frequency / maxFrequency) * Math.Log10((double)PagesCount / lemmaToIndexesDict[lemmatizedQueryWord].Count);
                        queryVector.Add(lemmatizedQueryWord, vectorElement);
                        queryLength += vectorElement * vectorElement;
                    }
                    else
                    {
                        Console.WriteLine($"Слово {queryWord} не входит в индекс");
                    }
                }
            }

            queryLength = Math.Sqrt(queryLength);

            var documentsLength           = new List <double>();
            var pageLemmasToTfIdfDictList = new List <Dictionary <string, double> >();

            for (int i = 0; i < PagesCount; i++)
            {
                var text = File.ReadAllLines($"{Task4Path}\\page_{i + 1}_tfidf.txt");
                var squaredDocumentLength = 0.0;
                pageLemmasToTfIdfDictList.Add(new Dictionary <string, double>());
                foreach (var lemmaWithTfIdfString in text)
                {
                    var wordWithTfIdf = lemmaWithTfIdfString.Split(' ', StringSplitOptions.RemoveEmptyEntries);
                    var word          = wordWithTfIdf[0];
                    var tfIdfString   = wordWithTfIdf[3];
                    var tfIdf         = double.Parse(tfIdfString.Substring(7));
                    squaredDocumentLength += tfIdf * tfIdf;
                    pageLemmasToTfIdfDictList[i].Add(word, tfIdf);
                }
                documentsLength.Add(Math.Sqrt(squaredDocumentLength));
            }

            if (queryLength == 0)
            {
                return(File.ReadAllLines($"{Task1Path}\\index.txt"));
            }

            var documentCosineSimilarities = new List <(int pageIndex, double cosineSimilarity)>();

            for (int i = 0; i < PagesCount; i++)
            {
                double cosineSimilarity = 0;
                foreach (var(lemmatizedQueryWord, vectorElementValue) in queryVector)
                {
                    if (pageLemmasToTfIdfDictList[i].ContainsKey(lemmatizedQueryWord))
                    {
                        cosineSimilarity += vectorElementValue * pageLemmasToTfIdfDictList[i][lemmatizedQueryWord];
                    }
                }
                documentCosineSimilarities.Add((i, cosineSimilarity / (queryLength * documentsLength[i])));
            }

            documentCosineSimilarities = documentCosineSimilarities.OrderByDescending(x => x.cosineSimilarity).ToList();
            var resultLinks = new List <string>();
            var links       = File.ReadAllLines($"{Task1Path}\\index.txt");

            foreach (var documentCosineSimilarity in documentCosineSimilarities)
            {
                resultLinks.Add(links[documentCosineSimilarity.pageIndex]);
            }

            return(resultLinks);
        }