コード例 #1
0
        private static List <string> BooleanSearch(string query)
        {
            var stopwords    = File.ReadAllLines(StopWordsPath);
            var words        = query.Split(" ", StringSplitOptions.RemoveEmptyEntries).Where(x => x != "" && !stopwords.Contains(x)).ToList();
            var excludeWords = new HashSet <string>();
            var includeWords = new HashSet <string>();

            foreach (var word in words)
            {
                if (word.StartsWith('-'))
                {
                    excludeWords.Add(word.Substring(1));
                }
                else
                {
                    includeWords.Add(word);
                }
            }


            var lemmasWithIndexes  = File.ReadAllLines($"{Task3Path}\\invertedIndex.txt");
            var lemmaToIndexesDict = new Dictionary <string, HashSet <int> >();

            foreach (var lemmaWithIndexes in lemmasWithIndexes)
            {
                var parsedString = lemmaWithIndexes.Split(' ', StringSplitOptions.RemoveEmptyEntries);
                lemmaToIndexesDict.Add(parsedString[0], new HashSet <int>(parsedString.Skip(1).Select(int.Parse)));
            }

            var lemmatizer    = new Lemmatizer(new StemDownloader().GetLocalPath());
            var excludeLemmas = lemmatizer.LemmatizeWordsList(excludeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList();
            var includeLemmas = lemmatizer.LemmatizeWordsList(includeWords).Select(x => x.ToLowerInvariant()).Where(x => lemmaToIndexesDict.ContainsKey(x)).ToList();

            if (!includeLemmas.Any() && !excludeLemmas.Any())
            {
                return(new List <string>());
            }

            var documentIndexesSet = new HashSet <int>();

            if (includeLemmas.Any())
            {
                var firstIncludeLemma = includeLemmas.FirstOrDefault();
                documentIndexesSet.UnionWith(lemmaToIndexesDict[firstIncludeLemma !]);;
コード例 #2
0
        private static void LemmatizeAllPagesAndWrite()
        {
            var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath());

            for (int i = 0; i < PagesCount; i++)
            {
                var text            = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt");
                var words           = ParseTextToWordsInLower(text);
                var lemmatizedWords = lemmatizer.LemmatizeWordsList(words);
                lemmatizedWords.Sort();
                File.WriteAllLines($"{Task5Path}\\sortedLemmatizedPages\\page_{i + 1}.txt", lemmatizedWords);
            }
        }
コード例 #3
0
        private static void CreateTfIdf()
        {
            var lemmaToIndexesDict = GetLemmasWithIndexes();

            var lemmatizer = new Lemmatizer(new StemDownloader().GetLocalPath());
            var texts      = new List <List <string> >(100);

            for (int i = 0; i < PagesCount; i++)
            {
                var fileText        = File.ReadAllText($"{Task1Path}\\pages\\page_{i + 1}.txt");
                var words           = ParseTextToWordsInLower(fileText);
                var lemmatizedWords = lemmatizer.LemmatizeWordsList(words);
                texts.Add(lemmatizedWords);
            }

            foreach (var(lemma, indexes) in lemmaToIndexesDict)
            {
                var idf = Math.Log10(Convert.ToDouble(PagesCount) / indexes.Count);
                foreach (var index in indexes)
                {
                    using (var writer = File.AppendText($"{Task4Path}\\page_{index + 1}_tfidf.txt"))
                    {
                        var text             = texts[index];
                        var wordEntriesCount = 0;
                        foreach (var word in text)
                        {
                            if (word == lemma)
                            {
                                wordEntriesCount++;
                            }
                        }
                        var tf = Convert.ToDouble(wordEntriesCount) / text.Count;
                        writer.WriteLine($"{lemma} tf={tf:0.#######} idf={idf:0.#######} tf-idf={tf * idf:0.#######}");
                    }
                }
            }
        }