Exemplo n.º 1
0
        private string StemTerm(string term)
        {
            string massagedTerm = term.Trim().ToLower();

            if (String.IsNullOrEmpty(massagedTerm))
            {
                return(null);
            }
            switch (_language)
            {
            case ConfigHandler.Language.English:
                EnglishWord englishWord = new EnglishWord(massagedTerm);
                return(englishWord.Stem);

            case ConfigHandler.Language.German:
                GermanStemmer germanStemmer = new GermanStemmer();
                return(germanStemmer.Stem(massagedTerm));

            case ConfigHandler.Language.Russian:
                RussianStemmer russianStemmer = new RussianStemmer();
                return(russianStemmer.Stem(massagedTerm));

            case ConfigHandler.Language.French:
                FrenchStemmer frenchStemmer = new FrenchStemmer();
                return(frenchStemmer.Stem(massagedTerm));

            default:
                return(massagedTerm);
            }
        }
Exemplo n.º 2
0
        public void GetTheBaseTest()
        {
            //User user = new User("n0ize34");
            //var mostPopularWords = user.GetMostPopularWordsOnWall(DateTime.MinValue, DateTime.MaxValue);
            //Assert.AreEqual(mostPopularWords[0], "тест");
            Dictionary <string, int> popularWords = new Dictionary <string, int>();
            string str = "вышка вышку вышке вышки тест тесты тесту test";

            string[] words = str.Split(' ');
            Console.WriteLine(str);
            foreach (string word in words)
            {
                string key = RussianStemmer.GetTheBase(word);
                if (popularWords.ContainsKey(key))
                {
                    popularWords[key]++;
                }
                else
                {
                    popularWords.Add(key, 1);
                }
            }
            Assert.AreEqual(3, popularWords.Count);
            Assert.AreEqual(4, popularWords["вышк"]);
            Assert.AreEqual(3, popularWords["тест"]);
            Assert.AreEqual(1, popularWords["test"]);
        }
Exemplo n.º 3
0
        public static Comment Parse(HtmlNode commentNode, string postId)
        {
            if (commentNode != null)
            {
                string commentId = commentNode.GetAttributeValue("id", null);
                if (commentId != null)
                {
                    commentId = commentId.Split('_')[1];
                    string parentId = commentNode.SelectSingleNode("./span[@class='parent_id']").GetAttributeValue("data-parent_id", null);
                    string user     = commentNode.SelectSingleNode("./div[@class='info  ']/a[@class='username']").InnerText;
                    string score    = commentNode.SelectSingleNode("./div[@class='info  ']/div[@class='voting   ']/*/span[@class='score']").InnerText;

                    string         text       = commentNode.SelectSingleNode("./div[contains(@class,'message html_format ')]").InnerText.Trim();
                    Regex          wordsRegex = new Regex(@"\w+");
                    RussianStemmer stemmer    = new RussianStemmer();
                    var            stems      = wordsRegex
                                                .Matches(text)
                                                .Cast <Match>()
                                                .Select(x => stemmer.Stem(x.Value))
                                                .GroupBy(x => x)
                                                .Select(x => new Stem()
                    {
                        id = Guid.NewGuid(), Word = x.Key, Frequency = x.Count()
                    });

                    string      timeStr  = commentNode.SelectSingleNode("./div[@class='info  ']/time").InnerText;
                    CultureInfo provider = CultureInfo.CurrentCulture;
                    string[]    formats  = new string[] { "d MMMM yyyy в HH:mm", "d MMMM yyyy в HH:mm (комментарий был изменён)" };
                    DateTime    time     = DateTime.ParseExact(timeStr, formats, provider, DateTimeStyles.AllowWhiteSpaces);
                    if (string.IsNullOrEmpty(score))
                    {
                        score = "0";
                    }
                    score = score.Trim('+');
                    score = score.Replace("–", "-");

                    return(new Comment()
                    {
                        id = commentId,
                        ParentId = (parentId == null || parentId == "0") ? null : parentId,
                        PostId = postId,
                        Score = int.Parse(score),
                        Time = time,
                        User = user,
                        Text = text,
                        Stems = new List <Stem>(stems)
                    });
                }
                else
                {
                    return(null);
                }
            }
            else
            {
                return(null);
            }
        }
Exemplo n.º 4
0
 public virtual void  TestStem()
 {
     for (int i = 0; i < words.Count; i++)
     {
         //if ( (i % 100) == 0 ) System.err.println(i);
         System.String realStem = RussianStemmer.Stem((System.String)words[i], RussianCharsets.UnicodeRussian);
         Assert.AreEqual(stems[i], realStem, "unicode");
     }
 }
Exemplo n.º 5
0
 public void TestStem()
 {
     for (int i = 0; i < words.Count; i++)
     {
         //if ( (i % 100) == 0 ) System.err.println(i);
         String realStem =
             RussianStemmer.StemWord(words[i]);
         Assert.AreEqual(stems[i], realStem, "unicode");
     }
 }
Exemplo n.º 6
0
        private void btnSearch_Click(object sender, EventArgs e)
        {
            SearchBegin(this, new EventArgs());

            string QueryText = tbQuery.Text;

            QueryText = fmInvIndex.ClearText(QueryText);
            RussianStemmer RStemmer   = new RussianStemmer();
            List <string>  queryWords = QueryText.Split(' ').ToList().Select(t => RStemmer.Stem(t)).Where(t => t.Length > 2)
                                        .OrderBy(t => t).ToList();

            lblQueryForIIView.Text = queryWords.Aggregate((l, r) => l + " " + r);

            pnResponses.Controls.Clear();

            if (queryWords.Count == 0)
            {
                MessageBox.Show("По данному запросу ничего не найдено");
                return;
            }


            List <int> resultIDs = new List <int>();

            if (openFileDialog.SafeFileName.Split('.').Last() == "ii")
            {
                resultIDs = SearchInSimpleInvertedIndex(queryWords);
            }
            else
            {
                resultIDs = SearchInCompressedInvertedIndex(queryWords);
            }

            if (resultIDs == null || resultIDs.Count == 0)
            {
                MessageBox.Show("По данному запросу ничего не найдено");
                return;
            }

            string forQuery = "(" + resultIDs.Select(n => n.ToString()).Aggregate((l, r) => l + "," + r) + ")";


            string sql = "SELECT [u].[Text] [URL] FROM [dbo].[Urls] [u] inner join [dbo].[Pages] [p] on [u].[UrlId] = [p].[MainUrl_UrlId] WHERE [p].[Id] in " +
                         forQuery;

            Connection.Open();
            DataTable dt = Connection.ExecuteQuery(sql);

            Connection.Close();

            dgv.DataSource = new DataView(dt);
            pnResponses.Controls.Add(dgv);

            SearchEnd(this, new EventArgs());
        }
Exemplo n.º 7
0
        string StemByRuEn(string text)
        {
            var r  = new RussianStemmer().Stem(text);
            var r2 = new EnglishStemmer().Stem(text);

            if (r.Length > r2.Length)//we chosee the text which have deleted more
            {
                return(r2);
            }
            return(r);
        }
        private string TitleForPorter(string title)
        {
            RussianStemmer stemmer = new RussianStemmer();

            string titlePorter = null;

            foreach (string word in title.Split(' ', ',', '.'))
            {
                titlePorter += stemmer.Stem(word) + " ";
            }
            return(titlePorter);
        }
Exemplo n.º 9
0
        protected BaseChannelProcessor(ChannelContract channel)
        {
            Channel    = channel;
            Attributes = new AttributesReader(Channel.Attributes).Read();

            StemmerBase stemmer = new RussianStemmer();

            foreach (string s in Channel.Tags.Split(';'))
            {
                _tags.Add(stemmer.Stem(s.Trim()));
            }
        }
Exemplo n.º 10
0
        public static string[] SplitToWords(string text)
        {
            RussianStemmer stemmer = new RussianStemmer();

            char[]   separators = { ' ', ',', '.', '!', '"', '?', '+', '-', ':', ';', '\n', '\r', '\t', ')', '(', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
            string[] textSplit  = text.Split(separators, StringSplitOptions.RemoveEmptyEntries);
            for (int i = 0; i < textSplit.Length; i++)
            {
                textSplit[i] = stemmer.Stem(textSplit[i]);
            }
            textSplit = TextChange.JoinNo(textSplit);
            textSplit = TextChange.RemovePrepositions(textSplit);
            return(textSplit);
        }
Exemplo n.º 11
0
        private void GetDataAndBuildInvertedIndex()
        {
            InvertedIndexBuildBegin(this, new EventArgs());

            TextProgress.Clear();
            TextProgress.AddInformation("Загрузка данных из БД...\r\n");
            LoadDataFromDB();
            TextProgress.AddInformation("Загрузка данных завершена.\r\n");

            TextProgress.AddInformation("Предобработка данных...\r\n");
            PreprocessData();
            TextProgress.AddInformation("Предобработка данных завершена\r\n");

            TextProgress.AddInformation("Построение индекса...\r\n");

            RussianStemmer RStemmer = new RussianStemmer();

            int SizeOfCorpus = DocumentCorpus.Count;
            int counter      = 0;

            TextProgress.AddInformation(String.Format("Прогресс: {0}/{1}\r\n", counter, SizeOfCorpus));
            foreach (IDocument doc in DocumentCorpus)
            {
                counter++;
                TextProgress.UpdateLastInfo(String.Format("Прогресс: {0}/{1}\r\n", counter, SizeOfCorpus));

                string[] terms = doc.Content.Split(' ');

                for (int i = 0; i < terms.Length; ++i)
                {
                    int    position = i + 1;
                    string term     = RStemmer.Stem(terms[i]);
                    if (term.Length < 2)
                    {
                        continue;
                    }

                    InvertedIndex.Add(term, doc.Id, position);
                }

                if (counter > 1000)
                {
                    break;
                }
            }

            TextProgress.AddInformation("Индекс построен.\r\n");

            InvertedIndexHasBeenBuilt(this, new EventArgs());
        }
Exemplo n.º 12
0
        private string AnnotationForPorter(string annotation)
        {
            RussianStemmer stemmer = new RussianStemmer();

            string annotationPorter = null;

            foreach (string word in annotation.Split(' ', ',', '.'))
            {
                if (word != "")
                {
                    annotationPorter += stemmer.Stem(word) + " ";
                }
            }
            return(annotationPorter);
        }
Exemplo n.º 13
0
        int wordCount(string text, string stopWText, out Dictionary <string, int> res)
        {
            res = new Dictionary <string, int>();
            //удалений символов разделителей
            string[] stopWords = stopWText.Split(new string[] { "\r", "\n", " ", ",", "\t", ";" },
                                                 StringSplitOptions.RemoveEmptyEntries);
            List <string> words = new List <string>(text.Split(delimeters, StringSplitOptions.RemoveEmptyEntries));

            //удаление всех стоп-слов в документе
            foreach (string sw in stopWords)
            {
                words.RemoveAll(w => w.ToLower().Equals(sw.ToLower()));
            }
            //переменная языка документа
            bool     isRus = isRussian(text);
            IStemmer stm;

            //если русский-использовать стеммеры русского языка
            if (isRussian(text))
            {
                stm = new RussianStemmer();
            }
            //иначе английский
            else
            {
                stm = new EnglishStemmer();
            }
            //нормализация всех слов документа
            for (int i = 0; i < words.Count; i++)
            {
                words[i] = stm.Stem(words[i].ToLower());
            }
            //строка уникальных слов, без повторов
            List <string> uWords = new List <string>(words.Distinct());



            for (int i = 0; i < uWords.Count; i++)
            {
                //подсчёт количества повторений каждого слова
                res.Add(uWords[i], words.FindAll(w => w.Equals(uWords[i])).Count);
            }

            return(words.Count);
        }
Exemplo n.º 14
0
    public static IEnumerable <DataSample> Search(string input, string fieldName = "")
    {
        var rs = new RussianStemmer();

        char[] ch    = { ' ', '.', '\t', ',', '_', '\\', '/', '|', '"', '\'', '{', '}', '[', ']', '<', '>', ':', ';', '!', '&', '?' };
        var    terms = input.Trim().Replace("-", " ").ToLower().Split(ch, StringSplitOptions.RemoveEmptyEntries);//

        input = "";
        for (int i = 0; i < terms.Length; i++)
        {
            if (terms[i].Length >= 3)
            {
                input += rs.Stem(terms[i]); input += " ";//"* "
            }
        }

        return(_search(input, fieldName));
    }
        // конвертировать в зависимости от стеммера
        public string Convert(string input)
        {
            string output = null;

            if (_stem == "porter")
            {
                RussianStemmer stemmer = new RussianStemmer();
                output = stemmer.Stem(input);
                return(output);
            }
            if (_stem == "mystem")
            {
                MyStem stemmer = new MyStem();
                output = stemmer.Stemer(input);
                return(output);
            }
            return(null);
        }
        /// <summary>
        /// Очищает текст используя базу (словарь) стоп-слов
        /// </summary>
        /// <param name="text">Неочищенный текст, который нужно канонизировать</param>
        /// <returns>Коллекцию слов из текста, которые готовы к употреблению =)</returns>
        private static List <string> TextPurify(string text)
        {
            //разделяем ввесь текст на отдельные слова
            var rawTokens = text.Split(Separators).ToList();

            //проходимся по этому списку слов в linq-выражении
            var canonedTokens = rawTokens.Select(word => word.ToCharArray().Where(n => !char.IsDigit(n)).ToArray()).Select(purified => new string(purified)).ToList();

            //из этой коллекции удаляем все пустые элементы и стоп-слова используя linq
            canonedTokens.RemoveAll(item => StopWords.Contains(item.ToLower()) || string.IsNullOrWhiteSpace(item));

            //также удаляются все стоп-символы из слов в коллекции
            var purifiedTokens = (from item in canonedTokens let regex = new Regex("[0-9/|_!@#$%^&*()_+=?:;.,{}№><«»'\"`~" + @"\\[\]– -]*") select regex.Replace(item, "")).ToList();

            //устанавливаются все слова в Lower Case
            var purifiedLowerCaseTokens = purifiedTokens.Select(purifiedToken => purifiedToken.ToLower()).ToList();

            var stemmedLowerCaseTokens = new List <string>();
            var cyrillicStemmer        = new RussianStemmer();
            var latinStemmer           = new EnglishStemmer();

            foreach (var purifiedLowerCaseToken in purifiedLowerCaseTokens)
            {
                switch (Verifications.GetFontType(purifiedLowerCaseToken))
                {
                case FontType.Cyrillic:
                    stemmedLowerCaseTokens.Add(cyrillicStemmer.Stem(purifiedLowerCaseToken));
                    break;

                case FontType.Latin:
                    stemmedLowerCaseTokens.Add(latinStemmer.Stem(purifiedLowerCaseToken));
                    break;

                case FontType.Other:
                case FontType.Numbers:
                    break;

                default:
                    throw new ArgumentOutOfRangeException();
                }
            }

            return(stemmedLowerCaseTokens);
        }
Exemplo n.º 17
0
        /*
         * лемматизация?
         * [10:43]
         * https://stackoverflow.com/questions/1371994/importing-external-module-in-ironpython
         * Importing external module in IronPython
         * I'm currently working on an application written in C#, which I'm embedding IronPython in. I generally have no problems about it, but there's one thing that I don't know how to deal with. I want to
         * [10:44]
         * Это для того, чтобы импортить питоновский NLTK
         * [10:47]
         * Хотя наверное в шарпе есть что-то для лемматизации, no idea
         *
         * стеммер портера
         *
         * Стемминг
         */
        public static List <string> BreakWords(string content)
        {
            var delimiterChars = new[] { ' ', ',', '.', ':',
                                         '\t', ';', '\n', '\r', '?', '\\', '/', '-',
                                         '<', '>', ')', '(', '<', '>', '[', ']', '\'',
                                         '"', '|', '*', '&', '+', '\u00AB', '\u00BB' };
            var words     = content.Split(delimiterChars);
            var wordsList = words.Select(word => word.Trim()).Where(w => !string.IsNullOrWhiteSpace(w)).ToList();

            var rusStemmer = new RussianStemmer();
            var engStemmer = new EnglishStemmer();
            var result     = new List <string>();

            foreach (var word in wordsList)
            {
                var stemmedWord = Regex.IsMatch(word, "^[a-zA-Z0-9]*$") ? engStemmer.Stem(word) : rusStemmer.Stem(word);
                result.Add(stemmedWord);
            }

            return(result);
        }
        public static IEnumerable <string> GetCanonizedTextWords(string text)
        {
            var rusStemmer = new RussianStemmer();
            var enStemmer  = new EnglishStemmer();
            var reg1       = new Regex(@"[\s\p{P}№^\|<>`~$]");
            var words      = reg1.Split(text.ToLower()).Where(s => s != string.Empty && !StopWordsFilter.Contains(s));

            foreach (var word in words)
            {
                if (IsNumbers(word))
                {
                    yield return(word);
                }
                else if (IsRussian(word))
                {
                    yield return(rusStemmer.Stem(word));
                }
                else
                {
                    yield return(enStemmer.Stem(word));
                }
            }
        }
Exemplo n.º 19
0
        public static Post Parse(HtmlNode postContentHtmlnode)
        {
            if (postContentHtmlnode != null)
            {
                string      published     = postContentHtmlnode.SelectSingleNode("//div[@class='published']").InnerText;
                CultureInfo provider      = CultureInfo.CurrentCulture;
                string      format        = "d MMMM yyyy в HH:mm";
                DateTime    publishedDate = DateTime.ParseExact(published.Trim(), format, provider, DateTimeStyles.AllowWhiteSpaces);

                string        title    = postContentHtmlnode.SelectSingleNode("//span[@class='post_title']").InnerText;
                var           hubnodes = postContentHtmlnode.SelectNodes("//a[@class='hub ']");
                List <string> hubs     = new List <string>();
                if (hubnodes != null)
                {
                    hubs = hubnodes.Select(x => x.InnerText).ToList();
                }
                string postHtml = postContentHtmlnode.SelectSingleNode("//div[@class='content html_format']").InnerHtml;
                string postText = postContentHtmlnode.SelectSingleNode("//div[@class='content html_format']").InnerText;

                Regex          wordsRegex = new Regex(@"\w+");
                RussianStemmer stemmer    = new RussianStemmer();
                var            stems      = wordsRegex
                                            .Matches(postText)
                                            .Cast <Match>()
                                            .Select(x => stemmer.Stem(x.Value))
                                            .GroupBy(x => x)
                                            .Select(x => new Stem()
                {
                    id = Guid.NewGuid(), Word = x.Key, Frequency = x.Count()
                });


                var           tagNodes = postContentHtmlnode.SelectSingleNode("//ul[@class='tags']").SelectNodes("//a[@rel='tag']");
                List <string> tags     = new List <string>();
                if (tagNodes != null)
                {
                    tags = tagNodes.Select(x => x.InnerText).ToList();
                }
                string pageViews = postContentHtmlnode.SelectSingleNode("//div[@class='pageviews']").InnerText;
                if (string.IsNullOrEmpty(pageViews))
                {
                    pageViews = "0";
                }
                string favCount = postContentHtmlnode.SelectSingleNode("//div[@class='favs_count']").InnerText;
                if (string.IsNullOrEmpty(favCount))
                {
                    favCount = "0";
                }
                string author        = postContentHtmlnode.SelectSingleNode("//div[@class='author']").SelectSingleNode(".//a").InnerText;
                string authorRating  = postContentHtmlnode.SelectSingleNode("//span[@class='rating']").InnerText;
                string commentsCount = postContentHtmlnode.SelectSingleNode("//span[@id='comments_count']").InnerText;
                if (string.IsNullOrEmpty(commentsCount))
                {
                    commentsCount = "0";
                }
                string postId = postContentHtmlnode.SelectSingleNode("//div[@class='post shortcuts_item'] | //div[@class='post translation shortcuts_item']").GetAttributeValue("id", null);
                postId = postId.Split('_')[1];

                string originalAuthor     = null;
                string originalSource     = null;
                bool   translation        = false;
                var    originalAuthorNode = postContentHtmlnode.SelectSingleNode("//div[@class='original-author']/a");
                if (originalAuthorNode != null)
                {
                    translation    = true;
                    originalAuthor = originalAuthorNode.InnerText;
                    originalSource = originalAuthorNode.GetAttributeValue("href", null);
                }


                List <Comment> comments     = new List <Comment>();
                var            commentNodes = postContentHtmlnode.SelectNodes("//div[@class='comment_item']");
                if (commentNodes != null)
                {
                    foreach (var commentNode in commentNodes)
                    {
                        Comment comment = Comment.Parse(commentNode, postId);
                        if (comment != null)
                        {
                            comments.Add(comment);
                        }
                    }
                }

                return(new Post
                {
                    Author = author,
                    AuthorRating = authorRating,
                    CommentsCount = int.Parse(commentsCount),
                    Comments = comments,
                    FavoritesCount = int.Parse(favCount),
                    Hubs = hubs,
                    id = postId,
                    PageViews = int.Parse(pageViews),
                    PostHtml = postHtml,
                    PostText = postText,
                    Published = publishedDate,
                    Tags = tags,
                    Title = title,
                    OriginalAuthor = originalAuthor,
                    OriginalSource = originalSource,
                    Translation = translation,
                    Stems = new List <Stem>(stems)
                });
            }
            else
            {
                return(null);
            }
        }
Exemplo n.º 20
0
        public void Russian_BaseTest()
        {
            var stemmer = new RussianStemmer();

            Assert.AreEqual("абиссин", stemmer.Stem("абиссинию"));
        }
Exemplo n.º 21
0
        public void ParsePage(string urlPage)
        {
            if (urlPage == null)
            {
                mess.WriteMessage(String.Format("Передана пустая ссылка"));
                return;
            }
            Dictionary <string, object> allInformation = new Dictionary <string, object>();

            allInformation.Add("url", urlPage);
            //Регулярка для поиска начала статьи. Ограничение на количество, что бы не зацепить другие комменты
            Regex beginText  = new Regex("<!--.*testcom.{0,5}news.{0,50}-->");
            Regex beginText2 = new Regex("<!-- СТАТЬЯ -->");
            //Регулярка для поиска конца статьи
            Regex endText = new Regex("(Ссылки по теме)|(Сайты по теме)|(<!-- social -->)");

            //urlPage = "http://lenta.ru/news/2012/10/07/milf/"; // "http://lenta.ru/news/2012/10/01/party/"; //http://lenta.ru/news/2012/10/05/manson1/
            DateTime dateArticle = ConvertSubUriInDateTime(urlPage);

            allInformation.Add("date", dateArticle);

            string subUrl = string.Format("testcom/news/2012");

            HtmlDocument newsPage = GetPage(urlPage);

            if (newsPage == null)
            {
                mess.WriteMessage(String.Format("Не удалось загрузить страницу: {0}", urlPage));
                return;
            }

            allInformation.Add("articleHtml", newsPage.DocumentNode.InnerHtml);

            var    text        = newsPage.DocumentNode.SelectSingleNode("//td[@class='statya']");
            var    title       = newsPage.DocumentNode.SelectSingleNode("/html/head/title");
            var    description = newsPage.DocumentNode.SelectSingleNode("/html/head/meta[1]");
            string descrTest   = description.GetAttributeValue("content", "");

            allInformation.Add("description", descrTest);

            string badText    = text.InnerText;
            var    matchBegin = beginText.Match(badText);

            if (!matchBegin.Success)
            {
                matchBegin = beginText2.Match(badText);
            }
            var matchEnd = endText.Match(badText);

            if (!matchBegin.Success || !matchEnd.Success)
            {
                mess.WriteMessage(String.Format("Не удалось выделить основной текст: {0}", urlPage));
                //throw new Exception("Начало или конец статьи не найден!");
            }
            StringBuilder articleText = new StringBuilder(
                badText.Substring(matchBegin.Index + matchBegin.Length, matchEnd.Index - matchBegin.Index - matchBegin.Length + 0));

            //после замены чистый текст статьи
            articleText.Replace("\n", "");

            allInformation.Add("articleText", articleText.ToString());

            //выделение основ
            RussianStemmer           rusStemmer = new RussianStemmer();
            Regex                    separators = new Regex("[А-Яа-я]+");
            var                      matches    = separators.Matches(articleText.ToString());
            Dictionary <string, int> words      = new Dictionary <string, int>();

            foreach (Match item in matches)
            {
                string tmp = rusStemmer.Stem(item.Value);
                if (words.ContainsKey(tmp))
                {
                    words[tmp]++;
                }
                else
                {
                    words.Add(tmp, 1);
                }
            }

            //Выделение тематики
            Regex  themeRegEx   = new Regex(":");
            var    themeMatches = themeRegEx.Matches(title.InnerText);
            string theme        = title.InnerText.Substring(themeMatches[0].Index + 1, themeMatches[1].Index - themeMatches[0].Index - 1).Trim();

            allInformation.Add("theme", theme);

            //Выделение заголовка
            var    h2Title   = newsPage.DocumentNode.SelectSingleNode("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/h2");
            string titleText = h2Title.InnerText;

            allInformation.Add("title", titleText);

            //Выделение ссылок по теме
            //*[@id="pacman"]/table/tbody/tr/td/p[6]
            //*[@id="pacman"]/table/tbody/tr/td/p[6]/a
            var           linksOnTheme = newsPage.DocumentNode.SelectNodes("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/p[@class='links']/a");
            List <string> links        = new List <string>();

            if (linksOnTheme != null)
            {
                foreach (var item in linksOnTheme)
                {
                    string tmp = item.GetAttributeValue("href", "");
                    if (tmp != String.Empty)
                    {
                        links.Add(tmp);
                    }
                }
            }

            allInformation.Add("wordBase", words);

            allInformation.Add("themeLinks", links);

            //Выделение ссылок в тексте
            var           linksInArticle     = newsPage.DocumentNode.SelectNodes("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/a");
            List <string> listLinksInArticle = new List <string>();

            if (linksInArticle != null)
            {
                foreach (var item in linksInArticle)
                {
                    string tmp = item.GetAttributeValue("href", "");
                    if (!tmp.StartsWith("http://lenta.ru"))
                    {
                        tmp = String.Format("http://lenta.ru{0}", tmp);
                    }
                    if (tmp != String.Empty)
                    {
                        listLinksInArticle.Add(tmp);
                    }
                }
            }

            allInformation.Add("articleLinks", listLinksInArticle);

            //сохраняем в Mongo
            repository.SaveDocument(allInformation);

            mess.WriteMessage(String.Format("Сохранили: {0}", urlPage));
        }
Exemplo n.º 22
0
        //обработка ввода
        void process(string s, string cardId)
        {
            List <string> words = new List <string>();

            //вставляем пробелы перед n

            /*string s = input;
             * for (int i = 0; i<s.Length; i++)
             * {
             *      if (i < s.Length && s[i] == '\\' && s[i + 1] == 'n')
             *      {
             *              s = input.Insert(i+1, " ");
             *      }
             * }*/

            //разбиваем на слова
            //words.AddRange(s.Split(' '));

            char[] badChars = new char[] { '\\', '*', '>', '\'', '"', ':', ',', '.', '@', '!', '?', '-', '+', '&' };
            //удаляем плохие символы

            /*for (int i = 0; i < words.Count; i++)
             * {
             *      string word = words[i];
             *      for (int j = 0; j < word.Length; j++)
             *      {
             *              if (badChars.Contains(word[j]))
             *              {
             *                      if (j < word.Length && word[j]=='\\' && word[j + 1] == 'n')
             *                      {
             *                              words[i] = word.Remove(j, 2);
             *                              word=words[i];
             *                      }
             *                      else
             *                      {
             *                              words[i] = word.Remove(j, 1);
             *                              word = words[i];
             *                      }
             *              }
             *      }
             * }*/

            //альтернатива: заменяем плохие символы пробелами, ЗАТЕМ разбиваем на слова
            string betterS = "";

            for (int i = 0; i < s.Length; i++)
            {
                if (badChars.Contains(s[i]))
                {
                    if (i < s.Length && s[i] == '\\' && s[i + 1] == 'n')
                    {
                        i++;
                    }
                    betterS += " ";
                }
                else
                {
                    betterS += s[i];
                }
            }
            words.AddRange(betterS.Split(' '));


            //удаляем стоп-слова
            for (int i = 0; i < words.Count; i++)
            {
                if (stopwords.Contains(words[i].ToLower()))
                {
                    words.Remove(words[i]);
                    i--;
                }
            }

            //стемминг
            IStemmer stemmerRU = new RussianStemmer();
            IStemmer stemmerEN = new EnglishStemmer();

            for (int i = 0; i < words.Count; i++)
            {
                string oldword = words[i];
                string neword  = stemmerRU.Stem(words[i]);
                neword   = stemmerEN.Stem(neword);
                words[i] = neword;
            }

            //считаем частоты
            string[,] frequencyTable = new string[2, words.Count];
            int length = 0;

            foreach (var n in words)
            {
                if (n != "" && n != " ")
                {
                    //MessageBox.Show(n, "Слово:", MessageBoxButtons.OK);
                    bool isNew = true;
                    for (int j = 0; j < length; j++)
                    {
                        if (frequencyTable[0, j] == n)
                        {
                            frequencyTable[1, j] = (Convert.ToInt32(frequencyTable[1, j]) + 1).ToString();
                            isNew = false;
                        }
                    }
                    if (isNew)
                    {
                        length++;
                        frequencyTable[0, length - 1] = n;
                        frequencyTable[1, length - 1] = "1";
                    }
                }
            }

            //запись в бд
            using (TrelloDbContext context = new TrelloDbContext())
            {
                for (int n = 0; n < length; n++)
                {
                    CardTerm term = new CardTerm {
                        Id = cardId + n.ToString(), Name = frequencyTable[0, n], Frequency = Convert.ToInt32(frequencyTable[1, n]), CardId = cardId
                    };
                    context.CardTerms.Add(term);
                    context.SaveChanges();
                }
            }

            //вывод

            /*s = "";
             * for (int j = 0; j < length; j++)
             * {
             *      s += frequencyTable[0,j] + " | " + frequencyTable[1,j] + "\n";
             * }
             * MessageBox.Show(s, "Частоты:", MessageBoxButtons.OK);*/
        }
Exemplo n.º 23
0
        /// <summary>
        /// 获取词干对象(线程不安全)
        /// </summary>
        /// <param name="language"></param>
        /// <returns></returns>
        //public static SnowballProgram GetSnowball(string language)
        //{
        //    if (_dictSnowball.ContainsKey(language))
        //        return _dictSnowball[language];
        //    return null;
        //}
        /// <summary>
        /// 获取词干对象(线程安全)
        /// </summary>
        /// <param name="language"></param>
        /// <returns></returns>
        public static SnowballProgram GetSnowball(string language)
        {
            SnowballProgram result = null;

            switch (language)
            {
            case "DA":
                result = new DanishStemmer();
                break;    //丹麦语

            case "NL":
                result = new DutchStemmer();
                break;    //荷兰语

            case "EN":
                result = new EnglishStemmer();
                break;    //英语

            case "FI":
                result = new FinnishStemmer();
                break;    //芬兰语

            case "FR":
                result = new FrenchStemmer();
                break;    //法语

            case "DE2":
                result = new German2Stemmer();
                break;    //德语2

            case "DE":
                result = new GermanStemmer();
                break;    //德语

            case "HU":
                result = new HungarianStemmer();
                break;

            case "IT":
                result = new ItalianStemmer();
                break;

            case "文斯语":
                result = new LovinsStemmer();
                break;

            case "NO":
                result = new NorwegianStemmer();
                break;

            case "波特语":
                result = new PorterStemmer();
                break;    //英语的

            case "PT":
                result = new PortugueseStemmer();
                break;    //葡萄牙语

            case "RO":
                result = new RomanianStemmer();
                break;

            case "RU":
                result = new RussianStemmer();
                break;    //俄语

            case "ES":
                result = new SpanishStemmer();
                break;    //西班牙语

            case "SV":
                result = new SwedishStemmer();
                break;

            case "TR":
                result = new TurkishStemmer();
                break;    //土耳其语
            }
            return(result);
        }
Exemplo n.º 24
0
 private void copy_from(RussianStemmer other)
 {
     I_p2 = other.I_p2;
     I_pV = other.I_pV;
     base.copy_from(other);
 }
Exemplo n.º 25
0
 private void copy_from(RussianStemmer other) {
     I_p2 = other.I_p2;
     I_pV = other.I_pV;
     copy_from(other);
 }
Exemplo n.º 26
0
 protected internal virtual void copy_from(RussianStemmer other)
 {
     I_p2 = other.I_p2;
     I_pV = other.I_pV;
     base.copy_from(other);
 }
Exemplo n.º 27
0
        static void Main(string[] args)
        {
            //Console.WriteLine("Input link: ");
            //String s = Console.ReadLine();
            //Console.WriteLine("Input Directory to save: ");
            //String d = Console.ReadLine();

            String startupPath = System.IO.Directory.GetCurrentDirectory();

            //startupPath = Path.Combine(startupPath, d);


            dsFiles = Path.Combine(startupPath, "Data");//Path to source files.
            //String dict = Path.Combine(startupPath,"LDict.txt");//Dictionary of lemmas

            /* STEP 1. PARSE HTML
             * ParserWorker<String[]> parser = new ParserWorker<String[]>(new NekdoParser());
             *
             * parser.SetSettings(new NekdoSettings(1,100));
             * parser.OnNewData += NewData;
             * parser.OnComplete += Complete;
             * dir = new DirectoryInfo(startupPath);
             * try{
             *  dir.Create();
             * }
             * catch(IOException){
             *  Console.WriteLine("This directory has already exist. Continue work with this directory");
             * }
             * parser.Start();
             * while(parser.IsActive()){//awaiting parser...
             *
             * }
             *
             *
             *
             * CreateIndexF(parser.GetUrls());
             */


            //STEP 2 STEMMING

            /*
             * TrainDataParser TDP = new TrainDataParser();
             *
             *
             * Lemmatization(TDP);
             *
             * Console.WriteLine("");
             */
            //STEP 3 CREATING INDEX.
            String indexFileP = Path.Combine(startupPath, "Indexer", "inventIndex.txt");

            Console.WriteLine("===STEP 3 ===");

            IndexBuilder builder = new IndexBuilder();

            Console.WriteLine("Source: {0} ", builder.Source);
            Console.WriteLine("Dest: {0}", indexFileP);


            LinkedDictionary <String, IndexEntry> indexer = builder.ReadData();//INDEX


            // UNCOMMENT FOR VECTOR RETRIEVAL (STEP 5)

            foreach (KeyValuePair <String, IndexEntry> p in indexer)
            {
                Double I = Math.Round(100.0 / p.Value.Ids.Count, 5);
                p.Value.IDF = I;//Math.Log(100.0/p.Value.Ids.Count, 10.0);

                foreach (Double prob in p.Value.Probs)
                {
                    p.Value.Weights.Add(prob * I); //tf(t,d)*idf(t,D) = tf-idf(t,d,D)
                }
                //String data = p.Key +" : "+ p.Value;
                //__CreateIFile(indexFileP, data);//read Data from indexer to file.
            }

            Console.WriteLine("Done.");



            IStemmer         stem   = new RussianStemmer();                                    //STEMMER
            BoolSyntaxParser bp     = new BoolSyntaxParser();                                  //PARSER OF BOOLEAN EXPRESSIONS
            ILemmatizer      lemmer = new LemmatizerPrebuiltCompact(LanguagePrebuilt.Russian); //LEMMATIZER.


            //STEP 4. BOOLEAN SEARCH BY(indexer)

            /*
             * while(true){
             *  Console.WriteLine("Input search str...");
             *  String ui = Console.ReadLine();
             *
             *  String[] u = ui.ToLower().Replace('ё','е').Split(new Char[]{' ' , ',', '.', ';', '-', ':','?','!','\"'},StringSplitOptions.RemoveEmptyEntries);
             *  LinkedStack<String> ui_w =  bp.GetInput(u);//GET EXPRESSION IN POLISH NOTATION
             *
             *  String[] ui_wa = ui_w.ToArray();//SAVE IT INTO ARRAY
             *  foreach(String it2 in ui_wa){
             *      Console.WriteLine(it2);
             *  }
             *  SimpleTextCrawler.Structures.LinkedList<Int32> idsOf = __GetIds(lemmer, indexer, ui_wa);
             *  __FindLinks(idsOf);
             *
             * }*/


            //STEP 5 Vector SEARCH BY(indexer).

            ArrayHeap <HeapEntry> PQ = new ArrayHeap <HeapEntry>(x => x.Relevance);//HEAP SORT.

            Console.WriteLine("VECTOR SEARCH...\n");
            while (true)
            {
                PQ.Clear();
                Console.WriteLine("Input search str...");
                String   ui    = Console.ReadLine();
                Double[] score = new Double[101];
                //Double[] lengths = new Double[101];//ST_C
                Double[] lengths = builder.GetLens();//ST_UC
                Double   q_w     = 0.0;
                String[] u       = ui.ToLower().Replace('ё', 'е').Split(new Char[] { ' ', ',', '.', ';', '-', ':', '?', '!', '\"' }, StringSplitOptions.RemoveEmptyEntries);
                foreach (String t in u)
                {
                    IndexEntry te;
                    if (indexer.TryGetValue(lemmer.Lemmatize(t), out te))
                    {
                        q_w += te.IDF * te.IDF;
                        Int32 i = 1;
                        foreach (Int32 id in te.Ids)
                        {
                            score[id] += te.Weights[i];
                            //lengths[id] += te.Probs[i]*te.Probs[i];//ST_C
                            i++;
                        }
                    }
                }
                q_w = Math.Sqrt(q_w);
                if (q_w == 0.0)
                {
                    Console.WriteLine("NOT FOUND");
                }
                else
                {
                    for (Int32 k = 1; k < 101; k++)
                    {
                        if (lengths[k - 1] == 0) //ST_C
                        {
                            continue;            //ST_C
                        }
                        //lengths[k] = lengths[k] > 0 ? Math.Sqrt(lengths[k]) : 1;//ST_C
                        //score[k] = score[k]/(lengths[k]*q_w);//ST_C
                        score[k] = score[k] / (lengths[k - 1] * q_w);// 0 /1 => 0.
                        if (score[k] == 0.0)
                        {
                            continue;
                        }
                        PQ.Add(new HeapEntry()
                        {
                            Relevance = 1d / score[k], Id = k
                        });                                                      //ASC ORDER
                    }
                    SimpleTextCrawler.Structures.LinkedList <Int32> docIds = new SimpleTextCrawler.Structures.LinkedList <Int32>();
                    Int32 KM = 5;
                    while (!PQ.IsEmpty() && KM > 0)
                    {
                        HeapEntry et = PQ.DeleteMin();
                        Console.WriteLine("{0} : {1} ", et.Id, 1d / et.Relevance);
                        docIds.Add(et.Id);
                        KM--;
                    }
                    Console.WriteLine("");
                    __FindLinksV(docIds);
                }
            }
        }