Ejemplo n.º 1
0
        public static bool isWeeklyNewspaper(string htmlText)
        {
            try
            {
                // if containts the word ejenedelnik we need this
                var parser = new HtmlParser();
                var node   = parser.Parse(htmlText);
                var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title"));
                node.AcceptVisitor(finder);
                var   titleNode = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0];
                var   title     = titleNode.Value;
                Regex regEx     = new Regex(@"(Еженедельник \""Аргументы и Факты\"" №)");
                Match m         = regEx.Match(title);

                if (m.Success)
                {
                    LogServices.WriteProgressLog("it is weekly ");
                    return(true);
                }
                else
                {
                    return(false);
                }
            }
            catch (Exception)
            {
                LogServices.WriteProgressLog("исключение");
                return(false);
            }
        }
Ejemplo n.º 2
0
 public static List <string> extractLinks(string htmlNewspaperText)
 {
     try
     {
         var links      = new List <string>();
         var parser     = new HtmlParser();
         var node       = parser.Parse(htmlNewspaperText);
         var areaFinder = new FindTagsVisitor(tag => tag.Name == "h2" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("data_title mbottom10"));
         node.AcceptVisitor(areaFinder);
         var linksarea = areaFinder.Result;
         foreach (Majestic13.HtmlNode.Tag tag in linksarea)
         {
             var linkFinder = new FindTagsVisitor(t => t.Name == "a" && t.Attributes.ContainsKey("href"));
             tag.AcceptVisitor(linkFinder);
             var link = linkFinder.Result[0].Attributes.Values.FirstOrDefault();
             links.Add(link.ToString());
         }
         LogServices.WriteProgressLog("extracted links");
         return(links);
     }
     catch (Exception)
     {
         return(null);
     }
 }
Ejemplo n.º 3
0
 public static string getEditionName(string htmlText)
 {
     try
     {
         var parser       = new HtmlParser();
         var node         = parser.Parse(htmlText);
         var edNameFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                                tag.Attributes.ContainsValue("newspaper_new"));
         node.AcceptVisitor(edNameFinder);
         if (edNameFinder.Result.Count > 0)
         {
             var hrefTag   = (Majestic13.HtmlNode.Tag)edNameFinder.Result[0].Children[3];
             var edNameTag = (Majestic13.HtmlNode.Text)hrefTag.Children[0];
             return(edNameTag.Value);
         }
         else
         {
             var edNameFinder1 = new FindTagsVisitor(tag => tag.Name == "span" && tag.Attributes.ContainsKey("class") &&
                                                     tag.Attributes.ContainsValue("artic_num_box"));
             node.AcceptVisitor(edNameFinder1);
             var aTag      = (Majestic13.HtmlNode.Tag)edNameFinder1.Result[1].Children[3];
             var edNameTag = (Majestic13.HtmlNode.Text)aTag.Children[0];
             return(edNameTag.Value);
         }
     }
     catch (Exception)
     {
         return("Название не может быть загружено из-за исключения");
     }
 }
Ejemplo n.º 4
0
        public static DateTime getQADate(string htmlText)
        {
            var parser = new HtmlParser();
            var node   = parser.Parse(htmlText);
            var finder = new FindTagsVisitor(tag => tag.Name == "time");

            node.AcceptVisitor(finder);
            var dateTag    = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0];
            var datestring = dateTag.Value;

            return(DateTime.ParseExact(datestring, "HH:mm dd/MM/yyyy", null));
        }
Ejemplo n.º 5
0
        public static string getQATitle(string htmlText)
        {
            var parser = new HtmlParser();
            var node   = parser.Parse(htmlText);
            var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") &&
                                             tag.Attributes.ContainsValue("title"));

            node.AcceptVisitor(finder);
            var titleTag = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0];
            var title    = titleTag.Value;

            return(title);
        }
Ejemplo n.º 6
0
        public static string getQAEditionName(string htmlText)
        {
            var parser = new HtmlParser();
            var node   = parser.Parse(htmlText);
            var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                             tag.Attributes.ContainsValue("autors_box"));

            node.AcceptVisitor(finder);
            var lastIndex = finder.Result[0].Children.Count - 2;
            var divTag    = (Majestic13.HtmlNode.Tag)finder.Result[0].Children[lastIndex];
            var aTag      = (Majestic13.HtmlNode.Tag)divTag.Children[3];
            var name      = (Majestic13.HtmlNode.Text)aTag.Children[0];

            return(name.Value);
        }
Ejemplo n.º 7
0
        public static string getQAContent(string htmlText)
        {
            var content = "";
            var parser  = new HtmlParser();
            var node    = parser.Parse(htmlText);
            var finder  = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                              tag.Attributes.ContainsValue("vo_o_text"));

            node.AcceptVisitor(finder);
            var contextTag = finder.Result[0];

            content = RequestServices.ContinueUntilOnlyTextLeft(contextTag, content);
            LogServices.WriteProgressLog("Fetched content of article ");
            return(content);
        }
Ejemplo n.º 8
0
        public static string getArticleAuthor(string htmlText)
        {
            try
            {
                var author            = "Автор не известен";
                var parser            = new HtmlParser();
                var node              = parser.Parse(htmlText);
                var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                                            tag.Attributes.ContainsValue("material_topline_info"));
                node.AcceptVisitor(authorFieldFinder);

                if (authorFieldFinder.Result.Count > 0)
                {
                    var spanTag      = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[3];
                    var authorFinder = new FindTagsVisitor(tag => tag.Name == "a" && tag.Attributes.ContainsKey("href"));
                    spanTag.AcceptVisitor(authorFinder);
                    if (authorFinder.Result.Count > 0)
                    {
                        author = "";
                        for (int i = 0; i < authorFinder.Result.Count; i++)
                        {
                            var authorText      = (Majestic13.HtmlNode.Text)authorFinder.Result[i].Children[0];
                            var authorToBeAdded = authorText.Value;
                            author = author + " " + authorToBeAdded;
                        }
                    }
                }
                else
                {
                    var authorFieldFinder2 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                                                 tag.Attributes.ContainsValue("autors_box"));
                    node.AcceptVisitor(authorFieldFinder2);
                    var aTag  = (Majestic13.HtmlNode.Tag)authorFieldFinder2.Result[0].Children[2];
                    var aText = (Majestic13.HtmlNode.Text)aTag.Children[0];
                    author = aText.Value;
                }

                return(author);
            }
            catch (Exception)
            {
                return("Автор не может быть загружен из-за исключения");
            }
        }
Ejemplo n.º 9
0
        public static string getQAAuthor(string htmlText)
        {
            var author            = "Автор не известен";
            var parser            = new HtmlParser();
            var node              = parser.Parse(htmlText);
            var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                                        tag.Attributes.ContainsValue("autors_box"));

            node.AcceptVisitor(authorFieldFinder);
            var authorTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[1];

            if (authorTag.Attributes.ContainsValue("icon autors_icon"))
            {
                var autorTag   = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[2];
                var authorName = (Majestic13.HtmlNode.Text)autorTag.Children[0];
                author = authorName.Value;
            }
            return(author);
        }
Ejemplo n.º 10
0
 public static string getArticleContent(string htmlText)
 {
     try
     {
         var content       = "";
         var parser        = new HtmlParser();
         var node          = parser.Parse(htmlText);
         var contentFinder = new FindTagsVisitor(tag => tag.Name == "article");
         node.AcceptVisitor(contentFinder);
         var resultTag = new Majestic13.HtmlNode.Tag();
         resultTag = (Majestic13.HtmlNode.Tag)contentFinder.Result[0];
         content   = ContinueUntilOnlyTextLeft(resultTag, content);
         LogServices.WriteProgressLog("Fetched content of article ");
         return(content);
     }
     catch (Exception)
     {
         return("Содержимое не может быть загружено из-за исключения");
     }
 }
Ejemplo n.º 11
0
 public static string getArticleTitle(string htmlText)
 {
     try
     {
         var title       = "Без заголовка";
         var parser      = new HtmlParser();
         var node        = parser.Parse(htmlText);
         var titleFinder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") &&
                                               tag.Attributes.ContainsValue("material_title increase_text"));
         node.AcceptVisitor(titleFinder);
         if (titleFinder.Result.Count > 0 && titleFinder.Result[0].Children.Count > 0)
         {
             var titleTag = (Majestic13.HtmlNode.Text)titleFinder.Result[0].Children[0];
             title = titleTag.Value;
         }
         return(title);
     }
     catch (Exception)
     {
         return("Название не может быть загружено из-за исключения");
     }
 }
Ejemplo n.º 12
0
 public static DateTime getArticleDate(string htmlText)
 {
     try
     {
         var date            = "Date not known";
         var parser          = new HtmlParser();
         var node            = parser.Parse(htmlText);
         var dateFieldFinder = new FindTagsVisitor(tag => tag.Name == "time");
         node.AcceptVisitor(dateFieldFinder);
         if (dateFieldFinder.Result.Count > 0)
         {
             var timeTag = (Majestic13.HtmlNode.Tag)dateFieldFinder.Result[0];
             var text    = (Majestic13.HtmlNode.Text)timeTag.Children[0];
             date = text.Value;
         }
         else
         {
             var dateFieldFinder1 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                                        tag.Attributes.ContainsValue("material_topline_info"));
             node.AcceptVisitor(dateFieldFinder1);
             var spanTag = (Majestic13.HtmlNode.Tag)dateFieldFinder1.Result[0].Children[1];
             var text    = (Majestic13.HtmlNode.Text)spanTag.Children[1];
             date = text.Value;
         }
         if (date.Contains("сегодня"))
         {
             return(DateTime.Now);
         }
         else
         {
             return(DateTime.ParseExact(date, "HH:mm dd/MM/yyyy", null));
         }
     }
     catch (Exception)
     {
         return(DateTime.Now);
     }
 }