Beispiel #1
0
        public static bool isWeeklyNewspaper(string htmlText)
        {
            try
            {
                // if containts the word ejenedelnik we need this
                var parser = new HtmlParser();
                var node   = parser.Parse(htmlText);
                var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title"));
                node.AcceptVisitor(finder);
                var   titleNode = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0];
                var   title     = titleNode.Value;
                Regex regEx     = new Regex(@"(Еженедельник \""Аргументы и Факты\"" №)");
                Match m         = regEx.Match(title);

                if (m.Success)
                {
                    LogServices.WriteProgressLog("it is weekly ");
                    return(true);
                }
                else
                {
                    return(false);
                }
            }
            catch (Exception)
            {
                LogServices.WriteProgressLog("исключение");
                return(false);
            }
        }
Beispiel #2
0
 public static List <string> extractLinks(string htmlNewspaperText)
 {
     try
     {
         var links      = new List <string>();
         var parser     = new HtmlParser();
         var node       = parser.Parse(htmlNewspaperText);
         var areaFinder = new FindTagsVisitor(tag => tag.Name == "h2" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("data_title mbottom10"));
         node.AcceptVisitor(areaFinder);
         var linksarea = areaFinder.Result;
         foreach (Majestic13.HtmlNode.Tag tag in linksarea)
         {
             var linkFinder = new FindTagsVisitor(t => t.Name == "a" && t.Attributes.ContainsKey("href"));
             tag.AcceptVisitor(linkFinder);
             var link = linkFinder.Result[0].Attributes.Values.FirstOrDefault();
             links.Add(link.ToString());
         }
         LogServices.WriteProgressLog("extracted links");
         return(links);
     }
     catch (Exception)
     {
         return(null);
     }
 }
Beispiel #3
0
        public static string getQAContent(string htmlText)
        {
            var content = "";
            var parser  = new HtmlParser();
            var node    = parser.Parse(htmlText);
            var finder  = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") &&
                                              tag.Attributes.ContainsValue("vo_o_text"));

            node.AcceptVisitor(finder);
            var contextTag = finder.Result[0];

            content = RequestServices.ContinueUntilOnlyTextLeft(contextTag, content);
            LogServices.WriteProgressLog("Fetched content of article ");
            return(content);
        }
Beispiel #4
0
        public static List <Article> GetNewspapers(int Max)
        {
            string CoreUrl     = "http://www.aif.ru/gazeta/number/";
            var    allArticles = new List <Article>();

            for (int i = 24355; i <= Max; i++)
            {
                var newUrl = String.Concat(CoreUrl, i.ToString());
                LogServices.WriteProgressLog("Formed new url " + newUrl);
                var htmlText = GetWebText(newUrl);

                if (htmlText != null)
                {
                    if (isWeeklyNewspaper(htmlText))
                    {
                        var articlesToBeAdded = GetArticles(htmlText);
                        if (articlesToBeAdded != null)
                        {
                            LogServices.WriteProgressLog("Received list of urls ");
                            foreach (Article a in articlesToBeAdded)
                            {
                                if (a.Url != null)
                                {
                                    LogServices.WriteProgressLog("Preparing to add ");
                                    allArticles.Add(a);
                                    //add to database
                                    LogServices.WriteProgressLog("Trying to insert article to database");
                                    try
                                    {
                                        articleService.Insert(a);
                                        LogServices.WriteProgressLog("Successfully inserted to database");
                                    }
                                    catch (Exception)
                                    {
                                        LogServices.WriteProgressLog("Could not add this article to database" + a.Url);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(allArticles);
        }
Beispiel #5
0
 public static string getArticleContent(string htmlText)
 {
     try
     {
         var content       = "";
         var parser        = new HtmlParser();
         var node          = parser.Parse(htmlText);
         var contentFinder = new FindTagsVisitor(tag => tag.Name == "article");
         node.AcceptVisitor(contentFinder);
         var resultTag = new Majestic13.HtmlNode.Tag();
         resultTag = (Majestic13.HtmlNode.Tag)contentFinder.Result[0];
         content   = ContinueUntilOnlyTextLeft(resultTag, content);
         LogServices.WriteProgressLog("Fetched content of article ");
         return(content);
     }
     catch (Exception)
     {
         return("Содержимое не может быть загружено из-за исключения");
     }
 }
Beispiel #6
0
 private void btnDownload_Click(object sender, EventArgs e)
 {
     try
     {
         LogServices.WriteProgressLog("Download button clicked");
         RequestServices.GetNewspapers(30500);
         LogServices.WriteProgressLog("All articles fetched");
         MessageBox.Show("Finished downloading");
     }
     catch (Exception)
     {
         LogServices.WriteErrorLog("Some error occurred at " + DateTime.Now.ToString());
         MessageBox.Show("Some error occurred at " + DateTime.Now.ToString());
     }
     //foreach (Article a in articles)
     //{
     //    LogServices.WriteProgressLog("Trying to insert article to database");
     //    articleService.Insert(a);
     //    LogServices.WriteProgressLog("Successfully inserted to database");
     //}
 }
Beispiel #7
0
 //After newspaper page where links to articles stored is downloaded, now possible to download html of articles
 //the whole html of page is passed
 public static List <Article> GetArticles(string htmlText)
 {
     try
     {
         var articles  = new List <Article>();
         var linksList = extractLinks(htmlText);
         LogServices.WriteProgressLog("Formed list of links" + linksList.ToString());
         foreach (string s in linksList)
         {
             var article = CreateArticle(s);
             LogServices.WriteProgressLog("Created new article");
             articles.Add(article);
             LogServices.WriteProgressLog("Added new article" + article.ToString());
         }
         ;
         return(articles);
     }
     catch (Exception)
     {
         return(null);
     }
 }
Beispiel #8
0
        public static string GetWebText(string url)
        {
            HttpWebResponse response = null;

            try
            {
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);

                response = (HttpWebResponse)request.GetResponse();

                StreamReader sr       = new StreamReader(response.GetResponseStream());
                string       htmlText = sr.ReadToEnd();
                LogServices.WriteProgressLog("htmlText fetched " + url);
                return(htmlText);
            }
            catch (WebException e)
            {
                if (e.Status == WebExceptionStatus.ProtocolError)
                {
                    response = (HttpWebResponse)e.Response;
                    LogServices.WriteProgressLog("Page not found. Errorcode:" + ((int)response.StatusCode).ToString());
                    return(null);
                }
                else
                {
                    LogServices.WriteProgressLog("Error:" + (e.Status).ToString());
                    return(null);
                }
            }
            finally
            {
                if (response != null)
                {
                    response.Close();
                }
            }
        }