public static bool isWeeklyNewspaper(string htmlText) { try { // if containts the word ejenedelnik we need this var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title")); node.AcceptVisitor(finder); var titleNode = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0]; var title = titleNode.Value; Regex regEx = new Regex(@"(Еженедельник \""Аргументы и Факты\"" №)"); Match m = regEx.Match(title); if (m.Success) { LogServices.WriteProgressLog("it is weekly "); return(true); } else { return(false); } } catch (Exception) { LogServices.WriteProgressLog("исключение"); return(false); } }
public static List <string> extractLinks(string htmlNewspaperText) { try { var links = new List <string>(); var parser = new HtmlParser(); var node = parser.Parse(htmlNewspaperText); var areaFinder = new FindTagsVisitor(tag => tag.Name == "h2" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("data_title mbottom10")); node.AcceptVisitor(areaFinder); var linksarea = areaFinder.Result; foreach (Majestic13.HtmlNode.Tag tag in linksarea) { var linkFinder = new FindTagsVisitor(t => t.Name == "a" && t.Attributes.ContainsKey("href")); tag.AcceptVisitor(linkFinder); var link = linkFinder.Result[0].Attributes.Values.FirstOrDefault(); links.Add(link.ToString()); } LogServices.WriteProgressLog("extracted links"); return(links); } catch (Exception) { return(null); } }
public static string getQAContent(string htmlText) { var content = ""; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("vo_o_text")); node.AcceptVisitor(finder); var contextTag = finder.Result[0]; content = RequestServices.ContinueUntilOnlyTextLeft(contextTag, content); LogServices.WriteProgressLog("Fetched content of article "); return(content); }
public static List <Article> GetNewspapers(int Max) { string CoreUrl = "http://www.aif.ru/gazeta/number/"; var allArticles = new List <Article>(); for (int i = 24355; i <= Max; i++) { var newUrl = String.Concat(CoreUrl, i.ToString()); LogServices.WriteProgressLog("Formed new url " + newUrl); var htmlText = GetWebText(newUrl); if (htmlText != null) { if (isWeeklyNewspaper(htmlText)) { var articlesToBeAdded = GetArticles(htmlText); if (articlesToBeAdded != null) { LogServices.WriteProgressLog("Received list of urls "); foreach (Article a in articlesToBeAdded) { if (a.Url != null) { LogServices.WriteProgressLog("Preparing to add "); allArticles.Add(a); //add to database LogServices.WriteProgressLog("Trying to insert article to database"); try { articleService.Insert(a); LogServices.WriteProgressLog("Successfully inserted to database"); } catch (Exception) { LogServices.WriteProgressLog("Could not add this article to database" + a.Url); } } } } } } } return(allArticles); }
public static string getArticleContent(string htmlText) { try { var content = ""; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var contentFinder = new FindTagsVisitor(tag => tag.Name == "article"); node.AcceptVisitor(contentFinder); var resultTag = new Majestic13.HtmlNode.Tag(); resultTag = (Majestic13.HtmlNode.Tag)contentFinder.Result[0]; content = ContinueUntilOnlyTextLeft(resultTag, content); LogServices.WriteProgressLog("Fetched content of article "); return(content); } catch (Exception) { return("Содержимое не может быть загружено из-за исключения"); } }
private void btnDownload_Click(object sender, EventArgs e) { try { LogServices.WriteProgressLog("Download button clicked"); RequestServices.GetNewspapers(30500); LogServices.WriteProgressLog("All articles fetched"); MessageBox.Show("Finished downloading"); } catch (Exception) { LogServices.WriteErrorLog("Some error occurred at " + DateTime.Now.ToString()); MessageBox.Show("Some error occurred at " + DateTime.Now.ToString()); } //foreach (Article a in articles) //{ // LogServices.WriteProgressLog("Trying to insert article to database"); // articleService.Insert(a); // LogServices.WriteProgressLog("Successfully inserted to database"); //} }
//After newspaper page where links to articles stored is downloaded, now possible to download html of articles //the whole html of page is passed public static List <Article> GetArticles(string htmlText) { try { var articles = new List <Article>(); var linksList = extractLinks(htmlText); LogServices.WriteProgressLog("Formed list of links" + linksList.ToString()); foreach (string s in linksList) { var article = CreateArticle(s); LogServices.WriteProgressLog("Created new article"); articles.Add(article); LogServices.WriteProgressLog("Added new article" + article.ToString()); } ; return(articles); } catch (Exception) { return(null); } }
public static string GetWebText(string url) { HttpWebResponse response = null; try { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); response = (HttpWebResponse)request.GetResponse(); StreamReader sr = new StreamReader(response.GetResponseStream()); string htmlText = sr.ReadToEnd(); LogServices.WriteProgressLog("htmlText fetched " + url); return(htmlText); } catch (WebException e) { if (e.Status == WebExceptionStatus.ProtocolError) { response = (HttpWebResponse)e.Response; LogServices.WriteProgressLog("Page not found. Errorcode:" + ((int)response.StatusCode).ToString()); return(null); } else { LogServices.WriteProgressLog("Error:" + (e.Status).ToString()); return(null); } } finally { if (response != null) { response.Close(); } } }