static async void MainAsync(string[] args) { List <HtmlNode> articles = Grab.GrabArticles(); int articleNo = 0; foreach (HtmlNode a in articles) { articleNo++; Console.Write($"Beginning scrapping article {articleNo}. "); PostMessage post; try { Interpreter.Interpret(a, out post); var json = JsonConvert.SerializeObject(post, new IsoDateTimeConverter() { DateTimeFormat = "yyyy-MM-dd" }); Console.WriteLine($"Post article {articleNo} API result: "); Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine(await PostArticle.Post(json)); Console.ForegroundColor = ConsoleColor.White; Console.WriteLine("-------------------------------------------------------"); } catch (Exception ex) { logger.Error(ex); } } Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine($"{articleNo} articles scrapped. Press any key to exit."); Console.ReadLine(); }
public void Date() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.date, DateTime.Parse("2017-01-19T11:22:06+00:00")); #endregion }
public void Institution() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.institution, "dialog"); #endregion }
public void Type() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.type, "OTHER"); #endregion }
public void Title() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.title, "Consultare publică CONECT Catalogul Organizațiilor Neguvernamentale pentru Evidență, Consultare și Transparență"); #endregion }
public void Identifier() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.identifier, "dialog_category-proiecte-de-acte-normative_post-6300"); #endregion }
public void Documents() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.documents[0].url, "http://dialogsocial.gov.ro/2017/01/consultare-publica-conect-catalogul-organizatiilor-neguvernamentale-pentru-evidenta-consultare-si-transparenta/"); #endregion }
public void ContactEmail() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.contact.email, "*****@*****.**"); #endregion }
public void FeedbackDays() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg.feedback_days, (uint)14); #endregion }
public void Description() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpreter.Interpret(articles[0], out PostMsg); #endregion #region ASSERT //Console.WriteLine(PostMsg.description); Assert.AreEqual(PostMsg.description, "Ministerul Consultării Publice și Dialogului Social (MCPDS) lansează, astăzi, 19 ianuarie a.c. în consultare publică instrumentul CONECT Catalogul organizațiilor neguvernamentale pentru evidență, consultare și transparență. „Ne dorim să oferim cât mai mult sprijin comunităților de expertiză și să le aducem mai aproape de decizia publică. Acest lucru..."); #endregion }
public void FeedbackDays() { #region ARRANGE List <HtmlNode> articles = Grab.GrabArticles(); PostMessage PostMsg; #endregion #region ACT Interpret(articles[0].OuterHtml, out PostMsg); #endregion #region ASSERT Assert.AreEqual(PostMsg, ""); #endregion }
public static void Interpret(HtmlNode input, out PostMessage postMsg) { postMsg = new PostMessage(); HtmlNode FullArticle = null; List <Document> docs = null; #region LIST OF DOCUMENTS && CONTACT try { MatchCollection links = GetMatches(input.OuterHtml, "<a[^>]* href=\"([^ \"]*)\""); Match link = links[links.Count - 1]; docs = new List <Document> { new Document { url = CustomTrimText(link.ToString()).Replace("\"", ""), type = "article" } }; //First doc is full article FullArticle = Grab.GrabArticles(docs[0].url)[0]; } catch (Exception ex) { logger.Error(ex); } #region CONTACT try { Match fullArt = GetMatches(FullArticle.InnerText, EndOfArticleMarker)[0]; if (fullArt.Success) { postMsg.contact = new Contact { email = AppendEmails(fullArt.Value), tel = GetPhoneNumber(fullArt.Value), addr = "" }; } } catch (Exception ex) { logger.Error(ex); } #endregion #region LIST OF PDFS, LINKS, YOUTUBE VIDEOS ETC try { Match searchForLinksToDocs = GetMatches(FullArticle.InnerHtml, EndOfArticleMarker)[0]; if (searchForLinksToDocs.Success) { MatchCollection docsMatches = GetMatches(searchForLinksToDocs.Value, "(?<=<a href=\")(.*?)(?=\")"); if (docsMatches.Count > 0) { List <Document> docsToAdd = ExtractDocs(docsMatches); docs.AddRange(docsToAdd); } } } catch (Exception ex) { logger.Error(ex); } #endregion //< !--Facebook Comments Plugin for WordPress: http://peadig.com/wordpress-plugins/facebook-comments/ --> postMsg.documents = docs; #endregion #region DATE && FEEDBACK_DAYS try { Match dateMatch = Regex.Match(input.OuterHtml, "(?<=datetime=\").*(?=\")"); if (dateMatch.Success) { postMsg.date = DateTime.Parse(CustomTrimText(dateMatch.Value)); #region FEEDBACK DAYS Match untilDateMatch = GetUntilDateMatch(FullArticle); if (untilDateMatch.Success) { string untilDateToParse = untilDateMatch.Value.Substring(0, untilDateMatch.Value.IndexOf('.')); Thread.CurrentThread.CurrentCulture = new CultureInfo("ro-RO"); DateTime ParsedDate = DateTime.Parse(untilDateToParse); DateTime ProcessedDate = new DateTime(postMsg.date.Year, ParsedDate.Month, ParsedDate.Day); if (ProcessedDate < postMsg.date) { ProcessedDate.AddYears(1); } if (ProcessedDate > postMsg.date) { postMsg.feedback_days = (uint)Math.Abs((ProcessedDate - postMsg.date.Date).Days); } } postMsg.date = DateTime.Parse(CustomTrimText(dateMatch.Value)); #endregion } } catch (Exception ex) { logger.Error(ex); } #endregion #region DESCRIPTION try { Match descriptionMatch = GetMatches(input.OuterHtml, "<p>(.*)</p>")[0]; if (descriptionMatch.Success) { postMsg.description = CustomTrimText(descriptionMatch.Value); } } catch (Exception ex) { logger.Error(ex); } #endregion #region IDENTIFIER try { Match id = GetMatches(input.OuterHtml, "post-[\\d]+")[0]; if (id.Success) { postMsg.identifier = $"dialog_category-proiecte-de-acte-normative_{CustomTrimText(id.Value)}"; } } catch (Exception ex) { logger.Error(ex); } #endregion #region INSTITUTION try { postMsg.institution = "dialog"; } catch (Exception ex) { logger.Error(ex); } #endregion #region TITLE try { Match titleMatch = Regex.Match(input.OuterHtml, "(?<=title=\").*(?=\" rel)"); if (titleMatch.Success) { postMsg.title = CustomTrimText(titleMatch.Value); } } catch (Exception ex) { logger.Error(ex); } #endregion #region TYPE // Default value, in case title contains no matches postMsg.type = "OTHER"; // Check if input text contains keywords, and adjust type to the first match. string txtForType = RemoveDiacritics(postMsg.title); foreach (var e in filtruActNormativ) { if (txtForType.ToLower().Contains(e.Key)) { postMsg.type = e.Value; break; } } #endregion }
private void Interpret(string input, out PostMessage postMsg) { postMsg = new PostMessage(); #region DATE string dateMatch = Regex.Match(input, "(?<=datetime=\").*(?=\")").Value; postMsg.date = DateTime.Parse(CustomTrimText(dateMatch)); #endregion #region DESCRIPTION Match descriptionMatch = GetMatches(input, "<p>(.*)</p>")[0]; postMsg.description = CustomTrimText(descriptionMatch.Value); #endregion #region LIST OF DOCUMENTS && CONTACT MatchCollection links = GetMatches(input, "<a[^>]* href=\"([^ \"]*)\""); Match link = links[links.Count - 1]; List <Document> docs = new List <Document> { new Document { url = CustomTrimText(link.ToString()).Replace("\"", ""), type = "article" } }; //First doc is full article HtmlNode FullArticle = Grab.GrabArticles(docs[0].url)[0]; #region CONTACT string fullArt = GetMatches(FullArticle.InnerText, EndOfArticleMarker)[0].ToString(); string emails = AppendEmails(fullArt); postMsg.contact = new Contact { email = emails.ToString() }; #endregion #region LIST OF PDFS, LINKS, YOUTUBE VIDEOS ETC string searchForLinksToDocs = GetMatches(FullArticle.InnerHtml, EndOfArticleMarker)[0].ToString(); var docsMatches = GetMatches(searchForLinksToDocs, "(?<=<a href=\")(.*?)(?=\")"); List <Document> docsToAdd = ExtractDocs(docsMatches); docs.AddRange(docsToAdd); #endregion //< !--Facebook Comments Plugin for WordPress: http://peadig.com/wordpress-plugins/facebook-comments/ --> postMsg.documents = docs; #endregion #region IDENTIFIER // string id = GetMatches(input, "post-[\\d]+")[0].ToString(); postMsg.identifier = $"czl_dialog_category-proiecte-de-acte-normative_{CustomTrimText(id)}"; #endregion #region INSTITUTION postMsg.institution = "dialog"; #endregion #region TITLE string titleMatch = Regex.Match(input, "(?<=title=\").*(?=\" rel)").Value; postMsg.title = CustomTrimText(titleMatch); #endregion //postMsg.identifier = GetMatches(input, "(< h2)(.*\n ?)(<\/h2)")[0].ToString(); }