コード例 #1
0
        static async void MainAsync(string[] args)
        {
            List <HtmlNode> articles  = Grab.GrabArticles();
            int             articleNo = 0;

            foreach (HtmlNode a in articles)
            {
                articleNo++;
                Console.Write($"Beginning scrapping article {articleNo}. ");
                PostMessage post;
                try
                {
                    Interpreter.Interpret(a, out post);
                    var json = JsonConvert.SerializeObject(post, new IsoDateTimeConverter()
                    {
                        DateTimeFormat = "yyyy-MM-dd"
                    });
                    Console.WriteLine($"Post article {articleNo} API result: ");
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine(await PostArticle.Post(json));
                    Console.ForegroundColor = ConsoleColor.White;
                    Console.WriteLine("-------------------------------------------------------");
                }
                catch (Exception ex)
                {
                    logger.Error(ex);
                }
            }
            Console.ForegroundColor = ConsoleColor.Green;
            Console.WriteLine($"{articleNo} articles scrapped. Press any key to exit.");
            Console.ReadLine();
        }
コード例 #2
0
        public void Date()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT
            Assert.AreEqual(PostMsg.date, DateTime.Parse("2017-01-19T11:22:06+00:00"));
            #endregion
        }
コード例 #3
0
        public void Institution()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg.institution, "dialog");

            #endregion
        }
コード例 #4
0
        public void Type()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg.type, "OTHER");

            #endregion
        }
コード例 #5
0
        public void Title()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg.title, "Consultare publică CONECT Catalogul Organizațiilor Neguvernamentale pentru Evidență, Consultare și Transparență");

            #endregion
        }
コード例 #6
0
        public void Identifier()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg.identifier, "dialog_category-proiecte-de-acte-normative_post-6300");

            #endregion
        }
コード例 #7
0
        public void Documents()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg.documents[0].url, "http://dialogsocial.gov.ro/2017/01/consultare-publica-conect-catalogul-organizatiilor-neguvernamentale-pentru-evidenta-consultare-si-transparenta/");

            #endregion
        }
コード例 #8
0
        public void ContactEmail()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg.contact.email, "*****@*****.**");

            #endregion
        }
コード例 #9
0
        public void FeedbackDays()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg.feedback_days, (uint)14);

            #endregion
        }
コード例 #10
0
        public void Description()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpreter.Interpret(articles[0], out PostMsg);
            #endregion

            #region ASSERT
            //Console.WriteLine(PostMsg.description);
            Assert.AreEqual(PostMsg.description, "Ministerul Consultării Publice și Dialogului Social (MCPDS) lansează, astăzi, 19 ianuarie a.c. în consultare publică instrumentul CONECT Catalogul organizațiilor neguvernamentale pentru evidență, consultare și transparență. „Ne dorim să oferim cât mai mult sprijin comunităților de expertiză și să le aducem mai aproape de decizia publică. Acest lucru...");

            #endregion
        }
コード例 #11
0
        public void FeedbackDays()
        {
            #region ARRANGE

            List <HtmlNode> articles = Grab.GrabArticles();
            PostMessage     PostMsg;

            #endregion

            #region ACT
            Interpret(articles[0].OuterHtml, out PostMsg);
            #endregion

            #region ASSERT

            Assert.AreEqual(PostMsg, "");

            #endregion
        }
コード例 #12
0
        public static void Interpret(HtmlNode input, out PostMessage postMsg)
        {
            postMsg = new PostMessage();
            HtmlNode        FullArticle = null;
            List <Document> docs        = null;

            #region LIST OF DOCUMENTS && CONTACT
            try
            {
                MatchCollection links = GetMatches(input.OuterHtml, "<a[^>]* href=\"([^ \"]*)\"");
                Match           link  = links[links.Count - 1];
                docs = new List <Document> {
                    new Document {
                        url = CustomTrimText(link.ToString()).Replace("\"", ""), type = "article"
                    }
                };

                //First doc is full article
                FullArticle = Grab.GrabArticles(docs[0].url)[0];
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #region CONTACT
            try
            {
                Match fullArt = GetMatches(FullArticle.InnerText, EndOfArticleMarker)[0];
                if (fullArt.Success)
                {
                    postMsg.contact = new Contact {
                        email = AppendEmails(fullArt.Value), tel = GetPhoneNumber(fullArt.Value), addr = ""
                    };
                }
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #endregion

            #region LIST OF PDFS, LINKS, YOUTUBE VIDEOS ETC
            try
            {
                Match searchForLinksToDocs = GetMatches(FullArticle.InnerHtml, EndOfArticleMarker)[0];
                if (searchForLinksToDocs.Success)
                {
                    MatchCollection docsMatches = GetMatches(searchForLinksToDocs.Value, "(?<=<a href=\")(.*?)(?=\")");
                    if (docsMatches.Count > 0)
                    {
                        List <Document> docsToAdd = ExtractDocs(docsMatches);
                        docs.AddRange(docsToAdd);
                    }
                }
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #endregion

            //< !--Facebook Comments Plugin for WordPress: http://peadig.com/wordpress-plugins/facebook-comments/ -->
            postMsg.documents = docs;

            #endregion

            #region DATE && FEEDBACK_DAYS
            try {
                Match dateMatch = Regex.Match(input.OuterHtml, "(?<=datetime=\").*(?=\")");
                if (dateMatch.Success)
                {
                    postMsg.date = DateTime.Parse(CustomTrimText(dateMatch.Value));

                    #region FEEDBACK DAYS
                    Match untilDateMatch = GetUntilDateMatch(FullArticle);
                    if (untilDateMatch.Success)
                    {
                        string untilDateToParse = untilDateMatch.Value.Substring(0, untilDateMatch.Value.IndexOf('.'));
                        Thread.CurrentThread.CurrentCulture = new CultureInfo("ro-RO");
                        DateTime ParsedDate    = DateTime.Parse(untilDateToParse);
                        DateTime ProcessedDate = new DateTime(postMsg.date.Year, ParsedDate.Month, ParsedDate.Day);
                        if (ProcessedDate < postMsg.date)
                        {
                            ProcessedDate.AddYears(1);
                        }
                        if (ProcessedDate > postMsg.date)
                        {
                            postMsg.feedback_days = (uint)Math.Abs((ProcessedDate - postMsg.date.Date).Days);
                        }
                    }

                    postMsg.date = DateTime.Parse(CustomTrimText(dateMatch.Value));
                    #endregion
                }
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #endregion

            #region DESCRIPTION
            try
            {
                Match descriptionMatch = GetMatches(input.OuterHtml, "<p>(.*)</p>")[0];
                if (descriptionMatch.Success)
                {
                    postMsg.description = CustomTrimText(descriptionMatch.Value);
                }
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #endregion

            #region IDENTIFIER
            try
            {
                Match id = GetMatches(input.OuterHtml, "post-[\\d]+")[0];
                if (id.Success)
                {
                    postMsg.identifier = $"dialog_category-proiecte-de-acte-normative_{CustomTrimText(id.Value)}";
                }
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #endregion

            #region INSTITUTION
            try
            {
                postMsg.institution = "dialog";
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #endregion

            #region TITLE
            try
            {
                Match titleMatch = Regex.Match(input.OuterHtml, "(?<=title=\").*(?=\" rel)");
                if (titleMatch.Success)
                {
                    postMsg.title = CustomTrimText(titleMatch.Value);
                }
            }
            catch (Exception ex)
            {
                logger.Error(ex);
            }
            #endregion

            #region TYPE
            // Default value, in case title contains no matches
            postMsg.type = "OTHER";
            // Check if input text contains keywords, and adjust type to the first match.

            string txtForType = RemoveDiacritics(postMsg.title);
            foreach (var e in filtruActNormativ)
            {
                if (txtForType.ToLower().Contains(e.Key))
                {
                    postMsg.type = e.Value;
                    break;
                }
            }


            #endregion
        }
コード例 #13
0
        private void Interpret(string input, out PostMessage postMsg)
        {
            postMsg = new PostMessage();

            #region DATE
            string dateMatch = Regex.Match(input, "(?<=datetime=\").*(?=\")").Value;

            postMsg.date = DateTime.Parse(CustomTrimText(dateMatch));

            #endregion

            #region DESCRIPTION
            Match descriptionMatch = GetMatches(input, "<p>(.*)</p>")[0];

            postMsg.description = CustomTrimText(descriptionMatch.Value);
            #endregion

            #region LIST OF DOCUMENTS && CONTACT

            MatchCollection links = GetMatches(input, "<a[^>]* href=\"([^ \"]*)\"");
            Match           link  = links[links.Count - 1];
            List <Document> docs  = new List <Document> {
                new Document {
                    url = CustomTrimText(link.ToString()).Replace("\"", ""), type = "article"
                }
            };

            //First doc is full article
            HtmlNode FullArticle = Grab.GrabArticles(docs[0].url)[0];

            #region CONTACT

            string fullArt = GetMatches(FullArticle.InnerText, EndOfArticleMarker)[0].ToString();
            string emails  = AppendEmails(fullArt);
            postMsg.contact = new Contact {
                email = emails.ToString()
            };

            #endregion

            #region LIST OF PDFS, LINKS, YOUTUBE VIDEOS ETC
            string searchForLinksToDocs = GetMatches(FullArticle.InnerHtml, EndOfArticleMarker)[0].ToString();

            var docsMatches = GetMatches(searchForLinksToDocs, "(?<=<a href=\")(.*?)(?=\")");

            List <Document> docsToAdd = ExtractDocs(docsMatches);

            docs.AddRange(docsToAdd);
            #endregion

            //< !--Facebook Comments Plugin for WordPress: http://peadig.com/wordpress-plugins/facebook-comments/ -->
            postMsg.documents = docs;

            #endregion

            #region IDENTIFIER
            //
            string id = GetMatches(input, "post-[\\d]+")[0].ToString();
            postMsg.identifier = $"czl_dialog_category-proiecte-de-acte-normative_{CustomTrimText(id)}";
            #endregion

            #region INSTITUTION
            postMsg.institution = "dialog";
            #endregion

            #region TITLE

            string titleMatch = Regex.Match(input, "(?<=title=\").*(?=\" rel)").Value;
            postMsg.title = CustomTrimText(titleMatch);

            #endregion


            //postMsg.identifier = GetMatches(input, "(< h2)(.*\n ?)(<\/h2)")[0].ToString();
        }