示例#1
0
        public void OfficeSupportExtractionTest()
        {
            var configPath = Path.Combine("TestData", "support.office.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "support.office.com.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("Export data to Excel", parsedJson["title"].Value, "The extracted title is incorrect");
            Assert.AreEqual("You can copy data from a Microsoft Office Access 2007 database into a worksheet by exporting a database object to a Microsoft Office Excel 2007 workbook. You do this by using the Export Wizard in Office Access 2007.", parsedJson["abstract"].Value, "The extracted abstract is incorrect");
            Assert.AreEqual(9, parsedJson["versions"].Count, "The extracted versions field is incorrect");

            Assert.AreEqual(5, parsedJson["sections"].Count, "The extracted json should have 5 sections");

            var secondSection = parsedJson["sections"][1];

            Assert.AreEqual("Exporting data to Excel: the basics", secondSection["title"].Value, "The title of the second section is incorrect");
            Assert.AreEqual(9, secondSection["text"]["paragraphs"].Count, "The paragraphs count of the second section is incorrect");
            Assert.AreEqual(2, secondSection["text"]["unorderedLists"].Count, "The paragraphs count of the second section is incorrect");

            var secondList = secondSection["text"]["unorderedLists"][1];

            Assert.AreEqual("If this is the first time you are exporting data to Excel", secondList["title"].Value, "The title of the second list in the second section is incorrect");
            Assert.AreEqual(4, secondList["items"].Count, "The second list in the second section should have 4 items");
        }
        public void RemoveExtraWhitespaceTransformationTest()
        {
            var html = "<html><body><div id='content'><a href=''>A link</a>with     adjacent text. &quot;the final frontier&quot;</div></body></html>";

            var configJson = @"
            {
                'text': {
                    '_xpath': '//div[@id=\'content\']',
                    '_transformations': [
                        'ExtractTextTransformation',
                        'HtmlDecodeTransformation',
                        'RemoveExtraWhitespaceTransformation'
                    ]
                }
            }
            ";

            var config = StructuredDataConfig.ParseJsonString(configJson);

            var     extractor  = new StructuredDataExtractor(config);
            var     result     = extractor.Extract(html);
            var     json       = JsonConvert.SerializeObject(result, Formatting.Indented);
            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("A link with adjacent text. \"the final frontier\"", parsedJson["text"].Value);
        }
示例#3
0
        static void Main(string[] args)
        {
            var configJson = @"
            {
                '':'//a[contains(text(), \'File\')]/@href'
            }
            ";

            var config = StructuredDataConfig.ParseJsonString(configJson);

            var       html    = @"http://rule34.paheal.net/post/list";
            HtmlWeb   web     = new HtmlWeb();
            WebClient wc      = new WebClient();
            var       htmlDoc = web.Load(html);
            var       body    = htmlDoc.Text;
            var       path    = Directory.GetCurrentDirectory() + "\\img\\";

            void CreataFolder()
            {
                if (!Directory.Exists(path))
                {
                    Directory.CreateDirectory(path);
                }
            }

            CreataFolder();
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(body);

            char[] charstotrim = { '\x5C', '\x22', '\x7B', '\x20' };
            var    output      = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented).Split(',');

            output[0] = output[0].Remove(0, 10);
            output[output.Length - 1] = output[output.Length - 1].Remove(output[output.Length - 1].Length - 8, 8);
            string fileName;

            for (int i = 0; i < output.Length; i++)
            {
                output[i] = output[i].Remove(0, 7);
                output[i] = output[i].Trim(charstotrim);
                if (output[i].Contains("webm"))
                {
                    fileName = path + i + ".webm";
                    Console.WriteLine(fileName);
                }
                else
                {
                    fileName = path + i + "." + output[i].Remove(0, output[i].Length - 3);
                    Console.WriteLine(fileName);
                }

                wc.DownloadFile((string)output[i], fileName);
            }
            Console.WriteLine("----------------------------");
            for (int i = 0; i < output.Length; i++)
            {
                Console.WriteLine(output[i]);
            }
            Console.ReadKey();
        }
示例#4
0
        public static void WorkerSummary1(int poolid)
        {
            var driver = new ChromeDriver();

            var    jsonConfig = File.ReadAllText(@"Json\\f2pool.json");
            var    config     = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url        = GetUrl(poolid);

            driver.Navigate().GoToUrl(url);

            Thread.Sleep(8000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];

                //var all = (int)json[0];
                var active   = (int)json[1];
                var inactive = (int)json[2];
                var dead     = 0;
                if (json.Count() == 4)
                {
                    dead = (int)json[3];
                }

                JToken json2 = jObject["currenthash"];
                JToken json3 = jObject["dailyhash"];
                var    temp1 = (string)json2;
                var    temp2 = (string)json3;

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag1 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag1 = false;
                    UpdateErrorLog("f2pool", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
示例#5
0
        public void Scrape()
        {
            var config = StructuredDataConfig.ParseJsonString(Top100Config);

            var html = DownloadPage(saveTo: IMDBhtml);

            var openScraping = new StructuredDataExtractor(config);

            var scrapingResults = openScraping.Extract(html);

            using (WebClient client = new WebClient())
            {
                foreach (var celeb in scrapingResults["celebrities"])
                {
                    celeb["birth"] = ScrapeCeleb(celeb);
                    var    wat = celeb["image"].ToString();
                    Uri    uri = new Uri(wat);
                    string fn  = Path.GetFileName(uri.LocalPath);
                    client.DownloadFile(wat, imgPath + fn);
                }
            }

            JsonSerializerSettings jss = new JsonSerializerSettings
            {
                StringEscapeHandling = StringEscapeHandling.Default
            };

            string textresult = JsonConvert.SerializeObject(scrapingResults, jss);

            File.WriteAllText(DBfile, textresult);
        }
示例#6
0
        public void QuoraWithWikiExtractionTest()
        {
            var configPath = Path.Combine("TestData", "quora.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.withwiki.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("What can I learn/know right now in 10 minutes that will be useful for the rest of my life?", question["title"].Value, "The extracted title is incorrect");

            // Answers
            Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file");
            Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file");

            // Best Answer
            Assert.AreNotEqual(null, parsedJson["bestAnswer"], "Extractor should find the best answer in the HTML file");

            var bestAnswer = parsedJson["bestAnswer"];

            Assert.AreNotEqual(null, bestAnswer["content"], "The content string should not be null in the extracted answer");
            Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer");
            Assert.AreEqual(9, bestAnswer["lists"].Count, "The lists array should have 9 items");
            Assert.AreEqual(25, bestAnswer["lists"][1]["items"].Count, "Second item in the lists array should have 25 items");

            // Check is textAboveLength exists in each list
            foreach (var answer in parsedJson["answers"])
            {
                var lists = answer["lists"];

                if (lists != null)
                {
                    foreach (var list in lists)
                    {
                        Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                        var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                        Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                    }
                }
            }

            var bestAnswerLists = bestAnswer["lists"];

            if (bestAnswerLists != null)
            {
                foreach (var list in bestAnswerLists)
                {
                    Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                    var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                    Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                }
            }
        }
示例#7
0
        internal static Newtonsoft.Json.Linq.JContainer ExtractFromHtml(this string html, string resourceName)
        {
            var config  = ScrapingExtensions.CreateConfig(resourceName);
            var scraper = new StructuredDataExtractor(config);
            var result  = scraper.Extract(html);

            return(result);
        }
示例#8
0
        public async Task Transform(string content)
        {
            var config          = StructuredDataConfig.ParseJsonString(_jsonConfig);
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(content);

            await _sender.Send(scrapingResults.ToString());
        }
        public IActionResult JsonResult([FromBody] ObjectJson json)
        {
            var         url    = WebUtility.UrlDecode(json.url);
            MyWebClient client = new MyWebClient()
            {
                Encoding = Encoding.UTF8
            };

            client.Headers[HttpRequestHeader.UserAgent] = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36";
            if (json.isID)
            {
                client.DownloadData(url);
                var    mainUrl = client.ResponseUri.ToString();
                var    regex   = JsonConvert.DeserializeObject <JsonIDInput>(json.data);
                string item    = "";
                try
                {
                    var regexMatch = Regex.Match(mainUrl, regex._xpath);
                    item = regexMatch.Groups[regex.group_number].Value;
                }
                catch (Exception) { }
                return(Json(JsonConvert.SerializeObject(new JsonIDresult {
                    url = mainUrl, id = item
                }, Formatting.Indented)));
            }
            else
            {
                var          baseUri  = new Uri(url);
                var          isScript = json.javascript;
                var          config   = StructuredDataConfig.ParseJsonString(json.data);
                var          html     = client.DownloadString(!isScript ? url : this.configuration.GetAppSetting("UrlSeleniumGetHtmlExcuteJavascript") + "?url=" + WebUtility.UrlEncode(url));
                HtmlDocument docc     = new HtmlDocument();
                docc.LoadHtml(html);
                var urltmp = "";
                HtmlNodeCollection nodes = docc.DocumentNode.SelectNodes("//a");
                if (nodes != null)
                {
                    foreach (HtmlNode node in nodes)
                    {
                        if ((node.Attributes["href"] != null) && (node.Attributes["href"].Value != ""))
                        {
                            try
                            {
                                urltmp = node.Attributes["href"].Value.Trim();
                                node.Attributes["href"].Value = new Uri(baseUri, urltmp).AbsoluteUri;
                            }
                            catch (Exception) { }
                        }
                    }
                }
                ;
                html = docc.DocumentNode.InnerHtml;
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(html);
                var result          = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented);
                return(Json(result));
            }
        }
示例#10
0
        public static void WorkerSummary7(int poolid)
        {
            var driver = new ChromeDriver();

            var jsonConfig = File.ReadAllText(@"Json\\spiderpool.json");

            var    config = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url    = GetUrl(poolid);

            //var url = "https://www.spiderpool.com/coin/show/btc/yibobtc01/detail.html";
            driver.Navigate().GoToUrl(url);

            Thread.Sleep(1000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];


                var temp1 = (string)json[8];
                var temp2 = (string)json[10];

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                var temp     = (string)json[6];
                var active   = Int32.Parse(temp.Substring(0, temp.IndexOf('/')));
                var total    = Int32.Parse(temp.Substring(temp.LastIndexOf('/') + 1));
                int inactive = total - active;
                int dead     = 0;

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag7 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag7 = false;
                    UpdateErrorLog("viabtc", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
示例#11
0
        public static void WorkerSummary6(int poolid)
        {
            var driver = new ChromeDriver();

            var jsonConfig = File.ReadAllText(@"Json\\viabtc.json");

            var    config = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url    = GetUrl(poolid);

            //var url = "https://pool.viabtc.com/observer/dashboard?access_key=cb735a866859b626a748c0fb4a479394";
            driver.Navigate().GoToUrl(url);

            //Thread.Sleep(1000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];


                var temp1 = (string)json[0];
                var temp2 = (string)json[2];

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                var active   = Int32.Parse((string)json[3]);
                var inactive = Int32.Parse((string)json[4]);
                int total    = 0;
                int dead     = 0;

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag6 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag6 = false;
                    UpdateErrorLog("viabtc", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
示例#12
0
        public async Task Transform(string content)
        {
            //transform intoJson, create object template in shared becouse we need the same object in Loader to deserialize
            var configJson      = GenerateJson();
            var config          = StructuredDataConfig.ParseJsonString(configJson);
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(content);

            await _sender.Send(scrapingResults.ToString());
        }
示例#13
0
        public static void WorkerSummary5(int poolid)
        {
            var    driver     = new ChromeDriver();
            var    jsonConfig = File.ReadAllText(@"Json\\antpool.json");
            var    config     = StructuredDataConfig.ParseJsonString(jsonConfig);
            string url        = GetUrl(poolid);

            driver.Navigate().GoToUrl(url);

            Thread.Sleep(1000);
            var source = driver.PageSource;

            try
            {
                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(source);

                JObject jObject = JObject.Parse(scrapingResults.ToString());
                JToken  json    = jObject["data"];

                var temp  = (string)json[0];
                var temp1 = (string)json[1];
                var temp2 = (string)json[3];

                var currentcalculation = GetFloat(temp1);
                var dailycalculation   = GetFloat(temp2);
                var unit = GetString(temp1);

                var numbers  = Regex.Split(temp.Trim(), @"\D+");
                var active   = Int32.Parse(numbers[0]);
                var total    = Int32.Parse(numbers[1]);
                var inactive = total - active;
                int dead     = 0;

                UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid);
            }
            catch (Exception ex)
            {
                if (Commonflag.flag5 != true)
                {
                    // do nothing
                }
                else
                {
                    string error = "Poolid=" + poolid + "  " + ex.ToString();
                    Commonflag.flag5 = false;
                    UpdateErrorLog("antpool", error);
                }
            }
            finally
            {
                driver.Close();
                driver.Quit();
            }
        }
示例#14
0
        public void MicrosoftAnswersExtractionTest()
        {
            var configPath = Path.Combine("TestData", "answers.microsoft.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "answers.microsoft.com.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("8acb1ac5-0acd-4c68-9eeb-e4afff5b39d8", question["id"].Value, "The extracted id is incorrect");
            Assert.AreEqual("I want to reserve my free copy of Windows 10, but I don’t see the icon on the taskbar", question["title"].Value, "The extracted title is incorrect");
            Assert.AreNotEqual(null, question["content"], "The extracted question should have a content");
            Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0");
            Assert.AreEqual(1642653, question["views"].Value, "The extracted views snippet is incorrect");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints");
            Assert.AreEqual("PC", question["hints"][3].ToString(), "The 4th hint of the extracted question should be PC");

            // Answers
            Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file");
            Assert.AreEqual(2, parsedJson["answers"].Count, "Extractor should find two answers in the thread summary of the HTML file");

            var secondAnswer = parsedJson["answers"][1];

            Assert.AreEqual("Most Helpful Reply", secondAnswer["type"].Value, "The extracted type of the answer is incorrect");
            Assert.AreNotEqual(null, secondAnswer["content"], "The content array in the extracted answer should not be null");
            Assert.IsTrue(secondAnswer["content"].Count > 0, "The content array in the extracted answer should have one or more items");
            Assert.AreEqual(4, secondAnswer["lists"].Count, "The lists array should have 4 items");
            Assert.IsTrue(secondAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item");

            // Check is textAboveLength exists in each list
            foreach (var answer in parsedJson["answers"])
            {
                var lists = answer["lists"];

                if (lists != null)
                {
                    foreach (var list in lists)
                    {
                        Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                        var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                        Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                    }
                }
            }
        }
示例#15
0
        private JToken ScrapeCeleb(JToken jToken)
        {
            var config = StructuredDataConfig.ParseJsonString(CelebConfig);

            jToken["page"] = "https://www.imdb.com" + jToken["page"];
            var html            = DownloadPage(jToken["page"].ToString());
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(html);

            return(scrapingResults["celebrities"]["birth"]);
        }
示例#16
0
        public void QuoraExtractionTest()
        {
            var configPath = Path.Combine("TestData", "quora.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("What are some tips for creating a successful Kickstarter project?", question["title"].Value, "The extracted title is incorrect");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(5, question["hints"].Count, "The extracted question should have 5 hints");
            Assert.AreEqual("Kickstarter", question["hints"][3].ToString(), "The 4th hint of the extracted question should be Kickstarter");

            // Answers
            Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file");
            Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file");

            var firstAnswer = parsedJson["answers"][0];

            Assert.AreNotEqual(null, firstAnswer["content"], "The content string should not be null in the extracted answer");
            Assert.IsTrue(firstAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer");
            Assert.AreEqual(6800, firstAnswer["views"].Value, "The extracted views count is incorrect");
            Assert.AreEqual(1, firstAnswer["lists"].Count, "The lists array should have 1 item");
            Assert.IsTrue(firstAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item");

            var secondAnswer = parsedJson["answers"][1];

            Assert.AreEqual(2, secondAnswer["views"].Value, "The extracted views count is incorrect");

            // Check is textAboveLength exists in each list
            foreach (var answer in parsedJson["answers"])
            {
                var lists = answer["lists"];

                if (lists != null)
                {
                    foreach (var list in lists)
                    {
                        Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                        var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                        Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                    }
                }
            }
        }
示例#17
0
        public void StackExchangeEx2ExtractionTest()
        {
            var configPath = Path.Combine("TestData", "stackexchange.com.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "stackoverflow.com.example2.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("How to configure Visual Studio 2008 to use IIS Express?", question["title"].Value, "The extracted title is incorrect");
            Assert.AreNotEqual(null, question["content"], "The extracted question should have a content");
            Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string");
            Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0");
            Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field");
            Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int");
            Assert.AreEqual(9, question["votes"].Value, "The votes extracted from the question should have a value of 9");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(2, question["hints"].Count, "The extracted question should have 2 hints");
            Assert.AreEqual("iis-express", question["hints"][1].ToString(), "The 2nd hint of the extracted question should be passwords");

            // Best Answer
            var bestAnswer = parsedJson["bestAnswer"];

            Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content");
            Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string");
            Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0");
            Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field");
            Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int");
            Assert.AreEqual(17, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 17");
            Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item");
            Assert.AreEqual(7, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 7 items");

            // Check is textAboveLength exists in each list
            var lists = bestAnswer["lists"];

            if (lists != null)
            {
                foreach (var list in lists)
                {
                    Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                    var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                    Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                }
            }
        }
        public void StackExchangeEx1ExtractionTest()
        {
            var configPath = "stackexchange.com.json";
            var config     = StructuredDataConfig.Parse(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText("stackoverflow.com.example1.html"));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            // Question
            Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file");

            var question = parsedJson["question"];

            Assert.AreEqual("Is there a way to crack the password on an Excel VBA Project?", question["title"].Value, "The extracted title is incorrect");
            Assert.AreNotEqual(null, question["content"], "The extracted question should have a content");
            Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string");
            Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0");
            Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field");
            Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int");
            Assert.AreEqual(196, question["votes"].Value, "The votes extracted from the question should have a value of 196");

            // Question context
            Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints");
            Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints");
            Assert.AreEqual("passwords", question["hints"][3].ToString(), "The 4th hint of the extracted question should be passwords");

            // Best Answer
            var bestAnswer = parsedJson["bestAnswer"];

            Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content");
            Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string");
            Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0");
            Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field");
            Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int");
            Assert.AreEqual(153, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 153");
            Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item");
            Assert.AreEqual(8, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 4 items");

            // Check is textAboveLength exists in each list
            var lists = bestAnswer["lists"];

            if (lists != null)
            {
                foreach (var list in lists)
                {
                    Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer");
                    var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>();
                    Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength));
                }
            }
        }
        public void ParseDateTest()
        {
            var     configPath = Path.Combine("TestData", "parse_date_rules.json");
            var     config     = StructuredDataConfig.ParseJsonFile(configPath);
            var     extractor  = new StructuredDataExtractor(config);
            var     result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
            var     json       = JsonConvert.SerializeObject(result, Formatting.Indented);
            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value);
            Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value);
            Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value);
        }
        public void RemoveXPathsExtractionTest()
        {
            var configPath = Path.Combine("TestData", "article_with_comments_div.json");
            var config     = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor  = new StructuredDataExtractor(config);
            var result     = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_comments_div.html")));
            var json       = JsonConvert.SerializeObject(result, Formatting.Indented);

            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("Article  title", parsedJson["title"].Value, "The extracted title is incorrect");
            Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect");
        }
        public void RegexTest()
        {
            var configPath       = Path.Combine("TestData", "regex_rules.json");
            var config           = StructuredDataConfig.ParseJsonFile(configPath);
            var extractor        = new StructuredDataExtractor(config);
            var result           = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html")));
            var actualJson       = JsonConvert.SerializeObject(result, Formatting.Indented);
            var parsedActualJson = JObject.Parse(actualJson);

            var expectedJsonPath   = Path.Combine("TestData", "regex_expected_result.json");
            var expectedJson       = File.ReadAllText(expectedJsonPath);
            var parsedExpectedJson = JObject.Parse(expectedJson);

            Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson));
        }
示例#22
0
        private async Task Init()
        {
            // Path to the folder with classifiers models
            var jarRoot = @"C:\stanford-ner-2018-10-16";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            _classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz");

            // Define a regular expression for finding the location element
            _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>",
                                    RegexOptions.Compiled | RegexOptions.IgnoreCase);

            // Define configurations for parsing artist and listener info
            var configArtistInfoJson = @"
            {
                'artist': '//h1[contains(@class, \'view-header\')]',
                'about': '//div[contains(@class, \'bio-primary\')]',
                'more': '//div[contains(@class, \'bio-secondary\')]',
                'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]',
                'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]'
            }";

            ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson);

            _artistScraping = new StructuredDataExtractor(configArtist);

            // Get the hosted feature layers for editing
            ArcGISPortal portal = await ArcGISPortal.CreateAsync();

            PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId);

            PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId);

            PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId);

            _hometownTable    = new ServiceFeatureTable(hometownLayerItem, 0);
            _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0);
            _listenerTable    = new ServiceFeatureTable(listenerLayerItem, 0);
            await _hometownTable.LoadAsync();

            await _otherPointsTable.LoadAsync();

            await _listenerTable.LoadAsync();
        }
示例#23
0
        public T Run()
        {
            using (var client = new WebClient())
            {
                var html = client.DownloadString(UrlConstants.BaseUrl + _relativeUrl);

                var configuration   = StructuredDataConfig.ParseJsonString(ConfigurationJson);
                var openScraping    = new StructuredDataExtractor(configuration);
                var scrapingResults = openScraping.Extract(html);

                var serializedObject   = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented);
                var deserializedObject = JsonConvert.DeserializeObject <T>(serializedObject);

                WaitRandom();

                return(deserializedObject);
            }
        }
示例#24
0
        static void Main(string[] args)
        {
            var jsonConfig = File.ReadAllText(@"match-result.config.json");
            var config     = StructuredDataConfig.ParseJsonString(jsonConfig);

            var html = string.Empty;

            using (WebClient client = new WebClient())
            {
                client.Encoding = Encoding.UTF8;
                html            = client.DownloadString("http://virtualsoccer.ru/viewmatch.php?day=12968&match_id=213340");
            }

            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(html);

            Console.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented));
            Console.ReadKey();
        }
示例#25
0
        private async Task ProcessAsync()
        {
            await this.Page.WaitForTimeoutAsync(this.DefaultTimeout);

            string HTML = await this.Page.GetContentAsync();

            try
            {
                var openScraping    = new StructuredDataExtractor(this.config);
                var scrapingResults = openScraping.Extract(html: HTML);
                var records         = scrapingResults.ToObject <Model.RootObject>();
                if (records.PropertyRecords.Count > 0)
                {
                    var first = records.PropertyRecords.First().Owner;
                    var last  = records.PropertyRecords.Last().Owner;
                    if (string.IsNullOrEmpty(first) && string.IsNullOrEmpty(last))
                    {
                        // suspicious
                        HTML = null;
                    }
                    else
                    {
                        return;
                    }
                }
                else
                {
                    HTML = null;
                }
            }
            catch (Exception ex)
            {
                HTML = null;
                throw ex;
            }

            if (HTML.Equals(null))
            {
                throw new Exception("NavigateCollect: No record data was returned");
            }
        }
        public static MyRecipe ExtractRecipe(string url)
        {
            string   urlResponse;
            MyRecipe myRecipe = null;

            // 1. Get Response from url
            using (WebClient w = new WebClient())
            {
                urlResponse = w.DownloadString(url);
            }

            //2: Check and scrape if any structured JSON is present (application/ld+json)
            var configJson      = @"{                
                'data': '//script[contains(@type, \'application\/ld+json\')]'
            }";
            var config          = StructuredDataConfig.ParseJsonString(configJson);
            var openScraping    = new StructuredDataExtractor(config);
            var scrapingResults = openScraping.Extract(urlResponse);

            if (scrapingResults != null && scrapingResults["data"] != null)
            {
                var content = scrapingResults["data"].ToString();
                if (content.Contains("\"@type\":\"Recipe\""))
                {
                    try
                    {
                        var serializerSettings = new JsonSerializerSettings()
                        {
                            DateParseHandling = DateParseHandling.DateTimeOffset
                        };
                        Recipe        rec     = JsonConvert.DeserializeObject <Recipe>(content, serializerSettings);
                        RecipeBuilder builder = new RecipeBuilder();
                        myRecipe = builder.Build(rec);
                    }
                    catch (Exception e)
                    {
                    }
                }
            }
            return(myRecipe);
        }
        public void CastToIntegerTest()
        {
            var html = "<meta property=\"width\" content=\"1200\">";

            var configJson = @"
            {
                'width': {
                    '_xpath': '/meta[@property=\'width\']/@content',
                    '_transformation': 'CastToIntegerTransformation'
                }
            }
            ";

            var config = StructuredDataConfig.ParseJsonString(configJson);

            var     extractor  = new StructuredDataExtractor(config);
            var     result     = extractor.Extract(html);
            var     json       = JsonConvert.SerializeObject(result, Formatting.Indented);
            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual(1200, parsedJson["width"].Value);
        }
        public void UrlEncodeTest()
        {
            var html = "<html><body><div id='content'><a href='hello world'></a></div></body></html>";

            var configJson = @"
            {
                'text': {
                    '_xpath': '//div[@id=\'content\']/a/@href',
                    '_transformation': 'UrlEncodeTransformation'
                }
            }
            ";

            var config = StructuredDataConfig.ParseJsonString(configJson);

            var     extractor  = new StructuredDataExtractor(config);
            var     result     = extractor.Extract(html);
            var     json       = JsonConvert.SerializeObject(result, Formatting.Indented);
            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("hello+world", parsedJson["text"].Value);
        }
        public void ExtractTextTest()
        {
            var html = "<html><body><div id='content'><a href=''>A link</a>with adjacent text.</div></body></html>";

            var configJson = @"
            {
                'text': {
                    '_xpath': '//div[@id=\'content\']',
                    '_transformation': 'ExtractTextTransformation'
                }
            }
            ";

            var config = StructuredDataConfig.ParseJsonString(configJson);

            var     extractor  = new StructuredDataExtractor(config);
            var     result     = extractor.Extract(html);
            var     json       = JsonConvert.SerializeObject(result, Formatting.Indented);
            dynamic parsedJson = JsonConvert.DeserializeObject(json);

            Assert.AreEqual("A link with adjacent text.", parsedJson["text"].Value);
        }
示例#30
0
        internal static StockPrice FilerTheStockpriceFromRediff(string httpResposeMessage)
        {
            TimeZoneInfo INDIAN_ZONE = TimeZoneInfo.FindSystemTimeZoneById("India Standard Time");
            DateTime     tm          = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE);
            DateTime     dt          = DateTime.Today;

            StockPrice sp         = new StockPrice();
            var        configJson = @"
                        {
                            'price':'//span[2]',
                            'LastTradedDate':'//span[6]',
                            'LastTradedTime':'//span[7]'
                        }";

            //            var configJson = @"
            //            {
            //                'title1': '//h1',
            //                'title': '//script',
            //                'price':'//span[2]',
            //                'LastTradedDate':'//span[6]',
            //                'LastTradedTime':'//span[7]',
            //'body': '//div[contains(@class, \'article\')]'
            //            }
            //            ";
            //            var html = "<html><body><h1>Article title</h1><div class='article'>Article contents</div></body></html>";
            //            html = httpResposeMessage;
            try
            {
                var config = StructuredDataConfig.ParseJsonString(configJson);

                var openScraping    = new StructuredDataExtractor(config);
                var scrapingResults = openScraping.Extract(httpResposeMessage);

                System.Diagnostics.Debug.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented));

                var thePrice = scrapingResults["price"];
                var theDate  = scrapingResults["LastTradedDate"];
                var theTime  = scrapingResults["LastTradedTime"];

                sp.Price = double.Parse(thePrice.ToString().Trim());

                //DateTime dt = DateTime.ParseExact(theDate.ToString().Trim(), "dd MMM", CultureInfo.InvariantCulture);
                //DateTime tm = DateTime.ParseExact(theTime.ToString().Trim(), "HH:mm:ss", CultureInfo.InvariantCulture);

                //** precaution in case missing date & time**//
                if (!string.IsNullOrEmpty(theDate.ToString()))
                {
                    if (theDate.ToString().IsDateType())
                    {
                        //    dt = (DateTime)Convert.ChangeType(theDate, typeof(DateTime));
                        dt = DateTime.ParseExact(theDate.ToString().Trim(), "dd MMM", CultureInfo.InvariantCulture);
                    }

                    if (theTime.ToString().IsDateType())
                    {
                        tm = (DateTime)Convert.ChangeType(theTime, typeof(DateTime));
                    }
                }
                else
                {
                    var DataNTime = theTime.ToString().Split(',', StringSplitOptions.RemoveEmptyEntries);
                    if (DataNTime.Length == 2) //has date and time
                    {
                        if (DataNTime[0].IsDateType())
                        {
                            //dt = (DateTime)Convert.ChangeType(DataNTime[0], typeof(DateTime));
                            dt = DateTime.ParseExact(DataNTime[0], "dd MMM", CultureInfo.InvariantCulture);
                        }
                        if (DataNTime[1].ToString().IsDateType())
                        {
                            tm = (DateTime)Convert.ChangeType(DataNTime[1], typeof(DateTime));
                        }
                    }
                    else   //has time only
                    {
                        if (DataNTime[0].ToString().IsDateType())
                        {
                            tm = (DateTime)Convert.ChangeType(DataNTime[0], typeof(DateTime));
                        }
                    }
                }
                //** adjust the date if date in missing in the downloaded time stamp**//
                var currectedDate = dt.Date.Add(tm.TimeOfDay);
                currectedDate = currectedDate > TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE) ?
                                currectedDate.AddYears(-1) : currectedDate;
                sp.ValueOn = currectedDate;
            }
            catch (Exception ex)
            {
                throw ex;
            }
            return(sp);
        }