public void OfficeSupportExtractionTest() { var configPath = Path.Combine("TestData", "support.office.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "support.office.com.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("Export data to Excel", parsedJson["title"].Value, "The extracted title is incorrect"); Assert.AreEqual("You can copy data from a Microsoft Office Access 2007 database into a worksheet by exporting a database object to a Microsoft Office Excel 2007 workbook. You do this by using the Export Wizard in Office Access 2007.", parsedJson["abstract"].Value, "The extracted abstract is incorrect"); Assert.AreEqual(9, parsedJson["versions"].Count, "The extracted versions field is incorrect"); Assert.AreEqual(5, parsedJson["sections"].Count, "The extracted json should have 5 sections"); var secondSection = parsedJson["sections"][1]; Assert.AreEqual("Exporting data to Excel: the basics", secondSection["title"].Value, "The title of the second section is incorrect"); Assert.AreEqual(9, secondSection["text"]["paragraphs"].Count, "The paragraphs count of the second section is incorrect"); Assert.AreEqual(2, secondSection["text"]["unorderedLists"].Count, "The paragraphs count of the second section is incorrect"); var secondList = secondSection["text"]["unorderedLists"][1]; Assert.AreEqual("If this is the first time you are exporting data to Excel", secondList["title"].Value, "The title of the second list in the second section is incorrect"); Assert.AreEqual(4, secondList["items"].Count, "The second list in the second section should have 4 items"); }
public void RemoveExtraWhitespaceTransformationTest() { var html = "<html><body><div id='content'><a href=''>A link</a>with adjacent text. "the final frontier"</div></body></html>"; var configJson = @" { 'text': { '_xpath': '//div[@id=\'content\']', '_transformations': [ 'ExtractTextTransformation', 'HtmlDecodeTransformation', 'RemoveExtraWhitespaceTransformation' ] } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("A link with adjacent text. \"the final frontier\"", parsedJson["text"].Value); }
static void Main(string[] args) { var configJson = @" { '':'//a[contains(text(), \'File\')]/@href' } "; var config = StructuredDataConfig.ParseJsonString(configJson); var html = @"http://rule34.paheal.net/post/list"; HtmlWeb web = new HtmlWeb(); WebClient wc = new WebClient(); var htmlDoc = web.Load(html); var body = htmlDoc.Text; var path = Directory.GetCurrentDirectory() + "\\img\\"; void CreataFolder() { if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } } CreataFolder(); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(body); char[] charstotrim = { '\x5C', '\x22', '\x7B', '\x20' }; var output = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented).Split(','); output[0] = output[0].Remove(0, 10); output[output.Length - 1] = output[output.Length - 1].Remove(output[output.Length - 1].Length - 8, 8); string fileName; for (int i = 0; i < output.Length; i++) { output[i] = output[i].Remove(0, 7); output[i] = output[i].Trim(charstotrim); if (output[i].Contains("webm")) { fileName = path + i + ".webm"; Console.WriteLine(fileName); } else { fileName = path + i + "." + output[i].Remove(0, output[i].Length - 3); Console.WriteLine(fileName); } wc.DownloadFile((string)output[i], fileName); } Console.WriteLine("----------------------------"); for (int i = 0; i < output.Length; i++) { Console.WriteLine(output[i]); } Console.ReadKey(); }
public static void WorkerSummary1(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\f2pool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(8000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; //var all = (int)json[0]; var active = (int)json[1]; var inactive = (int)json[2]; var dead = 0; if (json.Count() == 4) { dead = (int)json[3]; } JToken json2 = jObject["currenthash"]; JToken json3 = jObject["dailyhash"]; var temp1 = (string)json2; var temp2 = (string)json3; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag1 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag1 = false; UpdateErrorLog("f2pool", error); } } finally { driver.Close(); driver.Quit(); } }
public void Scrape() { var config = StructuredDataConfig.ParseJsonString(Top100Config); var html = DownloadPage(saveTo: IMDBhtml); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); using (WebClient client = new WebClient()) { foreach (var celeb in scrapingResults["celebrities"]) { celeb["birth"] = ScrapeCeleb(celeb); var wat = celeb["image"].ToString(); Uri uri = new Uri(wat); string fn = Path.GetFileName(uri.LocalPath); client.DownloadFile(wat, imgPath + fn); } } JsonSerializerSettings jss = new JsonSerializerSettings { StringEscapeHandling = StringEscapeHandling.Default }; string textresult = JsonConvert.SerializeObject(scrapingResults, jss); File.WriteAllText(DBfile, textresult); }
public void QuoraWithWikiExtractionTest() { var configPath = Path.Combine("TestData", "quora.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.withwiki.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("What can I learn/know right now in 10 minutes that will be useful for the rest of my life?", question["title"].Value, "The extracted title is incorrect"); // Answers Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file"); Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file"); // Best Answer Assert.AreNotEqual(null, parsedJson["bestAnswer"], "Extractor should find the best answer in the HTML file"); var bestAnswer = parsedJson["bestAnswer"]; Assert.AreNotEqual(null, bestAnswer["content"], "The content string should not be null in the extracted answer"); Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer"); Assert.AreEqual(9, bestAnswer["lists"].Count, "The lists array should have 9 items"); Assert.AreEqual(25, bestAnswer["lists"][1]["items"].Count, "Second item in the lists array should have 25 items"); // Check is textAboveLength exists in each list foreach (var answer in parsedJson["answers"]) { var lists = answer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } } var bestAnswerLists = bestAnswer["lists"]; if (bestAnswerLists != null) { foreach (var list in bestAnswerLists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } }
internal static Newtonsoft.Json.Linq.JContainer ExtractFromHtml(this string html, string resourceName) { var config = ScrapingExtensions.CreateConfig(resourceName); var scraper = new StructuredDataExtractor(config); var result = scraper.Extract(html); return(result); }
public async Task Transform(string content) { var config = StructuredDataConfig.ParseJsonString(_jsonConfig); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(content); await _sender.Send(scrapingResults.ToString()); }
public IActionResult JsonResult([FromBody] ObjectJson json) { var url = WebUtility.UrlDecode(json.url); MyWebClient client = new MyWebClient() { Encoding = Encoding.UTF8 }; client.Headers[HttpRequestHeader.UserAgent] = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"; if (json.isID) { client.DownloadData(url); var mainUrl = client.ResponseUri.ToString(); var regex = JsonConvert.DeserializeObject <JsonIDInput>(json.data); string item = ""; try { var regexMatch = Regex.Match(mainUrl, regex._xpath); item = regexMatch.Groups[regex.group_number].Value; } catch (Exception) { } return(Json(JsonConvert.SerializeObject(new JsonIDresult { url = mainUrl, id = item }, Formatting.Indented))); } else { var baseUri = new Uri(url); var isScript = json.javascript; var config = StructuredDataConfig.ParseJsonString(json.data); var html = client.DownloadString(!isScript ? url : this.configuration.GetAppSetting("UrlSeleniumGetHtmlExcuteJavascript") + "?url=" + WebUtility.UrlEncode(url)); HtmlDocument docc = new HtmlDocument(); docc.LoadHtml(html); var urltmp = ""; HtmlNodeCollection nodes = docc.DocumentNode.SelectNodes("//a"); if (nodes != null) { foreach (HtmlNode node in nodes) { if ((node.Attributes["href"] != null) && (node.Attributes["href"].Value != "")) { try { urltmp = node.Attributes["href"].Value.Trim(); node.Attributes["href"].Value = new Uri(baseUri, urltmp).AbsoluteUri; } catch (Exception) { } } } } ; html = docc.DocumentNode.InnerHtml; var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); var result = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented); return(Json(result)); } }
public static void WorkerSummary7(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\spiderpool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); //var url = "https://www.spiderpool.com/coin/show/btc/yibobtc01/detail.html"; driver.Navigate().GoToUrl(url); Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp1 = (string)json[8]; var temp2 = (string)json[10]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var temp = (string)json[6]; var active = Int32.Parse(temp.Substring(0, temp.IndexOf('/'))); var total = Int32.Parse(temp.Substring(temp.LastIndexOf('/') + 1)); int inactive = total - active; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag7 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag7 = false; UpdateErrorLog("viabtc", error); } } finally { driver.Close(); driver.Quit(); } }
public static void WorkerSummary6(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\viabtc.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); //var url = "https://pool.viabtc.com/observer/dashboard?access_key=cb735a866859b626a748c0fb4a479394"; driver.Navigate().GoToUrl(url); //Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp1 = (string)json[0]; var temp2 = (string)json[2]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var active = Int32.Parse((string)json[3]); var inactive = Int32.Parse((string)json[4]); int total = 0; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag6 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag6 = false; UpdateErrorLog("viabtc", error); } } finally { driver.Close(); driver.Quit(); } }
public async Task Transform(string content) { //transform intoJson, create object template in shared becouse we need the same object in Loader to deserialize var configJson = GenerateJson(); var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(content); await _sender.Send(scrapingResults.ToString()); }
public static void WorkerSummary5(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\antpool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp = (string)json[0]; var temp1 = (string)json[1]; var temp2 = (string)json[3]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var numbers = Regex.Split(temp.Trim(), @"\D+"); var active = Int32.Parse(numbers[0]); var total = Int32.Parse(numbers[1]); var inactive = total - active; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag5 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag5 = false; UpdateErrorLog("antpool", error); } } finally { driver.Close(); driver.Quit(); } }
public void MicrosoftAnswersExtractionTest() { var configPath = Path.Combine("TestData", "answers.microsoft.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "answers.microsoft.com.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("8acb1ac5-0acd-4c68-9eeb-e4afff5b39d8", question["id"].Value, "The extracted id is incorrect"); Assert.AreEqual("I want to reserve my free copy of Windows 10, but I don’t see the icon on the taskbar", question["title"].Value, "The extracted title is incorrect"); Assert.AreNotEqual(null, question["content"], "The extracted question should have a content"); Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0"); Assert.AreEqual(1642653, question["views"].Value, "The extracted views snippet is incorrect"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints"); Assert.AreEqual("PC", question["hints"][3].ToString(), "The 4th hint of the extracted question should be PC"); // Answers Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file"); Assert.AreEqual(2, parsedJson["answers"].Count, "Extractor should find two answers in the thread summary of the HTML file"); var secondAnswer = parsedJson["answers"][1]; Assert.AreEqual("Most Helpful Reply", secondAnswer["type"].Value, "The extracted type of the answer is incorrect"); Assert.AreNotEqual(null, secondAnswer["content"], "The content array in the extracted answer should not be null"); Assert.IsTrue(secondAnswer["content"].Count > 0, "The content array in the extracted answer should have one or more items"); Assert.AreEqual(4, secondAnswer["lists"].Count, "The lists array should have 4 items"); Assert.IsTrue(secondAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item"); // Check is textAboveLength exists in each list foreach (var answer in parsedJson["answers"]) { var lists = answer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } } }
private JToken ScrapeCeleb(JToken jToken) { var config = StructuredDataConfig.ParseJsonString(CelebConfig); jToken["page"] = "https://www.imdb.com" + jToken["page"]; var html = DownloadPage(jToken["page"].ToString()); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); return(scrapingResults["celebrities"]["birth"]); }
public void QuoraExtractionTest() { var configPath = Path.Combine("TestData", "quora.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("What are some tips for creating a successful Kickstarter project?", question["title"].Value, "The extracted title is incorrect"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(5, question["hints"].Count, "The extracted question should have 5 hints"); Assert.AreEqual("Kickstarter", question["hints"][3].ToString(), "The 4th hint of the extracted question should be Kickstarter"); // Answers Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file"); Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file"); var firstAnswer = parsedJson["answers"][0]; Assert.AreNotEqual(null, firstAnswer["content"], "The content string should not be null in the extracted answer"); Assert.IsTrue(firstAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer"); Assert.AreEqual(6800, firstAnswer["views"].Value, "The extracted views count is incorrect"); Assert.AreEqual(1, firstAnswer["lists"].Count, "The lists array should have 1 item"); Assert.IsTrue(firstAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item"); var secondAnswer = parsedJson["answers"][1]; Assert.AreEqual(2, secondAnswer["views"].Value, "The extracted views count is incorrect"); // Check is textAboveLength exists in each list foreach (var answer in parsedJson["answers"]) { var lists = answer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } } }
public void StackExchangeEx2ExtractionTest() { var configPath = Path.Combine("TestData", "stackexchange.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "stackoverflow.com.example2.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("How to configure Visual Studio 2008 to use IIS Express?", question["title"].Value, "The extracted title is incorrect"); Assert.AreNotEqual(null, question["content"], "The extracted question should have a content"); Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string"); Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0"); Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field"); Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int"); Assert.AreEqual(9, question["votes"].Value, "The votes extracted from the question should have a value of 9"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(2, question["hints"].Count, "The extracted question should have 2 hints"); Assert.AreEqual("iis-express", question["hints"][1].ToString(), "The 2nd hint of the extracted question should be passwords"); // Best Answer var bestAnswer = parsedJson["bestAnswer"]; Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content"); Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string"); Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0"); Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field"); Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int"); Assert.AreEqual(17, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 17"); Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item"); Assert.AreEqual(7, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 7 items"); // Check is textAboveLength exists in each list var lists = bestAnswer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } }
public void StackExchangeEx1ExtractionTest() { var configPath = "stackexchange.com.json"; var config = StructuredDataConfig.Parse(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText("stackoverflow.com.example1.html")); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("Is there a way to crack the password on an Excel VBA Project?", question["title"].Value, "The extracted title is incorrect"); Assert.AreNotEqual(null, question["content"], "The extracted question should have a content"); Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string"); Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0"); Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field"); Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int"); Assert.AreEqual(196, question["votes"].Value, "The votes extracted from the question should have a value of 196"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints"); Assert.AreEqual("passwords", question["hints"][3].ToString(), "The 4th hint of the extracted question should be passwords"); // Best Answer var bestAnswer = parsedJson["bestAnswer"]; Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content"); Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string"); Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0"); Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field"); Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int"); Assert.AreEqual(153, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 153"); Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item"); Assert.AreEqual(8, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 4 items"); // Check is textAboveLength exists in each list var lists = bestAnswer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } }
public void ParseDateTest() { var configPath = Path.Combine("TestData", "parse_date_rules.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value); Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value); Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value); }
public void RemoveXPathsExtractionTest() { var configPath = Path.Combine("TestData", "article_with_comments_div.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_comments_div.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("Article title", parsedJson["title"].Value, "The extracted title is incorrect"); Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect"); }
public void RegexTest() { var configPath = Path.Combine("TestData", "regex_rules.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html"))); var actualJson = JsonConvert.SerializeObject(result, Formatting.Indented); var parsedActualJson = JObject.Parse(actualJson); var expectedJsonPath = Path.Combine("TestData", "regex_expected_result.json"); var expectedJson = File.ReadAllText(expectedJsonPath); var parsedExpectedJson = JObject.Parse(expectedJson); Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson)); }
private async Task Init() { // Path to the folder with classifiers models var jarRoot = @"C:\stanford-ner-2018-10-16"; var classifiersDirecrory = jarRoot + @"\classifiers"; // Loading 3 class classifier model _classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz"); // Define a regular expression for finding the location element _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>", RegexOptions.Compiled | RegexOptions.IgnoreCase); // Define configurations for parsing artist and listener info var configArtistInfoJson = @" { 'artist': '//h1[contains(@class, \'view-header\')]', 'about': '//div[contains(@class, \'bio-primary\')]', 'more': '//div[contains(@class, \'bio-secondary\')]', 'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]', 'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]' }"; ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson); _artistScraping = new StructuredDataExtractor(configArtist); // Get the hosted feature layers for editing ArcGISPortal portal = await ArcGISPortal.CreateAsync(); PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId); PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId); PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId); _hometownTable = new ServiceFeatureTable(hometownLayerItem, 0); _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0); _listenerTable = new ServiceFeatureTable(listenerLayerItem, 0); await _hometownTable.LoadAsync(); await _otherPointsTable.LoadAsync(); await _listenerTable.LoadAsync(); }
public T Run() { using (var client = new WebClient()) { var html = client.DownloadString(UrlConstants.BaseUrl + _relativeUrl); var configuration = StructuredDataConfig.ParseJsonString(ConfigurationJson); var openScraping = new StructuredDataExtractor(configuration); var scrapingResults = openScraping.Extract(html); var serializedObject = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented); var deserializedObject = JsonConvert.DeserializeObject <T>(serializedObject); WaitRandom(); return(deserializedObject); } }
static void Main(string[] args) { var jsonConfig = File.ReadAllText(@"match-result.config.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); var html = string.Empty; using (WebClient client = new WebClient()) { client.Encoding = Encoding.UTF8; html = client.DownloadString("http://virtualsoccer.ru/viewmatch.php?day=12968&match_id=213340"); } var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); Console.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented)); Console.ReadKey(); }
private async Task ProcessAsync() { await this.Page.WaitForTimeoutAsync(this.DefaultTimeout); string HTML = await this.Page.GetContentAsync(); try { var openScraping = new StructuredDataExtractor(this.config); var scrapingResults = openScraping.Extract(html: HTML); var records = scrapingResults.ToObject <Model.RootObject>(); if (records.PropertyRecords.Count > 0) { var first = records.PropertyRecords.First().Owner; var last = records.PropertyRecords.Last().Owner; if (string.IsNullOrEmpty(first) && string.IsNullOrEmpty(last)) { // suspicious HTML = null; } else { return; } } else { HTML = null; } } catch (Exception ex) { HTML = null; throw ex; } if (HTML.Equals(null)) { throw new Exception("NavigateCollect: No record data was returned"); } }
public static MyRecipe ExtractRecipe(string url) { string urlResponse; MyRecipe myRecipe = null; // 1. Get Response from url using (WebClient w = new WebClient()) { urlResponse = w.DownloadString(url); } //2: Check and scrape if any structured JSON is present (application/ld+json) var configJson = @"{ 'data': '//script[contains(@type, \'application\/ld+json\')]' }"; var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(urlResponse); if (scrapingResults != null && scrapingResults["data"] != null) { var content = scrapingResults["data"].ToString(); if (content.Contains("\"@type\":\"Recipe\"")) { try { var serializerSettings = new JsonSerializerSettings() { DateParseHandling = DateParseHandling.DateTimeOffset }; Recipe rec = JsonConvert.DeserializeObject <Recipe>(content, serializerSettings); RecipeBuilder builder = new RecipeBuilder(); myRecipe = builder.Build(rec); } catch (Exception e) { } } } return(myRecipe); }
public void CastToIntegerTest() { var html = "<meta property=\"width\" content=\"1200\">"; var configJson = @" { 'width': { '_xpath': '/meta[@property=\'width\']/@content', '_transformation': 'CastToIntegerTransformation' } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual(1200, parsedJson["width"].Value); }
public void UrlEncodeTest() { var html = "<html><body><div id='content'><a href='hello world'></a></div></body></html>"; var configJson = @" { 'text': { '_xpath': '//div[@id=\'content\']/a/@href', '_transformation': 'UrlEncodeTransformation' } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("hello+world", parsedJson["text"].Value); }
public void ExtractTextTest() { var html = "<html><body><div id='content'><a href=''>A link</a>with adjacent text.</div></body></html>"; var configJson = @" { 'text': { '_xpath': '//div[@id=\'content\']', '_transformation': 'ExtractTextTransformation' } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("A link with adjacent text.", parsedJson["text"].Value); }
internal static StockPrice FilerTheStockpriceFromRediff(string httpResposeMessage) { TimeZoneInfo INDIAN_ZONE = TimeZoneInfo.FindSystemTimeZoneById("India Standard Time"); DateTime tm = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE); DateTime dt = DateTime.Today; StockPrice sp = new StockPrice(); var configJson = @" { 'price':'//span[2]', 'LastTradedDate':'//span[6]', 'LastTradedTime':'//span[7]' }"; // var configJson = @" // { // 'title1': '//h1', // 'title': '//script', // 'price':'//span[2]', // 'LastTradedDate':'//span[6]', // 'LastTradedTime':'//span[7]', //'body': '//div[contains(@class, \'article\')]' // } // "; // var html = "<html><body><h1>Article title</h1><div class='article'>Article contents</div></body></html>"; // html = httpResposeMessage; try { var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(httpResposeMessage); System.Diagnostics.Debug.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented)); var thePrice = scrapingResults["price"]; var theDate = scrapingResults["LastTradedDate"]; var theTime = scrapingResults["LastTradedTime"]; sp.Price = double.Parse(thePrice.ToString().Trim()); //DateTime dt = DateTime.ParseExact(theDate.ToString().Trim(), "dd MMM", CultureInfo.InvariantCulture); //DateTime tm = DateTime.ParseExact(theTime.ToString().Trim(), "HH:mm:ss", CultureInfo.InvariantCulture); //** precaution in case missing date & time**// if (!string.IsNullOrEmpty(theDate.ToString())) { if (theDate.ToString().IsDateType()) { // dt = (DateTime)Convert.ChangeType(theDate, typeof(DateTime)); dt = DateTime.ParseExact(theDate.ToString().Trim(), "dd MMM", CultureInfo.InvariantCulture); } if (theTime.ToString().IsDateType()) { tm = (DateTime)Convert.ChangeType(theTime, typeof(DateTime)); } } else { var DataNTime = theTime.ToString().Split(',', StringSplitOptions.RemoveEmptyEntries); if (DataNTime.Length == 2) //has date and time { if (DataNTime[0].IsDateType()) { //dt = (DateTime)Convert.ChangeType(DataNTime[0], typeof(DateTime)); dt = DateTime.ParseExact(DataNTime[0], "dd MMM", CultureInfo.InvariantCulture); } if (DataNTime[1].ToString().IsDateType()) { tm = (DateTime)Convert.ChangeType(DataNTime[1], typeof(DateTime)); } } else //has time only { if (DataNTime[0].ToString().IsDateType()) { tm = (DateTime)Convert.ChangeType(DataNTime[0], typeof(DateTime)); } } } //** adjust the date if date in missing in the downloaded time stamp**// var currectedDate = dt.Date.Add(tm.TimeOfDay); currectedDate = currectedDate > TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE) ? currectedDate.AddYears(-1) : currectedDate; sp.ValueOn = currectedDate; } catch (Exception ex) { throw ex; } return(sp); }