public static void WorkerSummary1(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\f2pool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(8000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; //var all = (int)json[0]; var active = (int)json[1]; var inactive = (int)json[2]; var dead = 0; if (json.Count() == 4) { dead = (int)json[3]; } JToken json2 = jObject["currenthash"]; JToken json3 = jObject["dailyhash"]; var temp1 = (string)json2; var temp2 = (string)json3; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag1 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag1 = false; UpdateErrorLog("f2pool", error); } } finally { driver.Close(); driver.Quit(); } }
static void Main(string[] args) { var configJson = @" { '':'//a[contains(text(), \'File\')]/@href' } "; var config = StructuredDataConfig.ParseJsonString(configJson); var html = @"http://rule34.paheal.net/post/list"; HtmlWeb web = new HtmlWeb(); WebClient wc = new WebClient(); var htmlDoc = web.Load(html); var body = htmlDoc.Text; var path = Directory.GetCurrentDirectory() + "\\img\\"; void CreataFolder() { if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } } CreataFolder(); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(body); char[] charstotrim = { '\x5C', '\x22', '\x7B', '\x20' }; var output = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented).Split(','); output[0] = output[0].Remove(0, 10); output[output.Length - 1] = output[output.Length - 1].Remove(output[output.Length - 1].Length - 8, 8); string fileName; for (int i = 0; i < output.Length; i++) { output[i] = output[i].Remove(0, 7); output[i] = output[i].Trim(charstotrim); if (output[i].Contains("webm")) { fileName = path + i + ".webm"; Console.WriteLine(fileName); } else { fileName = path + i + "." + output[i].Remove(0, output[i].Length - 3); Console.WriteLine(fileName); } wc.DownloadFile((string)output[i], fileName); } Console.WriteLine("----------------------------"); for (int i = 0; i < output.Length; i++) { Console.WriteLine(output[i]); } Console.ReadKey(); }
public void RemoveExtraWhitespaceTransformationTest() { var html = "<html><body><div id='content'><a href=''>A link</a>with adjacent text. "the final frontier"</div></body></html>"; var configJson = @" { 'text': { '_xpath': '//div[@id=\'content\']', '_transformations': [ 'ExtractTextTransformation', 'HtmlDecodeTransformation', 'RemoveExtraWhitespaceTransformation' ] } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("A link with adjacent text. \"the final frontier\"", parsedJson["text"].Value); }
public void Scrape() { var config = StructuredDataConfig.ParseJsonString(Top100Config); var html = DownloadPage(saveTo: IMDBhtml); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); using (WebClient client = new WebClient()) { foreach (var celeb in scrapingResults["celebrities"]) { celeb["birth"] = ScrapeCeleb(celeb); var wat = celeb["image"].ToString(); Uri uri = new Uri(wat); string fn = Path.GetFileName(uri.LocalPath); client.DownloadFile(wat, imgPath + fn); } } JsonSerializerSettings jss = new JsonSerializerSettings { StringEscapeHandling = StringEscapeHandling.Default }; string textresult = JsonConvert.SerializeObject(scrapingResults, jss); File.WriteAllText(DBfile, textresult); }
public MultiExtractor(string configRootFolder, string configFilesPattern) { var files = Directory.GetFiles(configRootFolder, configFilesPattern); var regexRules = 0; // used to configure the C# regex cache size if (files != null && files.Length > 0) { foreach (var file in files) { var config = StructuredDataConfig.ParseJsonFile(file); if (config.UrlPatterns != null && config.UrlPatterns.Count > 0) { regexRules += config.UrlPatterns.Count; var extractor = new StructuredDataExtractor(config); this.configsToExtractors.Add(Tuple.Create <ConfigSection, StructuredDataExtractor>(config, extractor)); } } } // The default is 15 if (regexRules > 15) { Regex.CacheSize = regexRules; } }
public void OfficeSupportExtractionTest() { var configPath = Path.Combine("TestData", "support.office.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "support.office.com.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("Export data to Excel", parsedJson["title"].Value, "The extracted title is incorrect"); Assert.AreEqual("You can copy data from a Microsoft Office Access 2007 database into a worksheet by exporting a database object to a Microsoft Office Excel 2007 workbook. You do this by using the Export Wizard in Office Access 2007.", parsedJson["abstract"].Value, "The extracted abstract is incorrect"); Assert.AreEqual(9, parsedJson["versions"].Count, "The extracted versions field is incorrect"); Assert.AreEqual(5, parsedJson["sections"].Count, "The extracted json should have 5 sections"); var secondSection = parsedJson["sections"][1]; Assert.AreEqual("Exporting data to Excel: the basics", secondSection["title"].Value, "The title of the second section is incorrect"); Assert.AreEqual(9, secondSection["text"]["paragraphs"].Count, "The paragraphs count of the second section is incorrect"); Assert.AreEqual(2, secondSection["text"]["unorderedLists"].Count, "The paragraphs count of the second section is incorrect"); var secondList = secondSection["text"]["unorderedLists"][1]; Assert.AreEqual("If this is the first time you are exporting data to Excel", secondList["title"].Value, "The title of the second list in the second section is incorrect"); Assert.AreEqual(4, secondList["items"].Count, "The second list in the second section should have 4 items"); }
public void QuoraWithWikiExtractionTest() { var configPath = Path.Combine("TestData", "quora.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.withwiki.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("What can I learn/know right now in 10 minutes that will be useful for the rest of my life?", question["title"].Value, "The extracted title is incorrect"); // Answers Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file"); Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file"); // Best Answer Assert.AreNotEqual(null, parsedJson["bestAnswer"], "Extractor should find the best answer in the HTML file"); var bestAnswer = parsedJson["bestAnswer"]; Assert.AreNotEqual(null, bestAnswer["content"], "The content string should not be null in the extracted answer"); Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer"); Assert.AreEqual(9, bestAnswer["lists"].Count, "The lists array should have 9 items"); Assert.AreEqual(25, bestAnswer["lists"][1]["items"].Count, "Second item in the lists array should have 25 items"); // Check is textAboveLength exists in each list foreach (var answer in parsedJson["answers"]) { var lists = answer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } } var bestAnswerLists = bestAnswer["lists"]; if (bestAnswerLists != null) { foreach (var list in bestAnswerLists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } }
public IDataExtractor <TRawData> CreateFromName(string name) { string jsonConfig = GetByName(name); ConfigSection config = StructuredDataConfig.ParseJsonString(jsonConfig); IDataExtractor <TRawData> dataExtractor = getDataExtractor(config); return(dataExtractor); }
public async Task Transform(string content) { var config = StructuredDataConfig.ParseJsonString(_jsonConfig); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(content); await _sender.Send(scrapingResults.ToString()); }
public static void WorkerSummary7(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\spiderpool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); //var url = "https://www.spiderpool.com/coin/show/btc/yibobtc01/detail.html"; driver.Navigate().GoToUrl(url); Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp1 = (string)json[8]; var temp2 = (string)json[10]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var temp = (string)json[6]; var active = Int32.Parse(temp.Substring(0, temp.IndexOf('/'))); var total = Int32.Parse(temp.Substring(temp.LastIndexOf('/') + 1)); int inactive = total - active; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag7 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag7 = false; UpdateErrorLog("viabtc", error); } } finally { driver.Close(); driver.Quit(); } }
public IActionResult JsonResult([FromBody] ObjectJson json) { var url = WebUtility.UrlDecode(json.url); MyWebClient client = new MyWebClient() { Encoding = Encoding.UTF8 }; client.Headers[HttpRequestHeader.UserAgent] = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"; if (json.isID) { client.DownloadData(url); var mainUrl = client.ResponseUri.ToString(); var regex = JsonConvert.DeserializeObject <JsonIDInput>(json.data); string item = ""; try { var regexMatch = Regex.Match(mainUrl, regex._xpath); item = regexMatch.Groups[regex.group_number].Value; } catch (Exception) { } return(Json(JsonConvert.SerializeObject(new JsonIDresult { url = mainUrl, id = item }, Formatting.Indented))); } else { var baseUri = new Uri(url); var isScript = json.javascript; var config = StructuredDataConfig.ParseJsonString(json.data); var html = client.DownloadString(!isScript ? url : this.configuration.GetAppSetting("UrlSeleniumGetHtmlExcuteJavascript") + "?url=" + WebUtility.UrlEncode(url)); HtmlDocument docc = new HtmlDocument(); docc.LoadHtml(html); var urltmp = ""; HtmlNodeCollection nodes = docc.DocumentNode.SelectNodes("//a"); if (nodes != null) { foreach (HtmlNode node in nodes) { if ((node.Attributes["href"] != null) && (node.Attributes["href"].Value != "")) { try { urltmp = node.Attributes["href"].Value.Trim(); node.Attributes["href"].Value = new Uri(baseUri, urltmp).AbsoluteUri; } catch (Exception) { } } } } ; html = docc.DocumentNode.InnerHtml; var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); var result = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented); return(Json(result)); } }
public static void WorkerSummary6(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\viabtc.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); //var url = "https://pool.viabtc.com/observer/dashboard?access_key=cb735a866859b626a748c0fb4a479394"; driver.Navigate().GoToUrl(url); //Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp1 = (string)json[0]; var temp2 = (string)json[2]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var active = Int32.Parse((string)json[3]); var inactive = Int32.Parse((string)json[4]); int total = 0; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag6 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag6 = false; UpdateErrorLog("viabtc", error); } } finally { driver.Close(); driver.Quit(); } }
public async Task Transform(string content) { //transform intoJson, create object template in shared becouse we need the same object in Loader to deserialize var configJson = GenerateJson(); var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(content); await _sender.Send(scrapingResults.ToString()); }
public StructuredDataExtractor(string configString) { if (string.IsNullOrEmpty(configString)) { throw new ArgumentNullException(nameof(configString)); } config = StructuredDataConfig.ParseJsonString(configString); LoadTransformations(); }
public static void WorkerSummary5(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\antpool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp = (string)json[0]; var temp1 = (string)json[1]; var temp2 = (string)json[3]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var numbers = Regex.Split(temp.Trim(), @"\D+"); var active = Int32.Parse(numbers[0]); var total = Int32.Parse(numbers[1]); var inactive = total - active; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag5 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag5 = false; UpdateErrorLog("antpool", error); } } finally { driver.Close(); driver.Quit(); } }
public void MicrosoftAnswersExtractionTest() { var configPath = Path.Combine("TestData", "answers.microsoft.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "answers.microsoft.com.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("8acb1ac5-0acd-4c68-9eeb-e4afff5b39d8", question["id"].Value, "The extracted id is incorrect"); Assert.AreEqual("I want to reserve my free copy of Windows 10, but I don’t see the icon on the taskbar", question["title"].Value, "The extracted title is incorrect"); Assert.AreNotEqual(null, question["content"], "The extracted question should have a content"); Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0"); Assert.AreEqual(1642653, question["views"].Value, "The extracted views snippet is incorrect"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints"); Assert.AreEqual("PC", question["hints"][3].ToString(), "The 4th hint of the extracted question should be PC"); // Answers Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file"); Assert.AreEqual(2, parsedJson["answers"].Count, "Extractor should find two answers in the thread summary of the HTML file"); var secondAnswer = parsedJson["answers"][1]; Assert.AreEqual("Most Helpful Reply", secondAnswer["type"].Value, "The extracted type of the answer is incorrect"); Assert.AreNotEqual(null, secondAnswer["content"], "The content array in the extracted answer should not be null"); Assert.IsTrue(secondAnswer["content"].Count > 0, "The content array in the extracted answer should have one or more items"); Assert.AreEqual(4, secondAnswer["lists"].Count, "The lists array should have 4 items"); Assert.IsTrue(secondAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item"); // Check is textAboveLength exists in each list foreach (var answer in parsedJson["answers"]) { var lists = answer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } } }
public void QuoraExtractionTest() { var configPath = Path.Combine("TestData", "quora.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "quora.com.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("What are some tips for creating a successful Kickstarter project?", question["title"].Value, "The extracted title is incorrect"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(5, question["hints"].Count, "The extracted question should have 5 hints"); Assert.AreEqual("Kickstarter", question["hints"][3].ToString(), "The 4th hint of the extracted question should be Kickstarter"); // Answers Assert.AreNotEqual(null, parsedJson["answers"], "Extractor should find answers in the HTML file"); Assert.AreEqual(5, parsedJson["answers"].Count, "Extractor should find 5 answers in the thread summary of the HTML file"); var firstAnswer = parsedJson["answers"][0]; Assert.AreNotEqual(null, firstAnswer["content"], "The content string should not be null in the extracted answer"); Assert.IsTrue(firstAnswer["content"].Value.Length > 0, "The content string should not be empty in the extracted answer"); Assert.AreEqual(6800, firstAnswer["views"].Value, "The extracted views count is incorrect"); Assert.AreEqual(1, firstAnswer["lists"].Count, "The lists array should have 1 item"); Assert.IsTrue(firstAnswer["lists"][0]["items"].Count > 0, "First item in the lists array should have at least one item"); var secondAnswer = parsedJson["answers"][1]; Assert.AreEqual(2, secondAnswer["views"].Value, "The extracted views count is incorrect"); // Check is textAboveLength exists in each list foreach (var answer in parsedJson["answers"]) { var lists = answer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } } }
private JToken ScrapeCeleb(JToken jToken) { var config = StructuredDataConfig.ParseJsonString(CelebConfig); jToken["page"] = "https://www.imdb.com" + jToken["page"]; var html = DownloadPage(jToken["page"].ToString()); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); return(scrapingResults["celebrities"]["birth"]); }
public void StackExchangeEx2ExtractionTest() { var configPath = Path.Combine("TestData", "stackexchange.com.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "stackoverflow.com.example2.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("How to configure Visual Studio 2008 to use IIS Express?", question["title"].Value, "The extracted title is incorrect"); Assert.AreNotEqual(null, question["content"], "The extracted question should have a content"); Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string"); Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0"); Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field"); Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int"); Assert.AreEqual(9, question["votes"].Value, "The votes extracted from the question should have a value of 9"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(2, question["hints"].Count, "The extracted question should have 2 hints"); Assert.AreEqual("iis-express", question["hints"][1].ToString(), "The 2nd hint of the extracted question should be passwords"); // Best Answer var bestAnswer = parsedJson["bestAnswer"]; Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content"); Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string"); Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0"); Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field"); Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int"); Assert.AreEqual(17, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 17"); Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item"); Assert.AreEqual(7, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 7 items"); // Check is textAboveLength exists in each list var lists = bestAnswer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } }
public void StackExchangeEx1ExtractionTest() { var configPath = "stackexchange.com.json"; var config = StructuredDataConfig.Parse(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText("stackoverflow.com.example1.html")); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); // Question Assert.AreNotEqual(null, parsedJson["question"], "Extractor should find a question in the HTML file"); var question = parsedJson["question"]; Assert.AreEqual("Is there a way to crack the password on an Excel VBA Project?", question["title"].Value, "The extracted title is incorrect"); Assert.AreNotEqual(null, question["content"], "The extracted question should have a content"); Assert.AreEqual(JTokenType.String, question["content"].Type, "The extracted question content should be a string"); Assert.IsTrue(question["content"].Value.Length > 0, "The extracted question content should have a length > 0"); Assert.AreNotEqual(null, question["votes"], "The extracted question should have a votes field"); Assert.AreEqual(JTokenType.Integer, question["votes"].Type, "The votes extracted from the question should be of type int"); Assert.AreEqual(196, question["votes"].Value, "The votes extracted from the question should have a value of 196"); // Question context Assert.AreNotEqual(null, question["hints"], "The extracted question should have hints"); Assert.AreEqual(4, question["hints"].Count, "The extracted question should have 4 hints"); Assert.AreEqual("passwords", question["hints"][3].ToString(), "The 4th hint of the extracted question should be passwords"); // Best Answer var bestAnswer = parsedJson["bestAnswer"]; Assert.AreNotEqual(null, bestAnswer["content"], "The extracted answer should have a content"); Assert.AreEqual(JTokenType.String, bestAnswer["content"].Type, "The extracted answer content should be a string"); Assert.IsTrue(bestAnswer["content"].Value.Length > 0, "The extracted answer content should have a length > 0"); Assert.AreNotEqual(null, bestAnswer["votes"], "The extracted answer should have a votes field"); Assert.AreEqual(JTokenType.Integer, bestAnswer["votes"].Type, "The votes extracted from the answer should be of type int"); Assert.AreEqual(153, bestAnswer["votes"].Value, "The votes extracted from the answer should have a value of 153"); Assert.AreEqual(1, bestAnswer["lists"].Count, "The lists array should have 1 item"); Assert.AreEqual(8, bestAnswer["lists"][0]["items"].Count, "The first item in the lists array should have 4 items"); // Check is textAboveLength exists in each list var lists = bestAnswer["lists"]; if (lists != null) { foreach (var list in lists) { Assert.AreEqual(JTokenType.Integer, list["textAboveLength"].Type, "The extracted textAboveLength should be an integer"); var textAboveLength = ((JValue)list["textAboveLength"]).ToObject <int>(); Assert.IsTrue(textAboveLength > 0, string.Format(CultureInfo.InvariantCulture, "textAboveLength was not greater than 0. The extracted value is: {0}", textAboveLength)); } } }
public void ParseDateTest() { var configPath = Path.Combine("TestData", "parse_date_rules.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual(DateTime.Parse("2018-11-24T00:00:00"), parsedJson["parsedDateNoFormat"].Value); Assert.AreEqual(DateTime.Parse("2011-12-30T00:00:00"), parsedJson["parsedDateWithFormat"].Value); Assert.AreEqual(DateTime.Parse("2008-06-12T00:00:00"), parsedJson["parsedDateNoFormatWithProviderStyle"].Value); }
public StructuredDataExtractor(FileInfo configFile) { if (configFile == null) { throw new ArgumentNullException(nameof(configFile)); } var configString = File.ReadAllText(configFile.FullName); config = StructuredDataConfig.ParseJsonString(configString); LoadTransformations(); }
public void RemoveXPathsExtractionTest() { var configPath = Path.Combine("TestData", "article_with_comments_div.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_comments_div.html"))); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("Article title", parsedJson["title"].Value, "The extracted title is incorrect"); Assert.AreEqual("Para1 content Para2 content", parsedJson["body"].Value, "The extracted body is incorrect"); }
private static ConfigSection CreateConfig(string resourceName) { var assembly = typeof(WikiSearcher).Assembly; string[] names = assembly.GetManifestResourceNames(); if (!names.Any(n => n.Equals(resourceName))) { resourceName = names.FirstOrDefault(n => n.Contains(resourceName)); } var stream = assembly.GetManifestResourceStream(resourceName); var reader = new System.IO.StreamReader(stream); return(StructuredDataConfig.ParseJsonString(reader.ReadToEnd())); }
public void RegexTest() { var configPath = Path.Combine("TestData", "regex_rules.json"); var config = StructuredDataConfig.ParseJsonFile(configPath); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(File.ReadAllText(Path.Combine("TestData", "article_with_date.html"))); var actualJson = JsonConvert.SerializeObject(result, Formatting.Indented); var parsedActualJson = JObject.Parse(actualJson); var expectedJsonPath = Path.Combine("TestData", "regex_expected_result.json"); var expectedJson = File.ReadAllText(expectedJsonPath); var parsedExpectedJson = JObject.Parse(expectedJson); Assert.IsTrue(JToken.DeepEquals(parsedActualJson, parsedExpectedJson)); }
private async Task Init() { // Path to the folder with classifiers models var jarRoot = @"C:\stanford-ner-2018-10-16"; var classifiersDirecrory = jarRoot + @"\classifiers"; // Loading 3 class classifier model _classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz"); // Define a regular expression for finding the location element _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>", RegexOptions.Compiled | RegexOptions.IgnoreCase); // Define configurations for parsing artist and listener info var configArtistInfoJson = @" { 'artist': '//h1[contains(@class, \'view-header\')]', 'about': '//div[contains(@class, \'bio-primary\')]', 'more': '//div[contains(@class, \'bio-secondary\')]', 'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]', 'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]' }"; ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson); _artistScraping = new StructuredDataExtractor(configArtist); // Get the hosted feature layers for editing ArcGISPortal portal = await ArcGISPortal.CreateAsync(); PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId); PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId); PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId); _hometownTable = new ServiceFeatureTable(hometownLayerItem, 0); _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0); _listenerTable = new ServiceFeatureTable(listenerLayerItem, 0); await _hometownTable.LoadAsync(); await _otherPointsTable.LoadAsync(); await _listenerTable.LoadAsync(); }
static void ExtractJsonLdRecipe() { //// 1. //// URL: http://en.wikipedia.org/wiki/Main_Page //WebClient w = new WebClient(); //string s = w.DownloadString("https://headbangerskitchen.com/recipe/low-carb-dessert/"); var configJson = @" { 'title': '//h1', 'body': '//script[contains(@type, \'application\/ld+json\')]' }"; var config = StructuredDataConfig.ParseJsonString(configJson); StringBuilder sb = new StringBuilder("{"); sb.Append("\"@context\":\"http://schema.org/\","); sb.Append("\"@type\":\"Recipe\","); sb.Append("\"name\":\"Keto Coffee & Chocolate Tart\",\"author\":{ \"@type\":\"Person\",\"name\":\"Sahil Makhija\"},\"datePublished\":\"2009-11-05T00:00:00+00:00\","); sb.Append("\"description\":\"A delicious layered low carb dessert with the flavours of chocolate and coffee\",\"recipeYield\":\"3 servings\",\"aggregateRating\":{ \"@type\":\"AggregateRating\",\"ratingValue\":\"5\",\"ratingCount\":\"6\"},"); sb.Append("\"prepTime\":\"PT10M\",\"cookTime\":\"PT20M\","); sb.Append("\"recipeIngredient\":[\"45 grams Almond Flour ( I use this one )\",\"30 grams Salted Butter\",\"1 Tbsp Unsweetened Coco Powder ( I recommend this one )\",\"150 grams Mascarpone cheese\",\"1 Tsp Vanilla Extract\",\"2 Tbsp Water\",\"1 Tsp Instant espresso powder\",\"100 ml Heavy Cream\",\"30 grams Dark Chocolate (85% or Higher) (I use Lindt 85%)\",\"Stevia to taste\"],"); sb.Append("\"recipeInstructions\":["); sb.Append("\"Microwave the butter for 30 seconds till melted\",\"Add in your stevia/sweetner to taste, vanilla essence and the coco powder and mix well together\",\"Add in the almond flour and combine till well incorporated\","); sb.Append("\"Divide the mixture in 3 tart tins or ramekins and shape the base\",\"Bake at 175 C/ 350 F for 10 minutes and then allow them to cool\",\"Heat 2 tablespoons of water and mix 1 tsp of instant espresso powder into that\","); sb.Append("\"Whip the mascarpone cheese, stevia, vanilla extract and coffee mixture together till nice and fluffy\",\"Pour the mascarpone mixture over the base and chill in the fridge for 15 minutes\","); sb.Append("\"Meanwhile warm up the cream for 30 seconds in the microwave and add the chocolate and sweetner to that and mix till fully melted and you have a creamy ganache\",\"Pour the ganache over the mascarpone mousse in the tart molds and chill in the fridge for an hour\",\"Finish with some sea salt on top of each tart.\"],"); sb.Append("\"recipeCategory\":\"Dessert\",\"recipeCuisine\":\"General\",\"suitableForDiet\": \"http://schema.org/LowFatDiet\"}"); var json = sb.ToString(); var serializerSettings = new JsonSerializerSettings() { DateParseHandling = DateParseHandling.DateTimeOffset }; serializerSettings.Converters.Add(new IsoDateTimeConverter()); serializerSettings.Converters.Add(new TimeSpanToISO8601DurationValuesConverter()); Recipe rec = JsonConvert.DeserializeObject <Recipe>(json, serializerSettings); Console.WriteLine("Extracting LD+JSON Recipe....."); Console.Write(rec); Console.ReadKey(); }
public T Run() { using (var client = new WebClient()) { var html = client.DownloadString(UrlConstants.BaseUrl + _relativeUrl); var configuration = StructuredDataConfig.ParseJsonString(ConfigurationJson); var openScraping = new StructuredDataExtractor(configuration); var scrapingResults = openScraping.Extract(html); var serializedObject = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented); var deserializedObject = JsonConvert.DeserializeObject <T>(serializedObject); WaitRandom(); return(deserializedObject); } }
static void Main(string[] args) { var jsonConfig = File.ReadAllText(@"match-result.config.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); var html = string.Empty; using (WebClient client = new WebClient()) { client.Encoding = Encoding.UTF8; html = client.DownloadString("http://virtualsoccer.ru/viewmatch.php?day=12968&match_id=213340"); } var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); Console.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented)); Console.ReadKey(); }
public static MyRecipe ExtractRecipe(string url) { string urlResponse; MyRecipe myRecipe = null; // 1. Get Response from url using (WebClient w = new WebClient()) { urlResponse = w.DownloadString(url); } //2: Check and scrape if any structured JSON is present (application/ld+json) var configJson = @"{ 'data': '//script[contains(@type, \'application\/ld+json\')]' }"; var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(urlResponse); if (scrapingResults != null && scrapingResults["data"] != null) { var content = scrapingResults["data"].ToString(); if (content.Contains("\"@type\":\"Recipe\"")) { try { var serializerSettings = new JsonSerializerSettings() { DateParseHandling = DateParseHandling.DateTimeOffset }; Recipe rec = JsonConvert.DeserializeObject <Recipe>(content, serializerSettings); RecipeBuilder builder = new RecipeBuilder(); myRecipe = builder.Build(rec); } catch (Exception e) { } } } return(myRecipe); }