static void Main(string[] args) { var configJson = @" { '':'//a[contains(text(), \'File\')]/@href' } "; var config = StructuredDataConfig.ParseJsonString(configJson); var html = @"http://rule34.paheal.net/post/list"; HtmlWeb web = new HtmlWeb(); WebClient wc = new WebClient(); var htmlDoc = web.Load(html); var body = htmlDoc.Text; var path = Directory.GetCurrentDirectory() + "\\img\\"; void CreataFolder() { if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } } CreataFolder(); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(body); char[] charstotrim = { '\x5C', '\x22', '\x7B', '\x20' }; var output = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented).Split(','); output[0] = output[0].Remove(0, 10); output[output.Length - 1] = output[output.Length - 1].Remove(output[output.Length - 1].Length - 8, 8); string fileName; for (int i = 0; i < output.Length; i++) { output[i] = output[i].Remove(0, 7); output[i] = output[i].Trim(charstotrim); if (output[i].Contains("webm")) { fileName = path + i + ".webm"; Console.WriteLine(fileName); } else { fileName = path + i + "." + output[i].Remove(0, output[i].Length - 3); Console.WriteLine(fileName); } wc.DownloadFile((string)output[i], fileName); } Console.WriteLine("----------------------------"); for (int i = 0; i < output.Length; i++) { Console.WriteLine(output[i]); } Console.ReadKey(); }
public static void WorkerSummary1(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\f2pool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(8000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; //var all = (int)json[0]; var active = (int)json[1]; var inactive = (int)json[2]; var dead = 0; if (json.Count() == 4) { dead = (int)json[3]; } JToken json2 = jObject["currenthash"]; JToken json3 = jObject["dailyhash"]; var temp1 = (string)json2; var temp2 = (string)json3; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag1 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag1 = false; UpdateErrorLog("f2pool", error); } } finally { driver.Close(); driver.Quit(); } }
public void RemoveExtraWhitespaceTransformationTest() { var html = "<html><body><div id='content'><a href=''>A link</a>with adjacent text. "the final frontier"</div></body></html>"; var configJson = @" { 'text': { '_xpath': '//div[@id=\'content\']', '_transformations': [ 'ExtractTextTransformation', 'HtmlDecodeTransformation', 'RemoveExtraWhitespaceTransformation' ] } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("A link with adjacent text. \"the final frontier\"", parsedJson["text"].Value); }
public void Scrape() { var config = StructuredDataConfig.ParseJsonString(Top100Config); var html = DownloadPage(saveTo: IMDBhtml); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); using (WebClient client = new WebClient()) { foreach (var celeb in scrapingResults["celebrities"]) { celeb["birth"] = ScrapeCeleb(celeb); var wat = celeb["image"].ToString(); Uri uri = new Uri(wat); string fn = Path.GetFileName(uri.LocalPath); client.DownloadFile(wat, imgPath + fn); } } JsonSerializerSettings jss = new JsonSerializerSettings { StringEscapeHandling = StringEscapeHandling.Default }; string textresult = JsonConvert.SerializeObject(scrapingResults, jss); File.WriteAllText(DBfile, textresult); }
public IDataExtractor <TRawData> CreateFromName(string name) { string jsonConfig = GetByName(name); ConfigSection config = StructuredDataConfig.ParseJsonString(jsonConfig); IDataExtractor <TRawData> dataExtractor = getDataExtractor(config); return(dataExtractor); }
public async Task Transform(string content) { var config = StructuredDataConfig.ParseJsonString(_jsonConfig); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(content); await _sender.Send(scrapingResults.ToString()); }
public IActionResult JsonResult([FromBody] ObjectJson json) { var url = WebUtility.UrlDecode(json.url); MyWebClient client = new MyWebClient() { Encoding = Encoding.UTF8 }; client.Headers[HttpRequestHeader.UserAgent] = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"; if (json.isID) { client.DownloadData(url); var mainUrl = client.ResponseUri.ToString(); var regex = JsonConvert.DeserializeObject <JsonIDInput>(json.data); string item = ""; try { var regexMatch = Regex.Match(mainUrl, regex._xpath); item = regexMatch.Groups[regex.group_number].Value; } catch (Exception) { } return(Json(JsonConvert.SerializeObject(new JsonIDresult { url = mainUrl, id = item }, Formatting.Indented))); } else { var baseUri = new Uri(url); var isScript = json.javascript; var config = StructuredDataConfig.ParseJsonString(json.data); var html = client.DownloadString(!isScript ? url : this.configuration.GetAppSetting("UrlSeleniumGetHtmlExcuteJavascript") + "?url=" + WebUtility.UrlEncode(url)); HtmlDocument docc = new HtmlDocument(); docc.LoadHtml(html); var urltmp = ""; HtmlNodeCollection nodes = docc.DocumentNode.SelectNodes("//a"); if (nodes != null) { foreach (HtmlNode node in nodes) { if ((node.Attributes["href"] != null) && (node.Attributes["href"].Value != "")) { try { urltmp = node.Attributes["href"].Value.Trim(); node.Attributes["href"].Value = new Uri(baseUri, urltmp).AbsoluteUri; } catch (Exception) { } } } } ; html = docc.DocumentNode.InnerHtml; var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); var result = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented); return(Json(result)); } }
public static void WorkerSummary7(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\spiderpool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); //var url = "https://www.spiderpool.com/coin/show/btc/yibobtc01/detail.html"; driver.Navigate().GoToUrl(url); Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp1 = (string)json[8]; var temp2 = (string)json[10]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var temp = (string)json[6]; var active = Int32.Parse(temp.Substring(0, temp.IndexOf('/'))); var total = Int32.Parse(temp.Substring(temp.LastIndexOf('/') + 1)); int inactive = total - active; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag7 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag7 = false; UpdateErrorLog("viabtc", error); } } finally { driver.Close(); driver.Quit(); } }
public static void WorkerSummary6(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\viabtc.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); //var url = "https://pool.viabtc.com/observer/dashboard?access_key=cb735a866859b626a748c0fb4a479394"; driver.Navigate().GoToUrl(url); //Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp1 = (string)json[0]; var temp2 = (string)json[2]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var active = Int32.Parse((string)json[3]); var inactive = Int32.Parse((string)json[4]); int total = 0; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag6 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag6 = false; UpdateErrorLog("viabtc", error); } } finally { driver.Close(); driver.Quit(); } }
public async Task Transform(string content) { //transform intoJson, create object template in shared becouse we need the same object in Loader to deserialize var configJson = GenerateJson(); var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(content); await _sender.Send(scrapingResults.ToString()); }
public StructuredDataExtractor(string configString) { if (string.IsNullOrEmpty(configString)) { throw new ArgumentNullException(nameof(configString)); } config = StructuredDataConfig.ParseJsonString(configString); LoadTransformations(); }
public static void WorkerSummary5(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\antpool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(1000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp = (string)json[0]; var temp1 = (string)json[1]; var temp2 = (string)json[3]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); var numbers = Regex.Split(temp.Trim(), @"\D+"); var active = Int32.Parse(numbers[0]); var total = Int32.Parse(numbers[1]); var inactive = total - active; int dead = 0; UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag5 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag5 = false; UpdateErrorLog("antpool", error); } } finally { driver.Close(); driver.Quit(); } }
private JToken ScrapeCeleb(JToken jToken) { var config = StructuredDataConfig.ParseJsonString(CelebConfig); jToken["page"] = "https://www.imdb.com" + jToken["page"]; var html = DownloadPage(jToken["page"].ToString()); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); return(scrapingResults["celebrities"]["birth"]); }
public StructuredDataExtractor(FileInfo configFile) { if (configFile == null) { throw new ArgumentNullException(nameof(configFile)); } var configString = File.ReadAllText(configFile.FullName); config = StructuredDataConfig.ParseJsonString(configString); LoadTransformations(); }
private static ConfigSection CreateConfig(string resourceName) { var assembly = typeof(WikiSearcher).Assembly; string[] names = assembly.GetManifestResourceNames(); if (!names.Any(n => n.Equals(resourceName))) { resourceName = names.FirstOrDefault(n => n.Contains(resourceName)); } var stream = assembly.GetManifestResourceStream(resourceName); var reader = new System.IO.StreamReader(stream); return(StructuredDataConfig.ParseJsonString(reader.ReadToEnd())); }
private async Task Init() { // Path to the folder with classifiers models var jarRoot = @"C:\stanford-ner-2018-10-16"; var classifiersDirecrory = jarRoot + @"\classifiers"; // Loading 3 class classifier model _classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.all.3class.distsim.crf.ser.gz"); // Define a regular expression for finding the location element _locationRx = new Regex(@"<LOCATION\b[^>]*>(.*?)</LOCATION>", RegexOptions.Compiled | RegexOptions.IgnoreCase); // Define configurations for parsing artist and listener info var configArtistInfoJson = @" { 'artist': '//h1[contains(@class, \'view-header\')]', 'about': '//div[contains(@class, \'bio-primary\')]', 'more': '//div[contains(@class, \'bio-secondary\')]', 'listeners-city': '//span[contains(@class, \'horizontal-list__item__title\')]', 'listeners': '//span[contains(@class, \'horizontal-list__item__subtitle\')]' }"; ConfigSection configArtist = StructuredDataConfig.ParseJsonString(configArtistInfoJson); _artistScraping = new StructuredDataExtractor(configArtist); // Get the hosted feature layers for editing ArcGISPortal portal = await ArcGISPortal.CreateAsync(); PortalItem hometownLayerItem = await PortalItem.CreateAsync(portal, _hometownLayerId); PortalItem otherPointsLayerItem = await PortalItem.CreateAsync(portal, _otherPointsLayerId); PortalItem listenerLayerItem = await PortalItem.CreateAsync(portal, _listenerLayerId); _hometownTable = new ServiceFeatureTable(hometownLayerItem, 0); _otherPointsTable = new ServiceFeatureTable(otherPointsLayerItem, 0); _listenerTable = new ServiceFeatureTable(listenerLayerItem, 0); await _hometownTable.LoadAsync(); await _otherPointsTable.LoadAsync(); await _listenerTable.LoadAsync(); }
static void ExtractJsonLdRecipe() { //// 1. //// URL: http://en.wikipedia.org/wiki/Main_Page //WebClient w = new WebClient(); //string s = w.DownloadString("https://headbangerskitchen.com/recipe/low-carb-dessert/"); var configJson = @" { 'title': '//h1', 'body': '//script[contains(@type, \'application\/ld+json\')]' }"; var config = StructuredDataConfig.ParseJsonString(configJson); StringBuilder sb = new StringBuilder("{"); sb.Append("\"@context\":\"http://schema.org/\","); sb.Append("\"@type\":\"Recipe\","); sb.Append("\"name\":\"Keto Coffee & Chocolate Tart\",\"author\":{ \"@type\":\"Person\",\"name\":\"Sahil Makhija\"},\"datePublished\":\"2009-11-05T00:00:00+00:00\","); sb.Append("\"description\":\"A delicious layered low carb dessert with the flavours of chocolate and coffee\",\"recipeYield\":\"3 servings\",\"aggregateRating\":{ \"@type\":\"AggregateRating\",\"ratingValue\":\"5\",\"ratingCount\":\"6\"},"); sb.Append("\"prepTime\":\"PT10M\",\"cookTime\":\"PT20M\","); sb.Append("\"recipeIngredient\":[\"45 grams Almond Flour ( I use this one )\",\"30 grams Salted Butter\",\"1 Tbsp Unsweetened Coco Powder ( I recommend this one )\",\"150 grams Mascarpone cheese\",\"1 Tsp Vanilla Extract\",\"2 Tbsp Water\",\"1 Tsp Instant espresso powder\",\"100 ml Heavy Cream\",\"30 grams Dark Chocolate (85% or Higher) (I use Lindt 85%)\",\"Stevia to taste\"],"); sb.Append("\"recipeInstructions\":["); sb.Append("\"Microwave the butter for 30 seconds till melted\",\"Add in your stevia/sweetner to taste, vanilla essence and the coco powder and mix well together\",\"Add in the almond flour and combine till well incorporated\","); sb.Append("\"Divide the mixture in 3 tart tins or ramekins and shape the base\",\"Bake at 175 C/ 350 F for 10 minutes and then allow them to cool\",\"Heat 2 tablespoons of water and mix 1 tsp of instant espresso powder into that\","); sb.Append("\"Whip the mascarpone cheese, stevia, vanilla extract and coffee mixture together till nice and fluffy\",\"Pour the mascarpone mixture over the base and chill in the fridge for 15 minutes\","); sb.Append("\"Meanwhile warm up the cream for 30 seconds in the microwave and add the chocolate and sweetner to that and mix till fully melted and you have a creamy ganache\",\"Pour the ganache over the mascarpone mousse in the tart molds and chill in the fridge for an hour\",\"Finish with some sea salt on top of each tart.\"],"); sb.Append("\"recipeCategory\":\"Dessert\",\"recipeCuisine\":\"General\",\"suitableForDiet\": \"http://schema.org/LowFatDiet\"}"); var json = sb.ToString(); var serializerSettings = new JsonSerializerSettings() { DateParseHandling = DateParseHandling.DateTimeOffset }; serializerSettings.Converters.Add(new IsoDateTimeConverter()); serializerSettings.Converters.Add(new TimeSpanToISO8601DurationValuesConverter()); Recipe rec = JsonConvert.DeserializeObject <Recipe>(json, serializerSettings); Console.WriteLine("Extracting LD+JSON Recipe....."); Console.Write(rec); Console.ReadKey(); }
public T Run() { using (var client = new WebClient()) { var html = client.DownloadString(UrlConstants.BaseUrl + _relativeUrl); var configuration = StructuredDataConfig.ParseJsonString(ConfigurationJson); var openScraping = new StructuredDataExtractor(configuration); var scrapingResults = openScraping.Extract(html); var serializedObject = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented); var deserializedObject = JsonConvert.DeserializeObject <T>(serializedObject); WaitRandom(); return(deserializedObject); } }
static void Main(string[] args) { var jsonConfig = File.ReadAllText(@"match-result.config.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); var html = string.Empty; using (WebClient client = new WebClient()) { client.Encoding = Encoding.UTF8; html = client.DownloadString("http://virtualsoccer.ru/viewmatch.php?day=12968&match_id=213340"); } var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); Console.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented)); Console.ReadKey(); }
public static MyRecipe ExtractRecipe(string url) { string urlResponse; MyRecipe myRecipe = null; // 1. Get Response from url using (WebClient w = new WebClient()) { urlResponse = w.DownloadString(url); } //2: Check and scrape if any structured JSON is present (application/ld+json) var configJson = @"{ 'data': '//script[contains(@type, \'application\/ld+json\')]' }"; var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(urlResponse); if (scrapingResults != null && scrapingResults["data"] != null) { var content = scrapingResults["data"].ToString(); if (content.Contains("\"@type\":\"Recipe\"")) { try { var serializerSettings = new JsonSerializerSettings() { DateParseHandling = DateParseHandling.DateTimeOffset }; Recipe rec = JsonConvert.DeserializeObject <Recipe>(content, serializerSettings); RecipeBuilder builder = new RecipeBuilder(); myRecipe = builder.Build(rec); } catch (Exception e) { } } } return(myRecipe); }
public void ExtractTextTest() { var html = "<html><body><div id='content'><a href=''>A link</a>with adjacent text.</div></body></html>"; var configJson = @" { 'text': { '_xpath': '//div[@id=\'content\']', '_transformation': 'ExtractTextTransformation' } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("A link with adjacent text.", parsedJson["text"].Value); }
public void UrlEncodeTest() { var html = "<html><body><div id='content'><a href='hello world'></a></div></body></html>"; var configJson = @" { 'text': { '_xpath': '//div[@id=\'content\']/a/@href', '_transformation': 'UrlEncodeTransformation' } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual("hello+world", parsedJson["text"].Value); }
public void CastToIntegerTest() { var html = "<meta property=\"width\" content=\"1200\">"; var configJson = @" { 'width': { '_xpath': '/meta[@property=\'width\']/@content', '_transformation': 'CastToIntegerTransformation' } } "; var config = StructuredDataConfig.ParseJsonString(configJson); var extractor = new StructuredDataExtractor(config); var result = extractor.Extract(html); var json = JsonConvert.SerializeObject(result, Formatting.Indented); dynamic parsedJson = JsonConvert.DeserializeObject(json); Assert.AreEqual(1200, parsedJson["width"].Value); }
internal static StockPrice FilerTheStockpriceFromYahoo(string httpResposeMessage) { StockPrice sp = new StockPrice(); TimeZoneInfo INDIAN_ZONE = TimeZoneInfo.FindSystemTimeZoneById("India Standard Time"); DateTime tm = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE); DateTime dt = DateTime.Today; var configJson = @" { 'price':'//div[1]/div/div/div[1]/div/div[2]/div/div/div[4]/div/div/div/div[3]/div/div/span[1]', 'Closing':'//div[1]/div/div/div[1]/div/div[2]/div/div/div[4]/div/div/div/div[3]/div/div/div/span', 'DK':'//span[7]' }"; try { var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(httpResposeMessage); System.Diagnostics.Debug.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented)); //Closing >> At close: May 24 3:29PM IST //Opening >> As of 9:23AM IST. Market open. //As of May 27 9:30AM IST. Market open. //As of May 27 9:26AM IST. Market open. //As of May 24 3:52PM IST. Market open. var thePrice = scrapingResults["price"]; var splits = scrapingResults["Closing"].ToString().Trim().Split(' ', StringSplitOptions.RemoveEmptyEntries); var index = Array.FindIndex(splits, x => x.ToUpper().TrimEnd(new char[] { '.', ',' }) == "IST"); string sDate = string.Empty, sTime = string.Empty; if (index > -1) { if (index - 1 > -1) { sTime = $"{splits[index - 1]}"; } if (index - 3 > -1) { sDate = $"{splits[index - 2]} {splits[index - 3]}"; } } //** precaution in case missing date & time**// if (sDate.IsDateType()) { dt = (DateTime)Convert.ChangeType(sDate, typeof(DateTime)); } if (sTime.IsDateType()) { tm = (DateTime)Convert.ChangeType(sTime, typeof(DateTime)); } //** adjust the date if date in missing in the downloaded time stamp**// var currectedDate = dt.Date.Add(tm.TimeOfDay); currectedDate = currectedDate > TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE) ? currectedDate.AddDays(-1) : currectedDate; //** Convert the downloaded time into last business day**// if (currectedDate.IsWeekend()) { currectedDate = currectedDate.PreviousWorkDay(); } sp.Price = double.Parse(thePrice.ToString().Trim()); sp.ValueOn = currectedDate; } catch (Exception ex) { throw ex; } return(sp); }
internal static StockPrice FilerTheStockpriceFromRediff(string httpResposeMessage) { TimeZoneInfo INDIAN_ZONE = TimeZoneInfo.FindSystemTimeZoneById("India Standard Time"); DateTime tm = TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE); DateTime dt = DateTime.Today; StockPrice sp = new StockPrice(); var configJson = @" { 'price':'//span[2]', 'LastTradedDate':'//span[6]', 'LastTradedTime':'//span[7]' }"; // var configJson = @" // { // 'title1': '//h1', // 'title': '//script', // 'price':'//span[2]', // 'LastTradedDate':'//span[6]', // 'LastTradedTime':'//span[7]', //'body': '//div[contains(@class, \'article\')]' // } // "; // var html = "<html><body><h1>Article title</h1><div class='article'>Article contents</div></body></html>"; // html = httpResposeMessage; try { var config = StructuredDataConfig.ParseJsonString(configJson); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(httpResposeMessage); System.Diagnostics.Debug.WriteLine(JsonConvert.SerializeObject(scrapingResults, Formatting.Indented)); var thePrice = scrapingResults["price"]; var theDate = scrapingResults["LastTradedDate"]; var theTime = scrapingResults["LastTradedTime"]; sp.Price = double.Parse(thePrice.ToString().Trim()); //DateTime dt = DateTime.ParseExact(theDate.ToString().Trim(), "dd MMM", CultureInfo.InvariantCulture); //DateTime tm = DateTime.ParseExact(theTime.ToString().Trim(), "HH:mm:ss", CultureInfo.InvariantCulture); //** precaution in case missing date & time**// if (!string.IsNullOrEmpty(theDate.ToString())) { if (theDate.ToString().IsDateType()) { // dt = (DateTime)Convert.ChangeType(theDate, typeof(DateTime)); dt = DateTime.ParseExact(theDate.ToString().Trim(), "dd MMM", CultureInfo.InvariantCulture); } if (theTime.ToString().IsDateType()) { tm = (DateTime)Convert.ChangeType(theTime, typeof(DateTime)); } } else { var DataNTime = theTime.ToString().Split(',', StringSplitOptions.RemoveEmptyEntries); if (DataNTime.Length == 2) //has date and time { if (DataNTime[0].IsDateType()) { //dt = (DateTime)Convert.ChangeType(DataNTime[0], typeof(DateTime)); dt = DateTime.ParseExact(DataNTime[0], "dd MMM", CultureInfo.InvariantCulture); } if (DataNTime[1].ToString().IsDateType()) { tm = (DateTime)Convert.ChangeType(DataNTime[1], typeof(DateTime)); } } else //has time only { if (DataNTime[0].ToString().IsDateType()) { tm = (DateTime)Convert.ChangeType(DataNTime[0], typeof(DateTime)); } } } //** adjust the date if date in missing in the downloaded time stamp**// var currectedDate = dt.Date.Add(tm.TimeOfDay); currectedDate = currectedDate > TimeZoneInfo.ConvertTimeFromUtc(DateTime.UtcNow, INDIAN_ZONE) ? currectedDate.AddYears(-1) : currectedDate; sp.ValueOn = currectedDate; } catch (Exception ex) { throw ex; } return(sp); }
public static void Worker1(int poolid) { var jsonConfig = File.ReadAllText(@"Json\\f2pool.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string html; string url = GetUrl(poolid); using (WebClient client = new WebClient()) { client.Encoding = Encoding.UTF8; client.Headers.Add(HttpRequestHeader.UserAgent, ""); html = client.DownloadString(url); } //var html = File.ReadAllText(@"f2pool.html", Encoding.UTF8); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); //var json = JsonConvert.SerializeObject(scrapingResults); var jsonres = JsonConvert.DeserializeObject(scrapingResults.ToString()); foreach (var item in (dynamic)jsonres) { //string checkquery = "SELECT COUNT(*) FROM worker where poolid = '" + poolid + "' and workername = '" + item.workername + "' "; //var count = db.Query<int>(checkquery).FirstOrDefault(); var count = 0; if (count == 0) { if (item.currenthash == "0" || item.currenthash == "0.00") { // Worker worker = new Worker(); worker.poolid = poolid; worker.workername = item.workername; worker.currenthashrate = item.currenthash; worker.dailyhashrate = item.dailyhash; worker.rejected = item.rejected; worker.updateat = DateTime.Now; worker.isactive = false; Add(worker); } else if (item.currenthash == null) { continue; } else { //DateTime datetime = Convert.ToDateTime(date); // insert to Worker worker = new Worker(); worker.poolid = poolid; worker.workername = item.workername; worker.currenthashrate = item.currenthash; worker.dailyhashrate = item.dailyhash; worker.rejected = item.rejected; worker.updateat = DateTime.Now; worker.isactive = true; Add(worker); } } else { bool isactive = (item.currenthash == "0" || item.currenthash == "0.00") ? false : true; Worker worker = new Worker(); worker.poolid = poolid; worker.workername = item.workername; worker.currenthashrate = item.currenthash; worker.dailyhashrate = item.dailyhash; worker.rejected = item.rejected; worker.updateat = DateTime.Now; worker.isactive = isactive; Update(worker); } //Console.WriteLine("{0} {1} {2} {3}\n", item.workername, item.currenthash, // item.dailyhash, item.rejected); } //Console.ReadKey(); }
public static void WorkerSummary2(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\poolin.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(15000); //WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(30)); //wait.Until(dr => dr.FindElement(By.XPath("//p[contains(@class, 'f-tac')]"))); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); //var jsonres = JsonConvert.DeserializeObject(scrapingResults.ToString()); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["data"]; var temp1 = (string)json[0]; var temp2 = (string)json[1]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); int active, inactive; int dead = 0; // no data for dead if ((string)json[2] == "-") { active = 0; } else { active = (int)json[2]; } if ((string)json[3] == "-") { inactive = 0; } else { inactive = (int)json[3]; } UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag2 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag2 = false; UpdateErrorLog("poolin", error); } } finally { driver.Close(); driver.Quit(); } }
public static void WorkerSummary4(int poolid) { var driver = new ChromeDriver(); var jsonConfig = File.ReadAllText(@"Json\\huobi.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string url = GetUrl(poolid); driver.Navigate().GoToUrl(url); Thread.Sleep(5000); var source = driver.PageSource; try { var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); JObject jObject = JObject.Parse(scrapingResults.ToString()); JToken json = jObject["calculation"]; var temp1 = (string)json[0]; var temp2 = (string)json[2]; var currentcalculation = GetFloat(temp1); var dailycalculation = GetFloat(temp2); var unit = GetString(temp1); JToken json2 = jObject["status"]; var temp = (string)json2; var numbers = Regex.Split(temp.Trim(), @"\D+"); //bool flag = false; //var list = new List<int>(); //string tmp = string.Empty; //for(int i = 0; i < temp.Length; i++) //{ // if (Char.IsDigit(temp[i])) // { // tmp += temp[i]; // flag = true; // } // else // { // if(flag == true) // { // flag = false; // list.Add(Int32.Parse(tmp)); // tmp = string.Empty; // } // } //} int active = Int32.Parse(numbers[1]); int inactive = Int32.Parse(numbers[2]); int dead = Int32.Parse(numbers[3]); UpdateSummary(currentcalculation, dailycalculation, unit, active, inactive, dead, poolid); } catch (Exception ex) { if (Commonflag.flag4 != true) { // do nothing } else { string error = "Poolid=" + poolid + " " + ex.ToString(); Commonflag.flag4 = false; UpdateErrorLog("huobi", error); } } finally { driver.Close(); driver.Quit(); } }
public static void Worker2(int poolid) { var jsonConfig = File.ReadAllText(@"Json\\poolin.json"); var config = StructuredDataConfig.ParseJsonString(jsonConfig); string html; string url = GetUrl(poolid); //using (WebClient client = new WebClient()) //{ // client.Encoding = Encoding.UTF8; // //client.Headers.Add(HttpRequestHeader.UserAgent, "test"); // //client.Credentials = CredentialCache.DefaultCredentials; // html = client.DownloadString(url); //} var driver = new ChromeDriver(); //var homeURL = "https://www.poolin.com/my/9007375/btc/miners?read_token=wowavEpSkh6wX7yePaQ4wcsfbPKPWNBlxkqppuYlJNvm4NUHUBoLCzAKhj4QTblH"; driver.Navigate().GoToUrl(url); //IWebElement element = driver.FindElement(By.XPath("//table")); var source = driver.PageSource; driver.Close(); driver.Quit(); var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(source); //var json = JsonConvert.SerializeObject(scrapingResults); var jsonres = JsonConvert.DeserializeObject(scrapingResults.ToString()); foreach (var item in (dynamic)jsonres) { //string checkquery = "SELECT COUNT(*) FROM worker where poolid = '" + poolid + "' and workername = '" + item.workername + "' "; //var count = db.Query<int>(checkquery).FirstOrDefault(); var count = 0; if (count == 0) { if (item.currenthash == "0" || item.currenthash == "0.00") { // Worker worker = new Worker(); worker.poolid = poolid; worker.workername = item.workername; worker.currenthashrate = item.currenthash; worker.dailyhashrate = item.dailyhash; worker.rejected = item.rejected; worker.updateat = DateTime.Now; worker.isactive = false; Add(worker); } else if (item.currenthash == null) { continue; } else { //DateTime datetime = Convert.ToDateTime(date); // insert to Worker worker = new Worker(); worker.poolid = poolid; worker.workername = item.workername; worker.currenthashrate = item.currenthash; worker.dailyhashrate = item.dailyhash; worker.rejected = item.rejected; worker.updateat = DateTime.Now; worker.isactive = true; Add(worker); } } else { bool isactive = (item.currenthash == "0" || item.currenthash == "0.00") ? false : true; Worker worker = new Worker(); worker.poolid = poolid; worker.workername = item.workername; worker.currenthashrate = item.currenthash; worker.dailyhashrate = item.dailyhash; worker.rejected = item.rejected; worker.updateat = DateTime.Now; worker.isactive = isactive; Update(worker); } //Console.WriteLine("{0} {1} {2} {3}\n", item.workername, item.currenthash, // item.dailyhash, item.rejected); } //foreach (var item in (dynamic)jsonres) //{ // Worker worker = new Worker(); // worker.poolid = poolid; // worker.workername = item.workername; // worker.currenthashrate = item.currenthash; // worker.dailyhashrate = item.dailyhash; // worker.rejected = item.rejected; // worker.updateat = DateTime.Now; // worker.isactive = true; // worker.currentcalculation = item.currenthashtotal; // worker.dailycalculation = item.dailyhashtotal; // UpdateSummary(worker); // break; //} //Console.ReadKey(); }
public ResultReturn pushDataToCategory([FromForm] ResultList result1) { try { { // Lấy ra danh sách url strong start_url cần bổ sung var urls = entities.start_url.Where(m => result1.result.Contains(m.ID)).ToList(); if (urls != null) { var countUrl = 0; MyWebClient client = new MyWebClient() { Encoding = Encoding.UTF8 }; client.Headers[HttpRequestHeader.UserAgent] = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"; foreach (var url1 in urls) { try { // Kiểm tra xem url có hợp lệ không var urltmp = url1.url; bool isUrl = Uri.TryCreate(urltmp, UriKind.Absolute, out Uri baseUri) && (baseUri.Scheme == Uri.UriSchemeHttp || baseUri.Scheme == Uri.UriSchemeHttps); if (isUrl) { var myDomain = baseUri.Authority; // Loại bỏ www ở đầu domain myDomain = myDomain.StartsWith("www.") ? myDomain.Substring(4) : myDomain; // lấy ra cấu hình của domain var domains = entities.Domain.Where(domain => (myDomain == domain.Domain1 || myDomain.EndsWith("." + domain.Domain1)) && domain.Type == 2).ToList(); if (domains.Count > 0) { Domain domain = null; foreach (Domain domaintmp in domains) { if (domaintmp.Domain1 == myDomain) { domain = domaintmp; break; } } if (domain == null) { domain = domains.FirstOrDefault(); } var data = client.DownloadData(urltmp); var contentType = client.ResponseHeaders["Content-Type"]; // Kiểm tra xem nội dung của trang web có phải là html hay không if (contentType.StartsWith(@"text/")) { var jsonConfig = domain.Content; var config = StructuredDataConfig.ParseJsonString(jsonConfig); HtmlDocument docc = new HtmlDocument(); var html = Encoding.UTF8.GetString(data); docc.LoadHtml(html); var url = ""; // Chuyển link động trong trang web thành link tĩnh HtmlNodeCollection nodes = docc.DocumentNode.SelectNodes("//a"); if (nodes != null) { foreach (HtmlNode node in nodes) { if ((node.Attributes["href"] != null) && (node.Attributes["href"].Value != "")) { try { url = node.Attributes["href"].Value.Trim(); node.Attributes["href"].Value = new Uri(baseUri, url).AbsoluteUri; } catch { } } } } ; html = docc.DocumentNode.InnerHtml; // Bóc tách ra danh sách chủ đề của domain dựa vào cấu hình var openScraping = new StructuredDataExtractor(config); var scrapingResults = openScraping.Extract(html); var result = JsonConvert.SerializeObject(scrapingResults, Formatting.Indented); // Lấy ra danh sách chủ đề trong trang báo if (scrapingResults.Count > 0) { var o = JObject.Parse(result); JToken token = o["link"]; if (token != null) { List <url_crawl_list> listCrawl = new List <url_crawl_list>() { }; int count = 0; var lsturl = token is JArray ? ((JArray)token).Select(m => m?.ToString()?.Trim()).Where(m => !string.IsNullOrWhiteSpace(m)).Distinct().ToList() : new List <string>() { token.ToString() }; foreach (var valuetmp in lsturl) { if (valuetmp.Contains(myDomain) && valuetmp.StartsWith("http")) { listCrawl.Add(new url_crawl_list { url = valuetmp, status = 1, domain = myDomain, interval = result1.interval, module = result1.module, schedule_time = DateTime.Now }); count++; } } if (count > 0) { countUrl++; // Thêm danh sách url chủ đề vào bảng url_crawl_list //entities.Database.ExecuteSqlRaw("usp_url_crawl_list_addList" //, UrlCrawlListParameters("@urlCrawlList", listCrawl)); entities.Database.ExecuteSqlCommand(new RawSqlString("usp_url_crawl_list_addList") , UrlCrawlListParameters("@urlCrawlList", listCrawl)); } } } } } } } catch (Exception) { } } if (countUrl > 0) { return(new ResultReturn("Thêm thành công " + countUrl + " url !", 1)); } return(new ResultReturn("Không thêm được url nào, vui lòng xem lại cấu hình!", -1)); } else { return(new ResultReturn("Không tìm thấy bản ghi nào!", -1)); } } } catch { return(new ResultReturn("Thêm thất bại!", -1)); } }