public void Go(string id, string inputUrl, bool goDirect = false) { ScrapeReq scrapeReq = new ScrapeReq{Id = id, InputUrl=inputUrl, GoDirect=goDirect}; if(CheckIfValidUrl(inputUrl) == false) { ScraperFailed?.Invoke(this, new ScraperFailedEventArgs{ScrapeReq=scrapeReq, Message="Invalid url"}); } else { _queue.Enqueue(scrapeReq); } }
public override ScrapeDesc Scrape(ScrapeReq scrapeReq) { if(_driver == null) { SendProgress("WEB Driver is null!"); throw new Exception(); } ScrapeDesc scrapeDesc = null; SendProgress("Scraping url: " + scrapeReq.InputUrl); string linkLocation = string.Empty; string name = string.Empty; DateTime startTime = DateTime.Now; WebDriverWait wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10)); try { SendProgress("Navigating..."); _driver.Navigate().GoToUrl(scrapeReq.InputUrl); string tmpUrl = string.Empty; try { name = _driver.FindElement(By.XPath("//div[@class='body']/div/h1/span/a")).Text; SendProgress("Found Name: " + name); SendProgress("Clicking..."); var videoClick = _driver.FindElement(By.XPath("//div[@class='video_play_button']/a")); videoClick.Click(); var frames = _driver.FindElements(By.TagName("iframe")); for (int i = 0; i < frames.Count(); i++) { _driver.SwitchTo().Frame(i); //WriteDebug("OK", "OK", driver.TakeScreenshot(), driver.PageSource); try { var script = _driver.FindElements(By.TagName("script")).FirstOrDefault(s => s.GetAttribute("innerHTML").Contains("jwplayer(\"vplayer\").setup({")).GetAttribute("innerHTML"); if(script != null) { script = script.Replace("jwplayer(\"vplayer\").setup(", string.Empty); var endIndex = script.IndexOf("});"); script = script.Remove(endIndex, script.Length - endIndex); script += "}"; var json = JObject.Parse(script); tmpUrl = json["modes"][1]["config"]["file"].ToString(); SendProgress("flash video found in frame: " + i); break; } } catch(Exception ex) { } finally { _driver.SwitchTo().DefaultContent(); } } } catch (Exception ex) { SendError("Error1", ex.ToString(), _driver.PageSource??"null"); } SendProgress("UrlDecode..."); tmpUrl = System.Net.WebUtility.UrlDecode(tmpUrl); SendProgress("Check if link is valid: " + tmpUrl); if (CheckIfValidUrl(tmpUrl)) { linkLocation = tmpUrl; SendProgress("link OK, removing invalid characters: " + name); name = CleanFileName(name); SendProgress("done: " + name); scrapeDesc = new ScrapeDesc{Name=name, DownloadUrl=linkLocation, Id=scrapeReq.Id}; } else { throw new Exception("Invalid link"); } } catch (Exception ex) { SendError("Error3", ex.ToString(), _driver?.PageSource); } SendProgress("\nlink : \n" + linkLocation == string.Empty ? "link not found" : linkLocation); SendProgress("total time = " + new DateTime((DateTime.Now - startTime).Ticks).ToString("HH:mm:ss")); return scrapeDesc; }
public override ScrapeDesc Scrape(ScrapeReq scrapeReq) { if(_driver == null) { SendProgress("WEB Driver is null!"); throw new Exception(); } ScrapeDesc scrapeDesc = null; SendProgress("Scraping url: " + scrapeReq.InputUrl); string linkLocation = string.Empty; string name = string.Empty; DateTime startTime = DateTime.Now; try { SendProgress("Navigating..."); _driver.Navigate().GoToUrl(scrapeReq.InputUrl); string tmpUrl = string.Empty; try { name += _driver.FindElement(By.XPath("//span[@itemprop='name']")).Text;//GetAttribute("OuterXml"); var seasonData = _driver.FindElements(By.XPath("//span[@class='list-top']/a")); foreach(var data in seasonData) { name += "_" + data.Text; } SendProgress("Found Name: " + name); var frames = _driver.FindElements(By.TagName("iframe")); SendProgress("Found " + frames.Count() + " frames"); for (int i = 0; i < frames.Count(); i++) { _driver.SwitchTo().Frame(i); //WriteDebug("OK", "OK", driver.TakeScreenshot(), driver.PageSource); SendProgress("frame: " + i); try { var video = _driver.FindElement(By.XPath("//video[@id='container_html5_api']/source")); tmpUrl = video.GetAttribute("src"); SendProgress("html5 video found in frame: " + i); break; } catch (Exception ex) { SendProgress("no html5 video"); } try { var tmp = _driver.FindElements(By.TagName("script")).FirstOrDefault(s => s.GetAttribute("innerHTML").Contains("jwplayer('flvplayer').setup(jwConfig({")); if(tmp != null) { var script = tmp.GetAttribute("innerHTML"); SendProgress("Way 2"); script = script.Replace("jwplayer('flvplayer').setup(jwConfig(", string.Empty); script = script.Replace("));", string.Empty); script = script.Replace("\"width\" : $(window).width(),", string.Empty); script = script.Replace("\"height\" : $(window).height(),", string.Empty); JObject jobject = JObject.Parse(script); tmpUrl = jobject["playlist"][0]["sources"].OrderByDescending(s => s["label"]).FirstOrDefault()["file"].ToString(); SendProgress("flash video found in frame: " + i); break; } else { SendProgress("WAY 3"); //script = _driver.FindElements(By.TagName("script")).FirstOrDefault(s => s.GetAttribute("innerHTML").Contains("jwplayer(\"flvplayer\").setup({")).GetAttribute("innerHTML"); tmp = _driver.FindElements(By.TagName("script")) .FirstOrDefault( s => s.GetAttribute("innerHTML").Contains("jwplayer(\"flvplayer\").setup({")); if(tmp != null) { var script = tmp.GetAttribute("innerHTML"); SendProgress("Way 3"); foreach (var line in script.Split('\n')) { if (line.Contains("file:")) { tmpUrl = line.Replace("file:", string.Empty); tmpUrl = tmpUrl.Replace("\"", string.Empty); tmpUrl = tmpUrl.Replace(",", string.Empty); tmpUrl = tmpUrl.Replace("\r", string.Empty); break; } } SendProgress("flash video found in frame: " + i); break; } else { System.IO.File.WriteAllText("jack" + i + ".txt", _driver.PageSource); } } } catch(Exception ex) { SendProgress("no flvplayer"); } finally { _driver.SwitchTo().DefaultContent(); } } } catch (Exception ex) { SendError("Error1", ex.ToString(), _driver.PageSource??"null"); } SendProgress("UrlDecode..."); tmpUrl = System.Net.WebUtility.UrlDecode(tmpUrl); SendProgress("Check if link is valid: " + tmpUrl); if (CheckIfValidUrl(tmpUrl)) { linkLocation = tmpUrl; SendProgress("link OK, removing invalid characters: " + name); name = CleanFileName(name); SendProgress("done: " + name); scrapeDesc = new ScrapeDesc{Name=name, DownloadUrl=linkLocation, Id=scrapeReq.Id}; } else { throw new Exception("Invalid link"); } } catch (Exception ex) { SendError("Error3", ex.ToString(), _driver?.PageSource); } SendProgress("\nlink : \n" + linkLocation == string.Empty ? "link not found" : linkLocation); SendProgress("total time = " + new DateTime((DateTime.Now - startTime).Ticks).ToString("HH:mm:ss")); return scrapeDesc; }
public override ScrapeDesc Scrape(ScrapeReq scrapeReq) { if(_driver == null) { SendProgress("WEB Driver is null!"); throw new Exception(); } ScrapeDesc scrapeDesc = null; SendProgress("Scraping url: " + scrapeReq.InputUrl); string linkLocation = string.Empty; string name = string.Empty; DateTime startTime = DateTime.Now; List<string> errors = new List<string>(); try { SendProgress("Navigating..."); _driver.Navigate().GoToUrl(scrapeReq.InputUrl); try { name = _driver.FindElement(By.XPath("//span[@itemprop='name']")).Text; SendProgress("Found name: " + name); IWebElement query = _driver.FindElement(By.XPath("//div[@class='player-wrapper']/a")); SendProgress("Click 1"); query.Click(); SendProgress("OK"); } catch (Exception ex) { errors.Add("Error1"); SendError("Error1", ex.ToString(), _driver?.PageSource); } WebDriverWait wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10)); var element = wait.Until((d) => { return d.FindElement(By.XPath("//button[@class='btn btn-primary']")); }); try { SendProgress("Click 2"); //var html = element.GetAttribute("outerHTML"); //html = element.GetAttribute("innerHTML"); element.Click(); SendProgress("OK"); } catch (Exception ex) { SendError("Warning", ex.ToString(), _driver?.PageSource); } try { wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10)); element = wait.Until((d) => { return d.FindElement(By.ClassName("player-wrapper")); }); var temp = element; var iframe = wait.Until((d) => { return temp.FindElement(By.TagName("iframe")); }); var link = iframe.GetAttribute("src"); SendProgress("checking if link is valid: " + link); if (CheckIfValidUrl(link)) { SendProgress("Link is OK. Navigating..."); _driver.Navigate().GoToUrl(link); } else { SendProgress("Invalid link..."); throw new Exception("Invalid link"); } } catch (Exception ex) { errors.Add("Error2"); SendError("Error2", ex.ToString(), _driver?.PageSource); } wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10)); var tmpUrl = wait.Until(d => { return d.FindElement(By.XPath("//div[@id='playerVidzer']/a")).GetAttribute("href"); }); int index = tmpUrl.LastIndexOf("http", StringComparison.InvariantCulture); SendProgress("Got link: " + tmpUrl); if (index > 0) { SendProgress("Link corrupted, fixing..."); tmpUrl = tmpUrl.Remove(0, index); } else { SendProgress("Link not corrupted - OK"); } SendProgress("UrlDecode..."); tmpUrl = System.Net.WebUtility.UrlDecode(tmpUrl); SendProgress("Check if link is valid: " + tmpUrl); if (CheckIfValidUrl(tmpUrl)) { linkLocation = tmpUrl; SendProgress("link OK, removing invalid characters: " + name); name = CleanFileName(name); SendProgress("done: " + name); scrapeDesc = new ScrapeDesc{Name=name, DownloadUrl=linkLocation, Id=scrapeReq.Id}; } else { throw new Exception("Invalid link"); } } catch (Exception ex) { errors.Add("Error3"); SendError("Error3", ex.ToString(), _driver?.PageSource); } SendProgress("\nlink : \n" + linkLocation == string.Empty ? "link not found" : linkLocation); SendProgress("total time = " + new DateTime((DateTime.Now - startTime).Ticks).ToString("HH:mm:ss")); return scrapeDesc; }
public abstract ScrapeDesc Scrape(ScrapeReq scrapeReq);