Exemple #1
0
        public void Go(string id, string inputUrl, bool goDirect = false)
        {
	    ScrapeReq scrapeReq = new ScrapeReq{Id = id, InputUrl=inputUrl, GoDirect=goDirect};

	    if(CheckIfValidUrl(inputUrl) == false)
	    {
		ScraperFailed?.Invoke(this, new ScraperFailedEventArgs{ScrapeReq=scrapeReq, Message="Invalid url"});
    	    }
	    else
	    {	
		_queue.Enqueue(scrapeReq);
	    }
        }
	public override ScrapeDesc Scrape(ScrapeReq scrapeReq)
	{
    	    if(_driver == null)
	    {
		SendProgress("WEB Driver is null!");
		
		throw new Exception();
	    }
	    
	    ScrapeDesc scrapeDesc = null;    
	    SendProgress("Scraping url: " + scrapeReq.InputUrl);
	        
            string linkLocation = string.Empty;
	    string name = string.Empty;
            DateTime startTime = DateTime.Now;
	    WebDriverWait wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10));

            try
            {
                SendProgress("Navigating...");
                _driver.Navigate().GoToUrl(scrapeReq.InputUrl);
                
                string tmpUrl = string.Empty;

		try
                {
		    name = _driver.FindElement(By.XPath("//div[@class='body']/div/h1/span/a")).Text;

		    SendProgress("Found Name: " + name);
		    SendProgress("Clicking...");

                    var videoClick = _driver.FindElement(By.XPath("//div[@class='video_play_button']/a"));
                    videoClick.Click();

		    var frames = _driver.FindElements(By.TagName("iframe"));
                    
		    for (int i = 0; i < frames.Count(); i++)
                    {
                        _driver.SwitchTo().Frame(i);
                        //WriteDebug("OK", "OK", driver.TakeScreenshot(), driver.PageSource);

			try
			{
			    var script = _driver.FindElements(By.TagName("script")).FirstOrDefault(s => s.GetAttribute("innerHTML").Contains("jwplayer(\"vplayer\").setup({")).GetAttribute("innerHTML");
			    if(script != null)
			    {
				script = script.Replace("jwplayer(\"vplayer\").setup(", string.Empty);
                                var endIndex = script.IndexOf("});");
                                script = script.Remove(endIndex, script.Length - endIndex);
                                script += "}";
                                var json = JObject.Parse(script);
                                tmpUrl = json["modes"][1]["config"]["file"].ToString();

				SendProgress("flash video found in frame: " + i);
			    	break;
			    }	
                        }
			catch(Exception ex)
			{
			    
			}
                        finally
                        {
                            _driver.SwitchTo().DefaultContent();
                        }
                    }                    
                }
                catch (Exception ex)
                {
                    SendError("Error1", ex.ToString(), _driver.PageSource??"null");
                }

		
                SendProgress("UrlDecode...");

                tmpUrl = System.Net.WebUtility.UrlDecode(tmpUrl);

                SendProgress("Check if link is valid: " + tmpUrl);
                if (CheckIfValidUrl(tmpUrl))
                {
                    linkLocation = tmpUrl;
                    SendProgress("link OK, removing invalid characters: " + name);
		    name = CleanFileName(name);
		    SendProgress("done: " + name);
		    
		    scrapeDesc = new ScrapeDesc{Name=name, DownloadUrl=linkLocation, Id=scrapeReq.Id};
                }
                else
                {
                    throw new Exception("Invalid link");
                }
            }
            catch (Exception ex)
            {
		SendError("Error3", ex.ToString(), _driver?.PageSource);
            }
            
            SendProgress("\nlink : \n" + linkLocation == string.Empty ? "link not found" : linkLocation);
            SendProgress("total time = " + new DateTime((DateTime.Now - startTime).Ticks).ToString("HH:mm:ss"));

	    return scrapeDesc;
	}
	public override ScrapeDesc Scrape(ScrapeReq scrapeReq)
	{
    	    if(_driver == null)
	    {
		SendProgress("WEB Driver is null!");
		
		throw new Exception();
	    }
	    
	    ScrapeDesc scrapeDesc = null;    
	    SendProgress("Scraping url: " + scrapeReq.InputUrl);
	        
            string linkLocation = string.Empty;
	    string name = string.Empty;
            DateTime startTime = DateTime.Now;

            try
            {
                SendProgress("Navigating...");
                _driver.Navigate().GoToUrl(scrapeReq.InputUrl);
                
                string tmpUrl = string.Empty;

		try
                {
                    name += _driver.FindElement(By.XPath("//span[@itemprop='name']")).Text;//GetAttribute("OuterXml");
                    var seasonData = _driver.FindElements(By.XPath("//span[@class='list-top']/a"));
                    foreach(var data in seasonData)
		    {
		        name += "_" + data.Text;
		    }

		    SendProgress("Found Name: " + name);

		    var frames = _driver.FindElements(By.TagName("iframe"));
                    SendProgress("Found " + frames.Count() + " frames");

		    for (int i = 0; i < frames.Count(); i++)
                    {
                        _driver.SwitchTo().Frame(i);
                        //WriteDebug("OK", "OK", driver.TakeScreenshot(), driver.PageSource);
			SendProgress("frame: " + i);
			
                        try
                        {
                            var video = _driver.FindElement(By.XPath("//video[@id='container_html5_api']/source"));
                    	    tmpUrl = video.GetAttribute("src");
			    SendProgress("html5 video found in frame: " + i);
			    break;
			}
                        catch (Exception ex)
                        {
			    SendProgress("no html5 video");
			}


			try
			{
			    var tmp = _driver.FindElements(By.TagName("script")).FirstOrDefault(s => s.GetAttribute("innerHTML").Contains("jwplayer('flvplayer').setup(jwConfig({"));
			    
			    if(tmp != null)
			    {
				var script = tmp.GetAttribute("innerHTML");
			    
				SendProgress("Way 2");
				
				script = script.Replace("jwplayer('flvplayer').setup(jwConfig(", string.Empty);
                		script = script.Replace("));", string.Empty);
                		script = script.Replace("\"width\" : $(window).width(),", string.Empty);
                		script = script.Replace("\"height\" : $(window).height(),", string.Empty);

                		JObject jobject = JObject.Parse(script);
                		tmpUrl = jobject["playlist"][0]["sources"].OrderByDescending(s => s["label"]).FirstOrDefault()["file"].ToString();
			
				SendProgress("flash video found in frame: " + i);
			    	break;
			    }
			    else
			    {
SendProgress("WAY 3");
				//script = _driver.FindElements(By.TagName("script")).FirstOrDefault(s => s.GetAttribute("innerHTML").Contains("jwplayer(\"flvplayer\").setup({")).GetAttribute("innerHTML");
				 tmp =
                            _driver.FindElements(By.TagName("script"))
                                .FirstOrDefault(
                                    s => s.GetAttribute("innerHTML").Contains("jwplayer(\"flvplayer\").setup({"));
                    		
				if(tmp != null)
				{
				    var script = tmp.GetAttribute("innerHTML");

				    SendProgress("Way 3");

				    foreach (var line in script.Split('\n'))
                        	    {		
                            		if (line.Contains("file:"))
                            		{
                                	    tmpUrl = line.Replace("file:", string.Empty);
                                	    tmpUrl = tmpUrl.Replace("\"", string.Empty);
                                	    tmpUrl = tmpUrl.Replace(",", string.Empty);
					    tmpUrl = tmpUrl.Replace("\r", string.Empty);

                                	    break;
                            		}
                        	    }
				    
				    SendProgress("flash video found in frame: " + i);
			    	    break;
				}
				else
				{
				    System.IO.File.WriteAllText("jack" + i + ".txt", _driver.PageSource);
				}
			    }
			    	
                        }
			catch(Exception ex)
			{
			    SendProgress("no flvplayer");
			}
                        finally
                        {
                            _driver.SwitchTo().DefaultContent();
                        }
                    }                    
                }
                catch (Exception ex)
                {
                    SendError("Error1", ex.ToString(), _driver.PageSource??"null");
                }

		
                SendProgress("UrlDecode...");

                tmpUrl = System.Net.WebUtility.UrlDecode(tmpUrl);

                SendProgress("Check if link is valid: " + tmpUrl);
                if (CheckIfValidUrl(tmpUrl))
                {
                    linkLocation = tmpUrl;
                    SendProgress("link OK, removing invalid characters: " + name);
		    name = CleanFileName(name);
		    SendProgress("done: " + name);
		    
		    scrapeDesc = new ScrapeDesc{Name=name, DownloadUrl=linkLocation, Id=scrapeReq.Id};
                }
                else
                {
                    throw new Exception("Invalid link");
                }
            }
            catch (Exception ex)
            {
		SendError("Error3", ex.ToString(), _driver?.PageSource);
            }
            
            SendProgress("\nlink : \n" + linkLocation == string.Empty ? "link not found" : linkLocation);
            SendProgress("total time = " + new DateTime((DateTime.Now - startTime).Ticks).ToString("HH:mm:ss"));

	    return scrapeDesc;
	}
	public override ScrapeDesc Scrape(ScrapeReq scrapeReq)
	{
    	    if(_driver == null)
	    {
		SendProgress("WEB Driver is null!");
		
		throw new Exception();
	    }
	    
	    ScrapeDesc scrapeDesc = null;    
	    SendProgress("Scraping url: " + scrapeReq.InputUrl);
	        
            string linkLocation = string.Empty;
	    string name = string.Empty;
            DateTime startTime = DateTime.Now;
	    List<string> errors = new List<string>();

            try
            {

                SendProgress("Navigating...");
                _driver.Navigate().GoToUrl(scrapeReq.InputUrl);
                
                try
                {
		    name = _driver.FindElement(By.XPath("//span[@itemprop='name']")).Text;
                    SendProgress("Found name: " + name);
		    IWebElement query = _driver.FindElement(By.XPath("//div[@class='player-wrapper']/a"));
                    SendProgress("Click 1");
                    query.Click();
                    SendProgress("OK");
                }
                catch (Exception ex)
                {
		    errors.Add("Error1");
                    SendError("Error1", ex.ToString(), _driver?.PageSource);
                }

                WebDriverWait wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10));
                var element = wait.Until((d) => { return d.FindElement(By.XPath("//button[@class='btn btn-primary']")); });

                try
                {
                    SendProgress("Click 2");
                    //var html = element.GetAttribute("outerHTML");
                    //html = element.GetAttribute("innerHTML");
                    element.Click();
                    SendProgress("OK");
                }
                catch (Exception ex)
                {
                    SendError("Warning", ex.ToString(), _driver?.PageSource);
                }

                try
                {
                    wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10));
                    element = wait.Until((d) => { return d.FindElement(By.ClassName("player-wrapper")); });
                    var temp = element;
                    var iframe = wait.Until((d) => { return temp.FindElement(By.TagName("iframe")); });
                    var link = iframe.GetAttribute("src");

                    SendProgress("checking if link is valid: " + link);

                    if (CheckIfValidUrl(link))
                    {
                        SendProgress("Link is OK. Navigating...");
                        _driver.Navigate().GoToUrl(link);
                    }
                    else
                    {
                        SendProgress("Invalid link...");
                        throw new Exception("Invalid link");
                    }
                }
                catch (Exception ex)
                {
		    errors.Add("Error2");
                    SendError("Error2", ex.ToString(), _driver?.PageSource);
                }

                wait = new WebDriverWait(_driver, TimeSpan.FromSeconds(10));
                var tmpUrl = wait.Until(d => { return d.FindElement(By.XPath("//div[@id='playerVidzer']/a")).GetAttribute("href"); });

                int index = tmpUrl.LastIndexOf("http", StringComparison.InvariantCulture);
                SendProgress("Got link: " + tmpUrl);

                if (index > 0)
                {
                    SendProgress("Link corrupted, fixing...");

                    tmpUrl = tmpUrl.Remove(0, index);
                }
                else
                {
                    SendProgress("Link not corrupted - OK");
                }

                SendProgress("UrlDecode...");

                tmpUrl = System.Net.WebUtility.UrlDecode(tmpUrl);

                SendProgress("Check if link is valid: " + tmpUrl);
                if (CheckIfValidUrl(tmpUrl))
                {
                    linkLocation = tmpUrl;
                    SendProgress("link OK, removing invalid characters: " + name);
		    name = CleanFileName(name);
		    SendProgress("done: " + name);
		    
		    scrapeDesc = new ScrapeDesc{Name=name, DownloadUrl=linkLocation, Id=scrapeReq.Id};
                }
                else
                {
                    throw new Exception("Invalid link");
                }
            }
            catch (Exception ex)
            {
		errors.Add("Error3");
                SendError("Error3", ex.ToString(), _driver?.PageSource);
            }
            
            SendProgress("\nlink : \n" + linkLocation == string.Empty ? "link not found" : linkLocation);
            SendProgress("total time = " + new DateTime((DateTime.Now - startTime).Ticks).ToString("HH:mm:ss"));

	    return scrapeDesc;
	}
Exemple #5
0
	public abstract ScrapeDesc Scrape(ScrapeReq scrapeReq);