public override List <ScrapedMovie> ScrapeMovies(List <string> skipUrls, List <int> years = null) { try { var dom = GotoUrl(RootUrl); if (years == null) { years = new List <int>(); } //loop through each year var elems = SelectItems(dom, "#HTML5.widget div.widget-content a"); for (var i = elems.Count - 1; i >= 0; i--) { try { var elem = elems[i]; int year = Convert.ToInt32(elem.InnerText.Trim()); if (years.Count > 0 && !years.Any(x => x == year)) { continue; } var urls = new Stack <string>(); //goto year page try { dom = GotoUrl(ReadAttribute(elem, "href")); while (true) //grab all links { var mUrls = SelectItems(dom, "div.blog-posts a").Skip(4).ToList(); foreach (var mu in mUrls) { var h = ReadAttribute(mu, "href"); if (urls.Contains(h)) { continue; } urls.Push(h); } if (mUrls.Count == 0) { break; } dom = GotoUrl(ReadAttribute(SelectItem(dom, "#blog-pager-older-link a"), "href")); } } catch { continue; } while (urls.Count > 0) { string u = null; try { u = urls.Pop(); dom = GotoUrl(u); var title = SelectItem(dom, ".post-title.entry-title a").InnerText; ScrapedMovie movie = null; try { movie = new ScrapedMovie(this) { PageUrl = u, Description = title.Contains("-") ? title.Split('-')[1] : String.Empty, Name = FixTitle(title.Contains("-") ? title.Split('-')[0] : title), LangCode = "ta", ReleasedDate = new DateTime(year, 1, 1), ImageUrl = ReadAttribute(SelectItem(dom, "div.post-body.entry-content img"), "src") }; } catch { } OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); allMovies.Add(movie); foreach (var item in SelectItems(dom, ".fullpost a")) { string linkUrl = null; try { linkUrl = ReadAttribute(item, "href"); if (linkUrl.Contains("links2sites")) { dom = GotoUrl(linkUrl); try { linkUrl = ReadAttribute(SelectItem(dom, ".post-body.entry-content embed"), "src"); } catch { try { linkUrl = ReadAttribute(SelectItem(dom, ".post-body.entry-content iframe"), "src"); } catch { } } } if (IgnoreLink(linkUrl)) { continue; } var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) { continue; } linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) { continue; } skipUrls.Add(linkUrl); MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, "Watch Full Movie")); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch (WebException ex) { } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) { return(allMovies); } } catch { } } } catch { } } } catch (Exception ex) { //throw; } return(allMovies); }
public override List <ScrapedMovie> ScrapeMovies(List <string> skipUrls, List <int> years = null) { if (years == null) { years = new List <int>(); } foreach (var entry in RootLinks) { try { if (years.Count > 0 && !years.Any(x => x == entry.Value)) { continue; } int year = entry.Value; var dom = GotoUrl(entry.Key); var lastPage = SelectItem(dom, "a.last").Attributes["href"]; int ix1 = lastPage.LastIndexOf('/'); int ix2 = ix1 > 0 ? lastPage.LastIndexOf('/', ix1 - 1) : -1; var count = Convert.ToInt32(lastPage.Substring(ix2 + 1, lastPage.Length - ix2 - 2)); lastPage = lastPage.Substring(0, ix2 + 1); var pageUrls = new List <string>(); for (var i = count; i >= 2; i--) { pageUrls.Add(lastPage + i.ToString() + "/"); } pageUrls.Add(entry.Key); foreach (var elem in pageUrls) { try { dom = GotoUrl(elem); var postBoxes = SelectItems(dom, "h2.title a"); for (var j = postBoxes.Count - 1; j >= 0; j--) { try { var movie = new ScrapedMovie(this); allMovies.Add(movie); movie.PageUrl = ReadAttribute(postBoxes[j], "href"); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); movie.LangCode = "te"; movie.ReleasedDate = new DateTime(year, 1, 1); dom = GotoUrl(movie.PageUrl); movie.Name = FixTitle(SelectItem(dom, "h2.title").InnerText); movie.ImageUrl = SelectItem(dom, "img.wp-post-image").Attributes["src"]; try { movie.Description = SelectItems(dom, "div.entry p span")[1].InnerText; } catch { //try //{ // movie.Description = SelectItem(dom, ".textsection").InnerText; //} //catch { } } var links = new Dictionary <string, string>(); var anchors = SelectItems(dom, "a"); foreach (var anchor in anchors) { try { var url = anchor.Attributes["href"]; if (GetScrapper(url) != null) { if (!links.ContainsKey(url)) { links.Add(url, anchor.InnerText.Replace(" ", "")); } } if (url.Contains("http://www.power4link.us")) { dom = GotoUrl(url); var frame = SelectItem(dom, "div.entry-content iframe"); if (!links.ContainsKey(url)) { links.Add(frame.Attributes["src"], "Watch Online"); } } } catch { } } var iframes = SelectItems(dom, "div.entry iframe"); foreach (var iframe in iframes) { links.Add(iframe.Attributes["src"], "Watch Online"); } if (links.Count == 0) { var k = 0; } foreach (var l in links) { try { var linkUrl = l.Key; if (IgnoreLink(linkUrl)) { continue; } var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) { continue; } linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) { continue; } MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var name = l.Value; if (name.ToLower().Contains("part")) { var ind = name.IndexOf("part", StringComparison.InvariantCultureIgnoreCase); name = "Watch " + name.Substring(ind, name.Length - ind); } else { name = "Watch Online"; } movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, name)); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) { return(allMovies); } } catch { } } } catch { } } } catch { } } return(allMovies); }
private void ScrapThread(CQ dom, int year, string startUrl, List <string> skipUrls) { var last = SelectItem(dom, ".wp-pagenavi > .last"); var lasthRef = ReadAttribute(last, "href"); var pos = lasthRef.LastIndexOf('/'); var lastIndex = Int32.Parse(lasthRef.Substring(pos + 1, lasthRef.Length - pos - 1)); var urls = new List <string>(); var urlTemplate = lasthRef.Substring(0, pos); for (var i = lastIndex; i > 1; i--) { urls.Add(String.Format("{0}/{1}", urlTemplate, i)); } urls.Add(startUrl); foreach (var url in urls) { try { dom = GotoUrl(url); var elems = SelectItems(dom, "a.clip-link"); for (var i = elems.Count - 1; i >= 0; i--) { try { var subElem = elems[i]; var movie = new ScrapedMovie(this); movie.PageUrl = ReadAttribute(subElem, "href"); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); if (movie.PageUrl.ToLower().Contains("-in-hindi") || movie.PageUrl.ToLower().Contains("-hindi.")) { continue; } dom = GotoUrl(movie.PageUrl); movie.ReleasedDate = new DateTime(year, 1, 1); movie.LangCode = "hi"; movie.Language = "Hindi"; movie.Description = String.Empty; movie.Name = ReadText(SelectItem(dom, ".entry-title")).Replace("\n", "").Replace("\t", ""); try { var descElems = SelectItems(dom, ".entry-content p"); //var descs = descElems.Count > 5 ? descElems.Skip(3) : descElems.Skip(2); foreach (var p in descElems) { if (!p.InnerHTML.Contains("<strong>")) { var t = ReadText(p); movie.Description += ReadText(p) + Environment.NewLine; } else { break; } } if (String.IsNullOrWhiteSpace(movie.Description)) { movie.Description = String.Empty; } } catch { } var imgElems = SelectItems(dom, "#thumb img"); if (imgElems.Count == 0) { Debug.WriteLine("No Image: " + movie.PageUrl); } else { movie.ImageUrl = ReadAttribute(imgElems[0], "src"); } allMovies.Add(movie); //links var linkPages = SelectItems(dom, ".entry-content p a.external"); foreach (var l in linkPages) { var pageUrl = ReadAttribute(l, "href"); string linkUrl = ""; if (pageUrl.Contains("filmshowonline.net")) { continue; dom = GotoUrl(pageUrl); IDomElement item = null; var attrib = "src"; if (dom.Document.Body.InnerHTML.Contains("id=\"cipher\"")) { var html = DecryptLink(ReadAttribute(SelectItem(dom, "#key"), "value"), ReadAttribute(SelectItem(dom, "#cipher"), "value")); var doc = CsQuery.CQ.CreateDocument(html); item = SelectItem(doc, "iframe"); if (item == null) { item = SelectItem(doc, "embed"); } if (item == null) { item = SelectItems(doc, "object param").FirstOrDefault(x => x.Attributes["name"] == "movie"); if (item != null) { attrib = "value"; } } if (html.Contains("flashvars")) { html = System.Web.HttpUtility.UrlDecode(html); linkUrl = System.Web.HttpUtility.UrlDecode(SubstringBetween(html, "&url=", "&")); } } if (item == null) { item = SelectItems(dom, "center embed").FirstOrDefault(x => x.HasAttribute("allowfullscreen")); } if (item == null) { item = SelectItems(dom, "center iframe").FirstOrDefault(x => x.HasAttribute("allowfullscreen")); } if (item == null) { OnScraperNotFound(new ScraperNotFound("No Link", pageUrl)); continue; } if (String.IsNullOrWhiteSpace(linkUrl)) { linkUrl = ReadAttribute(item, attrib); } } else if (pageUrl.Contains("www.veoh.com/download")) { continue; } else { linkUrl = pageUrl; } if (IgnoreLink(linkUrl)) { continue; } try { var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) { continue; } linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) { continue; } if (!movie.Links.Any(x => x.DownloadUrl.ToLower() == linkUrl.ToLower())) { MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, l.InnerText)); } } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, pageUrl)); } } catch (Exception ex) { OnScraperNotFound(new ScraperNotFound("Exception", ex.Message)); } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) { return; } } catch (Exception ex) { OnScraperNotFound(new ScraperNotFound("Exception", ex.Message)); } } } catch (Exception ex) { OnScraperNotFound(new ScraperNotFound("Exception", ex.Message)); } } }
public override List <ScrapedMovie> ScrapeMovies(List <string> skipUrls, List <int> years = null) { if (years == null) { years = new List <int>(); } foreach (var entry in RootLinks) { try { if (years.Count > 0 && !years.Any(x => x == entry.Value)) { continue; } int year = entry.Value; var dom = GotoUrl(entry.Key); var elems = SelectItems(dom, ".wp-pagenavi a").Where(x => x.Attributes["class"] == "page larger" || x.Attributes["class"] == "page smaller").ToList(); var pageUrls = new List <string>(); for (var i = elems.Count - 1; i >= 0; i--) { pageUrls.Add(new Uri(new Uri(entry.Key), ReadAttribute(elems[i], "href")).AbsoluteUri); } pageUrls.Add(entry.Key); foreach (var elem in pageUrls) { try { dom = GotoUrl(elem); var postBoxes = SelectItems(dom, ".boxentry a"); for (var j = postBoxes.Count - 1; j >= 0; j--) { try { var pb = postBoxes[j]; var movie = new ScrapedMovie(this); allMovies.Add(movie); movie.PageUrl = ReadAttribute(pb, "href"); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + year.ToString())); movie.LangCode = "ta"; movie.ReleasedDate = new DateTime(year, 1, 1); var title = ReadAttribute(pb, "title"); if (title.ToLower().Contains("dubbed")) { continue; } movie.Name = FixTitle(title); movie.ImageUrl = pb.FirstElementChild.Attributes["src"]; dom = GotoUrl(movie.PageUrl); try { //movie.Description = SelectItem(dom, "meta[name='description']").InnerText; } catch { //try //{ // movie.Description = SelectItem(dom, ".textsection").InnerText; //} //catch { } } var embedds = SelectItems(dom, ".videosection embed"); var iframes = SelectItems(dom, ".videosection iframe"); var links = new List <string>(); foreach (var emb in embedds) { links.Add(emb.Attributes["src"]); } foreach (var iframe in iframes) { links.Add(iframe.Attributes["src"]); } if (links.Count == 0) { var k = 0; } foreach (var l in links) { try { var linkUrl = l; if (IgnoreLink(linkUrl)) { continue; } var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) { continue; } linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) { continue; } MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, "Watch Full Movie")); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) { return(allMovies); } } catch { } } } catch { } } } catch { } } return(allMovies); }
private List <ScrapedMovie> DoScrapeMovies(List <string> links, string langCode, List <string> skipUrls, List <int> years = null) { if (years == null) { years = new List <int>(); } foreach (var entry in links) { var dom = GotoUrl(entry); var movies = SelectItems(dom, ".movie"); foreach (var m in movies) { try { var movie = new ScrapedMovie(this); allMovies.Add(movie); movie.PageUrl = "http://apnaview.com" + m.FirstElementChild.Attributes["href"]; var children = m.FirstElementChild.ChildElements.ToList(); movie.ReleasedDate = new DateTime(Convert.ToInt32(children[2].InnerText), 1, 1); OnNotify(new NotificationEventArgs("Processing " + movie.PageUrl + ". Year: " + movie.ReleasedDate.Year.ToString())); movie.LangCode = langCode; movie.Name = children[1].InnerText; if (children[0].Attributes["src"].Contains("/img")) { movie.ImageUrl = "http://apnaview.com" + children[0].Attributes["src"]; } dom = GotoUrl(movie.PageUrl); var vids = SelectItems(dom, ".table.table-bordered tbody tr"); foreach (var vid in vids) { try { var vidLinks = vid.ChildElements.ToList()[1].ChildElements.ToList(); foreach (var vl in vidLinks) { var linkUrl = vl.Attributes["href"]; if (GetScrapper(linkUrl) == null) { linkUrl = String.Empty; dom = GotoUrl(vl.Attributes["href"]); try { linkUrl = SelectItem(dom, ".videoplayer iframe").Attributes["src"]; } catch { } try { if (String.IsNullOrWhiteSpace(linkUrl)) { linkUrl = SelectItem(dom, ".videoplayer embed").Attributes["src"]; } } catch { } } if (IgnoreLink(linkUrl)) { continue; } try { var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) { continue; } linkUrl = host.SanitizeUrl(linkUrl); if (skipUrls.Any(x => x == linkUrl)) { continue; } MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.Success && result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var k = 0; } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { var name = vl.InnerText.Trim(); movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, name)); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } catch { } } } catch { } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) { return(allMovies); } } catch { } } } return(allMovies); }
private bool GetMovies(int year, CQ dom, List <string> skipUrls) { var items = SelectItems(dom, ".itemList a"); if (items.Count == 0) { Debug.Assert(false, "No movies"); } for (var i = items.Count - 1; i >= 0; i--) { var elem1 = items[i]; var movieUrl = RootUrl + ReadAttribute(elem1, "href"); OnNotify(new NotificationEventArgs("Processing " + movieUrl + ". Year: " + year.ToString())); if (allMovies.Any(x => x.PageUrl == movieUrl)) { continue; } try { dom = GotoUrl(movieUrl); } catch { continue; } var links = SelectItems(dom, ".itemIntroText table a"); if (links.Count == 0) { links = SelectItems(dom, "div.itemFullText a"); } if (links.Count == 0) { links = SelectItems(dom, ".avPlayerBlock iframe"); } if (links.Count > 0) { var movie = new ScrapedMovie(this); movie.ReleasedDate = new DateTime(year, 1, 1); movie.LangCode = "ml"; movie.Language = "Malayalam"; movie.Name = ReadText(SelectItems(dom, ".itemTitle")[0]).Replace("\n", "").Replace("\t", ""); try { try { movie.Description = ReadText(SelectItems(dom, ".itemIntroText p")[0]); } catch { try { movie.Description = ReadText(SelectItems(dom, ".itemIntroText")[0]); } catch { var spans = SelectItems(dom, ".typeTextfield span"); if (spans.Count > 0) { movie.Description = String.Empty; foreach (var span in spans) { movie.Description += span.InnerText; } } } } if (movie.Description != null) { movie.Description = movie.Description.Replace("\n", "").Replace("\t", ""); } } catch { } try { var a = SelectItems(dom, ".itemIntroText p img").FirstOrDefault(); if (a == null) { a = SelectItems(dom, ".itemImage a img").FirstOrDefault(); } if (a == null) { a = SelectItems(dom, ".itemIntroText img").FirstOrDefault(); } if (a == null) { a = SelectItems(dom, ".itemIntroText span img").FirstOrDefault(); } if (a != null) { movie.ImageUrl = RootUrl + ReadAttribute(a, "src"); } } catch { } movie.PageUrl = movieUrl; allMovies.Add(movie); foreach (var link in links) { string linkUrl = null; try { linkUrl = ReadAttribute(link, "href"); } catch { try { linkUrl = ReadAttribute(link, "src"); } catch { } } if (IgnoreLink(linkUrl)) { continue; } var host = GetScrapper(linkUrl); if (host != null) { if (skipUrls.Any(x => x == linkUrl)) { continue; } try { linkUrl = host.SanitizeUrl(linkUrl); } catch { continue; } if (skipUrls.Any(x => x == linkUrl)) { continue; } MovieTube.Client.Scraper.ScraperResult result = MovieTube.Client.Scraper.ScraperResult.Success; try { result = MovieTube.Client.Scraper.VideoScraperBase.ValidateUrl(linkUrl); } catch { } if (result != MovieTube.Client.Scraper.ScraperResult.VideoDoesNotExist) { movie.Links.Add(new ScrapedMovieLink(linkUrl, host.ID, link.InnerText)); } } else { OnScraperNotFound(new ScraperNotFound(linkUrl, movie.PageUrl)); } } if (movie.Links.Count > 0) { var args = new MovieFoundEventArgs(movie); OnMovieFound(args); } if (this.stop) { return(false); } } else { } } return(true); }