public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { var html = NetTools.DownloadString(url); var data = ParseArticleData(html); var pages = GetPagesUri(html); var image_urls = new List <string>(); if (option == null) { option = RecommendOption(url); } option.SimpleInfoCallback?.Invoke($"{data.Title}"); if (option.ExtractInformation) { return(null, null /*data*/); } // // Extract Image Url-Url // image_urls.AddRange(GetImagesUri(html)); for (int i = 1; i < pages.Length; i++) { (option as EHentaiExtractorOption).PageReadCallback?.Invoke(pages[i]); var page = NetTools.DownloadString(pages[i]); image_urls.AddRange(GetImagesUri(page)); } // // Extract Image Url // var result = new NetTask[image_urls.Count]; var count = image_urls.Count; var wait = new ManualResetEvent(false); var artist = "N/A"; var group = "N/A"; var series = "N/A"; if (data.artist != null && data.artist.Length > 0) { artist = data.artist[0]; } if (data.group != null && data.group.Length > 0) { group = data.group[0]; } if (data.parody != null && data.parody.Length > 0) { series = data.parody[0]; } if (artist == "N/A" && group != "N/A") { artist = group; } for (int i = 0; i < image_urls.Count; i++) { var task = NetTask.MakeDefault(image_urls[i]); var j = i; task.Priority = new NetPriority { Type = NetPriorityType.Trivial, TaskPriority = i }; task.DownloadString = true; task.CompleteCallbackString = (string str) => { var durl = GetImagesAddress(str); var tt = NetTask.MakeDefault(durl); tt.SaveFile = true; tt.Filename = durl.Split('/').Last(); tt.Format = new ExtractorFileNameFormat { Title = data.Title, FilenameWithoutExtension = Path.GetFileNameWithoutExtension(tt.Filename), Extension = Path.GetExtension(tt.Filename).Replace(".", ""), OriginalTitle = data.SubTitle, Artist = artist, Group = group, Series = series }; result[j] = tt; if (Interlocked.Decrement(ref count) == 0) { wait.Set(); } }; AppProvider.Scheduler.Add(task); } wait.WaitOne(); option.ThumbnailCallback?.Invoke(result[0]); var result_list = result.ToList(); result_list.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result_list, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.WorksComic }); }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { var match = ValidUrl.Match(url).Groups; if (option == null) { option = new GelbooruExtractorOption { Type = ExtractorType.Images } } ; var tags = match[1].Value; var result = new List <NetTask>(); var page = 0; if ((option as GelbooruExtractorOption).StartPage != null) { page = (option as GelbooruExtractorOption).StartPage[0].ToInt(); } var end_page = int.MaxValue; if ((option as GelbooruExtractorOption).EndPage != null) { end_page = (option as GelbooruExtractorOption).EndPage[0].ToInt(); } option.SimpleInfoCallback?.Invoke($"{HttpUtility.UrlDecode(tags)}"); var post_thumbnail = false; while (true) { var durl = "https://gelbooru.com/index.php?page=dapi&s=post&q=index&limit=100&tags=" + tags + "&pid=" + page.ToString(); option.PageReadCallback?.Invoke(durl); var data = NetTools.DownloadString(durl); var document = new HtmlDocument(); document.LoadHtml(data); var nodes = document.DocumentNode.SelectNodes("/posts[1]/post"); if (nodes == null || nodes.Count == 0) { break; } foreach (var node in nodes) { var imgurl = node.GetAttributeValue("file_url", ""); var task = NetTask.MakeDefault(imgurl); task.SaveFile = true; task.Filename = imgurl.Split('/').Last(); task.Format = new ExtractorFileNameFormat { Search = HttpUtility.UrlDecode(tags), FilenameWithoutExtension = Path.GetFileNameWithoutExtension(imgurl.Split('/').Last()), Extension = Path.GetExtension(imgurl.Split('/').Last()).Replace(".", "") }; result.Add(task); } if (!post_thumbnail) { option.ThumbnailCallback?.Invoke(result[0]); post_thumbnail = true; } option.PostStatus?.Invoke(nodes.Count); page += 1; if (page > end_page) { break; } } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.Search }); } }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { if (option == null) { option = RecommendOption(url); } var limit = int.MaxValue; if ((option as InstagramExtractorOption).LimitPosts != null) { limit = (option as InstagramExtractorOption).LimitPosts[0].ToInt(); } var html = NetTools.DownloadString(url); var user = InstaApi.get_user(option as InstagramExtractorOption, html); var urls = new List <string>(); urls.AddRange(user.FirstPost.DisplayUrls); option.PostStatus?.Invoke(user.FirstPost.PostCount); option.SimpleInfoCallback?.Invoke($"{user.FullName} ({user.UserName})"); var count = 0; var pp = user.FirstPost; while (pp.HasNext) { if (count >= limit) { break; } var posts = InstaApi.query_next(option as InstagramExtractorOption, InstaApi.posts_query_hash(), user.UserId, "50", pp.EndCursor); urls.AddRange(posts.DisplayUrls); option.PostStatus?.Invoke(posts.PostCount); count += 50; pp = posts; } var result = new List <NetTask>(); foreach (var surl in urls) { var task = NetTask.MakeDefault(surl); task.SaveFile = true; var fn = surl.Split('?')[0].Split('/').Last(); task.Filename = fn; task.Format = new ExtractorFileNameFormat { FilenameWithoutExtension = Path.GetFileNameWithoutExtension(fn), Extension = Path.GetExtension(fn).Replace(".", ""), User = user.FullName, Account = user.UserName }; result.Add(task); } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.UserArtist }); }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { if (option == null) { option = RecommendOption(url); } var html = NetTools.DownloadString(url); var match = ValidUrl.Match(url).Groups; var document = new HtmlDocument(); document.LoadHtml(html); var node = document.DocumentNode; if (option.Type == ExtractorType.EpisodeImages) { var images = get_board_images(html); var title = node.SelectSingleNode("/html[1]/head[1]/title[1]").InnerText; var result = new List <NetTask>(); int count = 1; foreach (var img in images) { var task = NetTask.MakeDefault(img[0]); task.SaveFile = true; task.Filename = count.ToString("000") + Path.GetExtension(img[0].Split('/').Last()); task.Format = new ExtractorFileNameFormat { Episode = title, FilenameWithoutExtension = count.ToString("000"), Extension = Path.GetExtension(task.Filename).Replace(".", "") }; task.FailUrls = img.Skip(1).ToList(); result.Add(task); count++; } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, null); } else if (option.Type == ExtractorType.Works) { var title = node.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]").InnerText; var sub_urls = new List <string>(); var sub_titles = new List <string>(); option.SimpleInfoCallback?.Invoke($"{title}"); option.ThumbnailCallback?.Invoke(NetTask.MakeDefault( Regex.Match(node.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]").GetAttributeValue("style", ""), @"(https?://.*?)\)").Groups[1].Value)); foreach (var item in node.SelectNodes("/html[1]/body[1]/div[1]/div[3]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div")) { sub_urls.Add(match["host"] + item.SelectSingleNode("./a[1]").GetAttributeValue("href", "")); sub_titles.Add(item.SelectSingleNode("./a[1]/div[1]").MyText()); } option.ProgressMax?.Invoke(sub_urls.Count); var htmls = NetTools.DownloadStrings(sub_urls, "PHPSESSID=" + Externals.ManamoaPHPSESSID, () => { option.PostStatus?.Invoke(1); }); var result = new List <NetTask>(); for (int i = 0; i < sub_urls.Count; i++) { try { var images = get_board_images(htmls[i]); int count = 1; foreach (var img in images) { var task = NetTask.MakeDefault(img[0]); task.SaveFile = true; task.Filename = count.ToString("000") + Path.GetExtension(img[0].Split('/').Last()); task.Format = new ExtractorFileNameFormat { Title = title, Episode = sub_titles[i], FilenameWithoutExtension = count.ToString("000"), Extension = Path.GetExtension(task.Filename).Replace(".", ""), }; task.FailUrls = img.Skip(1).ToList(); result.Add(task); count++; } } catch (Exception e) { ; } } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.WorksComic }); } return(null, null); }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { if (!PixivAPI.Auth(Settings.Instance.Model.PixivSettings.Id, Settings.Instance.Model.PixivSettings.Password)) { throw new ExtractorException("Authentication error! Check setting.json/PixivSetting."); } var match = ValidUrl.Match(url).Groups; if (option == null) { option = new PixivExtractorOption { Type = ExtractorType.Works } } ; if (match[2].Value.StartsWith("member") && option.ExtractInformation == false) { var user = PixivAPI.GetUsersAsync(match["id"].Value.ToInt()).Result; var works = PixivAPI.GetUsersWorksAsync(match["id"].Value.ToInt(), 1, 10000000).Result; option.SimpleInfoCallback?.Invoke($"{user[0].Name} ({user[0].Account})"); option.ThumbnailCallback?.Invoke(NetTask.MakeDefault(user[0].ProfileImageUrls.Px170x170)); var result = new List <NetTask>(); foreach (var work in works) { if (work.PageCount > 1) { ; } if (work.Type == null || work.Type == "illustration") { var task = NetTask.MakeDefault(work.ImageUrls.Large); task.Filename = work.ImageUrls.Large.Split('/').Last(); task.SaveFile = true; task.Referer = url; task.Format = new ExtractorFileNameFormat { Artist = user[0].Name, Account = user[0].Account, Id = user[0].Id.Value.ToString(), FilenameWithoutExtension = Path.GetFileNameWithoutExtension(work.ImageUrls.Large.Split('/').Last()), Extension = Path.GetExtension(work.ImageUrls.Large.Split('/').Last()).Replace(".", "") }; result.Add(task); } else if (work.Type == "ugoira") { var ugoira_data = PixivAPI.GetUgoiraAsync(work.Id.ToString()).Result; var task = NetTask.MakeDefault(ugoira_data.ZipUrls.Medium); task.Filename = ugoira_data.ZipUrls.Medium.Split('/').Last(); task.SaveFile = true; task.Referer = url; var pptask = new PostprocessorTask(); pptask.Postprocessor = new UgoiraPostprocessor { Frames = ugoira_data.Frames }; task.PostProcess = pptask; task.Format = new ExtractorFileNameFormat { Artist = user[0].Name, Account = user[0].Account, Id = user[0].Id.Value.ToString(), FilenameWithoutExtension = Path.GetFileNameWithoutExtension(ugoira_data.ZipUrls.Medium.Split('/').Last()), Extension = Path.GetExtension(ugoira_data.ZipUrls.Medium.Split('/').Last()).Replace(".", "") }; result.Add(task); } } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.UserArtist }); } else if (option.ExtractInformation == true) { var user = PixivAPI.GetUsersAsync(match["id"].Value.ToInt()).Result; return(null, null /*user*/); } return(null, null); }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { var html = NetTools.DownloadString(NetTask.MakeDefault(url, cookie: cookies[0])); var data = EHentaiExtractor.ParseArticleData(html, @"https://exhentai.org/.*?(?=\))"); var pages = EHentaiExtractor.GetPagesUri(html); var image_urls = new List <string>(); option.SimpleInfoCallback?.Invoke($"{data.Title}"); if (option == null) { option = RecommendOption(url); } if (option.ExtractInformation) { return(null, null /*data*/); } // // Extract Image Url-Url // image_urls.AddRange(EHentaiExtractor.GetImagesUri(html)); for (int i = 1; i < pages.Length; i++) { (option as EHentaiExtractorOption).PageReadCallback?.Invoke(pages[i]); var page = NetTools.DownloadString(NetTask.MakeDefault(pages[i], cookie: cookies[0])); image_urls.AddRange(EHentaiExtractor.GetImagesUri(page)); } // // Extract Image Url // var result = new NetTask[image_urls.Count]; var artist = "N/A"; var group = "N/A"; var series = "N/A"; if (data.artist != null && data.artist.Length > 0) { artist = data.artist[0]; } if (data.group != null && data.group.Length > 0) { group = data.group[0]; } if (data.parody != null && data.parody.Length > 0) { series = data.parody[0]; } if (artist == "N/A" && group != "N/A") { artist = group; } for (int i = 0; i < image_urls.Count; i++) { var html2 = NetTools.DownloadString(NetTask.MakeDefault(image_urls[i], cookies[0])); var durl = EHentaiExtractor.GetImagesAddress(html2); var task = NetTask.MakeDefault(durl, cookies[0]); task.SaveFile = true; task.Filename = durl.Split('/').Last(); task.Format = new ExtractorFileNameFormat { Title = data.Title, FilenameWithoutExtension = Path.GetFileNameWithoutExtension(task.Filename), Extension = Path.GetExtension(task.Filename).Replace(".", ""), OriginalTitle = data.SubTitle, Artist = artist, Group = group, Series = series }; result[i] = task; if (i == 0) { option.ThumbnailCallback?.Invoke(task); } } var result_list = result.ToList(); result_list.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result_list, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.WorksComic }); }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { var match = ValidUrl.Match(url).Groups; if (option == null) { option = new DanbooruExtractorOption { Type = ExtractorType.Images } } ; var tags = match["search"].Value; var result = new List <NetTask>(); var page = 1; option.SimpleInfoCallback?.Invoke($"{tags}"); if ((option as DanbooruExtractorOption).StartPage != null) { page = (option as DanbooruExtractorOption).StartPage[0].ToInt(); } var end_page = int.MaxValue; if ((option as DanbooruExtractorOption).EndPage != null) { end_page = (option as DanbooruExtractorOption).EndPage[0].ToInt(); } var already_thumbnail = false; while (true) { var durl = $"https://danbooru.donmai.us/posts?tags={tags}&page=" + page.ToString(); option.PageReadCallback?.Invoke(durl); var html = NetTools.DownloadString(durl); var node = html.ToHtmlNode().SelectNodes("/html[1]/body[1]/div[1]/div[3]/div[1]/section[1]/div[3]/div[1]/article"); if (node == null) { break; } var ds = new List <string>(); foreach (var sub in node) { ds.Add("https://danbooru.donmai.us" + sub.SelectSingleNode("./a").GetAttributeValue("href", "")); } var htmls = NetTools.DownloadStrings(ds); //foreach (var shtml in htmls) for (int i = 0; i < htmls.Count; i++) { var snode = htmls[i].ToHtmlNode(); var img_url = ""; // Just one banner if (snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/div[1]/span[1]/a[1]")?.GetAttributeValue("id", "") == "image-resize-link") { img_url = snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/div[1]/span[1]/a[1]").GetAttributeValue("href", ""); } // Two banner else if (snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/div[2]/span[1]/a[1]")?.GetAttributeValue("id", "") == "image-resize-link") { img_url = snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/div[2]/span[1]/a[1]").GetAttributeValue("href", ""); } // Three or none banner else if (snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/section[1]/img[1]") != null) { img_url = snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/section[1]/img[1]").GetAttributeValue("src", ""); } // Video URL else if (snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/section[1]/p[1]/a[1]") != null) { if ((option as DanbooruExtractorOption).ExcludeVideo) { continue; } img_url = snode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/section[1]/section[1]/p[1]/a[1]").GetAttributeValue("href", ""); } else { // ? Log.Logs.Instance.PushError("[DanbooruExtractor] Cannot find html format! " + ds[i]); } var task = NetTask.MakeDefault(img_url); task.SaveFile = true; task.Filename = img_url.Split('/').Last(); task.Format = new ExtractorFileNameFormat { Search = tags, FilenameWithoutExtension = Path.GetFileNameWithoutExtension(task.Filename), Extension = Path.GetExtension(task.Filename).Replace(".", "") }; result.Add(task); } if (!already_thumbnail) { option.ThumbnailCallback?.Invoke(result[0]); already_thumbnail = true; } page += 1; if (page > end_page) { break; } } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.UserArtist }); } }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { var match = ValidUrl.Match(url).Groups; if (option == null) { option = RecommendOption(url); } var mtask = NetTask.MakeDefault(url); mtask.Encoding = Encoding.GetEncoding(51949); var html = NetTools.DownloadString(mtask); var node = html.ToHtmlNode(); var result = new List <NetTask>(); var xcode = match["xcode"].Value; if (match["menu"].Value == "shopbrand" || match["menu"].Value == "bestseller") { var filtering_filename = new string[] { "HN_Copyright2.jpg", "next_product.gif", "prev_product.gif", "btn_h8_spin_dw.gif", "btn_h8_spin_up.gif", "Review.jpg", "shoppingguide2.jpg", "sizetip-2.jpg" }; var gallery = node.SelectSingleNode("/html[1]/head[1]/title[1]").InnerText.Trim(); option.SimpleInfoCallback?.Invoke(gallery); var last_page_node = node.SelectSingleNode("/html[1]/body[1]/div[3]/div[3]/div[1]/div[2]/div[3]/div[1]/div[5]/ol[1]/li[@class='last']/a"); var last_page = 1; if (last_page_node != null) { last_page = node.SelectSingleNode("/html[1]/body[1]/div[3]/div[3]/div[1]/div[2]/div[3]/div[1]/div[5]/ol[1]/li[@class='last']/a").GetAttributeValue("href", "").Split('=').Last().ToInt(); } var page_urls = Enumerable.Range(1, last_page).Select(page => $"{url}&page={page}").ToList(); var htmls = NetTools.DownloadStrings(page_urls); var sub_urls = new List <string>(); foreach (var shtml in htmls) { var snode = shtml.ToHtmlNode(); sub_urls.AddRange(snode.SelectNodes("/html[1]/body[1]/div[3]/div[3]/div[1]/div[2]/div[3]/div[1]/div[5]/table[1]/tbody[1]//a").Select(x => "http://www.hn-hn.co.kr" + x.GetAttributeValue("href", ""))); } option.ProgressMax?.Invoke(sub_urls.Count); var sub_htmls = new List <string>(); foreach (var surl in sub_urls) { var task = NetTask.MakeDefault(surl); task.Encoding = Encoding.GetEncoding(51949); sub_htmls.Add(NetTools.DownloadString(task)); option.PostStatus?.Invoke(1); } foreach (var shtml in sub_htmls) { var snode = shtml.ToHtmlNode(); var title = snode.SelectSingleNode("/html[1]/body[1]/div[3]/div[3]/div[1]/div[2]/div[1]/div[2]/div[1]/form[1]/div[1]/div[1]/h3[1]").InnerText.Trim(); var thumbnail = "http://www.hn-hn.co.kr" + snode.SelectSingleNode("/html[1]/body[1]/div[3]/div[3]/div[1]/div[2]/div[1]/div[2]/div[1]/div[3]/div[1]/a[1]/img[1]").GetAttributeValue("src", "").Split('?')[0]; var imgs = snode.SelectNodes("/html[1]/body[1]/div[3]/div[3]/div[1]/div[2]/div[1]/div[2]//img").Select(img => { if (img.GetAttributeValue("src", "").StartsWith("http")) { return(img.GetAttributeValue("src", "")); } else { return("http://www.hn-hn.co.kr" + img.GetAttributeValue("src", "").Split('?')[0]); } }).ToList(); foreach (var img in imgs) { var task = NetTask.MakeDefault(img); task.SaveFile = true; task.Filename = img.Split('/').Last(); if (filtering_filename.Contains(task.Filename)) { continue; } task.Format = new ExtractorFileNameFormat { Gallery = gallery, Title = title, FilenameWithoutExtension = Path.GetFileNameWithoutExtension(task.Filename), Extension = Path.GetExtension(task.Filename).Replace(".", "") }; result.Add(task); } } option.ThumbnailCallback?.Invoke(result[0]); } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.Search }); }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { var match = ValidUrl.Match(url).Groups; if (option == null) { option = RecommendOption(url); } if (option.Type == ExtractorType.Images) { var sinfo = new ExtractedInfo.WorksComic(); var imgs_url = $"https://ltn.hitomi.la/galleries/{match["id"].Value}.js"; option.PageReadCallback?.Invoke($"https://ltn.hitomi.la/galleryblock/{match["id"]}.html"); option.PageReadCallback?.Invoke(url); option.PageReadCallback?.Invoke(imgs_url); var urls = new List <string> { $"https://ltn.hitomi.la/galleryblock/{match["id"]}.html", url, imgs_url }; var strings = NetTools.DownloadStrings(urls); if (string.IsNullOrEmpty(strings[0]) || string.IsNullOrEmpty(strings[1]) || string.IsNullOrEmpty(strings[2])) { return(null, null); } var data1 = ParseGalleryBlock(strings[0]); var data2 = ParseGallery(strings[1]); var imgs = strings[2]; option.SimpleInfoCallback?.Invoke($"[{data1.Magic}] {data1.Title}"); // download.js var number_of_frontends = 3; var subdomain = Convert.ToChar(97 + (Convert.ToInt32(match["id"].Value.Last()) % number_of_frontends)); if (match["id"].Value.Last() == '0') { subdomain = 'a'; } var arr = JArray.Parse(imgs.Substring(imgs.IndexOf('['))); var img_urls = new List <string>(); foreach (var obj in arr) { if (obj.Value <int>("haswebp") == 0) { img_urls.Add($"https://{subdomain}a.hitomi.la/galleries/{match["id"].Value}/{obj.Value<string>("name")}"); } else { img_urls.Add($"https://{subdomain}a.hitomi.la/webp/{match["id"].Value}/{obj.Value<string>("name")}.webp"); } } var result = new List <NetTask>(); foreach (var img in img_urls) { var task = NetTask.MakeDefault(img); task.SaveFile = true; task.Filename = img.Split('/').Last(); task.Format = new ExtractorFileNameFormat { Title = data1.Title, Id = data1.Magic, Language = data1.Language, UploadDate = data1.Posted, FilenameWithoutExtension = Path.GetFileNameWithoutExtension(img.Split('/').Last()), Extension = Path.GetExtension(img.Split('/').Last()).Replace(".", "") }; if (data1.artist != null) { task.Format.Artist = data1.artist[0]; } else { task.Format.Artist = "N/A"; } if (data1.parody != null) { task.Format.Series = data1.parody[0]; } else { task.Format.Series = "N/A"; } if (data2.group != null) { task.Format.Group = data2.group[0]; } else { task.Format.Group = "N/A"; } if (data2.character != null) { task.Format.Character = data2.character[0]; } else { task.Format.Character = "N/A"; } if (task.Format.Artist == "N/A" && task.Format.Group != "N/A") { task.Format.Artist = task.Format.Group; } result.Add(task); } option.ThumbnailCallback?.Invoke(result[0]); sinfo.Thumbnail = result[0]; sinfo.URL = url; sinfo.Title = data1.Title; sinfo.Author = data1.artist?.ToArray(); sinfo.AuthorGroup = data2.group?.ToArray(); sinfo.ShortInfo = $"[{data1.Magic}] {data1.Title}"; sinfo.Tags = data1.Tags?.ToArray(); sinfo.Characters = data2.character?.ToArray(); sinfo.Language = data1.Language; sinfo.Parodies = data1.parody?.ToArray(); result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Info = sinfo, Type = ExtractedInfo.ExtractedType.WorksComic }); } return(null, null); }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { if (option == null) { option = RecommendOption(url); } var match = ValidUrl.Match(url).Groups; var limit = int.MaxValue; if ((option as TwitterExtractorOption).LimitPosts != null) { limit = (option as TwitterExtractorOption).LimitPosts[0].ToInt(); } if (match["id"].Value == "hashtag") { #if DEBUG && false var html = NetTools.DownloadString(url); var search = HttpUtility.UrlDecode(match["search"].Value); var position = Regex.Match(html, @"data-max-position""(.*?)""").Groups[1].Value; var document = new HtmlDocument(); document.LoadHtml(html); var node = document.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[2]/ol[1]"); var tweets = node.SelectNodes("./li[@data-item-type='tweet']"); var urls = new List <string>(); foreach (var tweet in tweets) { urls.AddRange(parse_tweet_hashtag(option as TwitterExtractorOption, tweet)); } while (true) { try { var next = seach_query(option as TwitterExtractorOption, search, position); position = JToken.Parse(next)["min_position"].ToString(); var html2 = JToken.Parse(next)["items_html"].ToString(); var document2 = new HtmlDocument(); document2.LoadHtml(html2); var tweets2 = node.SelectNodes("./li[@data-item-type='tweet']"); foreach (var tweet in tweets2) { urls.AddRange(parse_tweet_hashtag(option as TwitterExtractorOption, tweet)); } } catch { break; } } var result = new List <NetTask>(); foreach (var surl in urls) { var task = NetTask.MakeDefault(surl); task.SaveFile = true; var fn = surl.Split('/').Last(); task.Filename = fn; task.Format = new ExtractorFileNameFormat { FilenameWithoutExtension = Path.GetFileNameWithoutExtension(fn), Extension = Path.GetExtension(fn).Replace(".", ""), User = search }; result.Add(task); } return(new Tuple <List <NetTask>, object>(result, null)); #endif throw new ExtractorException("'hashtag' is not support yet!"); } else { var name = match["id"].Value; var html = NetTools.DownloadString($"https://twitter.com/{name}/media"); var min_position = Regex.Match(html, @"data-min-position=""(.*?)""").Groups[1].Value; var node = html.ToHtmlNode(); var tweets = node.SelectNodes("./html[1]/body[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/div[2]/div[1]/div[2]/ol[1]/li[@data-item-type='tweet']"); var urls = new List <string>(); var user = node.SelectSingleNode("/html[1]/body[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/h1[1]/a[1]").InnerText; var videos = new List <(string, List <string>)>(); var post_count = tweets.Count; var last_url_count = 0; option.SimpleInfoCallback?.Invoke($"{user} ({name})"); foreach (var tweet in tweets) { urls.AddRange(parse_tweet_hashtag(option as TwitterExtractorOption, tweet, videos)); } while (post_count < limit) { var next = profile_query(option as TwitterExtractorOption, name, min_position); var html2 = JToken.Parse(next)["items_html"].ToString(); var tweets2 = html2.ToHtmlNode().SelectNodes("./li[@data-item-type='tweet']"); if (tweets2 == null) { break; } foreach (var tweet in tweets2) { urls.AddRange(parse_tweet_hashtag(option as TwitterExtractorOption, tweet, videos)); } option.PostStatus?.Invoke(urls.Count - last_url_count); last_url_count = urls.Count; post_count += tweets2.Count; min_position = JToken.Parse(next)["min_position"].ToString(); if (!(bool)JToken.Parse(next)["has_more_items"]) { break; } Thread.Sleep(3000); } var result = new List <NetTask>(); foreach (var surl in urls) { var task = NetTask.MakeDefault(surl); task.SaveFile = true; var fn = surl.Split('/').Last(); task.Filename = fn; task.Format = new ExtractorFileNameFormat { FilenameWithoutExtension = Path.GetFileNameWithoutExtension(fn), Extension = Path.GetExtension(fn).Replace(".", ""), Account = name, User = user, }; result.Add(task); } foreach (var video in videos) { var count = 0; foreach (var ts in video.Item2) { var task = NetTask.MakeDefault(ts); task.SaveFile = true; var fn = ts.Split('/').Last(); task.Filename = fn; task.Format = new ExtractorFileNameFormat { FilenameWithoutExtension = video.Item1 + "/" + count++.ToString("000"), Extension = Path.GetExtension(fn).Replace(".", ""), Account = name, User = user, }; result.Add(task); } } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.UserArtist }); } }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { if (option == null) { option = new DCInsideExtractorOption { Type = ExtractorType.Images } } ; if ((option as DCInsideExtractorOption).OnlyRecommend) { url += "&exception_mode=recommend"; } var match = ValidUrl.Match(url).Groups; var result = new List <NetTask>(); var html = NetTools.DownloadString(url); if (html == null) { return(result, null); } if (match[1].Value == "gall") { try { // // Parse article // if (match[3].Value == "view") { var article = ParseBoardView(html, match[2].Value != ""); if (option.Type == ExtractorType.Images && option.ExtractInformation == false) { if (article.ImagesLink == null || article.ImagesLink.Count == 0) { throw new Exception("Nothing to download!"); } option.SimpleInfoCallback?.Invoke($"{article.Title}"); for (int i = 0; i < article.ImagesLink.Count; i++) { var task = NetTask.MakeDefault(article.ImagesLink[i]); task.Filename = article.FilesName[i]; task.SaveFile = true; task.Referer = url; task.Format = new ExtractorFileNameFormat { Id = article.Id, Gallery = article.GalleryName, Title = article.Title, FilenameWithoutExtension = (i + 1).ToString("000"), Extension = Path.GetExtension(article.FilesName[i]).Replace(".", ""), }; result.Add(task); } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, null /*article*/); } else if (option.Type == ExtractorType.ArticleInformation || option.ExtractInformation == true) { return(null, null /*article*/); } else if (option.Type == ExtractorType.Comments) { var cc = new List <DCComment>(); var comments = GetComments(article, "1"); cc.Add(comments); // // To avoid server blocks // Thread.Sleep(2000); int tcount = comments.total_cnt; int count = 100; for (int i = 2; count < tcount; count += 100) { comments = GetComments(article, i.ToString()); if (comments.comment_cnt == 0) { break; } count += comments.comment_cnt; cc.Add(comments); Thread.Sleep(2000); } return(null, null /*GetComments(article, "0")*/); } else { throw new Exception("You cannot do that with this URL. " + url); } } // // Parse Articles List // else if (match[3].Value == "lists") { DCGallery gallery; if (match[2].Value == "") { gallery = ParseGallery(html); } else { gallery = ParseMinorGallery(html); } if (option.Type == ExtractorType.GalleryInformation || option.ExtractInformation == true) { return(null, null /*gallery*/); } else { throw new Exception("You cannot do that with this URL." + url); } } } catch (Exception e) { Log.Logs.Instance.PushError("[DCInsideExtractor] Extract error - " + option.Type.ToString() + " - " + e.Message + "\r\n" + e.StackTrace); } } else { // Not support mobile page. throw new ExtractorException("[DCInside Extractor] Not support mobile page yet."); } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.Community }); }
public static async Task LoopInternal() { // Inha Univ Article { var range = Enumerable.Range(Convert.ToInt32(ExtractManager.InhaUnivArticles.Last().Link.Split('/')[6]) + 1, 5).ToList(); var htmls = NetTools.DownloadStrings(range.Select(x => $"https://www.inha.ac.kr/bbs/kr/8/{x}/artclView.do").ToList()); for (int i = 0; i < htmls.Count; i++) { try { var cc = InhaUnivExtractor.Parse(htmls[i]); cc.Link = $"https://www.inha.ac.kr/bbs/kr/8/{range[i]}/artclView.do"; ExtractManager.InhaUnivArticles.Add(cc); ExtractManager.InhaUnivDB.Add(cc); Log.Logs.Instance.Push($"[Loop] New item is added. - IUA - {cc.Title}"); await BotManager.Instance.Notice(cc.ToString(), "MSG-MAIN"); } catch { } } } // Department Notices { // Lazy downloading foreach (var department in DepartmentList.Lists) { try { if (department.Item3 == "") { continue; } var task = NetTask.MakeDefault(department.Item3); if (department.Item2 == "s5") { task.Encoding = Encoding.GetEncoding(51949); } var html = NetTools.DownloadString(task); List <DepartmentDBModel> cc = null; if (department.Item2 == "s1") { cc = DepartmentExtractor.ExtractStyle1(html, department.Item1); } else if (department.Item2 == "s2") { cc = DepartmentExtractor.ExtractStyle2(html, department.Item1); } else if (department.Item2 == "s3") { cc = DepartmentExtractor.ExtractStyle3(html, department.Item1); } else if (department.Item2 == "s4") { cc = DepartmentExtractor.ExtractStyle4(html, department.Item1); } else if (department.Item2 == "s5") { cc = DepartmentExtractor.ExtractStyle5(html, department.Item1); } else if (department.Item2 == "s6") { cc = DepartmentExtractor.ExtractStyle6(html, department.Item1); } // get cse latest var mm = new HashSet <int>(); ExtractManager.DepartmentArticles.Where(x => x.Department == department.Item1).ToList().ForEach(x => mm.Add(Convert.ToInt32(x.Number))); int starts = 0; for (starts = cc.Count - 1; starts >= 0; starts--) { if (mm.Contains(Convert.ToInt32(cc[starts].Number))) { break; } } for (int i = starts + 1; i < cc.Count; i++) { ExtractManager.DepartmentArticles.Add(cc[i]); ExtractManager.DepartmentDB.Add(cc[i]); Log.Logs.Instance.Push($"[Loop] New item is added. - DN - {cc[i].Title}"); await BotManager.Instance.Notice(cc[i].ToString(), "MSG-" + department.Item1); } } catch (Exception e) { Log.Logs.Instance.PushError("[Loop] '" + department.Item1 + "' " + e.Message + "\r\n" + e.StackTrace); } } } Log.Logs.Instance.Push("[Loop] Cycle " + Count.ToString()); Count++; }
public override (List <NetTask>, ExtractedInfo) Extract(string url, IExtractorOption option = null) { if (option == null) { option = RecommendOption(url); } var html = NetTools.DownloadString(url); var match = ValidUrl.Match(url).Groups; var node = html.ToHtmlNode(); var title = node.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/h1[1]").InnerText.Trim(); var genre = node.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/ul[1]/li[1]/div[1]/div[2]/div[2]/h3[1]/a[1]").InnerText.Trim(); var artist = node.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/ul[1]/li[1]/div[1]/div[2]/div[3]/h3[1]/a[1]").InnerText.Trim(); var sub_urls = new List <string>(); var sub_titles = new List <string>(); foreach (var episode in node.SelectNodes("/html[1]/body[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[2]/div[1]/div[1]/div")) { var tag_a = episode.SelectSingleNode("./div[2]/h2[1]/a[1]"); sub_urls.Add(tag_a.GetAttributeValue("href", "")); sub_titles.Add(tag_a.InnerText.Trim()); } option.SimpleInfoCallback?.Invoke(title); option.ThumbnailCallback?.Invoke(NetTask.MakeDefault( match["host"].Value + node.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/ul[1]/li[1]/div[1]/div[1]/a[1]/img[1]").GetAttributeValue("src", ""))); option.ProgressMax?.Invoke(sub_urls.Count); var sub_htmls = NetTools.DownloadStrings(sub_urls, "", () => { option.PostStatus?.Invoke(1); }); var result = new List <NetTask>(); for (int i = 0; i < sub_urls.Count; i++) { var snode = sub_htmls[i].ToHtmlNode(); int count = 1; foreach (var img in snode.SelectNodes("/html[1]/body[1]/div[1]/div[3]/div[2]/div[1]/div[2]/ul[1]//li/div[1]/img[1]")) { var img_src = img.GetAttributeValue("data-src", ""); if (string.IsNullOrWhiteSpace(img_src)) { img_src = img.GetAttributeValue("src", ""); } var task = NetTask.MakeDefault(HttpUtility.HtmlDecode(img_src)); task.SaveFile = true; task.Filename = count.ToString("000") + ".jpg"; task.Format = new ExtractorFileNameFormat { Title = title, Episode = sub_titles[i], FilenameWithoutExtension = count.ToString("000"), Extension = Path.GetExtension(task.Filename).Replace(".", ""), }; result.Add(task); count++; } } result.ForEach(task => task.Format.Extractor = GetType().Name.Replace("Extractor", "")); return(result, new ExtractedInfo { Type = ExtractedInfo.ExtractedType.WorksComic }); }
private void start() { int ps; if (!int.TryParse(textBox3.Text, out ps)) { append("숫자만 입력해주세요."); return; } int pe; if (!int.TryParse(textBox4.Text, out pe)) { append("숫자만 입력해주세요."); return; } var id = "monmusu"; var starts = ps; status("진행중...[0/" + (pe - ps + 1).ToString("#,#") + "]"); bool real_cookie_receive = false; var articles = new List <DCInsidePageArticle>(); try { for (; ps <= pe; ps++) { string url; if (true) { url = $"https://gall.dcinside.com/mgallery/board/lists/?id={id}&page={ps}"; } else { url = $"https://gall.dcinside.com/board/lists/?id={id}&page={ps}"; } Logger.Instance.Push("Downloading String... = " + url); var task = NetTask.MakeDefault(url); task.Cookie = COOKIES; bool y = real_cookie_receive; if (real_cookie_receive == false) { task.HeaderReceive = (ef) => { append("헤더받음: " + ef); var xx = ef.Split('\n').First(x => x.Contains("Set-Cookie")).Replace("Set-Cookie: ", "").Trim(); COOKIES = "PHPSESSID=" + SESS; COOKIES += "; PHPSESSKEY=" + xx.Split(new[] { "PHPSESSKEY=" }, StringSplitOptions.None)[1].Split(';')[0].Trim(); COOKIES += $"; block_alert_{id}=1"; real_cookie_receive = true; append("쿠키 변경됨: " + COOKIES); }; } var html = NetTools.DownloadString(task); if (y == false) { append("PS1"); ps--; continue; } Logger.Instance.Push("Downloaded String = " + url); if (string.IsNullOrEmpty(html)) { append("실패: " + url); Logger.Instance.Push("Fail: " + url); goto NEXT; } if (html.Length < 1000 && html.Contains("해당 마이너 갤러리는 운영원칙 위반으로 접근이 제한되었습니다.")) { append("실패: 접근 거부, 접근 가능한 아이디로 재시도하시기 바랍니다."); break; } if (html.Contains("해당 마이너 갤러리는 운영원칙 위반(사유: 누드패치, 성행위 패치, 음란성 게시물 공지 등록 및 정리 안됨) 으로 접근이 제한되었습니다.")) { //goto F**K; return; } DCInsideGallery gall; if (true) { gall = DCInsideUtils.ParseMinorGallery(html); } else { gall = DCInsideUtils.ParseGallery(html); } if (true && (gall.articles == null || gall.articles.Length == 0)) { gall = DCInsideUtils.ParseGallery(html); } if (gall.articles.Length == 0) { break; } articles.AddRange(gall.articles); Logger.Instance.Push("Parse: " + url); // 해당 마이너 갤러리는 운영원칙 위반으로 접근이 제한되었습니다.\n마이너 갤러리 메인으로 돌아갑니다. NEXT: var ss = TimeSpan.FromMilliseconds(720 * (pe - ps)); var yy = ""; if (ss.Days > 0) { yy += ss.Days.ToString() + "일 "; } if (ss.Days > 0 || ss.Hours > 0) { yy += ss.Hours.ToString() + "시간 "; } if (ss.Days > 0 || ss.Hours > 0 || ss.Minutes > 0) { yy += ss.Minutes.ToString() + "분 "; } if (ss.Days > 0 || ss.Hours > 0 || ss.Minutes > 0 || ss.Seconds > 0) { yy += ss.Seconds.ToString() + "초 남음"; } status("진행중...[" + (ps - starts + 1).ToString() + "/" + (pe - starts + 1).ToString("#,#") + "] 남은시간: " + yy); Logger.Instance.Push("next: " + url + $" || {ps}/{pe}/{starts}"); Thread.Sleep(700); } DCGalleryAnalyzer.Instance.Articles.AddRange(articles); DCGalleryAnalyzer.Instance.Save(); status("완료"); } catch (Exception e) { append("실패: " + e.Message + "\r\n" + e.StackTrace); } }