/// <summary> /// Identifies the base URL for package source files. /// </summary> /// <param name="purl"> </param> /// <param name="pool"> </param> /// <returns> </returns> private async Task <string?> GetArchiveBaseUrlForProject(PackageURL purl, string pool) { try { HttpClient httpClient = CreateHttpClient(); string?html = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/{pool}/{purl.Name}", neverThrow : true); if (html == null) { return(null); } AngleSharp.Html.Dom.IHtmlDocument document = await new HtmlParser().ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement anchor in document.QuerySelectorAll("a")) { string?href = anchor.GetAttribute("href"); if (href != null && href.EndsWith(".dsc")) { Match match = Regex.Match(href, "(.+)/[^/]+\\.dsc"); if (match.Success) { return(match.Groups[1].Value.Trim()); } } } } catch (Exception ex) { Logger.Debug(ex, "Error fetching Ubuntu archive base URL for {0}: {1}", purl.ToString(), ex.Message); } return(null); }
/// <summary> /// Identifies the available pools for a given Ubuntu project. For example, 'xenial'. /// </summary> /// <param name="purl"> Package URL to look up (only name is used). </param> /// <returns> List of pool names </returns> private async Task <IEnumerable <string> > GetPoolsForProject(PackageURL purl) { HashSet <string> pools = new(); try { HttpClient httpClient = CreateHttpClient(); string?searchResults = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/search?keywords={purl.Name}&searchon=names&exact=1&suite=all§ion=all", neverThrow : true); AngleSharp.Html.Dom.IHtmlDocument document = await new HtmlParser().ParseDocumentAsync(searchResults); foreach (AngleSharp.Dom.IElement anchor in document.QuerySelectorAll("a.resultlink")) { string?href = anchor.GetAttribute("href"); if (href != null) { Match match = Regex.Match(href, "^/([^/]+)/.+"); if (match.Success) { string pool = match.Groups[1].Value.Trim(); Logger.Debug("Identified pool: {0}", pool); pools.Add(pool); } } } } catch (Exception ex) { Logger.Debug(ex, "Error fetching Ubuntu pools for {0}: {1}", purl.ToString(), ex.Message); } return(pools); }
private ICollection <IndexerCategory> MapCategories(AngleSharp.Html.Dom.IHtmlDocument dom, AngleSharp.Dom.IElement t, AngleSharp.Dom.IElement tr) { var rName = t.TextContent; var rDesc = tr.QuerySelector("h3.tracker_info_bold").TextContent; var type = dom.QuerySelector("div.releases-date:contains('Тип:')").TextContent; // Check OVA first cause OVA looks like anime with OVA in release name or description if (CategorieOVARegex.IsMatch(rName) || CategorieOVARegex.IsMatch(rDesc)) { return(_categories.MapTrackerCatDescToNewznab("OVA/ONA/Special")); } // Check movies then, cause some of releases could be movies dorama and should go to movies category if (CategorieMovieRegex.IsMatch(rName) || CategorieMovieRegex.IsMatch(rDesc)) { return(_categories.MapTrackerCatDescToNewznab("Movies")); } // Check dorama. Most of doramas are flaged as doramas in type info, but type info could have a lot of types at same time (movie, etc) if (CategorieDoramaRegex.IsMatch(rName) || CategorieDoramaRegex.IsMatch(type)) { return(_categories.MapTrackerCatDescToNewznab("Dorama")); } return(_categories.MapTrackerCatDescToNewznab("TV Anime")); }
/// <summary> /// Download one CPAN package and extract it to the target directory. /// </summary> /// <param name="purl">Package URL of the package to download.</param> /// <returns>n/a</returns> public override async Task <IEnumerable <string> > DownloadVersionAsync(PackageURL purl, bool doExtract, bool cached = false) { Logger.Trace("DownloadVersion {0}", purl?.ToString()); string? packageName = purl?.Name; string? packageVersion = purl?.Version; List <string> downloadedPaths = new(); if (string.IsNullOrWhiteSpace(packageName) || string.IsNullOrWhiteSpace(packageVersion)) { Logger.Debug("Unable to download [{0} {1}]. Both must be defined.", packageName, packageVersion); return(downloadedPaths); } // Locate the URL HttpClient httpClient = CreateHttpClient(); string? packageVersionUrl = null; string? html = await GetHttpStringCache(httpClient, $"{ENV_CPAN_ENDPOINT}/release/{packageName}"); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement option in document.QuerySelectorAll("div.release select.extend option")) { if (!option.HasAttribute("value")) { continue; } string?value = option.GetAttribute("value"); string version = value.Split('-').Last(); if (version.StartsWith("v", StringComparison.InvariantCultureIgnoreCase)) { version = version[1..];
static int DownloadAndFindNext(string url, int page) { Console.WriteLine("Downloading page " + page); const string NextPageSelector = "div.row > div > div.card > div.card-body > div.paginator-top > ul > li.page-item > a[rel=\"next\"]"; //Download source string path = Path.GetFullPath(@"work\page" + page + ".html"); if (!File.Exists(path) || OverwritePages) { using (WebClient client = new WebClient()) { client.DownloadFile(url, path); } } string source = File.ReadAllText(path); AngleSharp.Html.Dom.IHtmlDocument document = Parser.ParseDocument(source); IHtmlCollection <IElement> pages = document.QuerySelectorAll(NextPageSelector); if (pages.Length != 0) { AngleSharp.Html.Dom.IHtmlAnchorElement thisPage = (AngleSharp.Html.Dom.IHtmlAnchorElement)pages[0]; return(DownloadAndFindNext(thisPage.Href, page + 1)); } else { return(page); } }
/// <summary> /// Получает все задания с странцы категории /// </summary> /// <param name="link">ссылка на страницу без домена</param> /// <returns>Список все заданий</returns> public List <Objects.Task> GetTasksFromPage(string link) { Log.ProcessMessage("Пытаемся получить список заданий со страницы " + link); try { string get = http.GetAsync(Domain + link).Result.Content.ReadAsStringAsync().Result; HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> taskElements = html.QuerySelectorAll("div.row.set_href .title a"); var tasks = new List <Objects.Task> { }; foreach (var elem in taskElements) { var task = GetTaskFromLink(elem.GetAttribute("href")); if (task == null) { continue; } tasks.Add(task); } Log.GoodMessage("Получили список заданий со страницы " + link); return(tasks); } catch { Log.ExMessage("Не удалось получить список заданий со страницы " + link); return(null); } }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("tbody tr td table tbody tr td span").Select(x => x.TextContent.Trim()).ToArray(); var latest = tdContent.LastOrDefault(); LatestPage = string.IsNullOrEmpty(latest) ? (int?)null : latest.ToInt(); }
/// <summary> /// Обновляет список категорий /// </summary> /// <returns>статус обновления</returns> public bool UpdateWorkCategory() { Log.ProcessMessage("Пытаемся обновить список категорий"); try { string get = http.GetAsync(Domain + "/jobs/").Result.Content.ReadAsStringAsync().Result; HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> categoriesElements = html.QuerySelectorAll(".collapse li a[data-category_id]"); foreach (var elem in categoriesElements) { Objects.Category.Categories.Add(new Objects.Category { Name = elem.TextContent, Href = elem.GetAttribute("href") }); } Log.GoodMessage("Обновили список категорий"); return(true); } catch { Log.ExMessage("Не удалось обновить список категорий"); return(false); } }
/// <summary> /// Получает данные задания /// </summary> /// <param name="link">ссылка на задание без домена</param> /// <returns>Задание</returns> public Objects.Task GetTaskFromLink(string link) { Log.ProcessMessage("Пытаемся получить задание " + link); try { if (link.Contains("vacancies")) { return(null); } string get = http.GetAsync(Domain + link).Result.Content.ReadAsStringAsync().Result; HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); var task = new Objects.Task { Title = html.QuerySelector(".col h1").TextContent, Discription = html.QuerySelector(".text_field p").TextContent, Price = html.QuerySelector(".page_header_content .title.amount") != null?html.QuerySelector(".page_header_content .title.amount").TextContent : "Бюджет не определен", TimeStamp = double.Parse(html.QuerySelector(".cols_table.no_hover .text-muted span[data-timestamp]").GetAttribute("data-timestamp")), Applications = html.QuerySelector(".block-content .title").TextContent, Link = Domain + link }; Log.GoodMessage("Получили задание " + link); return(task); } catch { Log.ExMessage("Не удалось получить задание " + link); return(null); } }
private DateTime GetDateFromShowPage(AngleSharp.Html.Dom.IHtmlDocument content) { const string dateFormat = "d-MM-yyyy"; const string dateTimeFormat = dateFormat + ", HH:mm"; // Would be better to use AssumeLocal and provide "ru-RU" culture, // but doesn't work cross-platform const DateTimeStyles style = DateTimeStyles.AssumeUniversal; var culture = CultureInfo.InvariantCulture; var dateText = GetDateFromDocument(content); //Correct way but will not always work on cross-platform //var localTimeZone = TimeZoneInfo.FindSystemTimeZoneById("Russian Standard Time"); //var nowLocal = TimeZoneInfo.ConvertTime(DateTime.UtcNow, localTimeZone); // Russian Standard Time is +03:00, no DST const int russianStandardTimeDiff = 3; var nowLocal = DateTime.UtcNow.AddHours(russianStandardTimeDiff); dateText = dateText .Replace("Вчера", nowLocal.AddDays(-1).ToString(dateFormat)) .Replace("Сегодня", nowLocal.ToString(dateFormat)); if (DateTime.TryParseExact(dateText, dateTimeFormat, culture, style, out var date)) { var utcDate = date.ToUniversalTime(); return(utcDate.AddHours(-russianStandardTimeDiff)); } Logger.Warn($"[AniDub] Date time couldn't be parsed on. Date text: {dateText}"); return(DateTime.UtcNow); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var thContent = document.QuerySelectorAll("thead tr th") .Select(x => x.TextContent.Trim()) .ToArray(); var tdContent = document.QuerySelectorAll("tbody tr") .Where(x => x.ClassName == "view list_tr_humordata") .Select(x => x.QuerySelectorAll("td")) .SelectMany(x => x.Select(y => { return(y.QuerySelector("a") != null ? y.QuerySelector("a").TextContent.Trim() : y.TextContent.Trim()); })) .ToArray(); var tdHref = document.QuerySelectorAll("tbody tr td") .Where(x => x.ClassName == ("subject")) .Select(x => x.QuerySelector("a").GetAttribute("href")) .ToArray(); if (!thContent.Any() || !tdContent.Any()) { return; } var cultureInfo = (CultureInfo)Thread.CurrentThread.CurrentCulture.Clone(); var calendar = cultureInfo.Calendar; calendar.TwoDigitYearMax = DateTime.Now.Year + 30; cultureInfo.DateTimeFormat.Calendar = calendar; Parallel.For(0, tdContent.Length / thContent.Length, n => { var cursor = n * thContent.Length; var id = tdContent[cursor + 0].ToInt(); var title = tdContent[cursor + 2]; var author = tdContent[cursor + 3]; var date = DateTime.ParseExact(tdContent[cursor + 4], "yy/MM/dd HH:mm", cultureInfo); var count = tdContent[cursor + 5].ToInt(); var recommend = tdContent[cursor + 6].ToInt(); var href = UrlCompositeHref(tdHref[n]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, RowId = id, Title = title, Author = author, Recommend = recommend, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("tbody tr td") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName != "g6") .Select(x => { return(x.QuerySelector("a") != null ? x.QuerySelector("a").TextContent.Trim() : x.TextContent.Trim()); }) .ToArray(); var tdHref = document.QuerySelectorAll("tbody tr td a") .Where(x => !string.IsNullOrEmpty(x.ClassName)) .Select(x => x.GetAttribute("href")) .ToArray(); if (!tdContent.Any()) { return; } const int thLength = 7; var thContent = tdContent.Take(thLength); tdContent = tdContent.Skip(thLength).ToArray(); Parallel.For(0, tdContent.Length / thLength, n => { var cursor = n * thLength; var originTitle = tdContent[cursor + 1]; var title = originTitle.Substring("\n"); var author = tdContent[cursor + 2]; var date = DateTime.Parse(tdContent[cursor + 3]); var count = tdContent[cursor + 4].ToInt(); var recommend = tdContent[cursor + 5].ToInt(); var notRecommend = tdContent[cursor + 6].ToInt(); var href = UrlCompositeHref(tdHref[n]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Title = title, Author = author, Recommend = recommend - notRecommend, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
public static string PrettifyHtml(string newContent) { AngleSharp.Html.Parser.HtmlParser parser = new AngleSharp.Html.Parser.HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument document = parser.ParseDocument(newContent); StringWriter sw = new StringWriter(); document.ToHtml(sw, new PrettyMarkupFormatter()); return(sw.ToString()); }
private static void ProcessChildNode(AngleSharp.Html.Dom.IHtmlDocument document, INodeList childs, Dictionary <string, string> dicKeywordMap) { for (var i = 0; i < childs.Length; i++) { var childNode = childs[i]; // 只跑純文字 if (childNode.NodeType != NodeType.Text) { continue; } // 如果沒有可閱讀文字,跳過 if (string.IsNullOrWhiteSpace(childNode.TextContent)) { continue; } // 切割文字 var splitedResult = WordSplitor.SplitWords(childNode.TextContent, dicKeywordMap); // 如果沒有找到目標 if (!splitedResult.Where(obj => obj.NodeType == TextNodeType.Link).Any()) { continue; } List <INode> nodeList = new List <INode>(); foreach (var item in splitedResult) { if (item is TextNode) { nodeList.Add(document.CreateTextNode(item.Context)); } else { var linkItem = item as LinkNode; var link = document.CreateElement("a"); link.SetAttribute("href", linkItem.LinkUrl.ToString()); link.SetAttribute("data-autolink", "bot"); link.TextContent = linkItem.Context; nodeList.Add(link); } } childNode.ReplaceWith(nodeList.ToArray()); } }
private static string GetDateFromDocument(AngleSharp.Html.Dom.IHtmlDocument content) { const string DateSelector = ".story_inf > li:nth-child(2)"; var domDate = content.QuerySelector(DateSelector).LastChild; if (domDate?.NodeName != "#text") { return(string.Empty); } return(domDate.NodeValue.Trim()); }
private static string GetTitle(AngleSharp.Html.Dom.IHtmlDocument content, AngleSharp.Dom.IElement tabNode) { var domTitle = content.QuerySelector("#news-title"); var baseTitle = domTitle.TextContent.Trim(); var quality = GetQuality(tabNode.ParentElement); if (!string.IsNullOrWhiteSpace(quality)) { return($"{baseTitle} [{quality}]"); } return(baseTitle); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var thContent = document.QuerySelectorAll("thead tr th").Select(x => x.TextContent.Trim()).ToArray(); if (thContent.Any()) { OnPageCrawlTable(document, thContent); } else { OnPageCrawlList(document); } }
/// <summary> /// Возвращает список проксей с сайта, который нам нашел поисковик /// </summary> /// <param name="_url"></param> /// <returns></returns> public static string[] GetProxys(string _url) { List <string> Prxs = new List <string>(); var browser = new AngleSharp.Html.Parser.HtmlParser(); string code = GetHtml(_url); if (code != "") { AngleSharp.Html.Dom.IHtmlDocument doc = browser.ParseDocument(code); var TRs = doc.GetElementsByTagName("TR"); foreach (var TR in TRs) { try { string tds = ""; var TDs = TR.GetElementsByTagName("TD"); foreach (var TD in TDs) { tds += " " + TD.TextContent + " "; } string tr = Regex.Replace(tds, @"[^0-9\.]", " "); while (tr.Contains(" ")) { tr = tr.Replace(" ", " "); } MatchCollection M = Regex.Matches(tr, @"(?<prx>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} \d{1,5} )+"); foreach (Match m in M) { string prx_ = m.Groups["prx"].Value.Replace(" ", ":"); if (prx_.EndsWith(":")) { prx_ = prx_.Substring(0, prx_.Length - 1); } prx_ = prx_.Trim(); if (!Prxs.Contains(prx_)) { Prxs.Add(prx_); } } } catch { } } } return(Prxs.ToArray()); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContents = document.QuerySelectorAll("tbody tr") .Select(x => { var stringTuples = x.QuerySelectorAll("span") .Where(x => x.ClassName != "category") .Select(y => new Tuple <string, string>(y.ClassName, y.TextContent.Trim())).ToList(); var hrefs = x.QuerySelectorAll("a") .Select(x => x.GetAttribute("href")) .ToList(); return(new Tuple <List <Tuple <string, string> >, List <string> >(stringTuples, hrefs)); }) .ToArray(); Parallel.ForEach(tdContents, row => { var stringTuples = row.Item1; var hrefs = row.Item2; var cmtnum = stringTuples.FindValue("cmtnum"); var originTitle = stringTuples.FindValue("title"); var infos = stringTuples.FindValue("info").Split("|"); var title = string.IsNullOrEmpty(cmtnum) ? originTitle : originTitle.Substring(cmtnum); var category = infos[0].Substring("\n"); var author = infos[1]; var date = DateTime.Parse(infos[2]); var recommend = string.IsNullOrEmpty(cmtnum) ? 0 : cmtnum.ToIntRegx(); var href = hrefs[0]; _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Title = title, Category = category, Author = author, Recommend = recommend, DateTime = date, Href = href, SourceId = Source.Id }); }); }
/// <summary> /// Возвращает список сайтов с проксями, которые нашел по передаваемой ссылке на яндекс и т.д. /// </summary> /// <param name="_url"></param> /// <returns></returns> public static string[] GetYandexHrefs(string _url) { var browser = new AngleSharp.Html.Parser.HtmlParser(); string code = GetHtml(_url); if (code != "") { AngleSharp.Html.Dom.IHtmlDocument doc = browser.ParseDocument(code); return((from m in doc.Links select m.GetAttribute("HREF")).Where(m => m.StartsWith("http")).Distinct().Where(m => ( !m.Contains("yandex") && !m.Contains("google") && !m.Contains("mail") && !m.Contains("rambler") && !m.Contains("youtube") )).ToArray()); } else { return(new string[0]); } }
/// <inheritdoc /> public override async Task <IEnumerable <string> > EnumerateVersionsAsync(PackageURL purl, bool useCache = true, bool includePrerelease = true) { Logger.Trace("EnumerateVersions {0}", purl?.ToString()); if (purl == null || purl.Name is null) { return(Array.Empty <string>()); } try { string packageName = purl.Name; HttpClient httpClient = CreateHttpClient(); System.Net.Http.HttpResponseMessage?html = await httpClient.GetAsync($"{ENV_HACKAGE_ENDPOINT}/package/{packageName}"); html.EnsureSuccessStatusCode(); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(await html.Content.ReadAsStringAsync()); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> ths = document.QuerySelectorAll("th"); List <string> versionList = new(); foreach (AngleSharp.Dom.IElement th in ths) { if (th.TextContent.StartsWith("Versions")) { AngleSharp.Dom.IElement td = th.NextElementSibling; foreach (AngleSharp.Dom.IElement version in td.QuerySelectorAll("a,strong")) { string versionString = version.TextContent.ToLower().Trim(); Logger.Debug("Identified {0} version {1}.", packageName, versionString); versionList.Add(versionString); } break; } } return(SortVersions(versionList.Distinct())); } catch (Exception ex) { Logger.Debug("Unable to enumerate versions: {0}", ex.Message); throw; } }
private string composeTitle(AngleSharp.Html.Dom.IHtmlDocument dom, AngleSharp.Dom.IElement t, AngleSharp.Dom.IElement tr) { var name_ru = dom.QuerySelector("div.media__post__header > h1").TextContent.Trim(); var name_en = dom.QuerySelector("div.media__panel > div:nth-of-type(1) > div.col-l:nth-of-type(1) > div > span").TextContent.Trim(); var name_orig = dom.QuerySelector("div.media__panel > div:nth-of-type(1) > div.col-l:nth-of-type(2) > div > span").TextContent.Trim(); var title = name_ru + " / " + name_en; if (name_en != name_orig) { title += " / " + name_orig; } var tabName = t.TextContent; tabName = tabName.Replace("Сезон", "Season"); if (tabName.Contains("Серии")) { tabName = ""; } var heading = tr.QuerySelector("h3.tracker_info_bold").TextContent; // Parse episodes info from heading if episods info present var match = EpisodesInfoQueryRegex.Match(heading); heading = tabName; if (match.Success) { if (string.IsNullOrEmpty(match.Groups[2].Value)) { heading += " E" + match.Groups[1].Value; } else { heading += string.Format(" E{0}-{1}", match.Groups[1].Value, match.Groups[2].Value); } } return(title + " - " + heading + " [" + getResolution(tr) + "p]"); }
private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string[] thContent) { var tdContent = document.QuerySelectorAll("tbody tr td").Select(x => x.TextContent.Trim()).ToArray(); var tdHref = document.QuerySelectorAll("tbody tr td").Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("title")).Select(x => x.QuerySelector("a").GetAttribute("href")).ToArray(); if (!thContent.Any() || !tdContent.Any()) { return; } Parallel.For(0, tdContent.Length / thContent.Length, n => { var cursor = n * thContent.Length; var category = tdContent[cursor + 0]; var title = tdContent[cursor + 1]; var author = tdContent[cursor + 2]; var date = DateTime.Parse(tdContent[cursor + 3]); var count = tdContent[cursor + 4].ToInt(); var recommend = tdContent[cursor + 5].ToInt(); var href = UrlCompositeHref(tdHref[n]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Category = category, Title = title.Substring("\t"), Author = author, Recommend = recommend, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
private List <string> Parse(BotData data) { var original = ReplaceValues(parseTarget, data); var partial = original; var list = new List <string>(); // Parse the value switch (Type) { case ParseType.LR: var ls = ReplaceValues(leftString, data); var rs = ReplaceValues(rightString, data); var pFrom = 0; var pTo = 0; // No L and R = return full input if (ls == "" && rs == "") { list.Add(original); break; } // L or R not present and not empty else if (((!partial.Contains(ls) && ls != "") || (!partial.Contains(rs) && rs != ""))) { break; } // Instead of the mess below, we could simply use Extreme.NET's Substring extensions // return original.Substrings(ls, rs); // Recursive // return original.Substring(ls, rs); // Not recursive if (recursive) { if (useRegexLR) { try { var pattern = BuildLRPattern(ls, rs); MatchCollection mc = Regex.Matches(partial, pattern); foreach (Match m in mc) { list.Add(m.Value); } } catch { } } else { try { while ((partial.Contains(ls) || ls == "") && (partial.Contains(rs) || rs == "")) { // Search for left delimiter and Calculate offset pFrom = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length; // Move right of offset partial = partial.Substring(pFrom); // Search for right delimiter and Calculate length to parse pTo = rs == "" ? (partial.Length - 1) : partial.IndexOf(rs); // Parse it var parsed = partial.Substring(0, pTo); list.Add(parsed); // Move right of parsed + right partial = partial.Substring(parsed.Length + rs.Length); } } catch { } } } // Non-recursive else { if (useRegexLR) { var pattern = BuildLRPattern(ls, rs); MatchCollection mc = Regex.Matches(partial, pattern); if (mc.Count > 0) { list.Add(mc[0].Value); } } else { try { pFrom = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length; partial = partial.Substring(pFrom); pTo = rs == "" ? partial.Length : partial.IndexOf(rs); list.Add(partial.Substring(0, pTo)); } catch { } } } break; case ParseType.CSS: HtmlParser parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument document = null; try { document = parser.ParseDocument(original); } catch { } try { if (recursive) { foreach (var element in document.QuerySelectorAll(ReplaceValues(cssSelector, data))) { switch (ReplaceValues(attributeName, data)) { case "innerHTML": list.Add(element.InnerHtml); break; case "outerHTML": list.Add(element.OuterHtml); break; default: foreach (var attr in element.Attributes) { if (attr.Name == ReplaceValues(attributeName, data)) { list.Add(attr.Value); break; } } break; } } } else { switch (ReplaceValues(attributeName, data)) { case "innerHTML": list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].InnerHtml); break; case "outerHTML": list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].OuterHtml); break; default: foreach (var attr in document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].Attributes) { if (attr.Name == ReplaceValues(attributeName, data)) { list.Add(attr.Value); break; } } break; } } } catch { } break; case ParseType.JSON: if (JTokenParsing) { if (original.Trim().StartsWith("[")) { JArray json = JArray.Parse(original); var jsonlist = json.SelectTokens(jsonField, false); foreach (var j in jsonlist) { list.Add(j.ToString()); } } else { JObject json = JObject.Parse(original); var jsonlist = json.SelectTokens(jsonField, false); foreach (var j in jsonlist) { list.Add(j.ToString()); } } } else { var jsonlist = new List <KeyValuePair <string, string> >(); parseJSON("", original, jsonlist); foreach (var j in jsonlist) { if (j.Key == ReplaceValues(jsonField, data)) { list.Add(j.Value); } } } break; case ParseType.XPATH: // NOT IMPLEMENTED YET break; case ParseType.REGEX: try { var matches = Regex.Matches(partial, ReplaceValues(regexString, data)); foreach (Match match in matches) { var output = ReplaceValues(regexOutput, data); for (var i = 0; i < match.Groups.Count; i++) { output = output.Replace("[" + i + "]", match.Groups[i].Value); } list.Add(output); } } catch { } break; } return(list); }
private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("ul li div") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("li")) .Select(x => { var tuples = x.QuerySelectorAll("h3") .Select(y => { var textContent = y.TextContent.Trim(); var lastBracket = textContent.LastIndexOf("["); if (lastBracket != -1) { textContent = textContent.Substring(0, lastBracket); } return(new Tuple <string, string>("title", textContent)); }).ToList(); tuples.AddRange(x.QuerySelectorAll("span") .Select(y => new Tuple <string, string>(y.ClassName, y.TextContent.Trim())) .ToList()); tuples.AddRange(x.QuerySelectorAll("div") .Where(y => !string.IsNullOrEmpty(y.ClassName) && y.ClassName == "hotdeal_info") .Select(y => new Tuple <string, string>("info", y.TextContent.Replace("\t", string.Empty))) .ToList()); var hrefs = x.QuerySelectorAll("a") .Select(x => x.GetAttribute("href")) .ToList(); return(new Tuple <List <Tuple <string, string> >, List <string> >(tuples, hrefs)); }).ToArray(); Parallel.ForEach(tdContent, row => { var stringTuples = row.Item1; var hrefs = row.Item2; var category = stringTuples.FindValue("category").Replace(" /", string.Empty); var title = stringTuples.FindValue("title").TrimEnd(); var info = stringTuples.FindValue("info"); if (!string.IsNullOrEmpty(info)) { title += $" [{info}]"; } var author = stringTuples.FindValue("author").Replace("/ ", string.Empty); var date = DateTime.Now; var recommend = stringTuples.FindValue("count").ToInt(); var href = UrlCompositeHref(hrefs[0]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Category = category, Title = title.Substring("\t"), Author = author, Recommend = recommend, DateTime = date, Href = href, SourceId = Source.Id }); }); }
private void ParseLot(AngleSharp.Html.Dom.IHtmlDocument document, MessageModel descriptionMessage) { var elems = document.GetElementsByClassName("lotInfo"); if (elems.Length == 0) { return; } var table = elems[0]; var trs = table.GetElementsByTagName("tr"); if (trs.Length > 0) { int number = -1; int description = -1; int startPrice = -1; int step = -1; int deposit = -1; int PriceReductionInformation = -1; int PropertyClassification = -1; var header = trs[0]; var thsHeaders = header.GetElementsByTagName("th"); var length = thsHeaders.Length; for (int i = 0; i < length; i++) { switch (thsHeaders[i].TextContent) { case "Номер лота": number = i; break; case "Описание": description = i; break; case "Начальная цена, руб": startPrice = i; break; case "Шаг": step = i; break; case "Задаток": deposit = i; break; case "Информация о снижении цены": PriceReductionInformation = i; break; case "Классификация имущества": PropertyClassification = i; break; } } length = trs.Length; for (int i = 1; i < length; i++) { var lot = new LotModel(); var tds = trs[i].GetElementsByTagName("td"); if (number != -1) { int num; if (int.TryParse(tds[number].TextContent, out num)) { lot.Number = num; } } if (description != -1) { lot.Description = tds[description].TextContent; } if (startPrice != -1) { double num; if (double.TryParse(tds[startPrice].TextContent, out num)) { lot.StartPrice = num; } } if (step != -1) { lot.Step = tds[step].TextContent; } if (deposit != -1) { lot.Deposit = tds[deposit].TextContent; } if (PriceReductionInformation != -1) { lot.PriceReductionInformation = tds[PriceReductionInformation].TextContent; } if (PropertyClassification != -1) { lot.PropertyClassification = tds[PropertyClassification].TextContent; } descriptionMessage.Lots.Add(lot); //Log.Debug($"Add lot - {lot.Number}"); } } var regex = new Regex(@"\d+\.\d+\.\d+ \d+\:\d+"); var regex1 = new Regex(@"\d+\.\d+\.\d+\s+\d+\:\d+"); var body = document.GetElementsByTagName("body")[0]; var ms = regex1.Matches(body.TextContent); if (ms.Count >= 2) { descriptionMessage.DateStart = DateTime.Parse(ms[0].Value); descriptionMessage.DateEnd = DateTime.Parse(ms[1].Value); } if (ms.Count == 3) { descriptionMessage.DateStartBargaining = DateTime.Parse(ms[2].Value); } }
static void Main(string[] args) { const string AudioPlayerSelector = "div.row > div > div.card > div.card-body > div.audioplayer-wrapper > div.audioplayer"; const string DownloadDirectory = @"songs"; //Get download url string baseUrl = ""; if (args.Length > 0) { baseUrl = args[0]; } else { baseUrl = "https://incompetech.filmmusic.io/de/suche/"; } //If we're overwriting, delete work folder if it exists if (OverwritePages) { if (Directory.Exists(Path.GetFullPath("work"))) { Directory.Delete(Path.GetFullPath("work"), true); } } //Create work directory if it doesn't exist if (!Directory.Exists("work")) { Directory.CreateDirectory("work"); } //Use the default configuration for AngleSharp IConfiguration config = Configuration.Default; //Create a new context for evaluating webpages with the given config IBrowsingContext context = BrowsingContext.New(config); IHtmlParser thisParser = context.GetService <IHtmlParser>(); Parser = thisParser; Console.WriteLine("Downloading pages...\n"); int totalPages = DownloadAndFindNext(baseUrl, 1); Console.WriteLine("Download complete! Found " + totalPages + " pages\n"); Console.WriteLine("Downloading songs..."); //If we're overwriting, delete song folder if it exists if (OverwriteSongs) { if (Directory.Exists(Path.GetFullPath(DownloadDirectory))) { Directory.Delete(Path.GetFullPath(DownloadDirectory), true); } } //Create song folder if it doesn't exist if (!Directory.Exists(Path.GetFullPath(DownloadDirectory))) { Directory.CreateDirectory(Path.GetFullPath(DownloadDirectory)); } //Ok, now parse the downloaded pages for (int thisPage = 1; thisPage <= totalPages; thisPage++) { Console.WriteLine("\nPage " + thisPage + " of " + totalPages + "\n"); string path = Path.GetFullPath(@"work\page" + thisPage + ".html"); string source = File.ReadAllText(path); AngleSharp.Html.Dom.IHtmlDocument document = Parser.ParseDocument(source); IHtmlCollection <IElement> pages = document.QuerySelectorAll(AudioPlayerSelector); foreach (IElement pageElement in pages) { AngleSharp.Html.Dom.IHtmlDivElement song = (AngleSharp.Html.Dom.IHtmlDivElement)pageElement; //Get path to save to string savePath = Path.GetFullPath(DownloadDirectory + "\\" + song.GetAttribute("data-title") + ".mp3"); //If we're overwriting songs, the song shouldn't already exist. If it does, there's a problem. if (OverwriteSongs) { while (File.Exists(savePath)) { savePath = savePath + ".CONFLICT"; } } //If we're not overwriting songs, it will only download if it doesn't exist if (!File.Exists(savePath) || OverwriteSongs) { using (WebClient client = new WebClient()) { client.DownloadFile(song.GetAttribute("data-mp3").Replace("mp3low", "mp3"), savePath); } } Console.WriteLine(song.GetAttribute("data-title")); } } Console.WriteLine("\n\nEnd of program, bruh"); Console.ReadKey(); }
/// <summary> /// Parses an attribute's value from one or more elements of an HTML page. /// </summary> /// <param name="input">The HTML page</param> /// <param name="selector">The CSS Selector that targets the desired elements</param> /// <param name="attribute">The attribute for which you want to parse the value</param> /// <param name="index">The index of the element to parse among all the ones selected (if not recursive)</param> /// <param name="recursive">Whether to parse from all the elements that match the selector</param> /// <returns>The attribute value(s).</returns> public static IEnumerable <string> CSS(string input, string selector, string attribute, int index = 0, bool recursive = false) { HtmlParser parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument document = null; document = parser.ParseDocument(input); var list = new List <string>(); if (recursive) { foreach (var element in document.QuerySelectorAll(selector)) { switch (attribute) { case "innerHTML": list.Add(element.InnerHtml); break; case "outerHTML": list.Add(element.OuterHtml); break; default: foreach (var attr in element.Attributes) { if (attr.Name == attribute) { list.Add(attr.Value); break; } } break; } } } else { switch (attribute) { case "innerHTML": list.Add(document.QuerySelectorAll(selector)[index].InnerHtml); break; case "outerHTML": list.Add(document.QuerySelectorAll(selector)[index].OuterHtml); break; default: foreach (var attr in document.QuerySelectorAll(selector)[index].Attributes) { if (attr.Name == attribute) { list.Add(attr.Value); break; } } break; } } return(list); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("div") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("list_item") && x.ClassName.Contains("symph_row")) .Select(x => { var stringTuples = x.QuerySelectorAll("span") .Select(y => { var text = y.TextContent.Trim(); if (string.IsNullOrEmpty(text)) { text = y.QuerySelector("img")?.GetAttribute("alt"); } return(new Tuple <string, string>(y.ClassName, text)); }).ToList(); var a = x.QuerySelectorAll("a"); stringTuples.AddRange(a.Where(x => !string.IsNullOrEmpty(x.ClassName)) .Select(y => new Tuple <string, string>(y.ClassName, y.TextContent)) .ToList()); var hrefs = a.Select(x => x.GetAttribute("href")) .ToList(); return(new Tuple <List <Tuple <string, string> >, List <string> >(stringTuples, hrefs)); }) .ToArray(); Parallel.ForEach(tdContent, row => { var stringTuples = row.Item1; var hrefs = row.Item2; var category = stringTuples.FindValue("category_fixed"); if (string.IsNullOrEmpty(category)) { category = stringTuples.FindValue("icon_keyword"); } ; var title = stringTuples.FindValue("subject_fixed"); if (string.IsNullOrEmpty(title)) { title = stringTuples.FindValue("list_subject"); } ; title = title.Substring("\n"); var author = stringTuples.FindValue("nickname"); var count = stringTuples.FindValue("hit").ToIntShorthand(); var date = DateTime.Parse(stringTuples.FindValue("timestamp")); var href = UrlCompositeHref(hrefs[0]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Category = category, Title = title, Author = author, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
/// <summary> /// Download one VS Marketplace package and extract it to the target directory. /// </summary> /// <param name="purl"> Package URL of the package to download. </param> /// <returns> the path or file written. </returns> public override async Task <IEnumerable <string> > DownloadVersionAsync(PackageURL purl, bool doExtract, bool cached = false) { Logger.Trace("DownloadVersion {0}", purl?.ToString()); List <string> downloadedPaths = new(); HashSet <string> downloadedUrls = new(); HttpClient httpClient = CreateHttpClient(); if (purl == null || purl.Name == null || purl.Version == null) { return(downloadedPaths); } string packageVersion = purl.Version; IEnumerable <string>?availablePools = await GetPoolsForProject(purl); foreach (string?pool in availablePools) { string?archiveBaseUrl = await GetArchiveBaseUrlForProject(purl, pool); if (archiveBaseUrl == null) { Logger.Debug("Unable to find archive base URL for {0}, pool {1}", purl.ToString(), pool); continue; } try { string?html = await GetHttpStringCache(httpClient, archiveBaseUrl, neverThrow : true); if (html == null) { Logger.Debug("Error reading {0}", archiveBaseUrl); continue; } AngleSharp.Html.Dom.IHtmlDocument document = await new HtmlParser().ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement anchor in document.QuerySelectorAll("a")) { string?anchorHref = anchor.GetAttribute("href"); if (anchorHref.Contains(packageVersion) && anchorHref.EndsWith(".deb")) { string?fullDownloadUrl = archiveBaseUrl + "/" + anchorHref; if (!downloadedUrls.Add(fullDownloadUrl)) { // Never re-download the same file twice. continue; } Logger.Debug("Downloading binary: {0}", fullDownloadUrl); System.Net.Http.HttpResponseMessage downloadResult = await httpClient.GetAsync(fullDownloadUrl); if (!downloadResult.IsSuccessStatusCode) { Logger.Debug("Error {0} downloading file {1}", downloadResult.StatusCode, fullDownloadUrl); continue; } // TODO: Add distro version id string targetName = $"ubuntu-{purl.Name}@{packageVersion}-{anchorHref}"; string extractionPath = Path.Combine(TopLevelExtractionDirectory, targetName); if (doExtract && Directory.Exists(extractionPath) && cached == true) { downloadedPaths.Add(extractionPath); return(downloadedPaths); } if (doExtract) { downloadedPaths.Add(await ArchiveHelper.ExtractArchiveAsync(TopLevelExtractionDirectory, targetName, await downloadResult.Content.ReadAsStreamAsync(), cached)); } else { extractionPath += Path.GetExtension(anchorHref) ?? ""; await File.WriteAllBytesAsync(extractionPath, await downloadResult.Content.ReadAsByteArrayAsync()); downloadedPaths.Add(extractionPath); } } // Source Code URLs don't have the full version on the source files. We need to find // them in the .dsc else if (anchorHref.Contains(packageVersion) && anchorHref.EndsWith(".dsc")) { string?dscContent = await GetHttpStringCache(httpClient, archiveBaseUrl + "/" + anchorHref); if (dscContent == null) { continue; } HashSet <string> seenFiles = new(); foreach (Match match in Regex.Matches(dscContent, "^ [a-z0-9]+ \\d+ (.*)$", RegexOptions.Multiline | RegexOptions.IgnoreCase).Where(x => x != null)) { seenFiles.Add(match.Groups[1].Value.Trim()); } // Now we need to go through the anchor tags again looking for the source code files foreach (AngleSharp.Dom.IElement?secondAnchor in document.QuerySelectorAll("a")) { string?secondHref = secondAnchor.GetAttribute("href"); if (seenFiles.Any(f => f.Equals(secondHref) && !secondHref.EndsWith(".deb") && !secondHref.EndsWith(".dsc") && !secondHref.EndsWith(".asc"))) { string fullDownloadUrl = archiveBaseUrl + "/" + secondHref; if (!downloadedUrls.Add(fullDownloadUrl)) { // Never re-download the same file twice. continue; } Logger.Debug("Downloading source code: {0}", fullDownloadUrl); System.Net.Http.HttpResponseMessage downloadResult = await httpClient.GetAsync(fullDownloadUrl); if (!downloadResult.IsSuccessStatusCode) { Logger.Debug("Error {0} downloading file {1}", downloadResult.StatusCode, fullDownloadUrl); continue; } // TODO: Add distro version id string targetName = $"ubuntu-{purl.Name}@{packageVersion}-{secondHref}"; string extractionPath = Path.Combine(TopLevelExtractionDirectory, targetName); if (doExtract) { downloadedPaths.Add(await ArchiveHelper.ExtractArchiveAsync(TopLevelExtractionDirectory, targetName, await downloadResult.Content.ReadAsStreamAsync(), cached)); } else { extractionPath += Path.GetExtension(anchorHref) ?? ""; await File.WriteAllBytesAsync(extractionPath, await downloadResult.Content.ReadAsByteArrayAsync()); downloadedPaths.Add(extractionPath); } } } } } } catch (Exception ex) { Logger.Debug("Error downloading binary for {0}: {1}", purl.ToString(), ex.Message); } } return(downloadedPaths); }