protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var thContent = document.QuerySelectorAll("thead tr th") .Select(x => x.TextContent.Trim()) .ToArray(); var tdContent = document.QuerySelectorAll("tbody tr") .Where(x => x.ClassName == "view list_tr_humordata") .Select(x => x.QuerySelectorAll("td")) .SelectMany(x => x.Select(y => { return(y.QuerySelector("a") != null ? y.QuerySelector("a").TextContent.Trim() : y.TextContent.Trim()); })) .ToArray(); var tdHref = document.QuerySelectorAll("tbody tr td") .Where(x => x.ClassName == ("subject")) .Select(x => x.QuerySelector("a").GetAttribute("href")) .ToArray(); if (!thContent.Any() || !tdContent.Any()) { return; } var cultureInfo = (CultureInfo)Thread.CurrentThread.CurrentCulture.Clone(); var calendar = cultureInfo.Calendar; calendar.TwoDigitYearMax = DateTime.Now.Year + 30; cultureInfo.DateTimeFormat.Calendar = calendar; Parallel.For(0, tdContent.Length / thContent.Length, n => { var cursor = n * thContent.Length; var id = tdContent[cursor + 0].ToInt(); var title = tdContent[cursor + 2]; var author = tdContent[cursor + 3]; var date = DateTime.ParseExact(tdContent[cursor + 4], "yy/MM/dd HH:mm", cultureInfo); var count = tdContent[cursor + 5].ToInt(); var recommend = tdContent[cursor + 6].ToInt(); var href = UrlCompositeHref(tdHref[n]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, RowId = id, Title = title, Author = author, Recommend = recommend, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("tbody tr td") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName != "g6") .Select(x => { return(x.QuerySelector("a") != null ? x.QuerySelector("a").TextContent.Trim() : x.TextContent.Trim()); }) .ToArray(); var tdHref = document.QuerySelectorAll("tbody tr td a") .Where(x => !string.IsNullOrEmpty(x.ClassName)) .Select(x => x.GetAttribute("href")) .ToArray(); if (!tdContent.Any()) { return; } const int thLength = 7; var thContent = tdContent.Take(thLength); tdContent = tdContent.Skip(thLength).ToArray(); Parallel.For(0, tdContent.Length / thLength, n => { var cursor = n * thLength; var originTitle = tdContent[cursor + 1]; var title = originTitle.Substring("\n"); var author = tdContent[cursor + 2]; var date = DateTime.Parse(tdContent[cursor + 3]); var count = tdContent[cursor + 4].ToInt(); var recommend = tdContent[cursor + 5].ToInt(); var notRecommend = tdContent[cursor + 6].ToInt(); var href = UrlCompositeHref(tdHref[n]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Title = title, Author = author, Recommend = recommend - notRecommend, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
/// <summary> /// Identifies the base URL for package source files. /// </summary> /// <param name="purl"> </param> /// <param name="pool"> </param> /// <returns> </returns> private async Task <string?> GetArchiveBaseUrlForProject(PackageURL purl, string pool) { try { HttpClient httpClient = CreateHttpClient(); string?html = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/{pool}/{purl.Name}", neverThrow : true); if (html == null) { return(null); } AngleSharp.Html.Dom.IHtmlDocument document = await new HtmlParser().ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement anchor in document.QuerySelectorAll("a")) { string?href = anchor.GetAttribute("href"); if (href != null && href.EndsWith(".dsc")) { Match match = Regex.Match(href, "(.+)/[^/]+\\.dsc"); if (match.Success) { return(match.Groups[1].Value.Trim()); } } } } catch (Exception ex) { Logger.Debug(ex, "Error fetching Ubuntu archive base URL for {0}: {1}", purl.ToString(), ex.Message); } return(null); }
/// <summary> /// Identifies the available pools for a given Ubuntu project. For example, 'xenial'. /// </summary> /// <param name="purl"> Package URL to look up (only name is used). </param> /// <returns> List of pool names </returns> private async Task <IEnumerable <string> > GetPoolsForProject(PackageURL purl) { HashSet <string> pools = new(); try { HttpClient httpClient = CreateHttpClient(); string?searchResults = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/search?keywords={purl.Name}&searchon=names&exact=1&suite=all§ion=all", neverThrow : true); AngleSharp.Html.Dom.IHtmlDocument document = await new HtmlParser().ParseDocumentAsync(searchResults); foreach (AngleSharp.Dom.IElement anchor in document.QuerySelectorAll("a.resultlink")) { string?href = anchor.GetAttribute("href"); if (href != null) { Match match = Regex.Match(href, "^/([^/]+)/.+"); if (match.Success) { string pool = match.Groups[1].Value.Trim(); Logger.Debug("Identified pool: {0}", pool); pools.Add(pool); } } } } catch (Exception ex) { Logger.Debug(ex, "Error fetching Ubuntu pools for {0}: {1}", purl.ToString(), ex.Message); } return(pools); }
/// <summary> /// Download one CPAN package and extract it to the target directory. /// </summary> /// <param name="purl">Package URL of the package to download.</param> /// <returns>n/a</returns> public override async Task <IEnumerable <string> > DownloadVersionAsync(PackageURL purl, bool doExtract, bool cached = false) { Logger.Trace("DownloadVersion {0}", purl?.ToString()); string? packageName = purl?.Name; string? packageVersion = purl?.Version; List <string> downloadedPaths = new(); if (string.IsNullOrWhiteSpace(packageName) || string.IsNullOrWhiteSpace(packageVersion)) { Logger.Debug("Unable to download [{0} {1}]. Both must be defined.", packageName, packageVersion); return(downloadedPaths); } // Locate the URL HttpClient httpClient = CreateHttpClient(); string? packageVersionUrl = null; string? html = await GetHttpStringCache(httpClient, $"{ENV_CPAN_ENDPOINT}/release/{packageName}"); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement option in document.QuerySelectorAll("div.release select.extend option")) { if (!option.HasAttribute("value")) { continue; } string?value = option.GetAttribute("value"); string version = value.Split('-').Last(); if (version.StartsWith("v", StringComparison.InvariantCultureIgnoreCase)) { version = version[1..];
static int DownloadAndFindNext(string url, int page) { Console.WriteLine("Downloading page " + page); const string NextPageSelector = "div.row > div > div.card > div.card-body > div.paginator-top > ul > li.page-item > a[rel=\"next\"]"; //Download source string path = Path.GetFullPath(@"work\page" + page + ".html"); if (!File.Exists(path) || OverwritePages) { using (WebClient client = new WebClient()) { client.DownloadFile(url, path); } } string source = File.ReadAllText(path); AngleSharp.Html.Dom.IHtmlDocument document = Parser.ParseDocument(source); IHtmlCollection <IElement> pages = document.QuerySelectorAll(NextPageSelector); if (pages.Length != 0) { AngleSharp.Html.Dom.IHtmlAnchorElement thisPage = (AngleSharp.Html.Dom.IHtmlAnchorElement)pages[0]; return(DownloadAndFindNext(thisPage.Href, page + 1)); } else { return(page); } }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("tbody tr td table tbody tr td span").Select(x => x.TextContent.Trim()).ToArray(); var latest = tdContent.LastOrDefault(); LatestPage = string.IsNullOrEmpty(latest) ? (int?)null : latest.ToInt(); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var thContent = document.QuerySelectorAll("thead tr th").Select(x => x.TextContent.Trim()).ToArray(); if (thContent.Any()) { OnPageCrawlTable(document, thContent); } else { OnPageCrawlList(document); } }
private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string[] thContent) { var tdContent = document.QuerySelectorAll("tbody tr td").Select(x => x.TextContent.Trim()).ToArray(); var tdHref = document.QuerySelectorAll("tbody tr td").Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("title")).Select(x => x.QuerySelector("a").GetAttribute("href")).ToArray(); if (!thContent.Any() || !tdContent.Any()) { return; } Parallel.For(0, tdContent.Length / thContent.Length, n => { var cursor = n * thContent.Length; var category = tdContent[cursor + 0]; var title = tdContent[cursor + 1]; var author = tdContent[cursor + 2]; var date = DateTime.Parse(tdContent[cursor + 3]); var count = tdContent[cursor + 4].ToInt(); var recommend = tdContent[cursor + 5].ToInt(); var href = UrlCompositeHref(tdHref[n]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Category = category, Title = title.Substring("\t"), Author = author, Recommend = recommend, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContents = document.QuerySelectorAll("tbody tr") .Select(x => { var stringTuples = x.QuerySelectorAll("span") .Where(x => x.ClassName != "category") .Select(y => new Tuple <string, string>(y.ClassName, y.TextContent.Trim())).ToList(); var hrefs = x.QuerySelectorAll("a") .Select(x => x.GetAttribute("href")) .ToList(); return(new Tuple <List <Tuple <string, string> >, List <string> >(stringTuples, hrefs)); }) .ToArray(); Parallel.ForEach(tdContents, row => { var stringTuples = row.Item1; var hrefs = row.Item2; var cmtnum = stringTuples.FindValue("cmtnum"); var originTitle = stringTuples.FindValue("title"); var infos = stringTuples.FindValue("info").Split("|"); var title = string.IsNullOrEmpty(cmtnum) ? originTitle : originTitle.Substring(cmtnum); var category = infos[0].Substring("\n"); var author = infos[1]; var date = DateTime.Parse(infos[2]); var recommend = string.IsNullOrEmpty(cmtnum) ? 0 : cmtnum.ToIntRegx(); var href = hrefs[0]; _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Title = title, Category = category, Author = author, Recommend = recommend, DateTime = date, Href = href, SourceId = Source.Id }); }); }
/// <inheritdoc /> public override async Task <IEnumerable <string> > EnumerateVersionsAsync(PackageURL purl, bool useCache = true, bool includePrerelease = true) { Logger.Trace("EnumerateVersions {0}", purl?.ToString()); if (purl == null || purl.Name is null) { return(Array.Empty <string>()); } try { string packageName = purl.Name; HttpClient httpClient = CreateHttpClient(); System.Net.Http.HttpResponseMessage?html = await httpClient.GetAsync($"{ENV_HACKAGE_ENDPOINT}/package/{packageName}"); html.EnsureSuccessStatusCode(); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(await html.Content.ReadAsStringAsync()); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> ths = document.QuerySelectorAll("th"); List <string> versionList = new(); foreach (AngleSharp.Dom.IElement th in ths) { if (th.TextContent.StartsWith("Versions")) { AngleSharp.Dom.IElement td = th.NextElementSibling; foreach (AngleSharp.Dom.IElement version in td.QuerySelectorAll("a,strong")) { string versionString = version.TextContent.ToLower().Trim(); Logger.Debug("Identified {0} version {1}.", packageName, versionString); versionList.Add(versionString); } break; } } return(SortVersions(versionList.Distinct())); } catch (Exception ex) { Logger.Debug("Unable to enumerate versions: {0}", ex.Message); throw; } }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var thContent = document.QuerySelectorAll("thead tr th") .Select(x => x.TextContent.Trim()) .ToList(); var tdContent = document.QuerySelectorAll("tbody tr") .Where(x => x.ClassName == "table_body") .Select(x => x.QuerySelectorAll("td")) .SelectMany(x => x.Select(y => y.TextContent.Trim())) .ToArray(); var tdHref = document.QuerySelectorAll("tbody tr") .Where(x => x.ClassName == "table_body") .Select(x => x.QuerySelectorAll("td")) .SelectMany(x => x.Where(y => y.ClassName == "subject" && y.QuerySelector("a") != null) .Select(y => y.QuerySelector("a").GetAttribute("href"))) .Where(x => x.StartsWith("http")) .ToArray(); if (!thContent.Any() || !tdContent.Any()) { return; } var cultureInfo = (CultureInfo)Thread.CurrentThread.CurrentCulture.Clone(); var calendar = cultureInfo.Calendar; calendar.TwoDigitYearMax = DateTime.Now.Year + 30; cultureInfo.DateTimeFormat.Calendar = calendar; Parallel.For(0, tdContent.Length / thContent.Count, n => { var cursor = n * thContent.Count; var id = tdContent.GetValue(thContent, "ID", cursor).ToIntNullable(); var category = tdContent.GetValue(thContent, "구분", cursor); if (string.IsNullOrEmpty(category)) { category = tdContent.GetValue(thContent, "게시판", cursor); } var title = tdContent.GetValue(thContent, "제목", cursor).Substring("\n"); var author = tdContent.GetValue(thContent, "글쓴이", cursor); var recommend = tdContent.GetValue(thContent, "추천", cursor).ToIntNullable(); var count = tdContent.GetValue(thContent, "조회", cursor).ToInt(); var dateTimeStr = tdContent.GetValue(thContent, "날짜", cursor); DateTime?date; if (dateTimeStr.Contains('.')) { date = dateTimeStr.IndexOf('.') >= 4 ? DateTime.ParseExact(dateTimeStr, "yyyy.MM.dd", cultureInfo) : DateTime.ParseExact(dateTimeStr, "yy.MM.dd", cultureInfo); } else { date = DateTime.Parse(dateTimeStr); } var href = tdHref[n]; _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Category = category, Title = title, Author = author, Recommend = recommend.GetValueOrDefault(0), Count = count, DateTime = date.GetValueOrDefault(DateTime.Now), RowId = id, Href = href, SourceId = Source.Id }); }); }
/// <summary> /// Получает все задания с странцы категории /// </summary> /// <param name="link">ссылка на страницу без домена</param> /// <returns>Список все заданий</returns> public List <Objects.Task> GetTasksFromPage(string link) { Log.ProcessMessage("Пытаемся получить список заданий со страницы " + link); try { string get = http.GetAsync(Domain + link).Result.Content.ReadAsStringAsync().Result; HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> taskElements = html.QuerySelectorAll("div.row.set_href .title a"); var tasks = new List <Objects.Task> { }; foreach (var elem in taskElements) { var task = GetTaskFromLink(elem.GetAttribute("href")); if (task == null) { continue; } tasks.Add(task); } Log.GoodMessage("Получили список заданий со страницы " + link); return(tasks); } catch { Log.ExMessage("Не удалось получить список заданий со страницы " + link); return(null); } }
static void Main(string[] args) { const string AudioPlayerSelector = "div.row > div > div.card > div.card-body > div.audioplayer-wrapper > div.audioplayer"; const string DownloadDirectory = @"songs"; //Get download url string baseUrl = ""; if (args.Length > 0) { baseUrl = args[0]; } else { baseUrl = "https://incompetech.filmmusic.io/de/suche/"; } //If we're overwriting, delete work folder if it exists if (OverwritePages) { if (Directory.Exists(Path.GetFullPath("work"))) { Directory.Delete(Path.GetFullPath("work"), true); } } //Create work directory if it doesn't exist if (!Directory.Exists("work")) { Directory.CreateDirectory("work"); } //Use the default configuration for AngleSharp IConfiguration config = Configuration.Default; //Create a new context for evaluating webpages with the given config IBrowsingContext context = BrowsingContext.New(config); IHtmlParser thisParser = context.GetService <IHtmlParser>(); Parser = thisParser; Console.WriteLine("Downloading pages...\n"); int totalPages = DownloadAndFindNext(baseUrl, 1); Console.WriteLine("Download complete! Found " + totalPages + " pages\n"); Console.WriteLine("Downloading songs..."); //If we're overwriting, delete song folder if it exists if (OverwriteSongs) { if (Directory.Exists(Path.GetFullPath(DownloadDirectory))) { Directory.Delete(Path.GetFullPath(DownloadDirectory), true); } } //Create song folder if it doesn't exist if (!Directory.Exists(Path.GetFullPath(DownloadDirectory))) { Directory.CreateDirectory(Path.GetFullPath(DownloadDirectory)); } //Ok, now parse the downloaded pages for (int thisPage = 1; thisPage <= totalPages; thisPage++) { Console.WriteLine("\nPage " + thisPage + " of " + totalPages + "\n"); string path = Path.GetFullPath(@"work\page" + thisPage + ".html"); string source = File.ReadAllText(path); AngleSharp.Html.Dom.IHtmlDocument document = Parser.ParseDocument(source); IHtmlCollection <IElement> pages = document.QuerySelectorAll(AudioPlayerSelector); foreach (IElement pageElement in pages) { AngleSharp.Html.Dom.IHtmlDivElement song = (AngleSharp.Html.Dom.IHtmlDivElement)pageElement; //Get path to save to string savePath = Path.GetFullPath(DownloadDirectory + "\\" + song.GetAttribute("data-title") + ".mp3"); //If we're overwriting songs, the song shouldn't already exist. If it does, there's a problem. if (OverwriteSongs) { while (File.Exists(savePath)) { savePath = savePath + ".CONFLICT"; } } //If we're not overwriting songs, it will only download if it doesn't exist if (!File.Exists(savePath) || OverwriteSongs) { using (WebClient client = new WebClient()) { client.DownloadFile(song.GetAttribute("data-mp3").Replace("mp3low", "mp3"), savePath); } } Console.WriteLine(song.GetAttribute("data-title")); } } Console.WriteLine("\n\nEnd of program, bruh"); Console.ReadKey(); }
/// <summary> /// Parses an attribute's value from one or more elements of an HTML page. /// </summary> /// <param name="input">The HTML page</param> /// <param name="selector">The CSS Selector that targets the desired elements</param> /// <param name="attribute">The attribute for which you want to parse the value</param> /// <param name="index">The index of the element to parse among all the ones selected (if not recursive)</param> /// <param name="recursive">Whether to parse from all the elements that match the selector</param> /// <returns>The attribute value(s).</returns> public static IEnumerable <string> CSS(string input, string selector, string attribute, int index = 0, bool recursive = false) { HtmlParser parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument document = null; document = parser.ParseDocument(input); var list = new List <string>(); if (recursive) { foreach (var element in document.QuerySelectorAll(selector)) { switch (attribute) { case "innerHTML": list.Add(element.InnerHtml); break; case "outerHTML": list.Add(element.OuterHtml); break; default: foreach (var attr in element.Attributes) { if (attr.Name == attribute) { list.Add(attr.Value); break; } } break; } } } else { switch (attribute) { case "innerHTML": list.Add(document.QuerySelectorAll(selector)[index].InnerHtml); break; case "outerHTML": list.Add(document.QuerySelectorAll(selector)[index].OuterHtml); break; default: foreach (var attr in document.QuerySelectorAll(selector)[index].Attributes) { if (attr.Name == attribute) { list.Add(attr.Value); break; } } break; } } return(list); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("div") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("list_item") && x.ClassName.Contains("symph_row")) .Select(x => { var stringTuples = x.QuerySelectorAll("span") .Select(y => { var text = y.TextContent.Trim(); if (string.IsNullOrEmpty(text)) { text = y.QuerySelector("img")?.GetAttribute("alt"); } return(new Tuple <string, string>(y.ClassName, text)); }).ToList(); var a = x.QuerySelectorAll("a"); stringTuples.AddRange(a.Where(x => !string.IsNullOrEmpty(x.ClassName)) .Select(y => new Tuple <string, string>(y.ClassName, y.TextContent)) .ToList()); var hrefs = a.Select(x => x.GetAttribute("href")) .ToList(); return(new Tuple <List <Tuple <string, string> >, List <string> >(stringTuples, hrefs)); }) .ToArray(); Parallel.ForEach(tdContent, row => { var stringTuples = row.Item1; var hrefs = row.Item2; var category = stringTuples.FindValue("category_fixed"); if (string.IsNullOrEmpty(category)) { category = stringTuples.FindValue("icon_keyword"); } ; var title = stringTuples.FindValue("subject_fixed"); if (string.IsNullOrEmpty(title)) { title = stringTuples.FindValue("list_subject"); } ; title = title.Substring("\n"); var author = stringTuples.FindValue("nickname"); var count = stringTuples.FindValue("hit").ToIntShorthand(); var date = DateTime.Parse(stringTuples.FindValue("timestamp")); var href = UrlCompositeHref(hrefs[0]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Category = category, Title = title, Author = author, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
/// <summary> /// Download one VS Marketplace package and extract it to the target directory. /// </summary> /// <param name="purl"> Package URL of the package to download. </param> /// <returns> the path or file written. </returns> public override async Task <IEnumerable <string> > DownloadVersionAsync(PackageURL purl, bool doExtract, bool cached = false) { Logger.Trace("DownloadVersion {0}", purl?.ToString()); List <string> downloadedPaths = new(); HashSet <string> downloadedUrls = new(); HttpClient httpClient = CreateHttpClient(); if (purl == null || purl.Name == null || purl.Version == null) { return(downloadedPaths); } string packageVersion = purl.Version; IEnumerable <string>?availablePools = await GetPoolsForProject(purl); foreach (string?pool in availablePools) { string?archiveBaseUrl = await GetArchiveBaseUrlForProject(purl, pool); if (archiveBaseUrl == null) { Logger.Debug("Unable to find archive base URL for {0}, pool {1}", purl.ToString(), pool); continue; } try { string?html = await GetHttpStringCache(httpClient, archiveBaseUrl, neverThrow : true); if (html == null) { Logger.Debug("Error reading {0}", archiveBaseUrl); continue; } AngleSharp.Html.Dom.IHtmlDocument document = await new HtmlParser().ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement anchor in document.QuerySelectorAll("a")) { string?anchorHref = anchor.GetAttribute("href"); if (anchorHref.Contains(packageVersion) && anchorHref.EndsWith(".deb")) { string?fullDownloadUrl = archiveBaseUrl + "/" + anchorHref; if (!downloadedUrls.Add(fullDownloadUrl)) { // Never re-download the same file twice. continue; } Logger.Debug("Downloading binary: {0}", fullDownloadUrl); System.Net.Http.HttpResponseMessage downloadResult = await httpClient.GetAsync(fullDownloadUrl); if (!downloadResult.IsSuccessStatusCode) { Logger.Debug("Error {0} downloading file {1}", downloadResult.StatusCode, fullDownloadUrl); continue; } // TODO: Add distro version id string targetName = $"ubuntu-{purl.Name}@{packageVersion}-{anchorHref}"; string extractionPath = Path.Combine(TopLevelExtractionDirectory, targetName); if (doExtract && Directory.Exists(extractionPath) && cached == true) { downloadedPaths.Add(extractionPath); return(downloadedPaths); } if (doExtract) { downloadedPaths.Add(await ArchiveHelper.ExtractArchiveAsync(TopLevelExtractionDirectory, targetName, await downloadResult.Content.ReadAsStreamAsync(), cached)); } else { extractionPath += Path.GetExtension(anchorHref) ?? ""; await File.WriteAllBytesAsync(extractionPath, await downloadResult.Content.ReadAsByteArrayAsync()); downloadedPaths.Add(extractionPath); } } // Source Code URLs don't have the full version on the source files. We need to find // them in the .dsc else if (anchorHref.Contains(packageVersion) && anchorHref.EndsWith(".dsc")) { string?dscContent = await GetHttpStringCache(httpClient, archiveBaseUrl + "/" + anchorHref); if (dscContent == null) { continue; } HashSet <string> seenFiles = new(); foreach (Match match in Regex.Matches(dscContent, "^ [a-z0-9]+ \\d+ (.*)$", RegexOptions.Multiline | RegexOptions.IgnoreCase).Where(x => x != null)) { seenFiles.Add(match.Groups[1].Value.Trim()); } // Now we need to go through the anchor tags again looking for the source code files foreach (AngleSharp.Dom.IElement?secondAnchor in document.QuerySelectorAll("a")) { string?secondHref = secondAnchor.GetAttribute("href"); if (seenFiles.Any(f => f.Equals(secondHref) && !secondHref.EndsWith(".deb") && !secondHref.EndsWith(".dsc") && !secondHref.EndsWith(".asc"))) { string fullDownloadUrl = archiveBaseUrl + "/" + secondHref; if (!downloadedUrls.Add(fullDownloadUrl)) { // Never re-download the same file twice. continue; } Logger.Debug("Downloading source code: {0}", fullDownloadUrl); System.Net.Http.HttpResponseMessage downloadResult = await httpClient.GetAsync(fullDownloadUrl); if (!downloadResult.IsSuccessStatusCode) { Logger.Debug("Error {0} downloading file {1}", downloadResult.StatusCode, fullDownloadUrl); continue; } // TODO: Add distro version id string targetName = $"ubuntu-{purl.Name}@{packageVersion}-{secondHref}"; string extractionPath = Path.Combine(TopLevelExtractionDirectory, targetName); if (doExtract) { downloadedPaths.Add(await ArchiveHelper.ExtractArchiveAsync(TopLevelExtractionDirectory, targetName, await downloadResult.Content.ReadAsStreamAsync(), cached)); } else { extractionPath += Path.GetExtension(anchorHref) ?? ""; await File.WriteAllBytesAsync(extractionPath, await downloadResult.Content.ReadAsByteArrayAsync()); downloadedPaths.Add(extractionPath); } } } } } } catch (Exception ex) { Logger.Debug("Error downloading binary for {0}: {1}", purl.ToString(), ex.Message); } } return(downloadedPaths); }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContents = document.QuerySelectorAll("tbody tr") .Where(x => string.IsNullOrEmpty(x.ClassName) || x.ClassName != "notice") .Select(x => { var stringTuples = x.QuerySelectorAll("td") .Select(y => { var text = y.TextContent.Trim(); if (string.IsNullOrEmpty(text)) { text = y.QuerySelectorAll("img") .Where(x => x.GetAttribute("src") != null) .Select(x => x.GetAttribute("title")).LastOrDefault(); // LastOrDefault인 이유는 Author 부분이 First쪽이 레벨이기 때문 } return(new Tuple <string, string>(y.ClassName, text)); }).ToList(); var hrefs = x.QuerySelectorAll("a") .Select(x => x.GetAttribute("href")) .ToList(); return(new Tuple <List <Tuple <string, string> >, List <string> >(stringTuples, hrefs)); }) .ToArray(); Parallel.ForEach(tdContents, row => { var stringTuples = row.Item1; var hrefs = row.Item2; var title = stringTuples.FindValue("title"); title = title.Substring("\n"); var category = stringTuples.FindValue("cate"); var author = stringTuples.FindValue("author"); var date = DateTime.Parse(stringTuples.FindValue("time")); var count = stringTuples[7].Item2.ToInt(); var recommend = stringTuples[8].Item2.ToInt(); var href = hrefs[0]; _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Title = title, Category = category, Author = author, Count = count, Recommend = recommend, DateTime = date, Href = UrlCompositeHref(href), SourceId = Source.Id }); }); }
public override async Task <string?> GetMetadataAsync(PackageURL purl, bool useCache = true) { Logger.Trace("GetMetadata {0}", purl?.ToString()); if (purl == null || purl.Name == null) { return(null); } StringBuilder metadataContent = new(); HttpClient httpClient = CreateHttpClient(); foreach (string distroUrlPrefix in GetBaseURLs(purl)) { try { string?html = await GetHttpStringCache(httpClient, distroUrlPrefix, useCache : useCache, neverThrow : true); if (html != null) { AngleSharp.Html.Dom.IHtmlDocument?document = await new HtmlParser().ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement?anchor in document.QuerySelectorAll("a")) { string?anchorHref = anchor.GetAttribute("href"); if (anchorHref.EndsWith(".dsc")) { Logger.Debug("Found a .dsc file: {0}", anchorHref); string?dscContent = await GetHttpStringCache(httpClient, distroUrlPrefix + anchorHref, neverThrow : true); if (dscContent == null) { continue; } metadataContent.AppendLine(dscContent); } } } } catch (Exception ex) { Logger.Debug("Error obtaining .dsc file for {0}: {1}", purl.ToString(), ex.Message); } // Fallback to packages.ubuntu.com if we haven't seen any .dsc files if (metadataContent.Length == 0) { try { string?searchResults = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/search?keywords={purl.Name}&searchon=names&exact=1&suite=all§ion=all", useCache); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(searchResults); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> anchorItems = document.QuerySelectorAll("a.resultlink"); IEnumerable <string> metadataUrlList = anchorItems.Select(s => s.GetAttribute("href") ?? ""); foreach (string metadataUrl in metadataUrlList) { metadataContent.AppendLine(await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/{metadataUrl}")); } } catch (Exception ex) { Logger.Debug(ex, "Error fetching Ubuntu metadata: {0}", ex.Message); } } } return(metadataContent.ToString()); }
/// <summary> /// Получает список всех контактов /// </summary> /// <returns>Objects.Contact[]</returns> public Objects.Contact[] GetContacts() { Log.ProcessMessage("Получаем список контактов"); if (Login == "" || Password == "") { Log.ExMessage("Не назначен логин/пароль"); } try { string get = http.GetAsync(Domain + "/account/contacts/").Result.Content.ReadAsStringAsync().Result; if (get.Contains("Вы не авторизованы")) { if (!Auth()) { return(null); } else { get = http.GetAsync(Domain + "/account/contacts/").Result.Content.ReadAsStringAsync().Result; } } HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); List <Objects.Contact> contacts = new List <Objects.Contact> { }; AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> links = html.QuerySelectorAll(".page_content .row .buttons a.pm_link"); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> names = html.QuerySelectorAll(".page_content .row .name a"); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> nicks = html.QuerySelectorAll(".page_content .row .nickname"); for (int i = 0; i < links.Length; i++) { if (links[i].GetAttribute("class").Contains("btn-success")) { contacts.Add(new Objects.Contact { Link = links[i].GetAttribute("href"), Name = names[i].TextContent, Nick = nicks[i].TextContent, IsNew = true }); } else { contacts.Add(new Objects.Contact { Link = links[i].GetAttribute("href"), Name = names[i].TextContent, Nick = nicks[i].TextContent, IsNew = false }); } } Log.GoodMessage("Получили список контактов"); return(contacts.ToArray()); } catch { Log.ExMessage("Не удалось получить список контактов"); return(null); } }
private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document) { var tdContent = document.QuerySelectorAll("ul li div") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("li")) .Select(x => { var tuples = x.QuerySelectorAll("h3") .Select(y => { var textContent = y.TextContent.Trim(); var lastBracket = textContent.LastIndexOf("["); if (lastBracket != -1) { textContent = textContent.Substring(0, lastBracket); } return(new Tuple <string, string>("title", textContent)); }).ToList(); tuples.AddRange(x.QuerySelectorAll("span") .Select(y => new Tuple <string, string>(y.ClassName, y.TextContent.Trim())) .ToList()); tuples.AddRange(x.QuerySelectorAll("div") .Where(y => !string.IsNullOrEmpty(y.ClassName) && y.ClassName == "hotdeal_info") .Select(y => new Tuple <string, string>("info", y.TextContent.Replace("\t", string.Empty))) .ToList()); var hrefs = x.QuerySelectorAll("a") .Select(x => x.GetAttribute("href")) .ToList(); return(new Tuple <List <Tuple <string, string> >, List <string> >(tuples, hrefs)); }).ToArray(); Parallel.ForEach(tdContent, row => { var stringTuples = row.Item1; var hrefs = row.Item2; var category = stringTuples.FindValue("category").Replace(" /", string.Empty); var title = stringTuples.FindValue("title").TrimEnd(); var info = stringTuples.FindValue("info"); if (!string.IsNullOrEmpty(info)) { title += $" [{info}]"; } var author = stringTuples.FindValue("author").Replace("/ ", string.Empty); var date = DateTime.Now; var recommend = stringTuples.FindValue("count").ToInt(); var href = UrlCompositeHref(hrefs[0]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, Category = category, Title = title.Substring("\t"), Author = author, Recommend = recommend, DateTime = date, Href = href, SourceId = Source.Id }); }); }
/// <inheritdoc /> public override async Task <IEnumerable <string> > EnumerateVersionsAsync(PackageURL purl, bool useCache = true, bool includePrerelease = true) { Logger.Trace("EnumerateVersions {0}", purl?.ToString()); if (purl == null || purl.Name is null) { return(new List <string>()); } try { string packageName = purl.Name; List <string> versionList = new(); HttpClient httpClient = CreateHttpClient(); // Get the latest version System.Net.Http.HttpResponseMessage html = await httpClient.GetAsync($"{ENV_CRAN_ENDPOINT}/web/packages/{packageName}/index.html"); html.EnsureSuccessStatusCode(); HtmlParser?parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(await html.Content.ReadAsStringAsync()); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> tds = document.QuerySelectorAll("td"); for (int i = 0; i < tds.Length; i++) { if (tds[i].TextContent == "Version:") { string?value = tds[i + 1]?.TextContent?.Trim(); if (value != null) { versionList.Add(value); } break; } } // Get the remaining versions html = await httpClient.GetAsync($"{ENV_CRAN_ENDPOINT}/src/contrib/Archive/{packageName}/"); html.EnsureSuccessStatusCode(); document = await parser.ParseDocumentAsync(await html.Content.ReadAsStringAsync()); tds = document.QuerySelectorAll("a"); foreach (AngleSharp.Dom.IElement td in tds) { string?href = td.GetAttribute("href"); if (href?.Contains(".tar.gz") ?? false) { string version = href.Replace(".tar.gz", ""); version = version.Replace(packageName + "_", "").Trim(); Logger.Debug("Identified {0} version {1}.", packageName, version); versionList.Add(version); } } return(SortVersions(versionList.Distinct())); } catch (Exception ex) { Logger.Debug("Unable to enumerate versions: {0}", ex.Message); throw; } }
protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) { var thContent = document.QuerySelectorAll("tbody tr") .Where(x => x.ClassName == "title_bg") .Select(x => x.QuerySelectorAll("td").Where(x => x.ClassName == "list_tspace")) .SelectMany(x => x.Select(y => y.TextContent.Trim())) .ToArray(); var tdContent = document.QuerySelectorAll("tbody tr") .Where(x => x.ClassName == "list0" || x.ClassName == "list1") .Select(x => x.QuerySelectorAll("td").Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("list_vspace"))) .SelectMany(x => x.Select(y => { var text = y.TextContent.Trim(); if (string.IsNullOrEmpty(text)) { text = y.QuerySelector("img")?.GetAttribute("alt"); } else if (y.QuerySelector("a") != null) { text = y.QuerySelector("a").TextContent; } return(text); }) ).ToArray(); var tdHref = document.QuerySelectorAll("tbody tr") .Where(x => x.ClassName == "list0" || x.ClassName == "list1") .Select(x => x.QuerySelectorAll("td a")) .SelectMany(x => x.Select(y => y.GetAttribute("href"))) .Where(x => x != "#") .ToArray(); if (!thContent.Any() || !tdContent.Any()) { return; } Parallel.For(0, tdContent.Length / thContent.Length, n => { var cursor = n * thContent.Length; var id = tdContent[cursor + 0].ToInt(); var author = tdContent[cursor + 1]; var title = tdContent[cursor + 2]; var date = DateTime.Parse(tdContent[cursor + 3]); var str = tdContent[cursor + 4]; var recommend = string.IsNullOrEmpty(str) ? 0 : str.Split(" - ")[0].ToInt(); var count = tdContent[cursor + 5].ToInt(); var href = UrlCompositeHref("/" + tdHref[n]); _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, BoardName = Source.Name, RowId = id, Title = title, Author = author, Recommend = recommend, Count = count, DateTime = date, Href = href, SourceId = Source.Id }); }); }
private List <string> Parse(BotData data) { var original = ReplaceValues(parseTarget, data); var partial = original; var list = new List <string>(); // Parse the value switch (Type) { case ParseType.LR: var ls = ReplaceValues(leftString, data); var rs = ReplaceValues(rightString, data); var pFrom = 0; var pTo = 0; // No L and R = return full input if (ls == "" && rs == "") { list.Add(original); break; } // L or R not present and not empty else if (((!partial.Contains(ls) && ls != "") || (!partial.Contains(rs) && rs != ""))) { break; } // Instead of the mess below, we could simply use Extreme.NET's Substring extensions // return original.Substrings(ls, rs); // Recursive // return original.Substring(ls, rs); // Not recursive if (recursive) { if (useRegexLR) { try { var pattern = BuildLRPattern(ls, rs); MatchCollection mc = Regex.Matches(partial, pattern); foreach (Match m in mc) { list.Add(m.Value); } } catch { } } else { try { while ((partial.Contains(ls) || ls == "") && (partial.Contains(rs) || rs == "")) { // Search for left delimiter and Calculate offset pFrom = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length; // Move right of offset partial = partial.Substring(pFrom); // Search for right delimiter and Calculate length to parse pTo = rs == "" ? (partial.Length - 1) : partial.IndexOf(rs); // Parse it var parsed = partial.Substring(0, pTo); list.Add(parsed); // Move right of parsed + right partial = partial.Substring(parsed.Length + rs.Length); } } catch { } } } // Non-recursive else { if (useRegexLR) { var pattern = BuildLRPattern(ls, rs); MatchCollection mc = Regex.Matches(partial, pattern); if (mc.Count > 0) { list.Add(mc[0].Value); } } else { try { pFrom = ls == "" ? 0 : partial.IndexOf(ls) + ls.Length; partial = partial.Substring(pFrom); pTo = rs == "" ? partial.Length : partial.IndexOf(rs); list.Add(partial.Substring(0, pTo)); } catch { } } } break; case ParseType.CSS: HtmlParser parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument document = null; try { document = parser.ParseDocument(original); } catch { } try { if (recursive) { foreach (var element in document.QuerySelectorAll(ReplaceValues(cssSelector, data))) { switch (ReplaceValues(attributeName, data)) { case "innerHTML": list.Add(element.InnerHtml); break; case "outerHTML": list.Add(element.OuterHtml); break; default: foreach (var attr in element.Attributes) { if (attr.Name == ReplaceValues(attributeName, data)) { list.Add(attr.Value); break; } } break; } } } else { switch (ReplaceValues(attributeName, data)) { case "innerHTML": list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].InnerHtml); break; case "outerHTML": list.Add(document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].OuterHtml); break; default: foreach (var attr in document.QuerySelectorAll(ReplaceValues(cssSelector, data))[cssElementIndex].Attributes) { if (attr.Name == ReplaceValues(attributeName, data)) { list.Add(attr.Value); break; } } break; } } } catch { } break; case ParseType.JSON: if (JTokenParsing) { if (original.Trim().StartsWith("[")) { JArray json = JArray.Parse(original); var jsonlist = json.SelectTokens(jsonField, false); foreach (var j in jsonlist) { list.Add(j.ToString()); } } else { JObject json = JObject.Parse(original); var jsonlist = json.SelectTokens(jsonField, false); foreach (var j in jsonlist) { list.Add(j.ToString()); } } } else { var jsonlist = new List <KeyValuePair <string, string> >(); parseJSON("", original, jsonlist); foreach (var j in jsonlist) { if (j.Key == ReplaceValues(jsonField, data)) { list.Add(j.Value); } } } break; case ParseType.XPATH: // NOT IMPLEMENTED YET break; case ParseType.REGEX: try { var matches = Regex.Matches(partial, ReplaceValues(regexString, data)); foreach (Match match in matches) { var output = ReplaceValues(regexOutput, data); for (var i = 0; i < match.Groups.Count; i++) { output = output.Replace("[" + i + "]", match.Groups[i].Value); } list.Add(output); } } catch { } break; } return(list); }
/// <summary> /// Обновляет список категорий /// </summary> /// <returns>статус обновления</returns> public bool UpdateWorkCategory() { Log.ProcessMessage("Пытаемся обновить список категорий"); try { string get = http.GetAsync(Domain + "/jobs/").Result.Content.ReadAsStringAsync().Result; HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> categoriesElements = html.QuerySelectorAll(".collapse li a[data-category_id]"); foreach (var elem in categoriesElements) { Objects.Category.Categories.Add(new Objects.Category { Name = elem.TextContent, Href = elem.GetAttribute("href") }); } Log.GoodMessage("Обновили список категорий"); return(true); } catch { Log.ExMessage("Не удалось обновить список категорий"); return(false); } }