/// <summary> /// Обновляет список категорий /// </summary> /// <returns>статус обновления</returns> public bool UpdateWorkCategory() { Log.ProcessMessage("Пытаемся обновить список категорий"); try { string get = http.GetAsync(Domain + "/jobs/").Result.Content.ReadAsStringAsync().Result; HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> categoriesElements = html.QuerySelectorAll(".collapse li a[data-category_id]"); foreach (var elem in categoriesElements) { Objects.Category.Categories.Add(new Objects.Category { Name = elem.TextContent, Href = elem.GetAttribute("href") }); } Log.GoodMessage("Обновили список категорий"); return(true); } catch { Log.ExMessage("Не удалось обновить список категорий"); return(false); } }
private static List <Track> AddProgramItems( List <Track> program, AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> trackElements, string trackId) { foreach (var trackElement in trackElements) { var topic = trackElement.QuerySelector("h3").TextContent.Trim(); if (topic != "skip") { var fullName = trackElement.QuerySelector("div.speakername .full-name").TextContent.Trim(); if (string.IsNullOrEmpty(fullName)) { fullName = "Networking"; } program.Add(new Track { TrackId = trackId, Time = DateTime.ParseExact( trackElement.QuerySelector("time").TextContent.Trim().Split('-')[0], "H:mm", CultureInfo.InvariantCulture, DateTimeStyles.None), FullName = fullName, Topic = CleanText(topic), Description = CleanText(trackElement.QuerySelector("p").TextContent.Trim()) }); } } return(program); }
/// <summary> /// Получает все задания с странцы категории /// </summary> /// <param name="link">ссылка на страницу без домена</param> /// <returns>Список все заданий</returns> public List <Objects.Task> GetTasksFromPage(string link) { Log.ProcessMessage("Пытаемся получить список заданий со страницы " + link); try { string get = http.GetAsync(Domain + link).Result.Content.ReadAsStringAsync().Result; HtmlParser Parser = new HtmlParser(); AngleSharp.Html.Dom.IHtmlDocument html = Parser.ParseDocument(get); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> taskElements = html.QuerySelectorAll("div.row.set_href .title a"); var tasks = new List <Objects.Task> { }; foreach (var elem in taskElements) { var task = GetTaskFromLink(elem.GetAttribute("href")); if (task == null) { continue; } tasks.Add(task); } Log.GoodMessage("Получили список заданий со страницы " + link); return(tasks); } catch { Log.ExMessage("Не удалось получить список заданий со страницы " + link); return(null); } }
public static void ExtractTableFromHTML(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elements, string statementTitleHref, string outputPath, IDictionary <string, string> rowHeadOverrides, IDictionary <string, string> config) { // Find the href'd element int iElement = 0; for (; iElement < elements.Length; ++iElement) { var element = elements[iElement]; if (element.GetAttribute("name") == statementTitleHref) { break; } } if (iElement == elements.Length) { Console.WriteLine("Unable to find expected element with name {0}", statementTitleHref); } // See if that landmark is contained by a table (assumed, then, to be the statement table) int statementTableIndex = findContainingElementByType(elements, iElement, "table"); if (statementTableIndex == -1) { // Landmark is not contained in a table, so statement table is assumed to be first table following the landmark. statementTableIndex = findFollowingElementByType(elements, iElement, "table"); } if (statementTableIndex == -1) { Console.WriteLine("No landmarked table found"); return; } ExtractTableFromHTML(elements, statementTableIndex, outputPath, rowHeadOverrides, config); }
static void quickSummaryPrint(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elements, int startFrom = 0) { for (int i = startFrom; i < elements.Length; ++i) { var element = elements[i]; if (element.TagName.ToLower() == "table") { var rows = element.QuerySelectorAll("tr"); var cells = element.QuerySelectorAll("td"); Console.WriteLine("{0}: table: rows: {1}, cells: {2}", i, rows.Length, cells.Length); Console.WriteLine(element.TextContent.Substring(0, Math.Min(80, element.TextContent.Length)).Replace("\r", "").Replace("\n", " ").Trim()); } if (Regex.IsMatch(element.TextContent, "^(?:Condensed )?notes to .*consolidated (?:condensed )?financial statements|^Supplemental Financial Data", RegexOptions.IgnoreCase)) { Console.WriteLine("{0}: Found notes section title: {1}", i, element.TextContent); } if (Regex.IsMatch(element.TextContent, @"^united states", RegexOptions.IgnoreCase)) { Console.WriteLine("{0}: Found United States", i); } if (Regex.IsMatch(element.TextContent, @"^table of contents|^index", RegexOptions.IgnoreCase)) { Console.WriteLine("{0}: Found TOC landmark: {1}", i, element.TextContent); } } }
public async void ParserPhones() { string[] href = File.ReadAllLines(@"C:\Users\aizhi\Desktop\parsed.txt"); var client = new MongoClient("mongodb://localhost:27017"); var db = client.GetDatabase("Kaspi_Store"); var collection = db.GetCollection <Item>("Phone"); // db.DropCollection("Phone"); Console.WriteLine("Adding into database started..."); foreach (string str in href) { var uri = str; var cancellationToken = new CancellationTokenSource(); var httpClient = new HttpClient(); HttpResponseMessage request = await httpClient.GetAsync(uri); cancellationToken.Token.ThrowIfCancellationRequested(); //Get the response stream var response = await request.Content.ReadAsStreamAsync(); cancellationToken.Token.ThrowIfCancellationRequested(); //Parse the stream HtmlParser parser = new HtmlParser(); IHtmlDocument document = parser.ParseDocument(response); //Do something with LINQ string header = document.QuerySelector("h2.item-content__el-heading").InnerHtml; header = header.Substring(20); string price = document.QuerySelector("div.item__price-once").InnerHtml; price = price.Trim(); price = price.Substring(0, price.Length - 2); //var ImgNode = document.QuerySelector("img.item__slider-thumb-pic"); //string ImgUrl = ImgNode.Attributes["src"].Value; Dictionary <string, string> Props = new Dictionary <string, string>(); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> nodes = document.QuerySelectorAll("dl.specifications-list__spec"); foreach (var node in nodes) { string propName = node.QuerySelector("span").InnerHtml; string propAttr = node.QuerySelector("dd").InnerHtml.Trim(); propAttr = propAttr.Replace(" ", " "); Props[propName] = propAttr; } Item itemData = new Item { itemHeader = header, itemPrice = price, itemProps = Props }; await collection.InsertOneAsync(itemData); Console.WriteLine(header); } Console.WriteLine("Adding completed"); // Console.ReadKey(true); }
private static void ShowDecks(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> decks) { Console.WriteLine($"Found {decks.Length} decks on page."); foreach (var deck in decks) { string deckName = GetTitle(deck); Console.WriteLine(deckName); } }
public static int skipPastLandmark(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, string landmark) { int iElement; for (iElement = startingIndex; iElement < elementsToScan.Length && elementsToScan[iElement].TextContent.Trim() == landmark.Trim(); ++iElement) { ; } return(iElement); }
public static int skipPastRegex(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, string pattern, RegexOptions regexOptions) { int iElement; for (iElement = startingIndex; iElement < elementsToScan.Length && Regex.IsMatch(elementsToScan[iElement].TextContent.Trim(), pattern, regexOptions); ++iElement) { ; } return(iElement); }
public static int findLandmark(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, string landmark) { for (int iElement = startingIndex; iElement < elementsToScan.Length; ++iElement) { var element = elementsToScan[iElement]; if (element.TextContent == landmark) { return(iElement); } } return(-1); // Not found }
public static int findFollowingElementByType(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, string elementType) { for (int iElement = startingIndex; iElement < elementsToScan.Length; ++iElement) { var element = elementsToScan[iElement]; if (element.TagName.ToLower() == elementType.ToLower()) { return(iElement); } } return(-1); // Not found }
/// <summary> /// Load information for every book. /// </summary> /// <param name="book">Book that contains the url.</param> /// <param name="htmlParser">HTML parser.</param> /// <param name="webClient">Wbepages downloader.</param> private static void LoadBookMeta(Book book, HtmlParser htmlParser, HtmlWeb webClient) { //listen-download clearfix book.Chapters = new List <Chapter>(); string innerHtml = webClient.LoadFromWebAsync(book.Url).GetAwaiter().GetResult().DocumentNode.InnerHtml; IHtmlDocument document = htmlParser.Parse(innerHtml); var sidebar = document.QuerySelector("div.book-page-sidebar"); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> htmlCollection = document.QuerySelectorAll("dd"); var bookInfoSide = htmlCollection.Select(x => x.TextContent).ToArray(); book.Duration = bookInfoSide[4]; var bookTextNode = document.QuerySelectorAll("a").Where(x => x.TextContent.ToLower() == "online text").FirstOrDefault(); if (bookTextNode != null) { book.OnlineText = bookTextNode.GetAttribute("href"); } var chapterNodes = document.QuerySelector("table.chapter-download").QuerySelector("tbody").QuerySelectorAll("tr"); var columns = document.QuerySelector("table.chapter-download").QuerySelector("thead").QuerySelectorAll("th").Select(x => x.TextContent).ToList(); int chapterIndex = columns.FindIndex(x => x.ToLower().Contains("chapter")); int sectionIndex = columns.FindIndex(x => x.ToLower().Contains("section")); int readerIndex = columns.FindIndex(x => x.ToLower().Contains("reader")); int durationIndex = columns.FindIndex(x => x.ToLower().Contains("time")); int languageIndex = columns.FindIndex(x => x.ToLower().Contains("language")); int sourceTextIndex = columns.FindIndex(x => x.ToLower().Contains("source")); foreach (var chapterNode in chapterNodes) { var chapterInfo = chapterNode.QuerySelectorAll("td").ToArray(); string chapterMp3 = chapterIndex != -1 ? chapterInfo[chapterIndex].QuerySelector("a").GetAttribute("href") : string.Empty; string chapterName = chapterIndex != -1 ? chapterInfo[chapterIndex].QuerySelector("a").TextContent : string.Empty; AngleSharp.Dom.IElement readerNameElement = chapterInfo[readerIndex].QuerySelector("a"); string readerName = readerNameElement != null ? readerNameElement.TextContent : ""; string chapterDuration = chapterInfo[durationIndex].TextContent; book.Chapters.Add(new Chapter { AudioLink = chapterMp3, Section = chapterInfo[sectionIndex].TextContent, Duration = chapterDuration, Name = chapterName, Reader = readerName, TextSource = sourceTextIndex != -1 ? chapterInfo[sourceTextIndex].GetAttribute("href") : string.Empty }); } }
public static int findByRegex(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, string pattern, RegexOptions regexOptions, int elementLimit = -1) { int lastElement = (elementLimit < 0) ? elementsToScan.Length : startingIndex + elementLimit; for (int iElement = startingIndex; iElement < lastElement; ++iElement) { var element = elementsToScan[iElement]; if (Regex.IsMatch(element.TextContent, pattern, regexOptions)) { return(iElement); } } return(-1); // Not found }
private string getSmall() { article = GetLsi(); var parser = new AngleSharp.Parser.Html.HtmlParser(); var document = parser.Parse(text); string anchorHTML = string.Empty; AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> anchors = document.QuerySelectorAll("a"); foreach (AngleSharp.Dom.IElement anchor in anchors) { anchorHTML = anchor.OuterHtml; } article = anchorHTML + Environment.NewLine + article; return(article); }
private async Task <List <CarModelDto> > GenerateModelList(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> data, string carBrand) { _logger.Log($"Returned {data.Length} model entries for selected brand: {carBrand}"); var carModelList = new List <CarModelDto>(); foreach (var item in data) { carModelList.Add(new CarModelDto() { ModelName = item.TextContent.Replace(" ", string.Empty).ToString(), ModelValue = item.GetAttribute("title").ToLower().Replace(" " + carBrand.ToLower().ToString() + " ", string.Empty).Replace(" ", "-").ToString() }); } return(await Task.FromResult(carModelList)); }
public static int findNextElementOfType(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, string type, int elementLimit = -1) { int lastElement = (elementLimit < 0) ? elementsToScan.Length : startingIndex + elementLimit; type = type.ToLower(); for (int iElement = startingIndex; iElement < lastElement; ++iElement) { var element = elementsToScan[iElement]; if (element.TagName.ToLower() == type) { return(iElement); } } return(-1); // Not found }
public async Task <Dictionary <string, List <StringDictionary> > > Parse(IHtmlDocument document) { Dictionary <string, List <StringDictionary> > result = new Dictionary <string, List <StringDictionary> >(); foreach (string tag in Settings.Tags) { AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> items = document?.QuerySelectorAll(tag); //.Where(item => item.ClassName != null && item.ClassName.Contains("")); List <StringDictionary> elements = new List <StringDictionary>(); foreach (AngleSharp.Dom.IElement item in items) { elements.Add(ParseTag(item)); } result.Add(tag, elements); } return(result); }
//Sanity check to ensure the page hasn't change format private static bool verifyHeaderNames(List <string> expected, IHtmlTableElement given) { List <string> headerNames = new List <string>(); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> headerCells = given.QuerySelectorAll("th"); foreach (IHtmlTableHeaderCellElement headerCell in headerCells) { headerNames.Add(headerCell.TextContent); } if (!expected.ToList().SequenceEqual(headerNames)) { throw new Exception("Headers do not match. Expected: |" + String.Join(",", expected) + "| but found |" + String.Join(",", headerNames)); } return(true); }
private void button3_Click(object sender, EventArgs e) { readFile(); article = text; article = article + Environment.NewLine + GetLsi(); Clipboard.SetText(article); webBrowser1.DocumentText = article; var parser = new AngleSharp.Parser.Html.HtmlParser(); var document = parser.Parse(article); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> anchors = document.QuerySelectorAll("a"); foreach (AngleSharp.Dom.IElement anchor in anchors) { wName.Text = anchor.InnerHtml; wUrl.Text = anchor.GetAttribute("href"); } }
/// <inheritdoc /> public override async Task <IEnumerable <string> > EnumerateVersionsAsync(PackageURL purl, bool useCache = true, bool includePrerelease = true) { Logger.Trace("EnumerateVersions {0}", purl?.ToString()); if (purl == null || purl.Name is null) { return(Array.Empty <string>()); } try { string packageName = purl.Name; HttpClient httpClient = CreateHttpClient(); System.Net.Http.HttpResponseMessage?html = await httpClient.GetAsync($"{ENV_HACKAGE_ENDPOINT}/package/{packageName}"); html.EnsureSuccessStatusCode(); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(await html.Content.ReadAsStringAsync()); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> ths = document.QuerySelectorAll("th"); List <string> versionList = new(); foreach (AngleSharp.Dom.IElement th in ths) { if (th.TextContent.StartsWith("Versions")) { AngleSharp.Dom.IElement td = th.NextElementSibling; foreach (AngleSharp.Dom.IElement version in td.QuerySelectorAll("a,strong")) { string versionString = version.TextContent.ToLower().Trim(); Logger.Debug("Identified {0} version {1}.", packageName, versionString); versionList.Add(versionString); } break; } } return(SortVersions(versionList.Distinct())); } catch (Exception ex) { Logger.Debug("Unable to enumerate versions: {0}", ex.Message); throw; } }
public JsonResult ScrapeAlbumChart() { string topAlbumsUrl = "https://www.billboard.com/charts/current-albums"; Billboard_Album[] topAlbums = new Billboard_Album[100]; HttpResponseMessage request = client.GetAsync(topAlbumsUrl).Result; Stream response = request.Content.ReadAsStreamAsync().Result; HtmlParser parser = new HtmlParser(); IHtmlDocument document = parser.ParseDocument(response); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> albums = document.GetElementsByClassName("chart-list-item__first-row chart-list-item__cursor-pointer"); for (int i = 0; i < albums.Length; i++) { string title = albums[i].GetElementsByClassName("chart-list-item__title-text")[0].TextContent.Trim(); string artist = ""; // Some albums have a link tag, some don't. if (albums[i].GetElementsByClassName("chart-list-item__artist")[0].ChildElementCount > 0) //sometimes there is an <a> tag { artist = albums[i].GetElementsByClassName("chart-list-item__artist")[0].FirstElementChild.TextContent.Trim(); } else { artist = albums[i].GetElementsByClassName("chart-list-item__artist")[0].TextContent.Trim(); } var ab = new Billboard_Album { Title = title, Artist = artist }; topAlbums[i] = ab; } return(Json(new { success = true, albums = topAlbums })); }
private void button10_Click(object sender, EventArgs e) { article = GetLsi(); var parser = new AngleSharp.Parser.Html.HtmlParser(); var document = parser.Parse(text); string anchorHTML = string.Empty; AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> anchors = document.QuerySelectorAll("a"); foreach (AngleSharp.Dom.IElement anchor in anchors) { string wantReplace = anchor.OuterHtml; string link = anchor.GetAttribute("href"); string anchorKeyword = anchor.InnerHtml; string textTile = @"[" + anchorKeyword + "]" + "(" + link + ")"; article = textTile + Environment.NewLine + article; } //article = anchorHTML + Environment.NewLine + article; Clipboard.SetText(article); }
private string ProcessUrl(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> urlData) { if (!urlData.Any()) { return(CommonTags.NotDefined); } string url = urlData.First().GetAttribute("href"); if (url == CommonTags.JavascriptVoid) { return(CommonTags.NotDefined); } if (url.StartsWith("//")) { return($"https:{url}"); } return(url); }
public static int findLandmarks(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, IList <string> landmarks) { if (landmarks == null || landmarks.Count == 0) { return(startingIndex); } for (int iLandmark = 0; iLandmark < landmarks.Count; ++iLandmark) { string landmark = landmarks[iLandmark]; startingIndex = findLandmark(elementsToScan, startingIndex, landmark); if (startingIndex == -1) { return(-1); // Could not find one of the landmarks } else if (iLandmark != landmarks.Count - 1) // Don't skip past the last landmark found { startingIndex = skipPastLandmark(elementsToScan, startingIndex, landmark); } } return(startingIndex); }
public static int findContainingElementByType(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elementsToScan, int startingIndex, string elementType) { var element = elementsToScan[startingIndex]; while (element != null && element.TagName.ToLower() != elementType.ToLower()) { element = element.ParentElement; } if (element != null && element.TagName.ToLower() == elementType.ToLower()) { for (int iElement = 0; iElement < elementsToScan.Length; ++iElement) { // Converting back to index numbers is fugly. if (elementsToScan[iElement] == element) { return(iElement); } } Debug.Assert(false, "findContainingElementByType could not find element known to be in elementsToScan"); return(-1); // Won't happen. } return(-1); // No containing element of this type found. }
private static void WriteDeckDefinitionFiles(string outputFolder, string pageUrl, AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> decks) { foreach (var deck in decks) { string deckName = GetTitle(deck); Console.WriteLine($"\n{deckName}"); string fileName = outputFolder + "\\" + deckName + ".txt"; var cards = deck.QuerySelectorAll("div.sorted-by-overview-container span.row"); using (StreamWriter writer = new StreamWriter(fileName)) { writer.WriteLine("// " + deckName); writer.WriteLine("// Source: " + pageUrl); foreach (var card in cards) { int amount = int.Parse(card.QuerySelector("span.card-count").TextContent); string cardName = card.QuerySelector("span.card-name a").TextContent; string line = $"{amount}x {cardName}"; Console.WriteLine(line); writer.WriteLine(line); } } } }
private async Task <List <Models.SkuPhotoInfo> > getSkuPhotoInfoAsync(string skuid) { List <Models.SkuPhotoInfo> skuPhotoInfos = new List <Models.SkuPhotoInfo>(); try { var config = Configuration.Default.WithDefaultLoader(); var address = "https://66123123.com/Goods/GoodsDetail?id=" + skuid; var context = BrowsingContext.New(config); AngleSharp.Dom.Document document = (AngleSharp.Dom.Document) await context.OpenAsync(address); //AngleSharp.Html.Dom.IHtmlDivElement skuIconImgDoc = (AngleSharp.Html.Dom.IHtmlDivElement)document.QuerySelector(".b-img"); //AngleSharp.Html.Dom.IHtmlDivElement skuDetailImgDoc = (AngleSharp.Html.Dom.IHtmlDivElement)document.QuerySelector(".showimg"); AngleSharp.Dom.IHtmlCollection <AngleSharp.Html.Dom.IHtmlImageElement> images = document.Images; List <AngleSharp.Html.Dom.IHtmlImageElement> imgs = images.Where(i => i.AlternativeText == "商品图片" || i.AlternativeText == "商品详情图片").ToList(); foreach (AngleSharp.Html.Dom.IHtmlImageElement item in imgs) { Models.SkuPhotoInfo skuPhotoInfo = new Models.SkuPhotoInfo { skuId = skuid, photoUrl = item.Source, photoTitle = item.AlternativeText, skuUrl = "https://66123123.com/Goods/GoodsDetail?id=" + skuid }; WebRequest request = WebRequest.Create(item.Source); request.Credentials = CredentialCache.DefaultCredentials; Stream s = request.GetResponse().GetResponseStream(); System.Drawing.Image image = System.Drawing.Image.FromStream(s); s.Close(); skuPhotoInfo.photoHeight = image.Height; skuPhotoInfo.photoWidth = image.Width; skuPhotoInfos.Add(skuPhotoInfo); } } catch (Exception ex) { string errorMsg = "爬取图片报异常:" + ex.Message; } return(skuPhotoInfos); }
/// <inheritdoc /> public override async Task <IEnumerable <string> > EnumerateVersionsAsync(PackageURL purl, bool useCache = true, bool includePrerelease = true) { Logger.Trace("EnumerateVersions {0}", purl?.ToString()); if (purl == null || purl.Name is null) { return(new List <string>()); } try { string packageName = purl.Name; List <string> versionList = new(); HttpClient httpClient = CreateHttpClient(); // Get the latest version System.Net.Http.HttpResponseMessage html = await httpClient.GetAsync($"{ENV_CRAN_ENDPOINT}/web/packages/{packageName}/index.html"); html.EnsureSuccessStatusCode(); HtmlParser?parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(await html.Content.ReadAsStringAsync()); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> tds = document.QuerySelectorAll("td"); for (int i = 0; i < tds.Length; i++) { if (tds[i].TextContent == "Version:") { string?value = tds[i + 1]?.TextContent?.Trim(); if (value != null) { versionList.Add(value); } break; } } // Get the remaining versions html = await httpClient.GetAsync($"{ENV_CRAN_ENDPOINT}/src/contrib/Archive/{packageName}/"); html.EnsureSuccessStatusCode(); document = await parser.ParseDocumentAsync(await html.Content.ReadAsStringAsync()); tds = document.QuerySelectorAll("a"); foreach (AngleSharp.Dom.IElement td in tds) { string?href = td.GetAttribute("href"); if (href?.Contains(".tar.gz") ?? false) { string version = href.Replace(".tar.gz", ""); version = version.Replace(packageName + "_", "").Trim(); Logger.Debug("Identified {0} version {1}.", packageName, version); versionList.Add(version); } } return(SortVersions(versionList.Distinct())); } catch (Exception ex) { Logger.Debug("Unable to enumerate versions: {0}", ex.Message); throw; } }
public override async Task <string?> GetMetadataAsync(PackageURL purl, bool useCache = true) { Logger.Trace("GetMetadata {0}", purl?.ToString()); if (purl == null || purl.Name == null) { return(null); } StringBuilder metadataContent = new(); HttpClient httpClient = CreateHttpClient(); foreach (string distroUrlPrefix in GetBaseURLs(purl)) { try { string?html = await GetHttpStringCache(httpClient, distroUrlPrefix, useCache : useCache, neverThrow : true); if (html != null) { AngleSharp.Html.Dom.IHtmlDocument?document = await new HtmlParser().ParseDocumentAsync(html); foreach (AngleSharp.Dom.IElement?anchor in document.QuerySelectorAll("a")) { string?anchorHref = anchor.GetAttribute("href"); if (anchorHref.EndsWith(".dsc")) { Logger.Debug("Found a .dsc file: {0}", anchorHref); string?dscContent = await GetHttpStringCache(httpClient, distroUrlPrefix + anchorHref, neverThrow : true); if (dscContent == null) { continue; } metadataContent.AppendLine(dscContent); } } } } catch (Exception ex) { Logger.Debug("Error obtaining .dsc file for {0}: {1}", purl.ToString(), ex.Message); } // Fallback to packages.ubuntu.com if we haven't seen any .dsc files if (metadataContent.Length == 0) { try { string?searchResults = await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/search?keywords={purl.Name}&searchon=names&exact=1&suite=all§ion=all", useCache); HtmlParser parser = new(); AngleSharp.Html.Dom.IHtmlDocument document = await parser.ParseDocumentAsync(searchResults); AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> anchorItems = document.QuerySelectorAll("a.resultlink"); IEnumerable <string> metadataUrlList = anchorItems.Select(s => s.GetAttribute("href") ?? ""); foreach (string metadataUrl in metadataUrlList) { metadataContent.AppendLine(await GetHttpStringCache(httpClient, $"{ENV_UBUNTU_ENDPOINT}/{metadataUrl}")); } } catch (Exception ex) { Logger.Debug(ex, "Error fetching Ubuntu metadata: {0}", ex.Message); } } } return(metadataContent.ToString()); }
public static void ExtractTableFromHTML(AngleSharp.Dom.IHtmlCollection <AngleSharp.Dom.IElement> elements, int tableIndex, string outputPath, IDictionary <string, string> rowHeadOverrides, IDictionary <string, string> config) { var statementTable = elements[tableIndex]; // Parse statement table just found into a 2d matrix (actually a list of TableRow objects, which contain a list of TableCell objects) var tableData = new List <TableRow>(); var rowElements = statementTable.QuerySelectorAll("TR"); foreach (var rowElement in rowElements) { var rowData = new TableRow(); var colElements = rowElement.QuerySelectorAll("TD"); foreach (var cellElement in colElements) { // Extract cell value TableCell tableCell = new TableCell(cellElement); // Duplicate cell value across all cols it spans. int colSpan = 1; var colAttrs = cellElement.Attributes; var colSpanAttr = colAttrs.Where(a => a.Name == "colspan"); if (colSpanAttr.Count() > 0) { string sColSpan = colSpanAttr.First().Value; Int32.TryParse(sColSpan, out colSpan); } for (int j = 0; j < colSpan; ++j) { rowData.AddCell(tableCell); } } tableData.Add(rowData); } // For diagnostic purposes save a csv of the parsed table // writeTableToFile(outputPath + ".tbl", tableData); // Extract the column Headings from the table into list IList <string> columnHeadings = ExtractColumnHeadings(tableData); if (columnHeadings == null) { Console.WriteLine("FATAL: Cannot find any qualifying heading rows in table"); } // Post-process the rowheads in the table, calculating its relative indentation level (compared to the rest of the rowheads) CalcRowheadIndentationLevels(tableData); // Post-process the rowheads, linking each to any parents it has, based on rules and clues. BuildComplexRowHeads(tableData, rowHeadOverrides, config); // Flatten matrix to a list of tuples using some rules List <FlattenedRow> results = new List <FlattenedRow>(); string attributeName = ""; for (int iRow = 0; iRow < tableData.Count; ++iRow) { var row = tableData[iRow]; int nCols = row.Cells.Count; // Create the attribute name for this row from its row head and those of its parents attributeName = row.RowHead.Text; TableRow row2 = row.parentRow; while (row2 != null && row2.RowHead.Text.Length > 0) { attributeName = row2.RowHead.Text + "|" + attributeName; row2 = row2.parentRow; } if (attributeName.Length == 0) { continue; // Assumption: rows without attribute names should be skipped. } // Standardize attribute name format: only single spaces between words attributeName = ConsolidateWhitespace(attributeName); // Scan columns string colContent = ""; for (int iCol = 1; iCol < nCols; ++iCol) { var col = row.Cells[iCol]; colContent = col.Text; // Process numeric columns only. Exclude centered (heading) cols. if (col.HorizontalAlignment == TableCell.HORIZONTAL_ALIGNMENT.CENTER || !Regex.IsMatch(colContent, @"\(?\d+\)?")) { continue; } // Convert (xxx) to -xxx. Drop comma separators colContent = colContent.Replace('(', '-').Replace(")", "").Replace(",", ""); // Get the heading for this col string heading = columnHeadings[iCol]; // Create the tuple with this data and add it to the results list. FlattenedRow flatRow = new FlattenedRow(attributeName, heading, colContent); results.Add(flatRow); } } // HTML column spans can result in duplicated entries: get rid of them. var distinctFlatRows = results.Distinct(); // Write the flattened matrix out to file using (StreamWriter fsw = File.CreateText(outputPath)) { foreach (var record in distinctFlatRows) { fsw.Write(record + "\r\n"); } } }