public void ProcessIndex(int index) { var novel = new Novel(); HtmlWeb htmlWeb = new HtmlWeb(); HtmlDocument htmlDocument = htmlWeb.Load(String.Format(MainUrlPattern, index)); var mainboxes = htmlDocument.DocumentNode.Descendants("div").Where(w => w.HasClass("mainbox")).ToArray(); var mainContent = mainboxes[0]; ParseMainContent(mainContent, novel); var releasesNode = htmlDocument.DocumentNode.Descendants("div").FirstOrDefault(w => w.HasClass("releases")); if (releasesNode != null) { ParseReleasesContent(releasesNode, novel); } var screenshotsNode = htmlDocument.DocumentNode.Descendants("div").FirstOrDefault(w => w.HasId("screenshots")); if (screenshotsNode != null) { ParseImagesContent(screenshotsNode, novel); } //staff (extract artists) htmlDocument = htmlWeb.Load(String.Format(StaffPattern, index)); var staffNode = htmlDocument.DocumentNode.Descendants("div").FirstOrDefault(w => w.HasClass("staff") && w.NotContainsClass("cast")); if (staffNode != null) { ParseStaffContent(staffNode, novel); } //characters htmlDocument = htmlWeb.Load(String.Format(CharacterPattern, index)); mainboxes = htmlDocument.DocumentNode.Descendants("div").Where(w => w.HasClass("mainbox")).ToArray(); if (mainboxes.Length > 1) { for (int i = 1; i < mainboxes.Length; i++) { ParseCharactersContent(mainboxes[i], novel); } } using (var ctx = new VNContext("VNConnectionString")) { NovelManager.SaveNovel(novel, ctx); Logs.Debug($@"Novel {index} finished"); } Console.WriteLine(index + @" finished"); }
void ParseStaffContent(HtmlNode node, Novel novel) { var artistRow = node.Descendants("td").FirstOrDefault(w => w.InnerText == "Artist"); if (artistRow == null) { return; } while (true) { foreach (var artistNode in artistRow.ParentNode.Descendants("a")) { var engName = artistNode.InnerText; var japName = artistNode.Attributes["title"].Value; if (novel.Artists.All(w => w.EngName != engName)) { novel.Artists.Add(new Artist() { EngName = artistNode.InnerText, JapName = japName }); } if (artistRow.NextSibling != null && artistRow.NextSibling.Name == "tr" && String.IsNullOrEmpty(artistRow.NextSibling.FirstChild.InnerText)) { artistRow = artistRow.NextSibling; } else { return; } } } }
void ParseMainContent(HtmlNode node, Novel novel) { var engName = node.Descendants("h1").FirstOrDefault(); if (engName != null) novel.EngName = engName.InnerText; var japName = node.Descendants("h2").FirstOrDefault(); if (japName != null) novel.JapName = japName.InnerText; var imageNode = node.Descendants().First(w => w.HasClass("vnimg")).Descendants("img").First().Attributes["src"].Value; if (!String.IsNullOrEmpty(imageNode)) { novel.Images.Add(new NovelImage() { LocalPath = imageNode, ImageType = NovelImageType.Title }); } var developersTable = node.Descendants("td").FirstOrDefault(w => w.InnerText == "Developer"); if (developersTable != null) { var developersRow = developersTable.NextSibling; foreach (var tableRow in developersRow.Descendants("a")) { var companyName = tableRow.InnerText; if (novel.Companies.All(w => w.EngName != companyName)) { novel.Companies.Add(new Company() { EngName = tableRow.InnerText }); } } } var publishersTable = node.Descendants("td").FirstOrDefault(w => w.InnerText == "Publishers"); if (publishersTable != null) { var publishersRow = publishersTable.NextSibling; foreach (var tableRow in publishersRow.Descendants("a")) { var companyName = tableRow.InnerText; if (novel.Companies.All(w => w.EngName != companyName)) { novel.Companies.Add(new Company() { EngName = tableRow.InnerText }); } } } }
void ParseReleasesContent(HtmlNode node, Novel novel) { var releaseNodes = node.Descendants("td"); List<DateTime> releaseDates = new List<DateTime>(); foreach (var releaseNode in releaseNodes) { DateTime date; if (DateTime.TryParse(releaseNode.InnerText, out date)) { releaseDates.Add(date); } } novel.ReleaseDate = releaseDates.OrderBy(w => w).FirstOrDefault(); }
void ParseImagesContent(HtmlNode node, Novel novel) { foreach (var screenNode in node.Descendants("a").Where(w => w.Descendants("img").Any())) { var isAdult = screenNode.HasClass("nsfw"); var image = new NovelImage() { UrlPath = screenNode.Attributes["href"].Value, IsAdult = isAdult, ImageType = isAdult ? NovelImageType.Event : NovelImageType.Sample }; novel.Images.Add(image); } }
void ParseCharactersContent(HtmlNode node, Novel novel) { foreach (var characterNode in node.Descendants("div").Where(w => w.HasClass("chardetails"))) { Character character = new Character(); NovelCharacterInfo novelCharacterInfo = new NovelCharacterInfo(); novelCharacterInfo.Character = character; var imageNode = characterNode.Descendants("img").FirstOrDefault(); if (imageNode != null) { var image = new NovelImage() { ImageType = NovelImageType.Character, UrlPath = imageNode.Attributes["src"].Value }; novelCharacterInfo.Image = image; novel.Images.Add(image); } var detailsTable = characterNode.Descendants("table").FirstOrDefault(); //Parse Name var headerWithName = detailsTable.FirstChild; var engNameNode = headerWithName.Descendants("a").FirstOrDefault(); if (engNameNode != null) character.EngName = engNameNode.InnerText; var japNameNode = headerWithName.Descendants("b").FirstOrDefault(); if (japNameNode != null) character.JapName = japNameNode.InnerText; var sexNode = headerWithName.Descendants("acronym").FirstOrDefault(); if (sexNode != null) character.Sex = GetSex(sexNode.Attributes["title"].Value); //Parse Sizes var measureRow = detailsTable.Descendants("td").FirstOrDefault(w => w.InnerText == "Measurements"); if (measureRow != null) { var measureNode = measureRow.NextSibling; if (measureNode != null) { var sizes = new Sizes(); var measureValue = measureNode.InnerText; var match = Regex.Match(measureValue, MeasurePattern); if (match.Success) { sizes.Bust = match.Groups[1].Value.ToNullableInt(); sizes.Waist = match.Groups[2].Value.ToNullableInt(); sizes.Hip = match.Groups[3].Value.ToNullableInt(); } character.Sizes = sizes; } } novel.CharacterInfos.Add(novelCharacterInfo); } }
public Novel ParsePage(string url) { var novel = new Novel(); var document = HtmlHelper.LoadDocumentWithEncoding(url); var upperContainer = document.DocumentNode.Descendants("table").First(w => w.HasId("soft_table")); if (upperContainer == null) throw new Exception(String.Format(ErrorMessagePattern, url, "no upper content")); var upperHeader = upperContainer.Descendants("tr").First(); //TITLE var titleHeader = upperHeader.Descendants().First(w => w.HasId("soft-title")); novel.JapName = titleHeader.InnerText; //TITLE IMAGE var titleImageNode = upperHeader.Descendants("img").First().ParentNode; if (titleImageNode.Name == "a")//if "td" -> no image { var image = new NovelImage() { ImageType = NovelImageType.Title, UrlPath = titleImageNode.Attributes["href"].Value }; novel.Images.Add(image); } //INFO TABLE var infoTable = upperHeader.NextSibling("tr").Descendants("table").First(); var companyRow = infoTable.Descendants("td").First(w => CompanyTitles.Any(t=> w.InnerText.Contains(t))); var companyName = companyRow.NextSibling("td").Descendants("a").First().InnerText; var company = new Company() { JapName = companyName }; novel.Companies.Add(company); var releaseDateRow = infoTable.Descendants("td").First(w => w.InnerText.Contains(DateTitle)); var releaseDate = DateTime.Parse(releaseDateRow.NextSibling.Descendants("a").First().InnerText); novel.ReleaseDate = releaseDate; var artistRow = infoTable.Descendants("td").FirstOrDefault(w => w.InnerText.Contains(ArtistTitle)); if (artistRow != null) { var artistName = artistRow.NextSibling("td").Descendants("a").First().InnerText; var artist = new Artist() { JapName = artistName }; novel.Artists.Add(artist); } var tagsRow = infoTable.Descendants("td").FirstOrDefault(w => w.InnerText.Contains(TagsTitle)); if (tagsRow != null) { var novelTags = new List<Tag>(); var tagsContainer = tagsRow.NextSibling("td"); novelTags.AddRange(tagsContainer.InnerText.Replace(GenreLinkText, "").Split('、').Where(w => !String.IsNullOrEmpty(w)).Select(w => new Tag() { TagType = 1, TagValue = w })); foreach (var tag in novelTags) { novel.Tags.Add(tag); } } //CHARACTERS var characterTable = document.DocumentNode.Descendants("table").FirstOrDefault(w => w.Attr("width") == "96%"); if (characterTable != null) { foreach (var characterNode in characterTable.Descendants("tr")) { ParseCharacterNode(characterNode, novel); } } //IMAGES var storyTableHeaders = document.DocumentNode.Descendants("div").Where(w => w.HasClass("tabletitle") && _imageTableHeaders.Any(header => w.InnerText.Contains(header))); foreach (var storyTableHeader in storyTableHeaders) { ParseImages(storyTableHeader.NextSibling("div"), novel); } return novel; }
void ParseImages(HtmlNode imagesTable, Novel novel) { if (imagesTable == null) return; foreach (var imageNode in imagesTable.Descendants("img")) { var image = new NovelImage() { ImageType = NovelImageType.Event, UrlPath = imageNode.GetchuImgValue(), IsAdult = true }; novel.Images.Add(image); } }
void ParseCharacterNode(HtmlNode characterNode, Novel novel) { if (characterNode.Descendants("td").Count() < 2) return;//border Character character = new Character(); NovelCharacterInfo novelCharacterInfo = new NovelCharacterInfo(); novelCharacterInfo.Character = character; //IMAGE var imageNode = characterNode.Descendants("td").First(); var imageTag = imageNode.Descendants("img").FirstOrDefault(); if (imageTag != null) { var image = new NovelImage() { ImageType = NovelImageType.Character, UrlPath = imageTag.GetchuImgValue() }; novelCharacterInfo.Image = image; novel.Images.Add(image); } //INFO var infoNode = imageNode.NextSibling("td"); ParseCharacterInfo(infoNode, novelCharacterInfo); //DETAILED IMAGE var detailedImageNode = infoNode.NextSibling("td"); if (detailedImageNode != null) { var detailedImageTag = detailedImageNode.Descendants("img").FirstOrDefault(); if (detailedImageTag != null) { var detailedImage = new NovelImage() { ImageType = NovelImageType.DetailedCharacter, UrlPath = detailedImageTag.GetchuImgValue() }; novelCharacterInfo.DetailedImage = detailedImage; novel.Images.Add(detailedImage); } } novel.CharacterInfos.Add(novelCharacterInfo); }