private static void ExtractTracks(ICleaner clean, HtmlNodeCollection songNodes, List <Track> tracklist) { for (int i = 0; i < songNodes.Count; i++) { HtmlNode artistNode = songNodes[i].SelectSingleNode(".//div[@class='artist']"); HtmlNode titleNode = songNodes[i].SelectSingleNode(".//div[@class='song']"); if (artistNode == null) { throw new NodeNotFoundException($"Couldn't find song's author node with the given XPath at index: {i}!"); } if (titleNode == null) { throw new NodeNotFoundException($"Couldn't find song's title node with the given XPath at index: {i}!"); } if (clean.IsNotArtist(artistNode.InnerText)) { continue; } string artistName = clean.ArtistName(artistNode.InnerText); tracklist.Add(new Track(artistName, titleNode.InnerText)); } }
/// <summary> /// Extract the <see cref="Track"/>s from the HTML. /// </summary> /// <param name="sourceCode">Non null string.</param> /// <param name="clean">Non null custom cleaning object.</param> /// <returns>Non null <see cref="Track"/> collection.</returns> /// <exception cref="ArgumentOutOfRangeException"></exception> internal static List <Music> Process(string sourceCode, ICleaner clean) { // TODO: HtmlAgilityPack string substrint_string = "TRACKLISTA"; sourceCode = sourceCode.Substring(sourceCode.IndexOf(substrint_string)); sourceCode = sourceCode.Substring(0, sourceCode.IndexOf("</div></div></div> </div>")); // [FIX] <a href="google.hu">Click here</a> to Click here sourceCode = Regex.Replace(sourceCode, @"<a\s*[^>]*><u>(.*)</u><\/a>", "$1"); sourceCode = Regex.Replace(sourceCode, @"<a\s*[^>]*>(.*)<\/a>", "$1"); sourceCode = Regex.Replace(sourceCode, @"<[^>]*>", "\n"); sourceCode = Regex.Replace(sourceCode, @"[\r\n]+", "\n"); string[] lines = sourceCode.Split(new string[] { "\n" }, StringSplitOptions.None); List <Music> tracklist = new List <Music>(); for (int i = 0; i < lines.Length; i++) { string musicName = lines[i]; if (musicName.Length < 3 || clean.IsNotArtist(musicName)) { continue; } // [FIX] Multiple spaces replace single space musicName = Regex.Replace(musicName, @"\s+", " ").Trim(); musicName = musicName.Replace("MK DJ", "") .Replace("() ", "") .Replace("( ", "(") .Replace(" )", ")"); musicName = clean.MusicPrefixes(musicName); musicName = clean.MusicPostfixes(musicName); musicName = HtmlEntity.DeEntitize(musicName); if (musicName.Length < 3) { continue; } tracklist.Add(new Music(musicName)); } return(tracklist); }
public void IsNotArtist_Should_Be_True(string artistName) { Assert.IsTrue(_clean.IsNotArtist(artistName)); }