public void Setup() { _crawledPage = new CrawledPage(_uri) { HttpWebRequest = (HttpWebRequest)WebRequest.Create(_uri) }; _unitUnderTest = GetInstance(false, false); }
public void Setup() { _crawledPage = new PageRequester(new CrawlConfiguration()).MakeRequest(new Uri("http://localhost.fiddler:1111/")); //Make the real request above look like it came from the fake uri _crawledPage.ParentUri = _uri; _crawledPage.HttpWebRequest = (HttpWebRequest)WebRequest.Create(_uri); _unitUnderTest = GetInstance(false, false, null, false, false); }
public void GetLinks_MetaNoIndexNoFollowUsingNoneUpperCase_ReturnsEmptyList() { _unitUnderTest = GetInstance(true, false); _crawledPage.Content.Text = "<META NAME=\"ROBOTS\" CONTENT=\"NONE\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>"; IEnumerable <Uri> result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(0, result.Count()); }
public void GetLinks_MetaNoIndex_ReturnsLinks() { _unitUnderTest = GetInstance(true, false); _crawledPage.Content.Text = "<meta name=\"robots\" content=\"noindex\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>"; IEnumerable <Uri> result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(2, result.Count()); }
public void GetLinks_RelNoFollow_NotReturned() { _unitUnderTest = GetInstance(false, true, null, false, false); _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" rel=\"nofollow\"></a><a href=\"/bbb/b.html\" rel=\"nofollow\" /></a>"; var result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(0, result.Count()); }
public void GetLinks_MetaNoFollow_ReturnsEmptyList() { _unitUnderTest = GetInstance(true, false, null, false, false); _crawledPage.Content.Text = "<meta name=\"robots\" content=\"nofollow\" /><a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>"; var result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(0, result.Count()); }
public void Init() { _crawledPage = new CrawledPage(_uri); _crawledPage.ParentUri = _uri; _crawledPage.HttpRequestMessage = new HttpRequestMessage(HttpMethod.Get, _uri); _crawledPage.HttpResponseMessage = new HttpResponseMessage(); _unitUnderTest = GetInstance(false, false, null, false, false); }
public void GetLinks_RelNoFollowUpperCase_NotReturned() { _unitUnderTest = GetInstance(false, true); _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" REL=\"NOFOLLOW\"></a><a href=\"/bbb/b.html\" REL=\"NOFOLLOW\" /></a>"; IEnumerable <Uri> result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(0, result.Count()); }
public void GetLinks_HttpXRobotsTagHeaderNoIndex_ReturnsLinks() { _crawledPage.HttpWebResponse.AddResponseHeader("X-Robots-Tag", "noindex"); _unitUnderTest = GetInstance(false, false, null, false, true); _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>"; IEnumerable <Uri> result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(2, result.Count()); }
public async Task Setup() { UnitTestConfig unitTestConfig = new UnitTestConfig(); _crawledPage = await new PageRequester(new CrawlConfiguration()).MakeRequestAsync(new Uri(unitTestConfig.SiteSimulatorBaseAddress)); //Make the real request above look like it came from the fake uri _crawledPage.ParentUri = _uri; _crawledPage.HttpRequestMessage = new System.Net.Http.HttpRequestMessage(System.Net.Http.HttpMethod.Get, _uri); _unitUnderTest = GetInstance(false, false, null, false, false); }
public void GetLinks_CleanUrlDelegateSet_ReturnsCleanLinks() { _unitUnderTest = GetInstance(false, false, (x) => x.Replace("a", "x").Replace("b", "y")); _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>"; IEnumerable <Uri> result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(2, result.Count()); Assert.AreEqual("http://a.com/xxx/x.html", result.ElementAt(0).AbsoluteUri); Assert.AreEqual("http://a.com/yyy/y.html", result.ElementAt(1).AbsoluteUri); }
public void GetLinks_NamedAnchorsOrHashbangs_Enabled_ReturnsLinks() { _unitUnderTest = GetInstance(false, false, null, true, false); _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/aaa/a.html#top\" ></a><a href=\"/aaa/a.html#bottom\" /></a><a href=\"/aaa/a.html/#someaction/someid\" /></a>"; IEnumerable <Uri> result = _unitUnderTest.GetLinks(_crawledPage); Assert.AreEqual(4, result.Count()); Assert.AreEqual("http://a.com/aaa/a.html", result.ElementAt(0).AbsoluteUri); Assert.AreEqual("http://a.com/aaa/a.html#top", result.ElementAt(1).AbsoluteUri); Assert.AreEqual("http://a.com/aaa/a.html#bottom", result.ElementAt(2).AbsoluteUri); Assert.AreEqual("http://a.com/aaa/a.html/#someaction/someid", result.ElementAt(3).AbsoluteUri); }
public void GetLinks_HttpXRobotsNoFollow_ReturnsEmptyList() { _crawledPage.HttpWebResponse.Headers.Add(new NameValueCollection() { { "X-Robots-Tag", "nofollow" } }); _unitUnderTest = GetInstance(false, false, null, false, true); _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>"; IEnumerable <Uri> result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(0, result.Count()); }
public void GetLinks_HttpXRobotsTagHeaderNoIndex_ReturnsLinks() { _crawledPage.HttpResponseMessage.Headers.Add("X-Robots-Tag", new List <string>() { "noindex" }); _unitUnderTest = GetInstance(false, false, null, false, true); _crawledPage.Content.Text = "<a href=\"/aaa/a.html\" ></a><a href=\"/bbb/b.html\" /></a>"; var result = _unitUnderTest.GetLinks(_crawledPage); Assert.IsNotNull(result); Assert.AreEqual(2, result.Count()); }
public NicoNicoUserEntry GetUserInfo() { Owner.Status = "ユーザー情報取得中"; var ret = new NicoNicoUserEntry(); //ユーザーページのhtmlを取得 var a = NicoNicoWrapperMain.Session.GetAsync(UserPage).Result; //htmlをロード HtmlDocument doc = new HtmlDocument(); doc.LoadHtml2(a); //ユーザープロファイル HtmlNode detail = doc.DocumentNode.SelectSingleNode("//div[@class='userDetail']"); HtmlNode profile = detail.SelectSingleNode("child::div[@class='profile']"); HtmlNode account = profile.SelectSingleNode("child::div[@class='account']"); ret.UserIconUrl = detail.SelectSingleNode("child::div[@class='avatar']/img").Attributes["src"].Value; ret.UserName = profile.SelectSingleNode("child::h2").InnerText.Trim(); ret.Id = account.SelectSingleNode("child::p[@class='accountNumber']").InnerText.Trim(); ret.Gender = account.SelectSingleNode("child::p[2]").InnerText.Trim(); ret.BirthDay = account.SelectSingleNode("child::p[3]").InnerText.Trim(); ret.Region = account.SelectSingleNode("child::p[4]").InnerText.Trim(); var temp = profile.SelectSingleNode("child::ul[@class='userDetailComment channel_open_mt0']/li/p/span"); ret.Description = temp == null ? "" : temp.InnerHtml; ret.UserPage = UserPage; //URLをハイパーリンク化する ret.Description = HyperLinkParser.Parse(ret.Description); Owner.Status = ""; return(ret); }
public NicoNicoPublicMylistEntry GetMylist() { try { var a = NicoNicoWrapperMain.Session.GetAsync(MylistUrl).Result; //該当JavaScriptの部分から取得 var globals = a.Substring(a.IndexOf("Jarty.globals(")); //改行で分割 var splitted = globals.Split('\n'); //正規表現でダブルクォーテ内の名前を取得 var regex = new Regex("\"(.*)\""); string nickname = null; string userid = null; string mylistname = null; string description = null; string json = null; foreach (var text in splitted) { //マイリストオーナーだったら if (text.Contains("mylist_owner:")) { //マイリストオーナーのニックネームを取得 nickname = text.Substring(text.IndexOf("nickname: ")); var match = regex.Match(nickname); //グループから取得 nickname = match.Groups[1].Value; continue; } if (nickname != null && text.Contains("user_id:")) { userid = new Regex(@"\d+").Match(text).Value; continue; } if (userid != null && text.Contains("name:")) { mylistname = regex.Match(text).Groups[1].Value; continue; } if (mylistname != null && text.Contains("description:")) { description = regex.Match(text).Groups[1].Value; continue; } if (description != null && text.Contains("Mylist.preload(")) { //Json取得 json = text.Substring(text.IndexOf(",") + 1, text.Length - text.IndexOf(",") - 3); break; } } var ret = new NicoNicoPublicMylistEntry(); ret.NickName = @"<a href=""http://www.nicovideo.jp/user/" + userid + @""">" + nickname + "</a> さんの公開マイリスト"; ret.MylistName = mylistname; ret.Description = description; //\nを改行に置換 ret.Description = ret.Description.Replace("\\n", "<br>").Replace("\\r", ""); ret.Description = HyperLinkParser.Parse(ret.Description); var list = new List <MylistListEntryViewModel>(); var data = DynamicJson.Parse(json); StoreItem(data, list); ret.Data = list; return(ret); } catch (RequestTimeout) { return(null); } }
//動画ページを指定 public static WatchApiData GetWatchApiData(string videoPage) { //動画ページのhtml取得 var response = NicoNicoWrapperMain.GetSession().GetResponseAsync(videoPage).Result; //チャンネル、公式動画 if (response.StatusCode == HttpStatusCode.MovedPermanently) { response = NicoNicoWrapperMain.GetSession().GetResponseAsync(response.Headers.Location.OriginalString).Result; } //削除された動画 if (response.StatusCode == HttpStatusCode.NotFound) { return(null); } //混雑中 if (response.StatusCode == HttpStatusCode.ServiceUnavailable) { return(null); } string html = response.Content.ReadAsStringAsync().Result; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml2(html); //htmlからAPIデータだけを綺麗に抜き出す すごい var container = doc.DocumentNode.QuerySelector("#watchAPIDataContainer"); if (container == null) { return(null); } var data = container.InnerHtml; //html特殊文字をデコードする data = HttpUtility.HtmlDecode(data); //jsonとしてAPIデータを展開していく var json = DynamicJson.Parse(data); //GetFlvの結果 string flv = json.flashvars.flvInfo; //2重にエンコードされてるので二回 flv = HttpUtility.UrlDecode(flv); flv = HttpUtility.UrlDecode(flv); WatchApiData ret = new WatchApiData(); //&で繋がれているので剥がす var getFlv = flv.Split(new char[] { '&' }).ToDictionary(source => source.Substring(0, source.IndexOf('=')), source => Uri.UnescapeDataString(source.Substring(source.IndexOf('=') + 1))); ret.GetFlv = new NicoNicoGetFlvData(getFlv); //動画情報 var videoDetail = json.videoDetail; //---情報を詰める--- ret.Cmsid = videoDetail.id; ret.MovieType = json.flashvars.movie_type; ret.Title = HttpUtility.HtmlDecode(videoDetail.title); //html特殊文字をデコード ret.Thumbnail = videoDetail.thumbnail; ret.Description = videoDetail.description; ret.PostedAt = videoDetail.postedAt; ret.Length = (int)videoDetail.length; ret.ViewCounter = (int)videoDetail.viewCount; ret.CommentCounter = (int)videoDetail.commentCount; ret.MylistCounter = (int)videoDetail.mylistCount; ret.YesterdayRank = videoDetail.yesterday_rank == null ? "圏外" : videoDetail.yesterday_rank + "位"; ret.HighestRank = videoDetail.highest_rank == null ? "圏外" : videoDetail.highest_rank + "位"; ret.Token = json.flashvars.csrfToken; if (json.uploaderInfo()) { //投稿者情報 var uploaderInfo = json.uploaderInfo; ret.UploaderId = uploaderInfo.id; ret.UploaderIconUrl = uploaderInfo.icon_url; ret.UploaderName = uploaderInfo.nickname; ret.UploaderIsFavorited = uploaderInfo.is_favorited; } else if (json.channelInfo()) { //投稿者情報 var channelInfo = json.channelInfo; ret.UploaderId = channelInfo.id; ret.UploaderIconUrl = channelInfo.icon_url; ret.UploaderName = channelInfo.name; ret.UploaderIsFavorited = channelInfo.is_favorited == 1 ? true : false; ret.IsChannelVideo = true; } ret.Description = HyperLinkParser.Parse(ret.Description); ret.TagList = new ObservableCollection <VideoTagViewModel>(); foreach (var tag in videoDetail.tagList) { NicoNicoTag entry = new NicoNicoTag() { Id = tag.id, Tag = tag.tag, Dic = tag.dic(), Lck = tag.lck == "1" ? true : false, Cat = tag.cat() }; ret.TagList.Add(new VideoTagViewModel(entry)); } //------ //有料動画 if (ret.GetFlv.VideoUrl == null || ret.GetFlv.VideoUrl.Count() == 0) { ret.IsPaidVideo = true; return(ret); } //念のためCookieを設定 var cookie = response.Headers.GetValues("Set-Cookie"); foreach (string s in cookie) { foreach (string ss in s.Split(';')) { App.SetCookie(new Uri("http://nicovideo.jp/"), ss); } } return(ret); }
public SiteMapFinder() { _linkParser = new AngleSharpHyperlinkParser(); }
public List <NicoNicoUserMylistEntry> GetUserMylist() { var url = UserPage + "/mylist"; Owner.Status = "ユーザーマイリスト取得中"; List <NicoNicoUserMylistEntry> ret = new List <NicoNicoUserMylistEntry>(); try { var a = NicoNicoWrapperMain.Session.GetAsync(url).Result; var doc = new HtmlDocument(); doc.LoadHtml2(a); var content = doc.DocumentNode.SelectSingleNode("//div[@class='content']"); var outers = content.SelectNodes("child::div[@class='articleBody']/div[@class='outer']"); //終了 if (outers == null) { Owner.Status = ""; return(null); } //ニコレポタイムライン走査 foreach (var node in outers) { NicoNicoUserMylistEntry entry = new NicoNicoUserMylistEntry(); //h4タグ var h4 = node.SelectSingleNode("child::div/h4"); entry.Url = "http://www.nicovideo.jp/" + h4.SelectSingleNode("child::a").Attributes["href"].Value; //名前取得 entry.Name = h4.SelectSingleNode("child::a").InnerText.Trim(); //説明文取得 var desc = node.SelectSingleNode("child::div/p[@data-nico-mylist-desc-full='true']"); entry.Description = desc == null ? "" : desc.InnerText.Trim(); entry.Description = HyperLinkParser.Parse(entry.Description); //サムネイル取得 var thumb1 = node.SelectSingleNode("child::div/ul/li[1]/img"); var thumb2 = node.SelectSingleNode("child::div/ul/li[2]/img"); var thumb3 = node.SelectSingleNode("child::div/ul/li[3]/img"); if (thumb1 != null) { entry.ThumbNail1Available = true; entry.ThumbNail1Url = thumb1.Attributes["src"].Value; entry.ThumbNail1ToolTip = HttpUtility.HtmlDecode(thumb1.Attributes["alt"].Value); } else { goto next; } if (thumb2 != null) { entry.ThumbNail2Available = true; entry.ThumbNail2Url = thumb2.Attributes["src"].Value; entry.ThumbNail2ToolTip = HttpUtility.HtmlDecode(thumb2.Attributes["alt"].Value); } else { goto next; } if (thumb3 != null) { entry.ThumbNail3Available = true; entry.ThumbNail3Url = thumb3.Attributes["src"].Value; entry.ThumbNail3ToolTip = HttpUtility.HtmlDecode(thumb3.Attributes["alt"].Value); } next: ret.Add(entry); } Owner.Status = ""; return(ret); } catch (RequestTimeout) { Owner.Status = "ユーザーマイリストの取得に失敗しました"; return(null); } }