Ejemplo n.º 1
0
        public void GetUrlTestWithSlash()
        {
            var subName           = WebpageHelper.GetSubUrl(@"http://www.baidu.com");
            var subNameMultiSlash = WebpageHelper.GetSubUrl(@"http://www.baidu.com/newsdetail/10204214.html");

            Assert.AreEqual("www.baidu.com", subName);
            Assert.AreEqual("10204214.html", subNameMultiSlash);
            Assert.Pass();
        }
Ejemplo n.º 2
0
        public void TryToSaveHtmlFile()
        {
            var     url     = "http://billie66.github.io/TLCL/book/chap01.html";
            HtmlWeb htmlWeb = new HtmlWeb();
            var     doc     = htmlWeb.Load(url);

            WebPageSaver.SaveHtml(url, doc);
            Assert.IsTrue(File.Exists(WebpageHelper.GetSubUrl(url)));
        }
Ejemplo n.º 3
0
        private static void GetPeopleMainPage(BufferBlock <string> imageTargetBlock)
        {
            var doc  = WebpageHelper.GetHttpRequestDocument(MAIN_PAGE);
            var bson = new BsonDocument();
            //获取图片和表格内容
            var tableImageNodes   = doc.DocumentNode.SelectNodes("//div[@class='tab-cont']/div");
            var tableContentNodes = doc.DocumentNode.SelectNodes("//div[@class='tab-track justify']/div");

            if (tableContentNodes == null || tableContentNodes == null || tableImageNodes.Count != tableContentNodes.Count)
            {
                return;
            }
            var bsonArray = new BsonArray();

            for (int i = 0; i < tableImageNodes.Count; i++)
            {
                var tableBson   = new BsonDocument();
                var imageNode   = tableImageNodes[i];
                var contentNode = tableContentNodes[i];
                var linkNode    = contentNode.SelectSingleNode(".//a");
                if (linkNode == null)
                {
                    continue;
                }
                var imageUrlWithAllText = imageNode.Attributes["style"].Value;
                var imageUrl            = imageUrlWithAllText.Substring(imageUrlWithAllText.IndexOf("/"), imageUrlWithAllText.IndexOf(")") - imageUrlWithAllText.IndexOf("/"));
                var imageDesc           = contentNode.InnerText;
                var link = linkNode.Attributes["href"].Value;
                imageTargetBlock.Post(imageUrl);
                tableBson.Add("img", WebpageHelper.GetSubUrl(imageUrl));
                tableBson.Add("desc", imageDesc);
                tableBson.Add("link", link);
                bsonArray.Add(tableBson);
            }
            bson.Add("table", bsonArray);
            var success = WebpageHelper.TryToInsertOrUpdateABson(bson, MongodbMain.PeopleMainPage);

            if (success)
            {
                Console.WriteLine("People page Insert or update success");
            }
            else
            {
                Console.WriteLine("Duplicated information in people page");
            }
        }
Ejemplo n.º 4
0
        private static void GetBanner(BufferBlock <string> imageTargetBlock)
        {
            var doc   = WebpageHelper.GetHttpRequestDocument(MainPage);
            var nodes = from links in doc.DocumentNode.Descendants()
                        where
                        links.Name == "a" &&
                        links.Attributes["href"] != null &&
                        links.Attributes["class"] != null &&
                        links.Attributes["class"].Value.Equals("slick-link p-show") &&
                        !MongodbChecker.CheckMainNewsList(links.Attributes["href"].Value)
                        select new BsonDocument()
                        .Add("link", links.Attributes["href"].Value)
                        .Add("img", WebpageHelper.GetSubUrl(
                                 links.Attributes["style"].Value.Substring(
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                     links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                     links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal))))
                        .Add("compressImg", WebImageSaver.Instance.GetComressImageName(WebpageHelper.GetSubUrl(
                                                                                           links.Attributes["style"].Value.Substring(
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                                                                               links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                                                                               links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)))))
                        .Add("originImage", links.Attributes["style"].Value.Substring(
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal),
                                 links.Attributes["style"].Value.LastIndexOf(")", StringComparison.Ordinal) -
                                 links.Attributes["style"].Value.IndexOf("/", StringComparison.Ordinal)));

            foreach (var node in nodes)
            {
                imageTargetBlock.Post(node["originImage"].AsBsonValue.ToString());
            }

            var enumerable = nodes as BsonDocument[] ?? nodes.ToArray();

            if (!enumerable.Any())
            {
                return;
            }
            MongodbSaver.SaveMainpageNewsList(nodes);
            foreach (var node in enumerable)
            {
                Console.WriteLine(node["link"].AsBsonValue + " " + node["img"].AsBsonValue);
            }
        }
Ejemplo n.º 5
0
        public void GetUrlTestWithoutSlash()
        {
            var subName = "asdsads.txt";

            Assert.AreEqual(subName, WebpageHelper.GetSubUrl(subName));
        }