Beispiel #1
0
        public async Task <Tuple <NewsItem, InstantState> > ParsePageInstant(string url, bool html)
        {
            try
            {
                if (InstantCache == null)
                {
                    InstantCache = new List <NewsItem>();
                }

                InstantState state = InstantState.FromCache;
                if (InstantCache != null && InstantCache.Count > 0)
                {
                    var spl = url.Substring(url.LastIndexOf('/'));
                    if (url.EndsWith('/'))
                    {
                        // url.Substring(0, url.Length - 2);
                    }

                    NewsItem obj;
                    foreach (var i in POOL.Values)
                    {
                        var u = i.Newslist.Where(x => x.Url.Contains(spl)).ToArray();
                        if (u.Count() == 1)
                        {
                            obj = Parser.DeepCopy(u.First());
                            obj.Detailed.ContentHTML = html ? u.First().Detailed.ContentHTML : u.First().GetText();

                            return(new Tuple <NewsItem, InstantState>(obj, state));
                        }
                    }
                    var op = InstantCache.Where(x => x.Url.Contains(spl)).ToArray();
                    if (op.Count() == 1)
                    {
                        obj = Parser.DeepCopy(op.First());
                        obj.Detailed.ContentHTML = html ? op.First().Detailed.ContentHTML : op.First().GetText();

                        return(new Tuple <NewsItem, InstantState>(obj, state));
                    }
                }
            }
            catch
            {
                return(new Tuple <NewsItem, InstantState>(null, InstantState.ErrorParsing));
            }
            try
            {
                HttpResponseMessage rm = await RequestAllocator.Instance.UsingPool(new Request(url));

                if (rm != null && rm.IsSuccessStatusCode && rm.Content != null)
                {
                    var doc = new HtmlDocument();
                    var tg  = await rm.Content.ReadAsStringAsync();

                    doc.LoadHtml(tg);
                    var item = new NewsItem();
                    var dla  = doc.DocumentNode.Descendants("article").First();
                    item.Title = dla.Element("h1").InnerText;
                    item.Date  = dla.Element("time").InnerText;
                    var imgp = dla.Element("img").GetAttributeValue("src", "");
                    item.ImageURL = App.Get.Config.HostUrl +
                                    imgp.Substring(1, imgp.Length - 1);
                    item.Url = url;
                    NewsItemDetailed.ParseArticle(item, doc.DocumentNode.Descendants().First(x => x.Name == "article" && x.HasClass("item-detailed")));
                    InstantCache.Add(item);

                    var ret = Parser.DeepCopy(item);
                    ret.Detailed.ContentHTML = html ? ret.Detailed.ContentHTML : ret.GetText();
                    return(new Tuple <NewsItem, InstantState>(ret, InstantState.Success));
                }

                return(new Tuple <NewsItem, InstantState>(null, InstantState.TimedOut));
            }
            catch (Exception)
            {
                return(new Tuple <NewsItem, InstantState>(null, InstantState.ErrorParsing));
            }
        }
        public static void ParseArticle(NewsItem item, HtmlNode artc)
        {
            try
            {
                item.Detailed = new NewsItemDetailed();
                if (string.IsNullOrEmpty(item.Date))
                {
                    item.Date = artc.Descendants("time").First().InnerText;
                    var img = artc.Descendants("img").First().GetAttributeValue("src", "");
                    item.ImageURL = App.Get.Config.HostUrl + img.Substring(1);
                }
                #region Docs
                var docs = artc.Descendants().Where(x => x.HasAttributes && x.GetAttributeValue("class", "null").Contains("file-desc"));
                var htmlNodes = docs as HtmlNode[] ?? docs.ToArray();
                if (htmlNodes.Any())
                {
                    item.Detailed.DocsLinks = new List<DocItem>();
                    foreach (var doc in htmlNodes)
                    {
                        var f = doc.NextSibling;
                        var t = f.ChildNodes[0].GetAttributeValue("data-href", "");
                        var type = f.ChildNodes[0].GetClasses().First(x => x != "img" && x != "ib");
                        item.Detailed.DocsLinks.Add(new DocItem(doc.InnerText, @"" + WebUtility.UrlEncode(t), type));
                    }
                }
                #endregion
                #region Images
                var imgnode = artc.Descendants().Where(x =>
                    x.Name == "div" && x.HasClass("s1") && x.GetAttributeValue("role", "") == "marquee").ToArray();


                if (imgnode.Length > 0)
                {
                    item.Detailed.ImagesLinks = new List<string>();
                    foreach (var y in imgnode)
                    {
                        var box = y.ChildNodes.Single(x => x.HasClass("box"));
                        foreach (var i in box.FirstChild.ChildNodes)
                        {
                            string uri = i.FirstChild.Attributes["src"].Value;
                            uri = uri.Replace(new Regex(@"(?<=photo.).*(?=\/)").Match(uri).Value, "0");
                            item.Detailed.ImagesLinks.Add(App.Get.Config.HostUrl + uri.Substring(1));
                        }
                    }
                }
                #endregion
                #region RelatedLink
                var rel = artc.Descendants().Where(x => x.InnerText.Contains("Читайте також")).ToArray();
                if (rel.Length > 0)
                {
                    var y = rel.First().Descendants("a").ToArray();
                    if (y.Any() && y.First().HasAttributes)
                    {
                        item.RelUrl = WebUtility.UrlEncode(App.Get.Config.HostUrl + "/" + y.Last().Attributes["href"].Value);
                    }
                }
                #endregion

                #region Text
                var text = artc.Descendants().First(x => x.HasAttributes && x.GetAttributeValue("id", "").Contains("item-desc"));
                var xr = text.Descendants().Where(x => x.InnerHtml.Contains("id=\"gallery") ||
                                                       x.HasClass("back") ||
                                                       x.HasAttributes && x.GetAttributeValue("role", "") == "photo").ToArray();
                if (xr.Any())
                {
                    for (var i = 0; i < xr.Count(); i++)
                    {
                        if (text.ChildNodes.Contains(xr.ElementAt(i)))
                        {
                            text.RemoveChild(xr.ElementAt(i));
                        }
                    }
                }
                if (htmlNodes.Any())
                {
                    var v = htmlNodes.First().ParentNode;
                    if (v.Name == "tr")
                    {
                        v.ParentNode.Remove();
                    }
                    else
                    {
                        v.RemoveAllChildren();
                    }
                }

                item.Detailed.ContentHTML = @"" + (text.OuterHtml.Replace("%22", "%5C%22"));


            }
            catch (Exception ex)
            {
                App.Get.Core.Logger.NotifyError(LogArea.Other, ex);
            }
            #endregion
        }
Beispiel #3
0
        public async Task <string> SearchNewsAsync()
        {
            var Query = Info.Query;

            try
            {
                if (Query.HasKey("query"))
                {
                    string qpar = Query["query"];
                    int    count;
                    var    news = new List <NewsItem>();
                    var    tr   = new List <Task>();
                    try
                    {
                        if (qpar.Contains(','))
                        {
                            var t = qpar.Split(',');
                            count = int.Parse(t[0]);
                            if (count > 1000)
                            {
                                throw new FormatException("Freak: server might fall");
                            }

                            qpar = t[1];
                        }
                        else
                        {
                            throw new FormatException("Expected count parameter. Search results may be very huge");
                        }
                        bool virg = true;
                        var  rm   = await RequestAllocator.Instance.UsingPool(new Request("http://nuwm.edu.ua/search?text=" + qpar.Replace(' ', '+')));

                        HtmlDocument doc = new HtmlDocument();

                        doc.Load(await rm.Content.ReadAsStreamAsync());

                        var wnode = doc.DocumentNode.Descendants().Where(x =>
                                                                         x.Name == "div" &&
                                                                         x.HasClass("news") && x.HasClass("search") &&
                                                                         x.GetAttributeValue("role", "") == "group");
                        var nodes = wnode as HtmlNode[] ?? wnode.ToArray();
                        if (!nodes.Any())
                        {
                            throw new InvalidDataException("Not found");
                        }
                        var node = nodes.First();
                        foreach (var a in node.Elements("article"))
                        {
                            var btnf      = a.Descendants("a").Where(x => x.HasClass("btn") && x.HasClass("s2"));
                            var htmlNodes = btnf as HtmlNode[] ?? btnf.ToArray();
                            if (htmlNodes.Any())
                            {
                                var link = htmlNodes.First().GetAttributeValue("href", "");
                                if (link.Contains("/news"))
                                {
                                    bool found = false;
                                    foreach (var i in ParserPool.Current.POOL.Values)
                                    {
                                        var t = i.Newslist.Where(x => x.Url == link).ToArray();
                                        if (t.Length == 1)
                                        {
                                            found = true;
                                            news.Add(t.First());
                                            break;
                                        }
                                    }
                                    if (pool.InstantCache != null)
                                    {
                                        var inst = pool.InstantCache.Where(x => x.Url == link).ToArray();
                                        if (inst.Length == 1)
                                        {
                                            news.Add(inst.First());
                                            found = true;
                                        }
                                    }
                                    if (!found)
                                    {
                                        var u = new NewsItem
                                        {
                                            Excerpt = a.Descendants("p").First().InnerText,
                                            Title   = a.Descendants("a").First(x => x.HasClass("name")).InnerText,
                                            Url     = link
                                        };

                                        tr.Add(Task.Run(() => NewsItemDetailed.Process(u)));
                                        news.Add(u);
                                        if (pool.InstantCache == null)
                                        {
                                            pool.InstantCache = new List <NewsItem>();
                                        }
                                        virg = false;
                                        pool.InstantCache.Add(u);
                                    }
                                }
                            }
                            if (news.Count == count)
                            {
                                break;
                            }
                        }

                        await Task.WhenAll(tr);

                        if (news.Count == 0)
                        {
                            return(JsonConvert.SerializeObject(ResponseTyper(new InvalidDataException("Not Found"))));
                        }


                        return(JsonConvert.SerializeObject(ResponseTyper(null, news, (virg ? InstantState.FromCache : InstantState.Success))));
                    }
                    catch (Exception ex)
                    {
                        return(JsonConvert.SerializeObject(ResponseTyper(ex, news)));
                    }
                }
            }
            catch (Exception ex)
            {
                return(JsonConvert.SerializeObject(ResponseTyper(ex)));
            }
            return(JsonConvert.SerializeObject(ResponseTyper(new InvalidOperationException("InvalidKey: query expected"))));
        }