public async Task <Tuple <NewsItem, InstantState> > ParsePageInstant(string url, bool html) { try { if (InstantCache == null) { InstantCache = new List <NewsItem>(); } InstantState state = InstantState.FromCache; if (InstantCache != null && InstantCache.Count > 0) { var spl = url.Substring(url.LastIndexOf('/')); if (url.EndsWith('/')) { // url.Substring(0, url.Length - 2); } NewsItem obj; foreach (var i in POOL.Values) { var u = i.Newslist.Where(x => x.Url.Contains(spl)).ToArray(); if (u.Count() == 1) { obj = Parser.DeepCopy(u.First()); obj.Detailed.ContentHTML = html ? u.First().Detailed.ContentHTML : u.First().GetText(); return(new Tuple <NewsItem, InstantState>(obj, state)); } } var op = InstantCache.Where(x => x.Url.Contains(spl)).ToArray(); if (op.Count() == 1) { obj = Parser.DeepCopy(op.First()); obj.Detailed.ContentHTML = html ? op.First().Detailed.ContentHTML : op.First().GetText(); return(new Tuple <NewsItem, InstantState>(obj, state)); } } } catch { return(new Tuple <NewsItem, InstantState>(null, InstantState.ErrorParsing)); } try { HttpResponseMessage rm = await RequestAllocator.Instance.UsingPool(new Request(url)); if (rm != null && rm.IsSuccessStatusCode && rm.Content != null) { var doc = new HtmlDocument(); var tg = await rm.Content.ReadAsStringAsync(); doc.LoadHtml(tg); var item = new NewsItem(); var dla = doc.DocumentNode.Descendants("article").First(); item.Title = dla.Element("h1").InnerText; item.Date = dla.Element("time").InnerText; var imgp = dla.Element("img").GetAttributeValue("src", ""); item.ImageURL = App.Get.Config.HostUrl + imgp.Substring(1, imgp.Length - 1); item.Url = url; NewsItemDetailed.ParseArticle(item, doc.DocumentNode.Descendants().First(x => x.Name == "article" && x.HasClass("item-detailed"))); InstantCache.Add(item); var ret = Parser.DeepCopy(item); ret.Detailed.ContentHTML = html ? ret.Detailed.ContentHTML : ret.GetText(); return(new Tuple <NewsItem, InstantState>(ret, InstantState.Success)); } return(new Tuple <NewsItem, InstantState>(null, InstantState.TimedOut)); } catch (Exception) { return(new Tuple <NewsItem, InstantState>(null, InstantState.ErrorParsing)); } }
public static void ParseArticle(NewsItem item, HtmlNode artc) { try { item.Detailed = new NewsItemDetailed(); if (string.IsNullOrEmpty(item.Date)) { item.Date = artc.Descendants("time").First().InnerText; var img = artc.Descendants("img").First().GetAttributeValue("src", ""); item.ImageURL = App.Get.Config.HostUrl + img.Substring(1); } #region Docs var docs = artc.Descendants().Where(x => x.HasAttributes && x.GetAttributeValue("class", "null").Contains("file-desc")); var htmlNodes = docs as HtmlNode[] ?? docs.ToArray(); if (htmlNodes.Any()) { item.Detailed.DocsLinks = new List<DocItem>(); foreach (var doc in htmlNodes) { var f = doc.NextSibling; var t = f.ChildNodes[0].GetAttributeValue("data-href", ""); var type = f.ChildNodes[0].GetClasses().First(x => x != "img" && x != "ib"); item.Detailed.DocsLinks.Add(new DocItem(doc.InnerText, @"" + WebUtility.UrlEncode(t), type)); } } #endregion #region Images var imgnode = artc.Descendants().Where(x => x.Name == "div" && x.HasClass("s1") && x.GetAttributeValue("role", "") == "marquee").ToArray(); if (imgnode.Length > 0) { item.Detailed.ImagesLinks = new List<string>(); foreach (var y in imgnode) { var box = y.ChildNodes.Single(x => x.HasClass("box")); foreach (var i in box.FirstChild.ChildNodes) { string uri = i.FirstChild.Attributes["src"].Value; uri = uri.Replace(new Regex(@"(?<=photo.).*(?=\/)").Match(uri).Value, "0"); item.Detailed.ImagesLinks.Add(App.Get.Config.HostUrl + uri.Substring(1)); } } } #endregion #region RelatedLink var rel = artc.Descendants().Where(x => x.InnerText.Contains("Читайте також")).ToArray(); if (rel.Length > 0) { var y = rel.First().Descendants("a").ToArray(); if (y.Any() && y.First().HasAttributes) { item.RelUrl = WebUtility.UrlEncode(App.Get.Config.HostUrl + "/" + y.Last().Attributes["href"].Value); } } #endregion #region Text var text = artc.Descendants().First(x => x.HasAttributes && x.GetAttributeValue("id", "").Contains("item-desc")); var xr = text.Descendants().Where(x => x.InnerHtml.Contains("id=\"gallery") || x.HasClass("back") || x.HasAttributes && x.GetAttributeValue("role", "") == "photo").ToArray(); if (xr.Any()) { for (var i = 0; i < xr.Count(); i++) { if (text.ChildNodes.Contains(xr.ElementAt(i))) { text.RemoveChild(xr.ElementAt(i)); } } } if (htmlNodes.Any()) { var v = htmlNodes.First().ParentNode; if (v.Name == "tr") { v.ParentNode.Remove(); } else { v.RemoveAllChildren(); } } item.Detailed.ContentHTML = @"" + (text.OuterHtml.Replace("%22", "%5C%22")); } catch (Exception ex) { App.Get.Core.Logger.NotifyError(LogArea.Other, ex); } #endregion }
public async Task <string> SearchNewsAsync() { var Query = Info.Query; try { if (Query.HasKey("query")) { string qpar = Query["query"]; int count; var news = new List <NewsItem>(); var tr = new List <Task>(); try { if (qpar.Contains(',')) { var t = qpar.Split(','); count = int.Parse(t[0]); if (count > 1000) { throw new FormatException("Freak: server might fall"); } qpar = t[1]; } else { throw new FormatException("Expected count parameter. Search results may be very huge"); } bool virg = true; var rm = await RequestAllocator.Instance.UsingPool(new Request("http://nuwm.edu.ua/search?text=" + qpar.Replace(' ', '+'))); HtmlDocument doc = new HtmlDocument(); doc.Load(await rm.Content.ReadAsStreamAsync()); var wnode = doc.DocumentNode.Descendants().Where(x => x.Name == "div" && x.HasClass("news") && x.HasClass("search") && x.GetAttributeValue("role", "") == "group"); var nodes = wnode as HtmlNode[] ?? wnode.ToArray(); if (!nodes.Any()) { throw new InvalidDataException("Not found"); } var node = nodes.First(); foreach (var a in node.Elements("article")) { var btnf = a.Descendants("a").Where(x => x.HasClass("btn") && x.HasClass("s2")); var htmlNodes = btnf as HtmlNode[] ?? btnf.ToArray(); if (htmlNodes.Any()) { var link = htmlNodes.First().GetAttributeValue("href", ""); if (link.Contains("/news")) { bool found = false; foreach (var i in ParserPool.Current.POOL.Values) { var t = i.Newslist.Where(x => x.Url == link).ToArray(); if (t.Length == 1) { found = true; news.Add(t.First()); break; } } if (pool.InstantCache != null) { var inst = pool.InstantCache.Where(x => x.Url == link).ToArray(); if (inst.Length == 1) { news.Add(inst.First()); found = true; } } if (!found) { var u = new NewsItem { Excerpt = a.Descendants("p").First().InnerText, Title = a.Descendants("a").First(x => x.HasClass("name")).InnerText, Url = link }; tr.Add(Task.Run(() => NewsItemDetailed.Process(u))); news.Add(u); if (pool.InstantCache == null) { pool.InstantCache = new List <NewsItem>(); } virg = false; pool.InstantCache.Add(u); } } } if (news.Count == count) { break; } } await Task.WhenAll(tr); if (news.Count == 0) { return(JsonConvert.SerializeObject(ResponseTyper(new InvalidDataException("Not Found")))); } return(JsonConvert.SerializeObject(ResponseTyper(null, news, (virg ? InstantState.FromCache : InstantState.Success)))); } catch (Exception ex) { return(JsonConvert.SerializeObject(ResponseTyper(ex, news))); } } } catch (Exception ex) { return(JsonConvert.SerializeObject(ResponseTyper(ex))); } return(JsonConvert.SerializeObject(ResponseTyper(new InvalidOperationException("InvalidKey: query expected")))); }