/// <summary> /// Process of the source - identifying language, encoding, title and so on. /// </summary> /// <param name="source"></param> private void Process(TheSource source) { foreach (var helper in helpers) { helper.Identify(ref source); } }
private HtmlDocument openLinkAndGetDoc(string link, TheSource source) { HtmlDocument doc = null; try { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(link); request.Timeout = source.RequestTimeOut != 0 ? source.RequestTimeOut : 5000; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); string responseString = string.Empty; using (StreamReader stream = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(source.Enc))) { responseString = stream.ReadToEnd(); } doc = new HtmlDocument(); doc.LoadHtml(responseString); } catch (Exception ex) { Console.WriteLine("Error with getting HttpResponse and creating HtmlAgilityPach.HtmlDocument for link " + link + Environment.NewLine + ex); } return(doc); }
/// <summary> /// Get all urls of html page. /// </summary> /// <param name="doc"></param> /// <returns></returns> private List <string> GetDocUrlsWithinSourceDomain(HtmlDocument doc, TheSource source) { var res = new List <string>(); res.AddRange(doc.DocumentNode.SelectNodes("//a[@href]") .Select(n => n.GetAttributeValue("href", string.Empty))); res = res.Where(u => Uri.IsWellFormedUriString(u, UriKind.Relative) || u.Contains(source.Url)).ToList(); return(res); }
public ControlPartsWindow() { InitializeComponent(); var dataSource = new TheSource(); dataSource.ComboxSource = new List <string>(); dataSource.ComboxSource.Add("Entry 1"); dataSource.ComboxSource.Add("Entry 2"); dataSource.ComboxSource.Add("Entry 3"); this.DataContext = dataSource; }
/// <summary> /// Opens URL page with HttpWebRequest and HtmlAgilityPack. /// </summary> /// <param name="doc"></param> /// <param name="Url"></param> /// <param name="source"></param> private void OpenUrlAndLoadHtml(HtmlDocument doc, string url, TheSource source) { string responseString = string.Empty; try { responseString = OpenUrlAndGetResponseString(url, source); doc.LoadHtml(responseString); } catch (Exception ex) { Console.WriteLine(ex); } }
public void All_Rss_Pages_Should_Be_Extracted() { TheSource s = new TheSource() { Enc = "utf-8", Url = "zonakz.net" }; RSSPagesIdentifier rpi = new RSSPagesIdentifier(); rpi.Identify(ref s); Assert.IsTrue(s.RssPages.Count > 0); }
private void GetTitleAndEncoding(TheSource so) { try { using (WebClient client = new WebClient()) using (var read = client.OpenRead("http://" + so.Url)) { HtmlDocument doc = new HtmlDocument(); doc.Load(read, true); so.Title = doc.DocumentNode.SelectSingleNode("//title").InnerText; so.Enc = doc.Encoding.BodyName; } } catch (Exception ex) { } }
public override void Identify(ref TheSource source) { if (source.Enc == null) { return; } try { HtmlDocument doc = openLinkAndGetDoc("http://" + source.Url, source); if (doc == null) { return; } HashSet <string> rssLinks = getRssLinksFromPage(doc, source); HashSet <string> temp = new HashSet <string>(); foreach (var l in rssLinks) { HashSet <string> ll = getValidRssPages(l, source); foreach (var x in ll) { temp.Add(x); } } rssLinks = temp; source.RssPages = new List <RssPage>(); foreach (var l in rssLinks.Except(source.RssPages.Select(x => x.Url))) { source.RssPages.Add(new RssPage() { TheSourceId = source.Id, TheSourse = source, Url = l }); } } catch (Exception ex) { Console.WriteLine(ex); } }
public void Container_With_Longest_Identifier_Is_Identified() { TheSource s = new TheSource() { Enc = "utf-8", Url = "zonakz.net", RssPages = new List <RssPage>() { new RssPage() { Url = "https://zonakz.net/feed/" } } }; RSSPagesIdentifier iden = new RSSPagesIdentifier(); iden.Identify(ref s); }
/// <summary> /// Identifing encoding of web page. /// </summary> /// <returns></returns> public override void Identify(ref TheSource theSource) { GetTitleAndEncoding(theSource); if (theSource.Title != null && theSource.Title.Contains("�")) { GetTitleAndEncoding2ndWay(theSource); } if (theSource.Title != null && !theSource.Title.Contains("�")) { theSource.AutomaticalEncodingUpdateWasSuccess = true; } else { theSource.AutomaticalEncodingUpdateWasSuccess = false; } theSource.LastTimeAutomaticalEncodingUpdateEffort = DateTime.Now; }
private string OpenUrlAndGetResponseString(string url, TheSource source) { string responseString = string.Empty; try { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); using (StreamReader stream = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(source.Enc))) { responseString = stream.ReadToEnd(); } } catch (Exception ex) { Console.WriteLine(ex); } return(responseString); }
/// <summary> /// Make search. Should be locked for 1 thread. /// </summary> /// <param name="maxSearchResultPagesToGet">How many pages of search results should be retrieved</param> /// <param name="source"></param> /// <param name="q">search query</param> /// <returns></returns> public SearchEngineSearchResults MakeSearch(TheSource source, string[] queries, string numOfResultsPart) { lock (_obj) { List <string> searchStrings = new List <string>(); SearchEngineSearchResults results = new SearchEngineSearchResults(); for (int i = 0; i < queries.Count(); i++) { searchStrings.Add(string.Format(this._baseSearchAddress, queries[i]) + numOfResultsPart); } //if there are no results for query we try others. If there are no results at all //it could be ban of search engine or bad queries. //so, we make big pause... //If there are results - (enough according to _minResultsNumOfSearchRequests parameter) //we stop searching and begin opening and analizing of the pages at the sources websites. foreach (var q in searchStrings) { Thread.Sleep(_pause); //open page with search results //parsing for snippets and urls //checking if there are enough results is collected if (results.snippets.Count() >= this._minResultsNumOfSearchRequests) { break; } } } return(null); }
private void GetTitleAndEncoding2ndWay(TheSource so) { try { HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create("http://" + so.Url); HttpWebResponse resp = (HttpWebResponse)req.GetResponse(); string res = string.Empty; using (StreamReader stream = new StreamReader(resp.GetResponseStream(), Encoding.GetEncoding("windows-1251"))) { res = stream.ReadToEnd(); } HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(res); so.Title = doc.DocumentNode.SelectSingleNode("//title").InnerText; so.Enc = doc.Encoding.BodyName; } catch (Exception ex) { } }
/// <summary> /// Checks if a page valid rss page and if not /// it opens links (calls method) on this page and checks if they are valid rss pages. /// If valid - set contanins only 1 valid link from argument. /// If can't get links from link (and link is not valid rss page) - returns empty set. /// </summary> /// <returns></returns> private HashSet <string> getValidRssPages(string link, TheSource source) { HashSet <string> set = new HashSet <string>(); try { if (IsPageValidRssPage(link)) { set.Add(link); } else { HtmlDocument doc = openLinkAndGetDoc(link, source); if (doc == null) { return(set); } var rsslinks = getRssLinksFromPage(doc, source); rsslinks = new HashSet <string>(rsslinks.Select(s => s.Contains("http://") ? s : "http://" + source.Url.Trim('/') + "/" + s.Trim('/'))); foreach (var l in rsslinks) { HtmlDocument d = openLinkAndGetDoc(l, source); if (d != null && IsPageValidRssPage(l)) { set.Add(l); } } } } catch (Exception ex) { Console.WriteLine(ex); } return(set); }
/// <summary> /// Gets all rsslinks from html page /// </summary> /// <param name="doc"></param> /// <returns></returns> private HashSet <string> getRssLinksFromPage(HtmlDocument doc, TheSource source) { var linkedPages = doc.DocumentNode.Descendants("a") .Select(a => a.GetAttributeValue("href", null)) .Where(u => !String.IsNullOrEmpty(u)); var linkedPages2 = doc.DocumentNode.Descendants("link") .Select(a => a.GetAttributeValue("href", null)) .Where(u => !String.IsNullOrEmpty(u)); var linkedPages3 = doc.DocumentNode.Descendants("link") .Where(a => a.Attributes.Contains("type")) .Where(a => a.Attributes["type"].Value == "application/rss+xml") .Select(a => a.GetAttributeValue("href", null)) .Where(u => !String.IsNullOrEmpty(u)); HashSet <string> rssLinks = new HashSet <string>(); foreach (var l in linkedPages.Union(linkedPages2).Union(linkedPages3)) { if ((l.Trim('/').EndsWith("feed") || l.Contains("rss")) && !l.Contains("comment")) { string resL = string.Empty; if (!l.Contains("http")) { rssLinks.Add(l); } else { rssLinks.Add(l); } } } return(rssLinks); }
/// <summary> /// Identifying the rules of parsing of the source. /// </summary> /// <param name="source"></param> public override void Identify(ref TheSource source) { if (source.TheSourceType == MMonitorLib.Enums.TheSourceType.MASS_MEDIA) { List <string> linksToOpen = new List <string>(); if (source.RssPages == null || source.RssPages.Count == 0) { //getting links from main page (not a good idea may be...) } else { //documents loaded from rss var docs = new List <HtmlDocument>(); var xPathes = new List <string>(); var urls = new List <string>(); var pages = new List <HtmlDocument>(); //getting links from rss feeds foreach (var rssLink in source.RssPages) { urls = GetAllLinksFromRSSFeed(rssLink.Url); if (urls.Count < 10) { return; } //docs loaded from links at rss page foreach (var u in urls) { try { HtmlDocument page = new HtmlDocument(); pages.Add(page); OpenUrlAndLoadHtml(page, u, source); xPathes.Add(GetArticleContainerIdentifier(page)); } catch (Exception ex) { Console.WriteLine(ex); } } } //identifing the most occured xpath if (xPathes.Count >= 10) { source.PageParsingRules.Add(new PageParsingRule() { ContentXPath = xPathes.GroupBy(x => x) .Select(group => new { XPa = group.Key, Cou = group.Count() }) .OrderByDescending(x => x.Cou) .Where(xp => xp.Cou > 1) .FirstOrDefault() .XPa }); } } } }
public bool UpdateSource(TheSource theSource) { throw new NotImplementedException(); }
static void Main(string[] args) { md5 = MD5.Create(); string file = System.Configuration.ConfigurationManager.AppSettings["file_with_sources"]; int row = 0; using (var db = new MMonitorContext()) { var lines = File.ReadAllLines(file); TheSource s; Uri u; string url = string.Empty; string sourceType = string.Empty; string l = string.Empty; foreach (var line in lines) { string[] parts = line.Split('\t').Select(p => p.Trim()).ToArray(); if (parts[0].Contains("xn--")) { try { u = new Uri(parts[0]); l = u.Host; } catch (Exception ex) { log.Error("Can't create source url from puny url", ex); } } if (parts[0].Contains("https://")) { l = parts[0].Replace("https://", ""); } else if (parts[0].Contains("http://")) { l = parts[0].Replace("http://", ""); } else { l = parts[0]; } if (l.Contains("www.")) { l = l.Replace("www.", ""); } if (!Uri.IsWellFormedUriString(l, UriKind.Relative) && !l.Contains(".рф")) { log.Error(string.Format("Bad format for Url {0} at line {1}", parts[0], row)); continue; } l = l.Trim().ToLower(); if (l.Contains(".рф") && !Regex.IsMatch(l, @"\S+\.рф$")) { log.Error(string.Format("Bad format for Url {0} at line {1}", parts[0], row)); continue; } s = new TheSource() { Url = l, UrlHash = GetMD5Hash(l), Lang = string.IsNullOrWhiteSpace(parts[1]) ? Langs.UNDEFINED : (Langs)Enum.Parse(typeof(Langs), parts[1]), TheSourceType = (TheSourceType)Enum.Parse(typeof(TheSourceType), parts[2]) }; if (db.TheSources.Where(x => x.UrlHash == s.UrlHash).FirstOrDefault() == null) { db.TheSources.Add(s); row++; Console.WriteLine("row is " + row); } } try { db.SaveChanges(); log.Info($"Loaded {row} sources to database"); } catch (Exception ex) { log.Error("Can't upload sources", ex); } } Console.ReadLine(); }
public abstract void Identify(ref TheSource source);
/// <summary> /// Identify the language of the source's pages. /// </summary> /// <param name="source"></param> public override void Identify(ref TheSource source) { //throw new NotImplementedException(); }
public bool AddNewSource(TheSource source) { throw new NotImplementedException(); }