Esempio n. 1
0
 /// <summary>
 /// Process of the source - identifying language, encoding, title and so on.
 /// </summary>
 /// <param name="source"></param>
 private void Process(TheSource source)
 {
     foreach (var helper in helpers)
     {
         helper.Identify(ref source);
     }
 }
Esempio n. 2
0
        private HtmlDocument openLinkAndGetDoc(string link, TheSource source)
        {
            HtmlDocument doc = null;

            try
            {
                HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(link);
                request.Timeout = source.RequestTimeOut != 0 ? source.RequestTimeOut : 5000;
                HttpWebResponse response       = (HttpWebResponse)request.GetResponse();
                string          responseString = string.Empty;

                using (StreamReader stream = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(source.Enc)))
                {
                    responseString = stream.ReadToEnd();
                }

                doc = new HtmlDocument();
                doc.LoadHtml(responseString);
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error with getting HttpResponse and creating HtmlAgilityPach.HtmlDocument for link " + link + Environment.NewLine + ex);
            }

            return(doc);
        }
Esempio n. 3
0
        /// <summary>
        /// Get all urls of html page.
        /// </summary>
        /// <param name="doc"></param>
        /// <returns></returns>
        private List <string> GetDocUrlsWithinSourceDomain(HtmlDocument doc, TheSource source)
        {
            var res = new List <string>();

            res.AddRange(doc.DocumentNode.SelectNodes("//a[@href]")
                         .Select(n => n.GetAttributeValue("href", string.Empty)));

            res = res.Where(u => Uri.IsWellFormedUriString(u, UriKind.Relative) || u.Contains(source.Url)).ToList();

            return(res);
        }
        public ControlPartsWindow()
        {
            InitializeComponent();

            var dataSource = new TheSource();

            dataSource.ComboxSource = new List <string>();
            dataSource.ComboxSource.Add("Entry 1");
            dataSource.ComboxSource.Add("Entry 2");
            dataSource.ComboxSource.Add("Entry 3");

            this.DataContext = dataSource;
        }
Esempio n. 5
0
        /// <summary>
        /// Opens URL page with HttpWebRequest and HtmlAgilityPack.
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="Url"></param>
        /// <param name="source"></param>
        private void OpenUrlAndLoadHtml(HtmlDocument doc, string url, TheSource source)
        {
            string responseString = string.Empty;

            try
            {
                responseString = OpenUrlAndGetResponseString(url, source);
                doc.LoadHtml(responseString);
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
        }
Esempio n. 6
0
        public void All_Rss_Pages_Should_Be_Extracted()
        {
            TheSource s = new TheSource()
            {
                Enc = "utf-8",
                Url = "zonakz.net"
            };

            RSSPagesIdentifier rpi = new RSSPagesIdentifier();

            rpi.Identify(ref s);

            Assert.IsTrue(s.RssPages.Count > 0);
        }
Esempio n. 7
0
 private void GetTitleAndEncoding(TheSource so)
 {
     try
     {
         using (WebClient client = new WebClient())
             using (var read = client.OpenRead("http://" + so.Url))
             {
                 HtmlDocument doc = new HtmlDocument();
                 doc.Load(read, true);
                 so.Title = doc.DocumentNode.SelectSingleNode("//title").InnerText;
                 so.Enc   = doc.Encoding.BodyName;
             }
     }
     catch (Exception ex)
     {
     }
 }
Esempio n. 8
0
        public override void Identify(ref TheSource source)
        {
            if (source.Enc == null)
            {
                return;
            }

            try
            {
                HtmlDocument doc = openLinkAndGetDoc("http://" + source.Url, source);
                if (doc == null)
                {
                    return;
                }

                HashSet <string> rssLinks = getRssLinksFromPage(doc, source);

                HashSet <string> temp = new HashSet <string>();
                foreach (var l in rssLinks)
                {
                    HashSet <string> ll = getValidRssPages(l, source);
                    foreach (var x in ll)
                    {
                        temp.Add(x);
                    }
                }
                rssLinks = temp;

                source.RssPages = new List <RssPage>();
                foreach (var l in rssLinks.Except(source.RssPages.Select(x => x.Url)))
                {
                    source.RssPages.Add(new RssPage()
                    {
                        TheSourceId = source.Id,
                        TheSourse   = source,
                        Url         = l
                    });
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
        }
Esempio n. 9
0
        public void Container_With_Longest_Identifier_Is_Identified()
        {
            TheSource s = new TheSource()
            {
                Enc      = "utf-8",
                Url      = "zonakz.net",
                RssPages = new List <RssPage>()
                {
                    new RssPage()
                    {
                        Url = "https://zonakz.net/feed/"
                    }
                }
            };

            RSSPagesIdentifier iden = new RSSPagesIdentifier();

            iden.Identify(ref s);
        }
Esempio n. 10
0
        /// <summary>
        /// Identifing encoding of web page.
        /// </summary>
        /// <returns></returns>
        public override void Identify(ref TheSource theSource)
        {
            GetTitleAndEncoding(theSource);
            if (theSource.Title != null && theSource.Title.Contains("�"))
            {
                GetTitleAndEncoding2ndWay(theSource);
            }

            if (theSource.Title != null && !theSource.Title.Contains("�"))
            {
                theSource.AutomaticalEncodingUpdateWasSuccess = true;
            }
            else
            {
                theSource.AutomaticalEncodingUpdateWasSuccess = false;
            }

            theSource.LastTimeAutomaticalEncodingUpdateEffort = DateTime.Now;
        }
Esempio n. 11
0
        private string OpenUrlAndGetResponseString(string url, TheSource source)
        {
            string responseString = string.Empty;

            try
            {
                HttpWebRequest  request  = (HttpWebRequest)HttpWebRequest.Create(url);
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                using (StreamReader stream = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(source.Enc)))
                {
                    responseString = stream.ReadToEnd();
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }

            return(responseString);
        }
Esempio n. 12
0
        /// <summary>
        /// Make search. Should be locked for 1 thread.
        /// </summary>
        /// <param name="maxSearchResultPagesToGet">How many pages of search results should be retrieved</param>
        /// <param name="source"></param>
        /// <param name="q">search query</param>
        /// <returns></returns>
        public SearchEngineSearchResults MakeSearch(TheSource source, string[] queries, string numOfResultsPart)
        {
            lock (_obj)
            {
                List <string>             searchStrings = new List <string>();
                SearchEngineSearchResults results       = new SearchEngineSearchResults();

                for (int i = 0; i < queries.Count(); i++)
                {
                    searchStrings.Add(string.Format(this._baseSearchAddress, queries[i]) + numOfResultsPart);
                }

                //if there are no results for query we try others. If there are no results at all
                //it could be ban of search engine or bad queries.
                //so, we make big pause...
                //If there are results - (enough according to _minResultsNumOfSearchRequests parameter)
                //we stop searching and begin opening and analizing of the pages at the sources websites.
                foreach (var q in searchStrings)
                {
                    Thread.Sleep(_pause);

                    //open page with search results



                    //parsing for snippets and urls



                    //checking if there are enough results is collected
                    if (results.snippets.Count() >= this._minResultsNumOfSearchRequests)
                    {
                        break;
                    }
                }
            }

            return(null);
        }
Esempio n. 13
0
        private void GetTitleAndEncoding2ndWay(TheSource so)
        {
            try
            {
                HttpWebRequest  req  = (HttpWebRequest)HttpWebRequest.Create("http://" + so.Url);
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                string          res  = string.Empty;

                using (StreamReader stream = new StreamReader(resp.GetResponseStream(),
                                                              Encoding.GetEncoding("windows-1251")))
                {
                    res = stream.ReadToEnd();
                }

                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(res);
                so.Title = doc.DocumentNode.SelectSingleNode("//title").InnerText;
                so.Enc   = doc.Encoding.BodyName;
            }
            catch (Exception ex)
            {
            }
        }
Esempio n. 14
0
        /// <summary>
        /// Checks if a page valid rss page and if not
        /// it opens links (calls method) on this page and checks if they are valid rss pages.
        /// If valid - set contanins only 1 valid link from argument.
        /// If can't get links from link (and link is not valid rss page) - returns empty set.
        /// </summary>
        /// <returns></returns>
        private HashSet <string> getValidRssPages(string link, TheSource source)
        {
            HashSet <string> set = new HashSet <string>();

            try
            {
                if (IsPageValidRssPage(link))
                {
                    set.Add(link);
                }
                else
                {
                    HtmlDocument doc = openLinkAndGetDoc(link, source);
                    if (doc == null)
                    {
                        return(set);
                    }
                    var rsslinks = getRssLinksFromPage(doc, source);
                    rsslinks = new HashSet <string>(rsslinks.Select(s => s.Contains("http://") ? s : "http://" + source.Url.Trim('/') + "/" + s.Trim('/')));

                    foreach (var l in rsslinks)
                    {
                        HtmlDocument d = openLinkAndGetDoc(l, source);
                        if (d != null && IsPageValidRssPage(l))
                        {
                            set.Add(l);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }

            return(set);
        }
Esempio n. 15
0
        /// <summary>
        /// Gets all rsslinks from html page
        /// </summary>
        /// <param name="doc"></param>
        /// <returns></returns>
        private HashSet <string> getRssLinksFromPage(HtmlDocument doc, TheSource source)
        {
            var linkedPages = doc.DocumentNode.Descendants("a")
                              .Select(a => a.GetAttributeValue("href", null))
                              .Where(u => !String.IsNullOrEmpty(u));

            var linkedPages2 = doc.DocumentNode.Descendants("link")
                               .Select(a => a.GetAttributeValue("href", null))
                               .Where(u => !String.IsNullOrEmpty(u));

            var linkedPages3 = doc.DocumentNode.Descendants("link")
                               .Where(a => a.Attributes.Contains("type"))
                               .Where(a => a.Attributes["type"].Value == "application/rss+xml")
                               .Select(a => a.GetAttributeValue("href", null))
                               .Where(u => !String.IsNullOrEmpty(u));

            HashSet <string> rssLinks = new HashSet <string>();

            foreach (var l in linkedPages.Union(linkedPages2).Union(linkedPages3))
            {
                if ((l.Trim('/').EndsWith("feed") || l.Contains("rss")) && !l.Contains("comment"))
                {
                    string resL = string.Empty;
                    if (!l.Contains("http"))
                    {
                        rssLinks.Add(l);
                    }
                    else
                    {
                        rssLinks.Add(l);
                    }
                }
            }

            return(rssLinks);
        }
Esempio n. 16
0
        /// <summary>
        /// Identifying the rules of parsing of the source.
        /// </summary>
        /// <param name="source"></param>
        public override void Identify(ref TheSource source)
        {
            if (source.TheSourceType == MMonitorLib.Enums.TheSourceType.MASS_MEDIA)
            {
                List <string> linksToOpen = new List <string>();
                if (source.RssPages == null || source.RssPages.Count == 0)
                {
                    //getting links from main page (not a good idea may be...)
                }
                else
                {
                    //documents loaded from rss
                    var docs    = new List <HtmlDocument>();
                    var xPathes = new List <string>();
                    var urls    = new List <string>();
                    var pages   = new List <HtmlDocument>();

                    //getting links from rss feeds
                    foreach (var rssLink in source.RssPages)
                    {
                        urls = GetAllLinksFromRSSFeed(rssLink.Url);
                        if (urls.Count < 10)
                        {
                            return;
                        }

                        //docs loaded from links at rss page
                        foreach (var u in urls)
                        {
                            try
                            {
                                HtmlDocument page = new HtmlDocument();
                                pages.Add(page);
                                OpenUrlAndLoadHtml(page, u, source);
                                xPathes.Add(GetArticleContainerIdentifier(page));
                            }
                            catch (Exception ex)
                            {
                                Console.WriteLine(ex);
                            }
                        }
                    }

                    //identifing the most occured xpath
                    if (xPathes.Count >= 10)
                    {
                        source.PageParsingRules.Add(new PageParsingRule()
                        {
                            ContentXPath = xPathes.GroupBy(x => x)
                                           .Select(group => new
                            {
                                XPa = group.Key,
                                Cou = group.Count()
                            })
                                           .OrderByDescending(x => x.Cou)
                                           .Where(xp => xp.Cou > 1)
                                           .FirstOrDefault()
                                           .XPa
                        });
                    }
                }
            }
        }
Esempio n. 17
0
 public bool UpdateSource(TheSource theSource)
 {
     throw new NotImplementedException();
 }
Esempio n. 18
0
        static void Main(string[] args)
        {
            md5 = MD5.Create();
            string file = System.Configuration.ConfigurationManager.AppSettings["file_with_sources"];
            int    row  = 0;

            using (var db = new MMonitorContext())
            {
                var lines = File.ReadAllLines(file);

                TheSource s;
                Uri       u;
                string    url        = string.Empty;
                string    sourceType = string.Empty;
                string    l          = string.Empty;

                foreach (var line in lines)
                {
                    string[] parts = line.Split('\t').Select(p => p.Trim()).ToArray();
                    if (parts[0].Contains("xn--"))
                    {
                        try
                        {
                            u = new Uri(parts[0]);
                            l = u.Host;
                        }
                        catch (Exception ex)
                        {
                            log.Error("Can't create source url from puny url", ex);
                        }
                    }

                    if (parts[0].Contains("https://"))
                    {
                        l = parts[0].Replace("https://", "");
                    }
                    else if (parts[0].Contains("http://"))
                    {
                        l = parts[0].Replace("http://", "");
                    }
                    else
                    {
                        l = parts[0];
                    }

                    if (l.Contains("www."))
                    {
                        l = l.Replace("www.", "");
                    }

                    if (!Uri.IsWellFormedUriString(l, UriKind.Relative) && !l.Contains(".рф"))
                    {
                        log.Error(string.Format("Bad format for Url {0} at line {1}", parts[0], row));
                        continue;
                    }

                    l = l.Trim().ToLower();

                    if (l.Contains(".рф") && !Regex.IsMatch(l, @"\S+\.рф$"))
                    {
                        log.Error(string.Format("Bad format for Url {0} at line {1}", parts[0], row));
                        continue;
                    }

                    s = new TheSource()
                    {
                        Url           = l,
                        UrlHash       = GetMD5Hash(l),
                        Lang          = string.IsNullOrWhiteSpace(parts[1]) ? Langs.UNDEFINED : (Langs)Enum.Parse(typeof(Langs), parts[1]),
                        TheSourceType = (TheSourceType)Enum.Parse(typeof(TheSourceType), parts[2])
                    };

                    if (db.TheSources.Where(x => x.UrlHash == s.UrlHash).FirstOrDefault() == null)
                    {
                        db.TheSources.Add(s);
                        row++;
                        Console.WriteLine("row is " + row);
                    }
                }

                try
                {
                    db.SaveChanges();
                    log.Info($"Loaded {row} sources to database");
                }
                catch (Exception ex)
                {
                    log.Error("Can't upload sources", ex);
                }
            }

            Console.ReadLine();
        }
Esempio n. 19
0
 public abstract void Identify(ref TheSource source);
 /// <summary>
 /// Identify the language of the source's pages.
 /// </summary>
 /// <param name="source"></param>
 public override void Identify(ref TheSource source)
 {
     //throw new NotImplementedException();
 }
Esempio n. 21
0
 public bool AddNewSource(TheSource source)
 {
     throw new NotImplementedException();
 }