예제 #1
0
        private void UploadAdvs()
        {
            if (!_db.Connect())
            {
                MessageBox.Show(General.ERROR_DB_CONNECT);
                _db.Close();
                Invoke(new Action(() =>
                {
                    statusLabel.Text      = STATUS_BAR_DEFAULT;
                    uploadButton.Enabled  = true;
                    exploreButton.Enabled = true;
                    parseButton.Enabled   = true;
                    exportButton.Enabled  = true;
                    textBox2.Enabled      = true;
                    stopButton.Enabled    = false;
                    fromBox.Enabled       = true;
                    toBox.Enabled         = true;
                }));
                return;
            }

            var targets = textBox2.Text.Split(';');
            var i       = 0;

            foreach (var target in targets)
            {
                var paths = Directory.GetDirectories(target);
                Invoke(new Action(() =>
                {
                    statusLabel.Text = "Инициализация Selenium...";
                }));
                IWebDriver browser;
                var        option = new ChromeOptions();
                option.AddArgument("--headless");
                option.AddArgument("--user-agent=" + General.DEFAULT_USER_AGENT);
                //option.AddArgument("--proxy-server=socks5://109.234.35.41:8888");
                var service = ChromeDriverService.CreateDefaultService();
                service.HideCommandPromptWindow = true;
                browser = new ChromeDriver(service, option);
                // Нужно сделать проверку является-ли фарпостовской сессией
                var defaultPageAddress = "https://www.farpost.ru/vladivostok/realty/sell_flats/?page=1";
                browser.Navigate().GoToUrl(defaultPageAddress);
                var def = (int)(toBox.Value - fromBox.Value) + 1;
                foreach (var path in paths)
                {
                    if (_stopFlag)
                    {
                        break;
                    }
                    var pathSplit = path.Split('\\');
                    var advName   = pathSplit[pathSplit.Length - 1];
                    var nlink     = target + "\\" + advName;
                    if (File.Exists(nlink + ".html") || File.Exists(nlink + "_arch.html"))
                    {
                        i++;
                        continue;
                    }
                    var idStr = pathSplit[pathSplit.Length - 1].Split('_')[0];
                    if (!int.TryParse(idStr, out int id))
                    {
                        i++;
                        continue;
                    }
                    if (id < fromBox.Value || id > toBox.Value)
                    {
                        i++;
                        continue;
                    }
                    Invoke(new Action(() =>
                    {
                        statusLabel.Text = "Загрузка объявления #" + idStr + " [" + i + "/" + def + "]";
                    }));
                    var link = _db.GetLinkById(id);
                    while (true)
                    {
                        if (_stopFlag)
                        {
                            break;
                        }
                        try
                        {
                            browser.Navigate().GoToUrl(link);
                            if (browser.PageSource.Contains("Из вашей подсети наблюдается подозрительная активность"))
                            {
                                _db.Close();
                                Invoke(new Action(() =>
                                {
                                    statusLabel.Text      = STATUS_BAR_DEFAULT;
                                    uploadButton.Enabled  = true;
                                    exploreButton.Enabled = true;
                                    parseButton.Enabled   = true;
                                    exportButton.Enabled  = true;
                                    textBox2.Enabled      = true;
                                    stopButton.Enabled    = false;
                                    fromBox.Enabled       = true;
                                    toBox.Enabled         = true;
                                }));
                                service.Dispose();
                                browser.Dispose();
                                MessageBox.Show("Капча! Завершаем работу.");
                                return;
                            }
                            break;
                        }
                        catch (Exception ex)
                        {
                            General.WriteLog(ex.Message);
                            Thread.Sleep(10000);
                        }
                    }
                    if (!browser.PageSource.Contains("Объявление находится в архиве и может быть неактуальным."))
                    {
                        for (var k = 0; k < 5; k++)
                        {
                            if (_stopFlag)
                            {
                                break;
                            }
                            try
                            {
                                var element = browser.FindElement(By.PartialLinkText("Показать контакты"));
                                element.Click();
                                var isf = false;
                                for (var ti = 0; ti < 20 && !_stopFlag; ti++)
                                {
                                    try
                                    {
                                        browser.FindElement(By.ClassName("dummy-listener_new-contacts"));
                                        isf = true;
                                        break;
                                    }
                                    catch
                                    {
                                        Thread.Sleep(500);
                                    }
                                }
                                if (isf)
                                {
                                    break;
                                }
                            }
                            catch (Exception ex)
                            {
                                General.WriteLog(ex.Message);
                                Thread.Sleep(1000);
                            }
                        }
                    }
                    else
                    {
                        nlink += "_arch";
                    }
                    try
                    {
                        var element = browser.FindElement(By.ClassName("expand--button"));
                        element.Click();
                        for (var ti = 0; ti < 20 && !_stopFlag; ti++)
                        {
                            try
                            {
                                browser.FindElement(By.ClassName("mod__active"));
                                break;
                            }
                            catch
                            {
                            }
                            Thread.Sleep(1000);
                        }
                    }
                    catch (Exception)
                    {
                    }
                    var content = browser.PageSource;


                    var images = new List <string>();
                    var doc    = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(content);
                    var nodes = doc.DocumentNode.SelectNodes("//img");
                    if (nodes != null)
                    {
                        var nodesArray = nodes.ToArray();
                        foreach (var node in nodesArray)
                        {
                            images.Add(node.Attributes["src"].Value);
                        }
                    }

                    int img_name = 0;
                    using (var wc = new WebClient())
                    {
                        foreach (var imageSrc in images)
                        {
                            if (_stopFlag)
                            {
                                break;
                            }
                            try
                            {
                                var ext = imageSrc;
                                var idx = ext.LastIndexOf("/");
                                if (idx == -1)
                                {
                                    continue;
                                }
                                ext = ext.Remove(0, idx + 1);
                                idx = ext.LastIndexOf(".");
                                ext = idx != -1 ? ext.Remove(0, idx) : ".jpg";
                                var pictureName = img_name.ToString() + ext;

                                content = content.Replace(imageSrc, Path.Combine(advName, pictureName));
                                wc.DownloadFile(imageSrc, Path.Combine(path, pictureName));
                                img_name++;
                            }
                            catch (Exception ex)
                            {
                                General.WriteLog(ex.Message);
                            }
                        }
                    }
                    nodes = doc.DocumentNode.SelectNodes("//*[href]");
                    if (nodes != null)
                    {
                        var nodesArray = nodes.ToArray();
                        foreach (var node in nodesArray)
                        {
                            var value = node.Attributes["href"].Value;
                            if (!value.StartsWith("http://") && !value.StartsWith("https://"))
                            {
                                var newValue = value;
                                if (newValue.StartsWith("/"))
                                {
                                    newValue.Remove(0, 1);
                                }
                                content = content.Replace(value, "https://www.farpost.ru/" + newValue);
                            }
                        }
                    }

                    var htmled = nlink + ".html";
                    File.WriteAllText(htmled, content, Encoding.UTF8);
                    FileInfo fileInfo = new FileInfo(htmled);
                    string   newname;
                    if (fileInfo.Length <= 25600)
                    {
                        newname = nlink + "_udalen.html";
                        File.Move(htmled, newname);
                    }
                    _db.ChangeAdvNLink(id, htmled);
                    General.Delay();
                    i++;
                }
                _db.Close();
                Invoke(new Action(() =>
                {
                    statusLabel.Text      = STATUS_BAR_DEFAULT;
                    uploadButton.Enabled  = true;
                    exploreButton.Enabled = true;
                    parseButton.Enabled   = true;
                    exportButton.Enabled  = true;
                    textBox2.Enabled      = true;
                    stopButton.Enabled    = false;
                    fromBox.Enabled       = true;
                    toBox.Enabled         = true;
                }));
                service.Dispose();
                browser.Dispose();
            }
        }
예제 #2
0
        private static void UnhandledExceptionOccured(object sender, UnhandledExceptionEventArgs e)
        {
            var ex = (Exception)e.ExceptionObject;

            General.WriteLog(ex.ToString());
        }
예제 #3
0
        /// <summary>
        /// Возвращает список с инфой об объявах
        /// </summary>
        /// <param name="link">Ссылка на ленту</param>
        /// <returns>Список с инфой об объявах</returns>
        public static List <Advertisement> GetDataFromFeed(string url, string saveTo)
        {
            var          result = new List <Advertisement>();
            HtmlDocument doc;
            var          web = new HtmlWeb
            {
                AutoDetectEncoding = false,
                OverrideEncoding   = Encoding.Default,
                UserAgent          = General.DEFAULT_USER_AGENT
            };

            doc = web.Load(url);
            var cityDom        = doc.DocumentNode.SelectNodes("//a[contains(@class, 'cityPop')]");
            var useAnnotations = url.Contains("primorskii-krai");
            var city           = string.Empty;

            if (cityDom != null)
            {
                city = cityDom[0].InnerText.Trim();
            }
            else
            {
                city = string.Empty;
            }
            var advs = doc.DocumentNode.SelectNodes("//tr[contains(@class,'bull-item') and not(@data-accuracy)]");

            if (saveTo != null)
            {
                var content = doc.DocumentNode.InnerHtml;
                var linx    = doc.DocumentNode.SelectNodes("//a[@href]");
                if (linx != null)
                {
                    foreach (var link in linx)
                    {
                        var val = link.Attributes["href"].Value.Trim();
                        if (!val.StartsWith("http://") &&
                            !val.StartsWith("https://"))
                        {
                            content = content.Replace('"' + val + '"', "\"https://www.farpost.ru" + val + '"');
                        }
                    }
                }
                File.WriteAllText(saveTo, content, Encoding.UTF8);
            }
            if (advs != null)
            {
                foreach (var adv in advs)
                {
                    try
                    {
                        var geo    = string.Empty;
                        var title  = WebUtility.HtmlDecode(adv.SelectSingleNode(".//td[@class='descriptionCell']/a[@name and @data-stat]").InnerText).Trim();
                        var geoDom = adv.SelectSingleNode(".//a[@data-geo]");
                        if (geoDom != null)
                        {
                            geo = geoDom.Attributes["data-geo"].Value.Trim();
                        }
                        var viewsDom = adv.SelectSingleNode(".//*[contains(@class,'views')]");
                        var views    = 0;
                        if (viewsDom != null)
                        {
                            int.TryParse(viewsDom.InnerHtml, out views);
                        }
                        var number = adv.SelectSingleNode(".//a[@name!='']").Attributes["name"].Value.Trim();
                        var link   = adv.SelectSingleNode(".//a[@href!='#']").Attributes["href"].Value.Trim();
                        // Цены иногда может и не быть...
                        var price    = "n/a";
                        var priceDom = adv.SelectSingleNode(".//span[@data-role='price']");
                        if (priceDom != null)
                        {
                            price = WebUtility.HtmlDecode(priceDom.InnerText).Trim();
                            if (price.Contains("₽"))
                            {
                                price = price.Replace("₽", "");
                            }
                        }
                        // Площадь. Также иногда может отсутствовать
                        string[] annotations = null;
                        var      square      = "n/a";
                        var      squareDom   = adv.SelectSingleNode(".//div[contains(@class, 'annotation')]");
                        if (squareDom != null)
                        {
                            annotations = squareDom.InnerText.Split(new string[] { ", " }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (var sep in annotations)
                            {
                                if (sep.Contains("кв."))
                                {
                                    square = WebUtility.HtmlDecode(sep).Trim();
                                    break;
                                }
                            }
                        }
                        if (useAnnotations)
                        {
                            var altCity    = string.Empty;
                            var cityDivDom = adv.SelectSingleNode(".//div[contains(@class, 'city')]");
                            if (cityDivDom != null)
                            {
                                altCity = cityDivDom.InnerText;
                            }
                            else
                            {
                                altCity = annotations[annotations.Length - 1];
                                var idx = altCity.IndexOf("м..");
                                if (idx != -1)
                                {
                                    altCity = altCity.Remove(0, idx + 3);
                                }
                            }
                            city = altCity.Trim();
                            General.WriteLog(city);
                        }
                        var adv_m = new Advertisement()
                        {
                            Number      = number,
                            Link        = "https://www.farpost.ru" + link,
                            Title       = title,
                            Price       = price,
                            Square      = square,
                            Geo         = geo,
                            Type        = GetTypeByLink(link),
                            Views       = views,
                            Annotations = annotations,
                            City        = city,
                            CurLink     = saveTo
                        };
                        result.Add(adv_m);
                    }
                    catch (Exception ex)
                    {
                        General.WriteLog(ex.ToString());
                    }
                }
            }

            /*
             *
             */
            advs = doc.DocumentNode.SelectNodes("//td[contains(@class,'bull-item') and not(@data-accuracy)]");
            if (advs != null)
            {
                foreach (var adv in advs)
                {
                    try
                    {
                        var title  = WebUtility.HtmlDecode(adv.SelectSingleNode(".//div[@class='title']/a").InnerText).Trim();
                        var geo    = string.Empty;
                        var geoDom = adv.SelectSingleNode(".//a[@data-geo]");
                        if (geoDom != null)
                        {
                            geo = geoDom.Attributes["data-geo"].Value.Trim();
                        }
                        var viewsDom = adv.SelectSingleNode(".//*[contains(@class,'views')]");
                        var views    = 0;
                        if (viewsDom != null)
                        {
                            int.TryParse(viewsDom.InnerHtml, out views);
                        }
                        var number = adv.SelectSingleNode(".//a[@name!='']").Attributes["name"].Value.Trim();
                        var link   = adv.SelectSingleNode(".//a[@href!='#']").Attributes["href"].Value.Trim();
                        // Цены иногда может и не быть...
                        var price    = "n/a";
                        var priceDom = adv.SelectSingleNode(".//span[@data-role='price']");
                        if (priceDom != null)
                        {
                            price = WebUtility.HtmlDecode(priceDom.InnerText).Trim();
                        }
                        // Площадь. Также иногда может отсутствовать
                        string[] annotations = null;
                        var      square      = "n/a";
                        var      squareDom   = adv.SelectSingleNode(".//div[contains(@class, 'annotation')]");
                        if (squareDom != null)
                        {
                            annotations = squareDom.InnerText.Split(new string[] { ", " }, StringSplitOptions.RemoveEmptyEntries);
                            foreach (var sep in annotations)
                            {
                                if (sep.Contains("кв."))
                                {
                                    square = WebUtility.HtmlDecode(sep).Trim();
                                    break;
                                }
                            }
                        }
                        var altCity = string.Empty;
                        if (useAnnotations)
                        {
                            var cityDivDom = adv.SelectSingleNode(".//div[contains(@class, 'city')]");
                            if (cityDivDom != null)
                            {
                                altCity = cityDivDom.InnerText;
                            }
                            else
                            {
                                altCity = annotations[annotations.Length - 1];
                                var idx = altCity.IndexOf("м..");
                                if (idx != -1)
                                {
                                    altCity = altCity.Remove(0, idx + 3);
                                }
                            }
                            city = altCity.Trim();
                        }
                        var adv_m = new Advertisement()
                        {
                            Number      = number,
                            Link        = "https://www.farpost.ru" + link,
                            Title       = title,
                            Price       = price,
                            Square      = square,
                            Geo         = geo,
                            Type        = GetTypeByLink(link),
                            Views       = views,
                            Annotations = annotations,
                            City        = city
                        };
                        result.Add(adv_m);
                    }
                    catch (Exception ex)
                    {
                        General.WriteLog(ex.ToString());
                    }
                }
            }
            return(result);
        }