public void Download() { StringBuilder sbHtml = new StringBuilder(); foreach (var url in urls) { var page = wc.DownloadString(url); page = Regex.Match(page, "<div class=\"Panel1a_column1\">([\\w\\W]+?)<div class=\"Panel1a_column2\">").Groups[1].Value; var mcLinks = Regex.Matches(page, "<li>\\s*<a href=\"([^\"]*?)\">([^<]*?)</a>\\s*<span class=\"listDate\">([^<]*?)</span"); foreach (Match m in mcLinks) { var innerPage = wc.DownloadString("https://mjs.bg" + m.Groups[1].Value); innerPage = Regex.Match(innerPage, "<div class=\"Panel1a_column1\">([\\w\\W]+?)<div class=\"Panel1a_column2\">").Groups[1].Value; var mAttr = Regex.Match(innerPage, "<div class=\"lTitle\">([^<]+?)</div>\\s+<div class=\"lDate\">([^<]+?)</div>"); var title = mAttr.Groups[1].Value; var date = mAttr.Groups[2].Value; var text = Regex.Match(innerPage, "<div class=\"lText\">([\\w\\W]+?)</div>\\s+<div class=\"clear\">").Groups[1].Value; var id = Guid.NewGuid().ToString(); BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "text", Name = "Интервю" + (title.Length > 180 ? title.Substring(0, 199) : title), RubricId = 1, PortalPartId = "min", Url = id, Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), body = JObject.FromObject(new { bg = date + "<br/>" + text }) }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" } } }; db.SetBlock(bd); sbHtml.AppendLine($"<p><a href=\"/home/index/{id}\">{title}</a><br/>{date}</p>"); } File.WriteAllText(@"d:\html.txt", sbHtml.ToString()); } }
public void Download() { var page = wc.DownloadString($"http://www.justice.government.bg/117"); var mcNews = Regex.Matches(page, "<a href=\"(/117/[0-9]+/)\""); foreach (Match n in mcNews) { var np = wc.DownloadString($"http://www.justice.government.bg" + n.Groups[1].Value); var mData = Regex.Match(np, "<div class=\"lBorder\"></div>\\s*<div class=\"lTitle\">([^<]+)</div>\\s*<div class=\"lDate\">([^<]+)</div>\\s*<div class=\"lBorder\"></div>\\s*<div class=\"lText\">([\\w\\W]+?)</div>"); if (mData.Success) { BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "new", Name = mData.Groups[1].Value.Length > 199 ? mData.Groups[1].Value.Substring(0, 199) : mData.Groups[1].Value, PortalPartId = "min", Url = Guid.NewGuid().ToString(), RubricId = 5, Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = mData.Groups[1].Value }), body = JObject.FromObject(new { bg = mData.Groups[3].Value }) }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mData.Groups[2].Value).ToString("yyyy-MM-dd") } } }; db.SetBlock(bd); } } }
public void Download() { foreach (var url in lUrls) { var pageNo = 1; var found = true; do { var list = DownloadString(url.Item1 + "?&page=" + pageNo.ToString()); pageNo++; list = Regex.Match(list, "</header>([\\w\\W]+?)<footer").Groups[1].Value; var mcLinks = Regex.Matches(list, "<h2>\\s*<a href=\"(/bg/profil-na-kupuvacha/[^\"]+)\""); found = mcLinks.Count > 0; foreach (Match mLink in mcLinks) { if (hsDownloaded.Contains(mLink.Groups[1].Value)) { found = false; break; } hsDownloaded.Add(mLink.Groups[1].Value); var page = this.DownloadString(mLink.Groups[1].Value); var title = Regex.Match(page, "<h1>([\\w\\W]+?)</h1>").Groups[1].Value; page = Regex.Match(page, "</header>([\\w\\W]+?)<footer").Groups[1].Value; page = ClearPage(page); var docs = Regex.Match(page, "<div class=\"panel panel-default\"[\\w\\W]+?<div class=\"clearfix\">\\s*</div>").Value; if (!string.IsNullOrEmpty(docs)) { page = page.Replace(docs, ""); } List <Tuple <string, string> > lBlobs = new List <Tuple <string, string> >(); var mb = Regex.Matches(page, "<img[\\w\\W]*?src=\"([^\"]+?)\""); foreach (Match m in mb) { string newUrl = "https://localhost:5001/api/part/GetBlob?hash=" + UploadBlob(m.Groups[1].Value); lBlobs.Add(new Tuple <string, string>(m.Groups[1].Value, newUrl)); } mb = Regex.Matches(page, "<a[\\w\\W]*?href=\"(/media[^\"]+?)\"[\\w\\W]*?>([\\w\\W]+?)</a>"); foreach (Match m in mb) { string newUrl = "part/GetBlob?hash=" + UploadBlob(m.Groups[1].Value); lBlobs.Add(new Tuple <string, string>(m.Groups[1].Value, newUrl)); } foreach (var ru in lBlobs) { page = page.Replace(ru.Item1, ru.Item2); } page = Regex.Replace(page, "<div[\\w\\W]*?>", "<br />"); page = Regex.Replace(page, "</*div[\\w\\W]*?>", ""); page = ReplaceWhileExists(page, "<br />\\s*<br />", "<br />"); var mDate = Regex.Match(page, "<time datetime=\"([\\w\\W]+?)\">[\\w\\W]+?</time>"); page = page.Replace(mDate.Value, ""); var jaFiles = new JArray(); foreach (Match mF in Regex.Matches(docs, "<a href=\"(/media/[^\"]+?)\" target=\"_blank\">([^<]+?)</a>[\\w\\W]+?качено на ([0-9\\.]+?)</small")) { var hash = UploadBlob(mF.Groups[1].Value); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = mF.Groups[2].Value }), fileType = "", date = DateTime.Parse(mF.Groups[3].Value).ToString("yyyy-MM-dd"), file = hash } ) ); } BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "pkmessage", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "av", Url = Guid.NewGuid().ToString(), RubricId = 1, Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), type = url.Item2, body = JObject.FromObject(new { bg = page }), files = jaFiles }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "8" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }; db.SetBlock(bd); } } while (found); } }
public void Download() { var pageNo = 1; var found = true; do { var list = DownloadString("https://www.registryagency.bg/bg/prestsentar/novini/?page=" + pageNo.ToString()); pageNo++; list = Regex.Match(list, "</header>([\\w\\W]+?)<footer").Groups[1].Value; var mcLinks = Regex.Matches(list, "<h2>\\s*<a href=\"(/bg/prestsentar/novini/[^\"]+)\""); foreach (Match mLink in mcLinks) { if (hsDownloaded.Contains(mLink.Groups[1].Value)) { found = false; break; } hsDownloaded.Add(mLink.Groups[1].Value); var page = this.DownloadString(mLink.Groups[1].Value); var title = Regex.Match(page, "<h1>([\\w\\W]+?)</h1>").Groups[1].Value; page = Regex.Match(page, "</header>([\\w\\W]+?)<footer").Groups[1].Value; page = ClearPage(page); List <Tuple <string, string> > lBlobs = new List <Tuple <string, string> >(); var mb = Regex.Matches(page, "<img[\\w\\W]*?src=\"([^\"]+?)\""); foreach (Match m in mb) { string newUrl = "https://localhost:5001/api/part/GetBlob?hash=" + UploadBlob(m.Groups[1].Value); lBlobs.Add(new Tuple <string, string>(m.Groups[1].Value, newUrl)); } mb = Regex.Matches(page, "<a[\\w\\W]*?href=\"(/media[^\"]+?)\"[\\w\\W]*?>([\\w\\W]+?)</a>"); foreach (Match m in mb) { string newUrl = "/api/part/GetBlob?hash=" + UploadBlob(m.Groups[1].Value); lBlobs.Add(new Tuple <string, string>(m.Groups[1].Value, newUrl)); } foreach (var ru in lBlobs) { page = page.Replace(ru.Item1, ru.Item2); } page = Regex.Replace(page, "<div[\\w\\W]*?>", "<br />"); page = Regex.Replace(page, "</*div[\\w\\W]*?>", ""); page = ReplaceWhileExists(page, "<br />\\s*<br />", "<br />"); var mDate = Regex.Match(page, "<time datetime=\"([\\w\\W]+?)\">[\\w\\W]+?</time>"); page = page.Replace(mDate.Value, ""); BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "new", Name = title.Length > 199 ? title.Substring(0, 199) : title, RubricId = 1, PortalPartId = "av", Url = Guid.NewGuid().ToString(), Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), body = JObject.FromObject(new { bg = page }) }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "8" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }; db.SetBlock(bd); } } while (found); }
public void Download() { foreach (var url in this.urls) { var found = false; var downloaded = 0; int pn = 0; do { var page = wc.DownloadString(url + pn); pn++; var mcLinks = Regex.Matches(page, "<a href=\"(/[a-z0-9]{32})\">"); found = mcLinks.Count > 0; foreach (Match ml in mcLinks) { page = wc.DownloadString("http://profile.mjs.bg" + ml.Groups[1].Value); var mTitle = Regex.Match(page, "<h3><i[\\w\\W]+?/i>([\\w\\W]+?)</h3>"); var mDate = Regex.Match(page, "Дата на създаване на преписката: ([0-9\\.]{10})</div>"); var mEndDate = Regex.Match(page, "Краен срок за подаване на оферти или заявления за участие: ([\\w\\W]+?)</div>"); var mProcType = Regex.Match(page, "Процедура: ([\\w\\W]+?)</div>"); var mAOP = Regex.Match(page, "<a href=\"(http://[^\\.]+?\\.aop.bg[\\w\\W]+?)\""); var mStatus = Regex.Match(page, "Статус: ([\\w\\W]+?)</div>"); var mText = Regex.Match(page, "<div class=\"clearfix\"><hr /></div>\\s*<div class=\"clearfix\">[\\w\\W]+?</div>\\s*<div class=\"clearfix\"><hr /></div>"); var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), enddate = DateTime.Parse(mEndDate.Groups[1].Value).ToString("yyyy-MM-dd hh:mm"), proctype = JObject.FromObject(new { bg = mProcType.Groups[1].Value }), AOPNum = mAOP.Groups[1].Value, procstatus = JObject.FromObject(new { bg = mStatus.Groups[1].Value }), Subject = JObject.FromObject(new { bg = mText.Groups[1].Value }) } ); var mcFiles = Regex.Matches(page, "<li class=\"media well clearfix\"[\\w\\W]+?>\\s*<a class=\"pull-left\" href=\"(/file[\\w\\W]+?)\"[\\w\\W]+?</h4>\\s*([\\w\\W]+?)</div>"); Console.WriteLine($"download: {++downloaded} files:{mcFiles.Count}"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://profile.mjs.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Guid.NewGuid().ToString() + ".pdf"; } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "pkop", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "min", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } }while (found); } }
public void Download() { var i = 0; var found = false; do { var page = wc.DownloadString($"http://www.nbpp.government.bg/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8?start={i}"); i += 20; var mcNews = Regex.Matches(page, "<h2>\\s+<a href=\"(/новини/[\\w\\W]+?)\""); found = mcNews.Count > 0; foreach (Match n in mcNews) { var url = HttpUtility.UrlEncode(n.Groups[1].Value).Replace("%2f", "/"); var np = wc.DownloadString($"http://www.nbpp.government.bg" + url); Match mp = Regex.Match(np, "</h2>\\s*([\\w\\W]+?)<div class=\"articleInfoFooter\">([\\w\\W]+?)</div>"); page = mp.Groups[1].Value; var mcPs = Regex.Matches(page, "<p>[\\w\\W]*?</p>"); var title = Regex.Match(np, "<h2>\\s*<a [\\w\\W]+?>\\s*([\\w\\W]+?)</a>").Groups[1].Value; StringBuilder sbPage = new StringBuilder(); foreach (Match p in mcPs) { sbPage.AppendLine(p.Value); } page = sbPage.ToString(); var mcFiles = Regex.Matches(page.ToString(), "<a [\\w\\W]*?href=\"([\\w\\W]+?\\.[a-z]{3,4})\"[\\w\\W]*?>"); List <Tuple <string, string> > lLinks = new List <Tuple <string, string> >(); foreach (Match f in mcFiles) { string fUrl = f.Groups[1].Value; fUrl = fUrl.Replace("../", ""); if (!fUrl.StartsWith("/") && !fUrl.Contains("http")) { fUrl = "/" + fUrl; } byte[] file; try { file = fUrl.Contains("http") ? wc.DownloadData(fUrl) : wc.DownloadData("http://www.nbpp.government.bg" + fUrl); } catch (Exception e) { Console.WriteLine(e.Message); continue; } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fUrl), Filename = Path.GetFileName(fUrl), Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string>(f.Value, hash)); } foreach (var l in lLinks) { page = page.Replace(l.Item1, $"<a href='/api/part/getblob?hash={l.Item2}'>"); } BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "new", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "nbpp", RubricId = 6, Url = Guid.NewGuid().ToString(), Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), body = JObject.FromObject(new { bg = page }) }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "10" }, new PropertyValue() { PropertyId = "date", Value = GetDate(mp.Groups[2].Value) } } }; db.SetBlock(bd); } } while (found); //var mcNews = Regex.Matches(page, "<a href=\"(/117/[0-9]+/)\""); //foreach (Match n in mcNews) //{ // var np = wc.DownloadString($"http://www.justice.government.bg" + n.Groups[1].Value); // var mData = Regex.Match(np, "<div class=\"lBorder\"></div>\\s*<div class=\"lTitle\">([^<]+)</div>\\s*<div class=\"lDate\">([^<]+)</div>\\s*<div class=\"lBorder\"></div>\\s*<div class=\"lText\">([\\w\\W]+?)</div>"); // if (mData.Success) // { // BlockData bd = new BlockData() // { // Block = new JSBlock() // { // BlockId = 0, // BlockTypeId = "new", // Name = mData.Groups[1].Value.Length > 199 ? mData.Groups[1].Value.Substring(0, 199) : mData.Groups[1].Value, // PortalPartId = "min", // Url = Guid.NewGuid().ToString(), // Jsonvalues = JObject.FromObject(new // { // title = JObject.FromObject(new { bg = mData.Groups[1].Value }), // body = JObject.FromObject(new { bg = mData.Groups[3].Value }) // }).ToString() // }, // Values = new PropertyValue[] // { // new PropertyValue() // { // PropertyId = "header", // Value = "6" // }, // new PropertyValue() // { // PropertyId = "date", // Value = DateTime.Parse(mData.Groups[2].Value).ToString("yyyy-MM-dd") // } // } // }; // db.SetBlock(bd); // } //} }
public void Download() { foreach (var url in lUrls) { string page = this.Download10Times(url.Item1); var mcOps = Regex.Matches(page, "<div class=\"lTitle\">[^<]*?</div>\\s*<div class=\"lDate\">[0-9\\.]+</div>[\\w\\W]*?<div class=\"clear\"></div>"); foreach (Match mOp in mcOps) { string pageOP = mOp.Value; var mTitle = Regex.Match(pageOP, "<div class=\"lTitle\">([^<]*?)<"); var mDate = Regex.Match(pageOP, "<div class=\"lDate\">([^<]*?)<"); var mText = Regex.Match(pageOP, "<div class=\"lText\">([\\w\\W]*?)</div>"); string text = mText.Success ? mText.Groups[1].Value : ""; text = url.Item2 + "<br/>" + text; var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), type = "Архив", body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/Files[^\"]+?)\"[^>]*?>([\\w\\W]+?)</a>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://mjs.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } //System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Path.GetFileName(f.Groups[1].Value); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = f.Groups[3].Value, file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 5, BlockTypeId = "pkmessage", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "min", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } } }
public void Download() { foreach (var u in File.ReadAllLines(@"data\urls.txt")) { dUrls.Add(u, new UrlInfo() { Id = Guid.NewGuid().ToString(), Downloaded = false }); } lNoUrls = File.ReadAllLines(@"data\nourls.txt").ToList(); while (dUrls.Any(x => !x.Value.Downloaded)) { var url = dUrls.First(x => !x.Value.Downloaded); var page = this.DownloadString(url.Key); var title = Regex.Match(page, "<h1>([\\w\\W]+?)</h1>").Groups[1].Value; page = Regex.Match(page, "</header>([\\w\\W]+?)<footer").Groups[1].Value; page = ReplaceWhileExists(page, "<nav[\\w\\W]+?</nav>", ""); page = ReplaceWhileExists(page, "<ol class=\"breadcrumb\">[\\w\\W]+?</ol>", ""); page = ReplaceWhileExists(page, "<ul class=\"list-inline aside-links\">[\\w\\W]+?</ul><!--end-->", ""); page = ReplaceWhileExists(page, "<aside[\\w\\W]+?</aside>", ""); page = ReplaceWhileExists(page, "<figure[\\w\\W]+?</figure>", ""); page = page.Replace("<img src=\"/static/images/icons/pdf.svg\" alt=\"pdf document\">", ""); page = page.Replace("<img src=\"/static/images/icons/doc.svg\" alt=\"doc document\">", ""); page = ReplaceWhileExists(page, "<span>\\s*<strong>Сподели</strong>[\\w\\W]+?</ul>", ""); List <Tuple <string, string> > lBlobs = new List <Tuple <string, string> >(); var mb = Regex.Matches(page, "<img[\\w\\W]*?src=\"([^\"]+?)\""); foreach (Match m in mb) { string newUrl = "https://localhost:5001/api/part/GetBlob?hash=" + UploadBlob(m.Groups[1].Value); lBlobs.Add(new Tuple <string, string>(m.Groups[1].Value, newUrl)); } mb = Regex.Matches(page, "<a[\\w\\W]*?href=\"(/media[^\"]+?)\"[\\w\\W]*?>([\\w\\W]+?)</a>"); foreach (Match m in mb) { string newUrl = "/api/part/GetBlob?hash=" + UploadBlob(m.Groups[1].Value); lBlobs.Add(new Tuple <string, string>(m.Groups[1].Value, newUrl)); } foreach (var ru in lBlobs) { page = page.Replace(ru.Item1, ru.Item2); } var mLinks = Regex.Matches(page, "<a[\\w\\W]*?href=\"(/bg/[^\"]+?)\"[\\w\\W]*?>([\\w\\W]+?)</a>"); foreach (Match ml in mLinks) { string pUrl = ml.Groups[1].Value; if (IsNo(pUrl)) { continue; } if (dUrls.ContainsKey(pUrl)) { page = page.Replace(pUrl, "/home/index/" + dUrls[pUrl].Id); continue; } string id = Guid.NewGuid().ToString(); page = page.Replace("href=\"" + pUrl + "\"", "href=\"" + "/home/index/" + id + "\""); dUrls.Add(pUrl, new UrlInfo() { Id = id, Downloaded = false }); } page = Regex.Replace(page, "<div[\\w\\W]*?>", "<br />"); page = Regex.Replace(page, "</*div[\\w\\W]*?>", ""); page = ReplaceWhileExists(page, "<br />\\s*<br />", "<br />"); string others = ""; var mcVideos = Regex.Matches(page, "<video[\\w\\W]+?</video>"); foreach (Match mv in mcVideos) { var vsrc = Regex.Match(mv.Value, "src=\"([^\"]+?)\"").Groups[1].Value; string newUrl = "/api/part/GetBlob?hash=" + UploadBlob(vsrc); others += mv.Value.Replace(vsrc, newUrl) + "<br/>"; page = page.Replace(mv.Value, ""); } BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "text", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "av", Url = url.Value.Id, Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), body = JObject.FromObject(new { bg = page }), others = others }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "8" } } }; db.SetBlock(bd); dUrls[url.Key].Downloaded = true; } }
public void Download() { var i = 0; var found = false; byte[] file; string hash; do { var page = wc.DownloadString($"https://www.gdin.bg/news/{i}"); i += 6; var mcNews = Regex.Matches(page, "<a href=\"(https://www.gdin.bg/news[^\"]+?)\">...виж още</a>"); found = mcNews.Count > 0; foreach (Match n in mcNews) { var np = wc.DownloadString(n.Groups[1].Value); var title = Regex.Match(np, "<div class=\"page-title\">\\s+<h1>([\\w\\W]+?)</h1>").Groups[1].Value; title = Regex.Replace(title, "<[^>]+>", "").Replace(" ", ""); np = Regex.Match(np, "<div class=\"details\">([\\w\\W]+?)<div class=\"right-sidebar\">").Groups[1].Value; var time = Regex.Match(np, "datetime=\"([\\w\\W]+?) ").Groups[1].Value; var type = Regex.Match(np, "<span class=\"category\">([\\w\\W]+?)</span>").Groups[1].Value; var mImgMain = Regex.Match(np, "<div class=\"col-md-7 col-sm-7 col-xs-12\">[\\w\\W]+?src=\"([\\w\\W]+?)\"[\\w\\W]+?</div>"); string imgMainPath = null; if (mImgMain.Success) { try { file = wc.DownloadData(mImgMain.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(mImgMain.Groups[1].Value), Filename = Path.GetFileName(mImgMain.Groups[1].Value), Hash = hash }; db.AddBlob(b); imgMainPath = hash; } var mcPs = Regex.Matches(np, "<p[\\w\\W]*?>([\\w\\W]+?)</p>"); StringBuilder sb = new StringBuilder(); foreach (Match p in mcPs) { sb.AppendLine(p.Value); } np = sb.ToString(); np = Regex.Replace(np, "<[/]*div[\\w\\W]*?>", ""); np = Regex.Replace(np, "<[/]*a[\\w\\W]*?>", ""); var mcImgs = Regex.Matches(np, "<img [\\w\\W]*?src=\"([\\w\\W]+?)\"[\\w\\W]*?>"); List <Tuple <string, string> > lLinks = new List <Tuple <string, string> >(); foreach (Match f in mcImgs) { string fUrl = f.Groups[1].Value; try { file = wc.DownloadData("https://www.gdin.bg" + fUrl); } catch (Exception e) { Console.WriteLine(e.Message); continue; } using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fUrl), Filename = Path.GetFileName(fUrl), Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string>(f.Value, hash)); } foreach (var l in lLinks) { np = np.Replace(l.Item1, $"<img alt='' src='https://localhost:5001/api/part/getblob?hash={l.Item2}'>"); } BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = type == "Новина" ? "new" : "ad", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "gdin", RubricId = 3, Url = Guid.NewGuid().ToString(), Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), body = JObject.FromObject(new { bg = np }), imageId = imgMainPath }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = time } } }; db.SetBlock(bd); } } while (found); }
public void Download() { wc.DownloadString("http://profile.gdin.bg/"); //string sid = Regex.Match(wc.ResponseHeaders["Set-Cookie"], "PHPSESSID=([^;]+?);").Groups[1].Value; string cookies = wc.ResponseHeaders["Set-Cookie"]; wc.Headers.Add(HttpRequestHeader.Cookie, cookies); for (int year = 2014; year <= DateTime.Now.Year; year++) { wc.DownloadString("http://profile.gdin.bg/?year=" + year); foreach (var url in lUrls) { var page = 0; var found = false; do { string pageLinks = this.Download10Times(url.Item1 + "&page=" + page.ToString(), cookies); page++; var mcLinks = Regex.Matches(pageLinks, "<h6>\\s*<a href=\"(/[^\"]{32})\""); found = mcLinks.Count > 0; foreach (Match lnk in mcLinks) { string pageOP = this.Download10Times("http://profile.gdin.bg" + lnk.Groups[1].Value, cookies); var mTitle = Regex.Match(pageOP, "<h4><i[\\w\\W]+?/i>([\\w\\W]+?)</h4>"); var mDate = Regex.Match(pageOP, "Дата на създаване на преписката: ([0-9\\.]{10})</div>"); var mText = Regex.Match(pageOP, "<div class=\"page-header\">[\\w\\W]+?</div>[\\w\\W]*?>([\\w\\W]+?)<hr />"); string text = mText.Groups[1].Value; text = text.Replace("</div>", "<br />"); text = Regex.Replace(text, "<[/]*div[\\w\\W]*?>", ""); text = Regex.Replace(text, "<[/]*i[\\w\\W]*?>", ""); var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), type = url.Item2, body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/file[^\"]+?)\"[^>]+?>([\\w\\W]+?)</a>\\s*<small>([^<]+?)</small>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://profile.gdin.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Guid.NewGuid().ToString() + ".pdf"; } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = f.Groups[3].Value, file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 3, BlockTypeId = "pkmessage", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "gdin", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "11" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } } while (found); } } }
public void Download() { JArray docs = new JArray(); //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] foreach (var url in urls) { var page = 1; var found = false; do { string pageLinks = this.Download10Times(url.Item1 + "?page=" + page.ToString()); page++; var mcLinks = Regex.Matches(pageLinks, "<div class=\"item-title\">\\s*<a href=\"([^\"]+?)\">([^<]+?)</a>\\s+</div>\\s+<div class=\"item-published\">([^<]+?)<"); found = mcLinks.Count > 0; foreach (Match lnk in mcLinks) { var title = lnk.Groups[2].Value; var date = lnk.Groups[3].Value.Trim(); string pageOP = this.Download10Times("http://www.gdo.bg" + lnk.Groups[1].Value); var mText = Regex.Match(pageOP, "</h2>([\\w\\W]+?)<div class=\"item-docs\">").Groups[1].Value; string text = mText; text = text.Replace("<div class=\"display-label\">", "<br />"); text = Regex.Replace(text, "<[/]*div[\\w\\W]*?>", ""); text = Regex.Replace(text, "<[/]*i[\\w\\W]*?>", ""); var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = lnk.Groups[2].Value }), type = url.Item2, body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/Uploads[^\"]+?)\">([\\w\\W]+?)</a>[\\w\\W]+?<td class=\"publ\">([0-9\\.]+?)</td>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://www.gdo.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Guid.NewGuid().ToString() + Path.GetExtension(f.Groups[1].Value); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = DateTime.Parse(f.Groups[3].Value).ToString("yyyy-MM-dd"), file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 4, BlockTypeId = "pkmessage", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "gdo", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "12" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(date).ToString("yyyy-MM-dd") } } }); } } while (found); } }
public void Download() { foreach (var url in lUrls) { var page = 0; var found = false; do { string pageLinks = this.Download10Times(url.Item1 + "&page=" + page.ToString()); page++; var mcLinks = Regex.Matches(pageLinks, "<h6>\\s*<a href=\"(/[^\"]{32})\"[\\w\\W]+?</h6>\\s*<p>([^<]+?)</p>"); found = mcLinks.Count > 0; foreach (Match lnk in mcLinks) { string pageOP = this.Download10Times("https://profile.mjs.bg" + lnk.Groups[1].Value); var mTitle = Regex.Match(pageOP, "<h3><i[\\w\\W]+?/i>([\\w\\W]+?)</h3>"); var mDate = Regex.Match(pageOP, "Дата на създаване на преписката: ([0-9\\.]{10})</div>"); var mText = Regex.Match(pageOP, "<div class=\"page-header\">[\\w\\W]+?</div>[\\w\\W]*?>([\\w\\W]+?)<hr />"); string text = mText.Groups[1].Value; text = text.Replace("</div>", "<br />"); text = Regex.Replace(text, "<[/]*div[\\w\\W]*?>", ""); text = Regex.Replace(text, "<[/]*i[\\w\\W]*?>", ""); var pkType = url.Item2; switch (lnk.Groups[2].Value) { case "Покана до определени лица": pkType = "Събиране на оферти с обява или покана до определени лица"; break; case "Събиране на оферти с обява": pkType = "Събиране на оферти с обява или покана до определени лица"; break; } var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), type = pkType, body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/file[^\"]+?)\"[^>]+?>([\\w\\W]+?)</a>\\s*<small>([^<]+?)</small>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { //file = wc.DownloadData("https://profile.mjs.bg" + f.Groups[1].Value); file = this.DownloadFile("https://profile.mjs.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine("https://profile.mjs.bg" + lnk.Groups[1].Value + " " + e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } var fileName = DownloadFilename; Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = f.Groups[3].Value, file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 5, BlockTypeId = "pkmessage", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "min", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } } while (found); } }