private void DownloadRecurs(string path, string url, JArray result) { var page = wc.DownloadString(url); page = Regex.Match(page, "<h2>([\\w\\W]+?)</h2>([\\w\\W]+?)<!-- FOOTER -->").Groups[2].Value; var mcFiles = Regex.Matches(page, "<a\\s*href=\"(/Uploads[^\"]+?)\"[^>]*?>([^<]+)<"); if (mcFiles.Count == 0) { var mcLinks = Regex.Matches(page, "<a\\s*href=\"([^\"]+?)\"[^>]*?>([^<]+)<"); foreach (Match l in mcLinks) { var newPath = path; if (newPath != "") { newPath += "/" + l.Groups[2].Value; } else { newPath = l.Groups[2].Value; } DownloadRecurs(newPath, "http://www.gdo.bg" + l.Groups[1].Value, result); } return; } foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://www.gdo.bg" + f.Groups[1].Value); } catch { continue; } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(f.Groups[1].Value), Filename = Path.GetFileName(f.Groups[1].Value), Hash = hash }; db.AddBlob(b); JObject rec = new JObject(); rec["id"] = Guid.NewGuid().ToString(); rec["781254eb-344b-408f-9646-c14fb8c954c9"] = path; rec["fd94572c-5036-48fc-87db-7d563ee1a424"] = f.Groups[2].Value; rec["a72ab72b-3459-4d71-8944-51983e350ca8"] = hash; result.Add(rec); } }
private string UploadBlob(string url) { byte[] file; try { file = this.DownloadData(url); } catch { return(null); } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(url), Filename = Path.GetFileName(url), Hash = hash }; db.AddBlob(b); return(hash); }
public void Download() { JArray result = new JArray(); //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] var page = wc.DownloadString($"http://www.mjs.bg/38/"); var mcParts = Regex.Matches(page, "<div class=\"DocumentContainer\">\\s*<div class=\"Title\">([\\w\\W]+?)</div>\\s*<div class=\"Date\">([\\w\\W]+?)</div>[\\w\\W]+?<a href=\"([\\w\\W]+?)\""); foreach (Match m in mcParts) { byte[] file = wc.DownloadData("http://www.mjs.bg/" + m.Groups[3].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(m.Groups[1].Value), Filename = Path.GetFileName(m.Groups[1].Value), Hash = hash }; db.AddBlob(b); JObject rec = new JObject(); rec["id"] = Guid.NewGuid().ToString(); rec["49133545-2eae-44bb-9dc3-7bc6c8df088d"] = JObject.FromObject(new { bg = m.Groups[1].Value }); rec["d9fe17ec-3f45-4cd0-9fd3-d0f28c713939"] = DateTime.Parse(m.Groups[2].Value).ToString("yyyy-MM-dd"); rec["d8d7f1e5-df65-4873-a5a4-2632e381381d"] = hash; result.Add(rec); } File.WriteAllText(@"d:\11.json", result.ToString()); }
public void Download() { //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] List <string> lUrls = new List <string>() { "http://mjs.bg/2170/", "http://mjs.bg/2171/" }; for (int pn = 0; pn < lUrls.Count - 1; pn++) { JArray result = new JArray(); var page = wc.DownloadString(lUrls[pn]); var mcFiles = Regex.Matches(page, "<a\\s*href=\"([^\"]+?/Files[^\"]+)\">([^<]+)<"); foreach (Match m in mcFiles) { if (m.Groups[2].Value.Contains("ЗПКОНПИ")) { continue; } try { var url = m.Groups[1].Value; url = url.Replace("../", ""); if (!url.Contains("http://mjs.bg/")) { url = "http://mjs.bg/" + url; } byte[] file = wc.DownloadData(url); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(m.Groups[1].Value), Filename = Path.GetFileName(m.Groups[1].Value), Hash = hash }; db.AddBlob(b); JObject rec = new JObject(); rec["id"] = Guid.NewGuid().ToString(); rec["49a60721-065c-4686-8284-e315ff99443c"] = JObject.FromObject(new { bg = m.Groups[2].Value }); rec["28ba0b81-430c-4f68-bfed-b0d4c4ce9b85"] = hash; result.Add(rec); } catch (Exception e) { Console.WriteLine(e.Message); } } File.WriteAllText(@"d:\2-" + pn.ToString() + ".json", result.ToString()); } }
public void Download() { Console.Write("URL:"); string url = Console.ReadLine(); var page = wc.DownloadString(url); var mPage = Regex.Match(page, "<h2>([\\w\\W]+?)</h2>([\\w\\W]+?)<!-- FOOTER -->"); page = mPage.Groups[2].Value; var mcFiles = Regex.Matches(page, "<a [\\w\\W]*?href=\"([\\w\\W]+?\\.[a-z]{3,4})\"[\\w\\W]*?>"); List <Tuple <string, string> > lLinks = new List <Tuple <string, string> >(); foreach (Match f in mcFiles) { string fUrl = f.Groups[1].Value; fUrl = fUrl.Replace("../", ""); if (!fUrl.StartsWith("/") && !fUrl.Contains("http")) { fUrl = "/" + fUrl; } byte[] file; try { file = fUrl.Contains("http") ? wc.DownloadData(fUrl) : wc.DownloadData("http://www.gdo.bg" + fUrl); } catch (Exception e) { Console.WriteLine(e.Message); continue; } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fUrl), Filename = Path.GetFileName(fUrl), Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string>(f.Value, hash)); } foreach (var l in lLinks) { page = page.Replace(l.Item1, $"<a href='/api/part/getblob?hash={l.Item2}'>"); } page = Regex.Replace(page, "<[/]*div[\\w\\W]*?>", ""); page = Regex.Replace(page, "<[/]*span[\\w\\W]*?>", ""); page = mPage.Groups[1].Value + "\r\n" + page; File.WriteAllText(@"d:\html.txt", page); }
public void Download() { Console.Write("URL:"); string url = Console.ReadLine(); var page = wc.DownloadString(url); page = Regex.Match(page, "<!-- BEGIN PAGE CONTENT INNER -->[\\w\\W]*?<div class=\"col-lg-9\">([\\w\\W]*?)</div>").Groups[1].Value; var mcFiles = Regex.Matches(page, "<img [\\w\\W]*?src=\"([^\"]+)\"[^>]*?>"); List <Tuple <string, string> > lLinks = new List <Tuple <string, string> >(); foreach (Match f in mcFiles) { string fUrl = f.Groups[1].Value; fUrl = fUrl.Replace("../", ""); if (!fUrl.StartsWith("/") && !fUrl.Contains("http")) { fUrl = "/" + fUrl; } byte[] file; try { file = fUrl.Contains("http") ? wc.DownloadData(fUrl) : wc.DownloadData("https://www.gdin.bg" + fUrl); } catch (Exception e) { Console.WriteLine(e.Message); continue; } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fUrl), Filename = Path.GetFileName(fUrl), Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string>(f.Value, hash)); } foreach (var l in lLinks) { page = page.Replace(l.Item1, $"<img alt=\"\" src='https://*****:*****@"d:\html.txt", page); }
public void Download() { JArray result = new JArray(); //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] var page = wc.DownloadString($"http://172.16.0.44/47/"); var i = 0; foreach (Match m in Regex.Matches(page, "<a href=\"(/47/[0-9]+/)\"")) { page = wc.DownloadString($"http://172.16.0.44/" + m.Groups[1].Value); var mTitle = Regex.Match(page, "<div class=\"ProfileActHead\">([\\w\\W]+?)</div>"); var mOthers = Regex.Match(page, "<td>Дата на решение:[\\w\\W]*?<td>\\s*([\\w\\W]*?)</td>[\\w\\W]*?<td>№ на жалба:[\\w\\W]*?<td>([\\w\\W]*?)</td>[\\w\\W]*?<td>Членове:[\\w\\W]*?<li>\\s*([\\w\\W]*?)\\s*</li>"); var mDoc = Regex.Match(page, "<a href=\"([^\"]+?)\">Изтегли"); var mLink = Regex.Match(page, "<a href=\"([^\"]+?)\">Отвори"); byte[] file; try { file = wc.DownloadData("http://172.16.0.44/" + mDoc.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(mDoc.Groups[1].Value), Filename = Path.GetFileName(mDoc.Groups[1].Value), Hash = hash }; db.AddBlob(b); Console.WriteLine(++i); JObject rec = new JObject(); rec["id"] = Guid.NewGuid().ToString(); rec["8832f117-e52f-4ad8-8e16-0d50742488fa"] = JObject.FromObject(new { bg = mTitle.Groups[1].Value }); rec["988be796-1227-4ad4-95d8-ff0bf356c79c"] = DateTime.Parse(mOthers.Groups[1].Value).ToString("yyyy-MM-dd"); rec["b9b900f5-2a6d-4a7a-8dd2-dbc4c39c37c2"] = mOthers.Groups[2].Value; rec["0ec35482-0a53-455f-b236-a17967a41bd8"] = hash; rec["8460beb6-a976-4056-8415-9c44673d8322"] = mLink.Groups[1].Value; rec["7c3ff3bd-073d-43b2-b472-56efb5fcb0ab"] = mOthers.Groups[3].Value; result.Add(rec); } File.WriteAllText(@"d:\12.json", result.ToString()); }
public void Download() { foreach (var url in lUrls) { JArray result = new JArray(); var i = 1; var found = false; do { var pageUrl = url.Item1; if (i > 1) { pageUrl += "?" + url.Item2 + "=" + i.ToString(); } i++; var page = wc.DownloadString(pageUrl); var mcFiles = Regex.Matches(page, "<a href=\"([^\"]+)\"[^>]*>([0-9\\.]{10}) - ([^<]+?)<"); found = mcFiles.Count > 0 && url.Item2 != null; foreach (Match m in mcFiles) { try { byte[] file = wc.DownloadData("http://www.gdin.bg" + m.Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(m.Groups[1].Value), Filename = Path.GetFileName(m.Groups[1].Value), Hash = hash }; db.AddBlob(b); JObject rec = new JObject(); rec["id"] = Guid.NewGuid().ToString(); rec["ead9f5ac-a318-4b3a-a78a-a8de754052ba"] = DateTime.Parse(m.Groups[2].Value).ToString("yyyy-MM-dd"); rec["b771ae8f-2587-46ee-8d43-2e819f43058a"] = m.Groups[3].Value; rec["2960039f-e2fb-4e2a-83fd-cc3eef410339"] = hash; result.Add(rec); } catch { } } } while (found); File.WriteAllText($"d:\\18-{url.Item3}.json", result.ToString()); } }
public void Download() { string url = "http://mjs.bg/25/"; var page = wc.DownloadString(url); page = Regex.Match(page, "<div class=\"Panel1a_column1\">([\\w\\W]+?)<div class=\"Panel1a_column2\">").Groups[1].Value; var mcFiles = Regex.Matches(page, "<a href=\"(/[0-9]+/[0-9]+/)\">([\\w\\W]*?)<"); List <Tuple <string, string, string> > lLinks = new List <Tuple <string, string, string> >(); int iter = 0; foreach (Match f in mcFiles) { Console.WriteLine($"{++iter} of {mcFiles.Count}"); string fUrl = "http://mjs.bg" + f.Groups[1].Value; string proto = wc.DownloadString(fUrl); proto = Regex.Match(proto, "<div class=\"Title\">([\\w\\W]+?)<div class=\"clear\"></div>").Groups[1].Value; proto = Regex.Replace(proto, "<[/]*div[\\w\\W]*?>", "\r\n"); proto = Regex.Replace(proto, "<[/]*span[\\w\\W]*?>", ""); proto = Regex.Replace(proto, "<[/]*img[\\w\\W]*?>", ""); proto = proto.Replace("<br />", "\r\n"); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(System.Text.Encoding.UTF8.GetBytes(proto)).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = System.Text.Encoding.UTF8.GetBytes(proto), ContentType = "text/plain; charset=UTF-8", Extension = ".txt", Filename = Guid.NewGuid().ToString() + ".txt", Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string, string>(f.Value, hash, f.Groups[2].Value.Trim())); } foreach (var l in lLinks) { page = page.Replace(l.Item1, $"<a href='/api/part/getblob?hash={l.Item2}'>{l.Item3}<"); } page = Regex.Replace(page, "<[/]*div[\\w\\W]*?>", ""); page = Regex.Replace(page, "<[/]*span[\\w\\W]*?>", ""); page = Regex.Replace(page, "<[/]*img[\\w\\W]*?>", ""); File.WriteAllText(@"d:\html.txt", page); }
public void Download() { JArray result = new JArray(); //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] var page = wc.DownloadString($"https://www.gdin.bg/deklaratsii-po-zpkonpi/deklaratsii-po-chl-35"); var mcFiles = Regex.Matches(page, "<a href=\"([^\"]+)\"[^>]*>([0-9\\.]{10}) - ([\\w\\W]+?) - ([^<]+?)<"); foreach (Match m in mcFiles) { try { byte[] file = wc.DownloadData("http://www.gdin.bg" + m.Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(m.Groups[1].Value), Filename = Path.GetFileName(m.Groups[1].Value), Hash = hash }; db.AddBlob(b); JObject rec = new JObject(); rec["id"] = Guid.NewGuid().ToString(); rec["8a6c22e6-d495-4404-9fcd-8e3df3c3fc2d"] = DateTime.Parse(m.Groups[2].Value).ToString("yyyy-MM-dd"); rec["34a42d4c-5700-415c-a271-a5a211c52eff"] = m.Groups[3].Value; rec["72c76bc3-fff2-4632-8812-7142aff81aec"] = m.Groups[4].Value; rec["ca4e44ea-1c0c-4fb8-a1b7-bfe4e6a13437"] = hash; result.Add(rec); } catch { } } File.WriteAllText(@"d:\17.json", result.ToString()); }
public void Download() { var page = wc.DownloadString("http://www.justice.government.bg/15/"); StringBuilder sb = new StringBuilder(); sb.AppendLine("<figure class=\"table\"><table><tbody>"); foreach (Match m in Regex.Matches(page, "<div class=\"Title\">([^<]*?)</div><div class=\"Date\">([^<]*?)</div>([\\w\\W]*?)</div></div>")) { sb.AppendLine($"<tr><td><h2>{m.Groups[1].Value}</h2>"); sb.AppendLine($"<p>{m.Groups[2].Value}</p>"); foreach (Match f in Regex.Matches(m.Groups[3].Value, "href=\"([^\"]+?)\"")) { try { byte[] file = wc.DownloadData("http://www.justice.government.bg" + f.Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(f.Groups[1].Value), Filename = Path.GetFileName(f.Groups[1].Value), Hash = hash }; int bID = db.AddBlob(b); sb.AppendLine($"<p><a href=\"/api/part/GetBlob?hash={hash}\">Изтегли</a></p>"); } catch { } } sb.AppendLine($"</td></tr>"); } sb.AppendLine($"</tbody></table></figure>"); File.WriteAllText(@"d:\project.txt", sb.ToString()); }
public void Download() { var page = wc.DownloadString($"http://mjs.bg/132/"); var mcElements = Regex.Matches(page, "<div class=\"DocumentContainer\">([\\w\\W]+?)</a>\\s+</div>\\s+</div>"); StringBuilder sb = new StringBuilder(); var i = 0; foreach (Match e in mcElements) { var mTitle = Regex.Match(e.Groups[1].Value, "<div class=\"Title\\\">([\\w\\W]+?)</div>"); var mcFiles = Regex.Matches(e.Groups[1].Value, "<a href=\\\"([^\\\"]+)\\\">Изтегли"); sb.AppendLine($"<p>{mTitle.Groups[1].Value}</p>"); sb.AppendLine($"<ul>"); foreach (Match f in mcFiles) { byte[] file = wc.DownloadData("http://www.justice.government.bg" + f.Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(f.Groups[1].Value), Filename = Path.GetFileName(f.Groups[1].Value), Hash = hash }; db.AddBlob(b); sb.AppendLine($"<li><a href=\"/api/part/GetBlob?hash={hash}\"><t>download</t></a></li>"); } sb.AppendLine($"</ul>"); i++; Console.WriteLine($"{i}/{mcElements.Count}"); } File.WriteAllText(@"d:\nfm.txt", sb.ToString()); }
public void Download() { foreach (var url in lUrls) { var page = 0; var found = false; do { string pageLinks = this.Download10Times(url.Item1 + "&page=" + page.ToString()); page++; var mcLinks = Regex.Matches(pageLinks, "<h6>\\s*<a href=\"(/[^\"]{32})\"[\\w\\W]+?</h6>\\s*<p>([^<]+?)</p>"); found = mcLinks.Count > 0; foreach (Match lnk in mcLinks) { string pageOP = this.Download10Times("https://profile.mjs.bg" + lnk.Groups[1].Value); var mTitle = Regex.Match(pageOP, "<h3><i[\\w\\W]+?/i>([\\w\\W]+?)</h3>"); var mDate = Regex.Match(pageOP, "Дата на създаване на преписката: ([0-9\\.]{10})</div>"); var mText = Regex.Match(pageOP, "<div class=\"page-header\">[\\w\\W]+?</div>[\\w\\W]*?>([\\w\\W]+?)<hr />"); string text = mText.Groups[1].Value; text = text.Replace("</div>", "<br />"); text = Regex.Replace(text, "<[/]*div[\\w\\W]*?>", ""); text = Regex.Replace(text, "<[/]*i[\\w\\W]*?>", ""); var pkType = url.Item2; switch (lnk.Groups[2].Value) { case "Покана до определени лица": pkType = "Събиране на оферти с обява или покана до определени лица"; break; case "Събиране на оферти с обява": pkType = "Събиране на оферти с обява или покана до определени лица"; break; } var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), type = pkType, body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/file[^\"]+?)\"[^>]+?>([\\w\\W]+?)</a>\\s*<small>([^<]+?)</small>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { //file = wc.DownloadData("https://profile.mjs.bg" + f.Groups[1].Value); file = this.DownloadFile("https://profile.mjs.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine("https://profile.mjs.bg" + lnk.Groups[1].Value + " " + e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } var fileName = DownloadFilename; Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = f.Groups[3].Value, file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 5, BlockTypeId = "pkmessage", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "min", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } } while (found); } }
public void Download() { JArray docs = new JArray(); //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] var page = ""; for (int year = 2013; year <= DateTime.Now.Year; year++) { for (int month = 1; month < 13; month++) { page = wc.DownloadString($"http://mjs.bg/149/{year}/{month}"); var mcTitles = Regex.Matches(page, "<div class=\\\"Title\\\">([\\w\\W]+?)</div>"); var mcDates = Regex.Matches(page, "<div class=\\\"Date\\\">([0-9\\.]+)</div>"); var mcFiles = Regex.Matches(page, "<a href=\\\"([^\\\"]+)\\\">Изтегли</a>"); for (int i = 0; i < mcDates.Count; i++) { try { byte[] file = wc.DownloadData("http://www.justice.government.bg" + mcFiles[i].Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(mcFiles[i].Groups[1].Value), Filename = Path.GetFileName(mcFiles[i].Groups[1].Value), Hash = hash }; int bID = db.AddBlob(b); docs.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = mcTitles[i].Groups[1].Value }), docId = hash, date = DateTime.Parse(mcDates[i].Groups[1].Value).ToString("yyyy-MM-dd") } ) ); Console.WriteLine($"date:{mcDates[i].Groups[1].Value} file:{file.Length}"); } catch (Exception e) { Console.WriteLine($"error:{e.Message}"); } } } } page = wc.DownloadString($"http://mjs.bg/154/"); var mcElements = Regex.Matches(page, "<div class=\"DocumentContainer\">([\\w\\W]+?)</a>\\s+</div>\\s+</div>"); StringBuilder sb = new StringBuilder(); var iter = 0; foreach (Match e in mcElements) { var mTitle = Regex.Match(e.Groups[1].Value, "<div class=\"Title\\\">([\\w\\W]+?)</div>"); var mcFiles = Regex.Matches(e.Groups[1].Value, "<a href=\\\"([^\\\"]+)\\\">Изтегли"); sb.AppendLine($"<p>{mTitle.Groups[1].Value}</p>"); sb.AppendLine($"<ul>"); foreach (Match f in mcFiles) { byte[] file = wc.DownloadData("http://www.justice.government.bg" + f.Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(f.Groups[1].Value), Filename = Path.GetFileName(f.Groups[1].Value), Hash = hash }; db.AddBlob(b); sb.AppendLine($"<li><a href=\"/api/part/GetBlob?hash={hash}\"><t>download</t></a></li>"); } sb.AppendLine($"</ul>"); iter++; Console.WriteLine($"{iter}/{mcElements.Count}"); } JObject result = JObject.FromObject( new { title = JObject.FromObject(new { bg = "Финанси" }), text = JObject.FromObject(new { bg = sb.ToString() }), docs = docs } ); File.WriteAllText(@"d:\budget.json", result.ToString()); }
public void Download() { JArray docs = new JArray(); //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] foreach (var url in urls) { var page = 1; var found = false; do { string pageLinks = this.Download10Times(url.Item1 + "?page=" + page.ToString()); page++; var mcLinks = Regex.Matches(pageLinks, "<div class=\"item-title\">\\s*<a href=\"([^\"]+?)\">([^<]+?)</a>\\s+</div>\\s+<div class=\"item-published\">([^<]+?)<"); found = mcLinks.Count > 0; foreach (Match lnk in mcLinks) { var title = lnk.Groups[2].Value; var date = lnk.Groups[3].Value.Trim(); string pageOP = this.Download10Times("http://www.gdo.bg" + lnk.Groups[1].Value); var mText = Regex.Match(pageOP, "</h2>([\\w\\W]+?)<div class=\"item-docs\">").Groups[1].Value; string text = mText; text = text.Replace("<div class=\"display-label\">", "<br />"); text = Regex.Replace(text, "<[/]*div[\\w\\W]*?>", ""); text = Regex.Replace(text, "<[/]*i[\\w\\W]*?>", ""); var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = lnk.Groups[2].Value }), type = url.Item2, body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/Uploads[^\"]+?)\">([\\w\\W]+?)</a>[\\w\\W]+?<td class=\"publ\">([0-9\\.]+?)</td>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://www.gdo.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Guid.NewGuid().ToString() + Path.GetExtension(f.Groups[1].Value); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = DateTime.Parse(f.Groups[3].Value).ToString("yyyy-MM-dd"), file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 4, BlockTypeId = "pkmessage", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "gdo", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "12" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(date).ToString("yyyy-MM-dd") } } }); } } while (found); } }
public void Download() { foreach (var url in this.urls) { var found = false; var downloaded = 0; int pn = 0; do { var page = wc.DownloadString(url + pn); pn++; var mcLinks = Regex.Matches(page, "<a href=\"(/[a-z0-9]{32})\">"); found = mcLinks.Count > 0; foreach (Match ml in mcLinks) { page = wc.DownloadString("http://profile.mjs.bg" + ml.Groups[1].Value); var mTitle = Regex.Match(page, "<h3><i[\\w\\W]+?/i>([\\w\\W]+?)</h3>"); var mDate = Regex.Match(page, "Дата на създаване на преписката: ([0-9\\.]{10})</div>"); var mEndDate = Regex.Match(page, "Краен срок за подаване на оферти или заявления за участие: ([\\w\\W]+?)</div>"); var mProcType = Regex.Match(page, "Процедура: ([\\w\\W]+?)</div>"); var mAOP = Regex.Match(page, "<a href=\"(http://[^\\.]+?\\.aop.bg[\\w\\W]+?)\""); var mStatus = Regex.Match(page, "Статус: ([\\w\\W]+?)</div>"); var mText = Regex.Match(page, "<div class=\"clearfix\"><hr /></div>\\s*<div class=\"clearfix\">[\\w\\W]+?</div>\\s*<div class=\"clearfix\"><hr /></div>"); var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), enddate = DateTime.Parse(mEndDate.Groups[1].Value).ToString("yyyy-MM-dd hh:mm"), proctype = JObject.FromObject(new { bg = mProcType.Groups[1].Value }), AOPNum = mAOP.Groups[1].Value, procstatus = JObject.FromObject(new { bg = mStatus.Groups[1].Value }), Subject = JObject.FromObject(new { bg = mText.Groups[1].Value }) } ); var mcFiles = Regex.Matches(page, "<li class=\"media well clearfix\"[\\w\\W]+?>\\s*<a class=\"pull-left\" href=\"(/file[\\w\\W]+?)\"[\\w\\W]+?</h4>\\s*([\\w\\W]+?)</div>"); Console.WriteLine($"download: {++downloaded} files:{mcFiles.Count}"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://profile.mjs.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Guid.NewGuid().ToString() + ".pdf"; } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "pkop", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "min", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } }while (found); } }
public void Download() { var i = 0; var found = false; do { var page = wc.DownloadString($"http://www.nbpp.government.bg/%D0%BD%D0%BE%D0%B2%D0%B8%D0%BD%D0%B8?start={i}"); i += 20; var mcNews = Regex.Matches(page, "<h2>\\s+<a href=\"(/новини/[\\w\\W]+?)\""); found = mcNews.Count > 0; foreach (Match n in mcNews) { var url = HttpUtility.UrlEncode(n.Groups[1].Value).Replace("%2f", "/"); var np = wc.DownloadString($"http://www.nbpp.government.bg" + url); Match mp = Regex.Match(np, "</h2>\\s*([\\w\\W]+?)<div class=\"articleInfoFooter\">([\\w\\W]+?)</div>"); page = mp.Groups[1].Value; var mcPs = Regex.Matches(page, "<p>[\\w\\W]*?</p>"); var title = Regex.Match(np, "<h2>\\s*<a [\\w\\W]+?>\\s*([\\w\\W]+?)</a>").Groups[1].Value; StringBuilder sbPage = new StringBuilder(); foreach (Match p in mcPs) { sbPage.AppendLine(p.Value); } page = sbPage.ToString(); var mcFiles = Regex.Matches(page.ToString(), "<a [\\w\\W]*?href=\"([\\w\\W]+?\\.[a-z]{3,4})\"[\\w\\W]*?>"); List <Tuple <string, string> > lLinks = new List <Tuple <string, string> >(); foreach (Match f in mcFiles) { string fUrl = f.Groups[1].Value; fUrl = fUrl.Replace("../", ""); if (!fUrl.StartsWith("/") && !fUrl.Contains("http")) { fUrl = "/" + fUrl; } byte[] file; try { file = fUrl.Contains("http") ? wc.DownloadData(fUrl) : wc.DownloadData("http://www.nbpp.government.bg" + fUrl); } catch (Exception e) { Console.WriteLine(e.Message); continue; } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fUrl), Filename = Path.GetFileName(fUrl), Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string>(f.Value, hash)); } foreach (var l in lLinks) { page = page.Replace(l.Item1, $"<a href='/api/part/getblob?hash={l.Item2}'>"); } BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = "new", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "nbpp", RubricId = 6, Url = Guid.NewGuid().ToString(), Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), body = JObject.FromObject(new { bg = page }) }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "10" }, new PropertyValue() { PropertyId = "date", Value = GetDate(mp.Groups[2].Value) } } }; db.SetBlock(bd); } } while (found); //var mcNews = Regex.Matches(page, "<a href=\"(/117/[0-9]+/)\""); //foreach (Match n in mcNews) //{ // var np = wc.DownloadString($"http://www.justice.government.bg" + n.Groups[1].Value); // var mData = Regex.Match(np, "<div class=\"lBorder\"></div>\\s*<div class=\"lTitle\">([^<]+)</div>\\s*<div class=\"lDate\">([^<]+)</div>\\s*<div class=\"lBorder\"></div>\\s*<div class=\"lText\">([\\w\\W]+?)</div>"); // if (mData.Success) // { // BlockData bd = new BlockData() // { // Block = new JSBlock() // { // BlockId = 0, // BlockTypeId = "new", // Name = mData.Groups[1].Value.Length > 199 ? mData.Groups[1].Value.Substring(0, 199) : mData.Groups[1].Value, // PortalPartId = "min", // Url = Guid.NewGuid().ToString(), // Jsonvalues = JObject.FromObject(new // { // title = JObject.FromObject(new { bg = mData.Groups[1].Value }), // body = JObject.FromObject(new { bg = mData.Groups[3].Value }) // }).ToString() // }, // Values = new PropertyValue[] // { // new PropertyValue() // { // PropertyId = "header", // Value = "6" // }, // new PropertyValue() // { // PropertyId = "date", // Value = DateTime.Parse(mData.Groups[2].Value).ToString("yyyy-MM-dd") // } // } // }; // db.SetBlock(bd); // } //} }
public void Download() { //Console.OutputEncoding = System.Text.Encoding.GetEncoding(1251); //Console.InputEncoding = System.Text.Encoding.GetEncoding(1251); Console.Write("URL:"); string url = Console.ReadLine(); var page = wc.DownloadString(url); Match mp = Regex.Match(page, "<div class=\"item-page\">([\\w\\W]+?)<div class=\"articleInfoFooter\">"); if (!mp.Success) { mp = Regex.Match(page, "<div class=\"moduletable_ct_lightBox\">([\\w\\W]+?)<!-- end items-leading -->"); } page = mp.Groups[1].Value; var mcFiles = Regex.Matches(page, "<a [\\w\\W]*?href=\"([\\w\\W]+?\\.[a-z]{3,4})\"[\\w\\W]*?>"); List <Tuple <string, string> > lLinks = new List <Tuple <string, string> >(); foreach (Match f in mcFiles) { string fUrl = f.Groups[1].Value; fUrl = fUrl.Replace("../", ""); if (!fUrl.StartsWith("/") && !fUrl.Contains("http")) { fUrl = "/" + fUrl; } byte[] file; try { file = fUrl.Contains("http") ? wc.DownloadData(fUrl) : wc.DownloadData("http://www.nbpp.government.bg" + fUrl); } catch (Exception e) { Console.WriteLine(e.Message); continue; } string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fUrl), Filename = Path.GetFileName(fUrl), Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string>(f.Value, hash)); } foreach (var l in lLinks) { page = page.Replace(l.Item1, $"<a href='/api/part/getblob?hash={l.Item2}'>"); } page = Regex.Replace(page, "<[/]*div[\\w\\W]*?>", ""); page = Regex.Replace(page, "<[/]*span[\\w\\W]*?>", ""); page = Regex.Replace(page, "<[/]*img[\\w\\W]*?>", ""); File.WriteAllText(@"d:\html.txt", page); }
public void Download() { foreach (var url in lUrls) { string page = this.Download10Times(url.Item1); var mcOps = Regex.Matches(page, "<div class=\"lTitle\">[^<]*?</div>\\s*<div class=\"lDate\">[0-9\\.]+</div>[\\w\\W]*?<div class=\"clear\"></div>"); foreach (Match mOp in mcOps) { string pageOP = mOp.Value; var mTitle = Regex.Match(pageOP, "<div class=\"lTitle\">([^<]*?)<"); var mDate = Regex.Match(pageOP, "<div class=\"lDate\">([^<]*?)<"); var mText = Regex.Match(pageOP, "<div class=\"lText\">([\\w\\W]*?)</div>"); string text = mText.Success ? mText.Groups[1].Value : ""; text = url.Item2 + "<br/>" + text; var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), type = "Архив", body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/Files[^\"]+?)\"[^>]*?>([\\w\\W]+?)</a>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://mjs.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } //System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Path.GetFileName(f.Groups[1].Value); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = f.Groups[3].Value, file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 5, BlockTypeId = "pkmessage", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "min", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } } }
public void Download() { JObject result = JObject.FromObject(new { title = JObject.FromObject(new { bg = "" }), body = JObject.FromObject(new { bg = "" }) }); JArray data = new JArray(); foreach (var url in urls) { var page = wc.DownloadString(url.Item1); var mcCareers = Regex.Matches(page, "</div>\\s+(<div class=\"lTitle\">[\\w\\W]+?)<div class=\"clear\">"); foreach (Match m in mcCareers) { var mTitle = Regex.Match(m.Groups[1].Value, "<div class=\"lTitle\">([^<]+?)</div>"); var mDate = Regex.Match(m.Groups[1].Value, "<div class=\"lDate\">([0-9\\.]{10})</div>"); var mText = Regex.Match(m.Groups[1].Value, "<div class=\"lText\">([^<]+?)</div>"); var mcFiles = Regex.Matches(m.Groups[1].Value, "<a href=\"([^\"]+?)\">([^<]*?)</a>"); JObject rec = JObject.FromObject(new { id = Guid.NewGuid().ToString(), type = JObject.FromObject(new { bg = url.Item2 }), title = JObject.FromObject(new { bg = mTitle.Success ? mTitle.Groups[1].Value : "" }), body = JObject.FromObject(new { bg = mText.Success ? mText.Groups[1].Value : "" }), date = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd"), canceled = m.Groups[1].Value.Contains("Приключил!") }); //"body":{"bg":"666","en":""},"docs":[{"id":"4874634b-09d9-40fb-a42a-349dc2e92805","title":{"bg":"8888","en":""},"link":"BA4DDD4C49C052A75D8E63302D9E8DFE"}],"date":"2019-09-07","canceled":false}]} JArray docs = new JArray(); foreach (Match f in mcFiles) { try { Console.WriteLine($"download{f.Groups[1].Value}"); System.Threading.Thread.Sleep(1000); byte[] file = wc.DownloadData("http://www.mjs.bg" + f.Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(f.Groups[1].Value), Filename = Path.GetFileName(f.Groups[1].Value), Hash = hash }; int bID = db.AddBlob(b); docs.Add(JObject.FromObject(new { title = JObject.FromObject(new { bg = string.IsNullOrEmpty(f.Groups[2].Value) ? "..." : f.Groups[2].Value }), link = hash })); } catch (Exception e) { Console.WriteLine(e.Message); } } rec["docs"] = docs; data.Add(rec); } result["data"] = data; File.WriteAllText("d:\\3.json", result.ToString()); } // }
public void Download() { wc.DownloadString("http://profile.gdin.bg/"); //string sid = Regex.Match(wc.ResponseHeaders["Set-Cookie"], "PHPSESSID=([^;]+?);").Groups[1].Value; string cookies = wc.ResponseHeaders["Set-Cookie"]; wc.Headers.Add(HttpRequestHeader.Cookie, cookies); for (int year = 2014; year <= DateTime.Now.Year; year++) { wc.DownloadString("http://profile.gdin.bg/?year=" + year); foreach (var url in lUrls) { var page = 0; var found = false; do { string pageLinks = this.Download10Times(url.Item1 + "&page=" + page.ToString(), cookies); page++; var mcLinks = Regex.Matches(pageLinks, "<h6>\\s*<a href=\"(/[^\"]{32})\""); found = mcLinks.Count > 0; foreach (Match lnk in mcLinks) { string pageOP = this.Download10Times("http://profile.gdin.bg" + lnk.Groups[1].Value, cookies); var mTitle = Regex.Match(pageOP, "<h4><i[\\w\\W]+?/i>([\\w\\W]+?)</h4>"); var mDate = Regex.Match(pageOP, "Дата на създаване на преписката: ([0-9\\.]{10})</div>"); var mText = Regex.Match(pageOP, "<div class=\"page-header\">[\\w\\W]+?</div>[\\w\\W]*?>([\\w\\W]+?)<hr />"); string text = mText.Groups[1].Value; text = text.Replace("</div>", "<br />"); text = Regex.Replace(text, "<[/]*div[\\w\\W]*?>", ""); text = Regex.Replace(text, "<[/]*i[\\w\\W]*?>", ""); var op = JObject.FromObject( new { title = JObject.FromObject(new { bg = mTitle.Groups[1].Value }), type = url.Item2, body = JObject.FromObject(new { bg = text }), } ); var mcFiles = Regex.Matches(pageOP, "<a href=\"(/file[^\"]+?)\"[^>]+?>([\\w\\W]+?)</a>\\s*<small>([^<]+?)</small>"); var jaFiles = new JArray(); foreach (Match f in mcFiles) { byte[] file; try { file = wc.DownloadData("http://profile.gdin.bg" + f.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } System.Threading.Thread.Sleep(1000); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } string header = wc.ResponseHeaders["Content-Disposition"] ?? string.Empty; string filename = "filename="; string fileName = ""; int index = header.LastIndexOf(filename, StringComparison.OrdinalIgnoreCase); if (index > -1) { fileName = header.Substring(index + filename.Length); } else { fileName = Guid.NewGuid().ToString() + ".pdf"; } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fileName), Filename = Path.GetFileName(fileName), Hash = hash }; int bID = db.AddBlob(b); jaFiles.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = f.Groups[2].Value }), fileType = "", date = f.Groups[3].Value, file = hash } ) ); } op["files"] = jaFiles; db.SetBlock(new BlockData() { Block = new JSBlock() { BlockId = 0, RubricId = 3, BlockTypeId = "pkmessage", Name = mTitle.Groups[1].Value.Length > 199 ? mTitle.Groups[1].Value.Substring(0, 199) : mTitle.Groups[1].Value, PortalPartId = "gdin", Url = Guid.NewGuid().ToString(), Jsonvalues = op.ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "11" }, new PropertyValue() { PropertyId = "date", Value = DateTime.Parse(mDate.Groups[1].Value).ToString("yyyy-MM-dd") } } }); } } while (found); } } }
public void Download() { var i = 0; var found = false; byte[] file; string hash; do { var page = wc.DownloadString($"https://www.gdin.bg/news/{i}"); i += 6; var mcNews = Regex.Matches(page, "<a href=\"(https://www.gdin.bg/news[^\"]+?)\">...виж още</a>"); found = mcNews.Count > 0; foreach (Match n in mcNews) { var np = wc.DownloadString(n.Groups[1].Value); var title = Regex.Match(np, "<div class=\"page-title\">\\s+<h1>([\\w\\W]+?)</h1>").Groups[1].Value; title = Regex.Replace(title, "<[^>]+>", "").Replace(" ", ""); np = Regex.Match(np, "<div class=\"details\">([\\w\\W]+?)<div class=\"right-sidebar\">").Groups[1].Value; var time = Regex.Match(np, "datetime=\"([\\w\\W]+?) ").Groups[1].Value; var type = Regex.Match(np, "<span class=\"category\">([\\w\\W]+?)</span>").Groups[1].Value; var mImgMain = Regex.Match(np, "<div class=\"col-md-7 col-sm-7 col-xs-12\">[\\w\\W]+?src=\"([\\w\\W]+?)\"[\\w\\W]+?</div>"); string imgMainPath = null; if (mImgMain.Success) { try { file = wc.DownloadData(mImgMain.Groups[1].Value); } catch (Exception e) { Console.WriteLine(e.Message); continue; } using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(mImgMain.Groups[1].Value), Filename = Path.GetFileName(mImgMain.Groups[1].Value), Hash = hash }; db.AddBlob(b); imgMainPath = hash; } var mcPs = Regex.Matches(np, "<p[\\w\\W]*?>([\\w\\W]+?)</p>"); StringBuilder sb = new StringBuilder(); foreach (Match p in mcPs) { sb.AppendLine(p.Value); } np = sb.ToString(); np = Regex.Replace(np, "<[/]*div[\\w\\W]*?>", ""); np = Regex.Replace(np, "<[/]*a[\\w\\W]*?>", ""); var mcImgs = Regex.Matches(np, "<img [\\w\\W]*?src=\"([\\w\\W]+?)\"[\\w\\W]*?>"); List <Tuple <string, string> > lLinks = new List <Tuple <string, string> >(); foreach (Match f in mcImgs) { string fUrl = f.Groups[1].Value; try { file = wc.DownloadData("https://www.gdin.bg" + fUrl); } catch (Exception e) { Console.WriteLine(e.Message); continue; } using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(fUrl), Filename = Path.GetFileName(fUrl), Hash = hash }; db.AddBlob(b); lLinks.Add(new Tuple <string, string>(f.Value, hash)); } foreach (var l in lLinks) { np = np.Replace(l.Item1, $"<img alt='' src='https://localhost:5001/api/part/getblob?hash={l.Item2}'>"); } BlockData bd = new BlockData() { Block = new JSBlock() { BlockId = 0, BlockTypeId = type == "Новина" ? "new" : "ad", Name = title.Length > 199 ? title.Substring(0, 199) : title, PortalPartId = "gdin", RubricId = 3, Url = Guid.NewGuid().ToString(), Jsonvalues = JObject.FromObject(new { title = JObject.FromObject(new { bg = title }), body = JObject.FromObject(new { bg = np }), imageId = imgMainPath }).ToString() }, Values = new PropertyValue[] { new PropertyValue() { PropertyId = "header", Value = "6" }, new PropertyValue() { PropertyId = "date", Value = time } } }; db.SetBlock(bd); } } while (found); }
public void Download() { JArray docs = new JArray(); //"docs":[{"id":"116454da-620c-4ad0-b0ef-4399c561a067","title":{"bg":"24.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-24"},{"id":"48260e46-5501-404f-a609-6be3b967ea73","title":{"bg":"25.04.2016","en":""},"docId":"4C3216697058FE257B4A264CBE69A861","date":"2016-04-25"},{"id":"45a97f33-0fd4-4fd3-98dc-f12a8637bb8c","title":{"bg":"01.01.2019","en":""},"docId":"8CCD98F123163912F123C92B59A21FBF","date":"2019-01-01"},{"id":"150092ea-7bef-4c07-9684-eb8a9453072d","title":{"bg":"24.04.2019","en":""},"docId":"BA4DDD4C49C052A75D8E63302D9E8DFE","date":"2019-04-24"}] for (int year = 2012; year <= DateTime.Now.Year; year++) { for (int month = 1; month < 13; month++) { var page = wc.DownloadString($"http://www.mjs.bg/9/{year}/{month}"); var mcDates = Regex.Matches(page, "<div class=\\\"Date\\\">([0-9\\.]+)</div>"); var mcFiles = Regex.Matches(page, "<a href=\\\"([^\\\"]+)\\\">Изтегли</a>"); for (int i = 0; i < mcDates.Count; i++) { try { byte[] file = wc.DownloadData("http://www.mjs.bg" + mcFiles[i].Groups[1].Value); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(mcFiles[i].Groups[1].Value), Filename = Path.GetFileName(mcFiles[i].Groups[1].Value), Hash = hash }; int bID = db.AddBlob(b); docs.Add( JObject.FromObject( new { id = Guid.NewGuid().ToString(), title = JObject.FromObject(new { bg = mcDates[i].Groups[1].Value, en = mcDates[i].Groups[1].Value }), docId = hash, date = DateTime.Parse(mcDates[i].Groups[1].Value).ToString("yyyy-MM-dd") } ) ); Console.WriteLine($"date:{mcDates[i].Groups[1].Value} file:{file.Length}"); } catch (Exception e) { Console.WriteLine($"error:{e.Message}"); } } } } JObject result = JObject.FromObject( new { title = JObject.FromObject(new { bg = "Себра бюлетин" }), docs = docs } ); File.WriteAllText(@"d:\1.json", result.ToString()); }
public void Download() { var page = wc.DownloadString($"http://mjs.bg/2155/"); var mPage = Regex.Match(page, "<div class=\"Panel1a\">([\\w\\W]+?)<div class=\"Panel1a_column2\">"); var mcElements = Regex.Matches(mPage.Groups[1].Value, "<a href=\"([\\w\\W]+?)\"[\\w\\W]*?>([\\w\\W]+?)</a>"); StringBuilder sb = new StringBuilder(); var i = 0; List <Tuple <string, string> > lUrls = new List <Tuple <string, string> >(); foreach (Match e in mcElements) { string url = e.Groups[1].Value; url = url.Replace("../", ""); if (!url.StartsWith("/") && !url.Contains("http")) { url = "/" + url; } if (e.Groups[2].Value.Contains("<a")) { continue; } string text = e.Groups[2].Value; text = Regex.Replace(text, "<[^>]+>", "", RegexOptions.Multiline); byte[] file = url.Contains("http") ? wc.DownloadData(url) : wc.DownloadData("http://www.justice.government.bg" + url); string hash; using (var md5 = MD5.Create()) { hash = string.Join("", md5.ComputeHash(file).Select(x => x.ToString("X2"))); } Blob b = new Blob() { Content = file, ContentType = "application/octet-stream", Extension = Path.GetExtension(url), Filename = Path.GetFileName(url), Hash = hash }; db.AddBlob(b); lUrls.Add(new Tuple <string, string>(hash, text)); i++; Console.WriteLine($"{i}/{mcElements.Count}"); } i = 0; while (i < lUrls.Count - 1) { if (lUrls[i].Item1 == lUrls[i + 1].Item1) { lUrls[i] = new Tuple <string, string>(lUrls[i].Item1, lUrls[i].Item2 + " " + lUrls[i + 1].Item2); lUrls.RemoveAt(i + 1); } else { i++; } } foreach (var t in lUrls) { sb.AppendLine($"<p><a href=\"/api/part/GetBlob?hash={t.Item1}\">{t.Item2}</a></p>"); } File.WriteAllText(@"d:\opdu.txt", sb.ToString()); }