static string HtmlToString(string html) { if (html == "") { return(""); } var indexOfHead = html.IndexOf("</head>"); var index = indexOfHead + 8; var resultHtml = html.Substring(index); HTMLDocument htmldoc = new HTMLDocument(); IHTMLDocument2 htmldoc2 = (IHTMLDocument2)htmldoc; htmldoc2.write(new object[] { resultHtml }); return(htmldoc2 .body .outerText .Replace("\n", " ") .Replace("\r", " ") .Replace("(", "") .Replace(")", "") .Replace(".", "")); }
// Obsolete method. Extracts coin names by parsing HTML. Use GetAllWtmCoinNamesFromJson instead. public static async Task <List <string> > GetAllWtmCoinNamesFromWeb() { string allCoinsHTML = await WebDownloadAsync(@"http://whattomine.com/calculators", false).ConfigureAwait(false); if (allCoinsHTML == null) { return(null); } IHTMLDocument2 doc = (IHTMLDocument2)(new HTMLDocument()); doc.write(allCoinsHTML); var links = new List <string>(); Regex reg = new Regex(@"^(about:/coins/)\d{1,3}-(?<Coin>\w+)-"); foreach (IHTMLAnchorElement link in doc.links) { var match = reg.Match(link.href); var coin = (match.Groups["Coin"]).ToString(); if (coin != string.Empty) { links.Add(coin.ToUpper()); } } var allCoins = links.Distinct().ToList(); return(allCoins); }
/// <summary> /// 转换字符串为dom树 /// </summary> /// <param name="html">html字符串</param> /// <returns>HTMLDocument</returns> public HTMLDocument GetDocument(string html) { IHTMLDocument2 document = (IHTMLDocument2)(new HTMLDocumentClass()); document.write(html); return((HTMLDocument)document); }
public bool test3(string vsURL) { cWeb oWeb = new cWeb(); string htmlContent = oWeb.GrabPageToString(vsURL); // Obtain the document interface IHTMLDocument2 htmlDocument = (IHTMLDocument2) new mshtml.HTMLDocument(); // Construct the document htmlDocument.write(htmlContent); //htmlDocument. List <IHTMLElement> oOut = new List <IHTMLElement>(); // Extract all elements IHTMLElementCollection allElements = htmlDocument.all; cFile oFile = new cFile("ele.txt"); // Iterate all the elements and display tag names foreach (IHTMLElement element in allElements) { oFile.Write(element.tagName); } return(false); }
public void SetHTML(string html) { if (!String.IsNullOrEmpty(html)) { doc.write(html); } }
public string ConvertEmailToTxt(Message msg) { if (msg != null) { StringBuilder builder = new StringBuilder(); MessagePart html = msg.FindFirstHtmlVersion(); builder.Append(html.GetBodyAsText()); HTMLDocument htmldoc = new HTMLDocument(); IHTMLDocument2 htmldoc2 = (IHTMLDocument2)htmldoc; htmldoc2.write(new object[] { builder.ToString() }); string emailasTxt = htmldoc2.body.outerText; if (!string.IsNullOrEmpty(emailasTxt)) { Logger.LogDebug("Email converted to Text"); return(emailasTxt); } else { Logger.LogError("Email converter to Text Failed"); return(null); } } else { Logger.LogError("Email message cannot be coverted to text"); return(null); } }
private HTMLDocument GetDoc(string html) { HTMLDocument dx = new HTMLDocumentClass(); IHTMLDocument2 doc2 = (IHTMLDocument2)dx; doc2.write(new object[] { html }); return(dx); }
private static IHTMLDocument2 GetMsHtmlDocument(String source) { IHTMLDocument2 rtn = (IHTMLDocument2) new HTMLDocument(); rtn.write(new object[] { source }); rtn.close(); return(rtn); }
public void UpdatePhone(string xml) { try { Phones.Clear(); XmlDocument doc = new XmlDocument(); doc.LoadXml(xml); XmlNodeList xnl = doc.GetElementsByTagName("html"); if (xnl.Count > 0) { XmlNode me = xnl[0]; xml = me.InnerXml.Trim(); if (xml.StartsWith("<![CDATA[")) { //Trace.WriteLine("Removing CDATA"); xml = xml.Substring("<![CDATA[".Length); xml = xml.Substring(0, xml.Length - 3); xml = xml.Trim(); } else { Trace.WriteLine("No CDATA!"); } object[] oPageText = { xml }; HTMLDocument dx = new HTMLDocumentClass(); IHTMLDocument2 doc2 = (IHTMLDocument2)dx; doc2.write(oPageText); IHTMLElementCollection ihec = dx.getElementsByTagName("div"); foreach (IHTMLElement e in ihec) { //Trace.WriteLine("Phone: " + e.className + " " + e.id); if (e.className == "gc-forwarding-number-ani goog-inline-block") { Phone p = new Phone(); p.Number = e.innerText; Phones.Add(p); Trace.WriteLine("Found Phone: " + p.Number); } } } else { Trace.WriteLine("Couldn't find HTML node in Phone XML"); } } catch (Exception ex) { Trace.Write("XError loading Phone: " + ex); } }
private static HTMLDocument DOMParser(string responseText) { HTMLDocument domobj = new HTMLDocument(); IHTMLDocument2 doc2 = (IHTMLDocument2)domobj; doc2.write(new object[] { responseText }); doc2.close(); return(domobj); }
/// <summary> /// Extract the refno from Label Id ReferenceNo /// </summary> /// <param name="_html">HTML response</param> /// <returns>Reference number</returns> private string ExtractReferenceNo(string _html) { HTMLDocument doc = new HTMLDocument(); IHTMLDocument2 doc2 = (IHTMLDocument2)doc; doc2.write(_html); //Load string as HTML document string innerText = doc.getElementById("ReferenceNo").innerText; return(innerText.Substring(ReferenceStartIndex, innerText.Length - ReferenceStartIndex)); }
public static string RetrieveInnerTextContent(string htmlContent) { object[] oPageText = { htmlContent }; HTMLDocument doc = new HTMLDocumentClass(); IHTMLDocument2 document = (IHTMLDocument2)doc; document.write(oPageText); document.close(); //Console.WriteLine(document.body.innerHTML); // whole content of body Console.WriteLine(document.body.innerText); // all plain text in body return(document.body.innerText); }
// Obsolete hashrate extractor from html. !!!!!!!!!!!!!!!!!!!!!!!!! private static double GetDefaultHashrateFromHtml(string html) { var doc = new HTMLDocument(); IHTMLDocument2 doc2 = (IHTMLDocument2)doc; doc2.clear(); doc2.write(html); IHTMLDocument3 doc3 = (IHTMLDocument3)doc2; var hr = doc3.getElementById("hr"); var defaultHashrate = Convert.ToDouble(hr.getAttribute("defaultValue"), CultureInfo.InvariantCulture); return((double)defaultHashrate); }
public DocumentModel Transform(Action <HTMLDocumentClass> interop) { HTMLDocumentClass myDocument = new HTMLDocumentClass(); IHTMLDocument2 doc2 = myDocument; doc2.write(new object[] { this.content }); interop(myDocument); this.content = myDocument.documentElement.outerHTML; string fileName = string.Format("{0}-transformed.html", DateTime.Now.ToString("hh:mm:ss-fffff")); this.absolutePath = Path.Combine(this.workingDirectory, fileName); File.WriteAllText(this.absolutePath, this.content, Encoding.UTF8); return(this); }
static internal string PlainText(string html) { HTMLDocument htmldoc = new HTMLDocument(); IHTMLDocument2 htmldoc2 = (IHTMLDocument2)htmldoc; Regex rRemScript = new Regex(@"<script[^>]*>[\s\S]*?</script>"); html = rRemScript.Replace(html, ""); rRemScript = new Regex(@"<style[^>]*>[\s\S]*?</style>"); html = rRemScript.Replace(html, ""); htmldoc2.write(html); string txt = htmldoc2.body.outerText; return(txt); }
private String GetContent(String Rstring) { String sString=""; HTMLDocument d = new HTMLDocument(); IHTMLDocument2 doc = (IHTMLDocument2)d; doc.write(Rstring); IHTMLElementCollection L = doc.links; foreach (IHTMLElement links in L) { sString += links.getAttribute("href", 0); sString += "/n"; } return sString; }
private HTMLDocument ReadHtmlDocument(string fileName) { HTMLDocument htmlDoc = null; using (StreamReader reader = new StreamReader(fileName, UTF8Encoding)) { string html = reader.ReadToEnd(); htmlDoc = new HTMLDocument(); IHTMLDocument2 docToWritein = htmlDoc as IHTMLDocument2; docToWritein.write(new object[] { html }); } return(htmlDoc); }
void Instance_SearchComplete(object sender, SearchEventArgs e) { StringWriter writer = new StringWriter(); transform.Transform(e.Result, new XmlTextWriter(writer)); IHTMLDocument2 doc = (IHTMLDocument2)Explorer.Document; doc.clear(); try { doc.body.innerHTML = ""; doc.write(writer.ToString()); } catch { } }
public string getResultsViaWebClient(string url) { WebClient client = new WebClient(); // Retrieve resource as a stream Stream data = client.OpenRead(new Uri(url)); // Retrieve the text StreamReader reader = new StreamReader(data); string htmlContent = reader.ReadToEnd(); // Cleanup data.Close(); reader.Close(); // Obtain the document interface IHTMLDocument2 htmlDocument = (IHTMLDocument2) new mshtml.HTMLDocument(); // Construct the document htmlDocument.write(htmlContent); // listBox1.Items.Clear(); List <string> images = new List <string>(); List <string> allElementsList = new List <string>(); // Extract all elements IHTMLElementCollection allElements = htmlDocument.all; // Iterate all the elements and display tag names foreach (IHTMLElement element in allElements) { allElementsList.Add(element.tagName); } // Extract all image elements IHTMLElementCollection imgElements = htmlDocument.images; // Iterate through each image element foreach (IHTMLImgElement img in imgElements) { images.Add(img.src); } return(htmlContent); }
/// <summary> /// Remove internal text from repro step. /// since the internal text is wrapped in a "div", find a div with matching key text and remove the child nodes of div. /// generally, keyText is guid value. /// </summary> /// <param name="inputHTML"></param> /// <param name="keyText"></param> /// <returns></returns> internal static string RemoveInternalHTML(string inputHTML, string keyText) { object[] htmlText = { inputHTML }; HTMLDocument doc = new HTMLDocument(); IHTMLDocument2 doc2 = doc as IHTMLDocument2; doc2.write(htmlText); IHTMLDOMNode node = null; // search div with matching issueguid. // remove any of it if there is matched one. var divnodes = doc.getElementsByTagName("div"); if (divnodes != null) { foreach (IHTMLDOMNode divnode in divnodes) { foreach (IHTMLDOMNode child in GetChildren(divnode)) { string nodevalue = child.nodeValue?.ToString(); if (nodevalue != null && nodevalue.Contains(keyText) == true) { node = divnode; break; } } if (node != null) { break; } } } if (node != null) { foreach (IHTMLDOMNode child in GetChildren(node).ToList()) { node.removeChild(child); } } return(doc.body.outerHTML); }
public string ParseRssFile() { string html; WebClient webClient = new WebClient(); using (Stream stream = webClient.OpenRead(new Uri("https://news.google.ca/news/section?cf=all&pz=1&q=GTARealEstate"))) using (StreamReader reader = new StreamReader(stream)) { html = reader.ReadToEnd(); } IHTMLDocument2 doc = (IHTMLDocument2) new HTMLDocument(); doc.write(html); foreach (IHTMLElement el in doc.all) { Console.WriteLine(el.tagName); } return(""); }
public static string GetPlainTextFromHTML(string strHTML) { string strPlainText; try { HTMLDocument htmldoc = new HTMLDocument(); IHTMLDocument2 htmldoc2 = (IHTMLDocument2)htmldoc; htmldoc2.write(new object[] { strHTML }); strPlainText = htmldoc2.body.outerText; } catch (Exception) { strPlainText = Regex.Replace(strHTML, @"<p>|</p>|<br>|<br />", "\r\n"); strPlainText = Regex.Replace(strPlainText, @"\<[^\>]*\>", string.Empty); } return(strPlainText); }
public static HTMLDocument LoadDocument(string url) { Stream data = null; StreamReader reader = null; try { string htmlContent = DownloadString(url); // Load HTML with injected scripts object[] oPageText = { htmlContent }; HTMLDocument doc = new HTMLDocumentClass(); IHTMLDocument2 doc2 = (IHTMLDocument2)doc; doc2.write(oPageText); while (doc2.body == null) { Thread.Sleep(5000); } return(doc); } catch (Exception e) { //logger.Error(e); } finally { // Cleanup if (data != null) { data.Close(); } if (reader != null) { reader.Close(); } } return(null); }
/// <summary> /// 获取当前页面Dom树 /// </summary> private HTMLDocument GetDocument() { if (!IsChromeLoaded || !IsDocumentLoaded) { return(new HTMLDocument()); } Task <string> returnStr = null; IHTMLDocument2 document = (IHTMLDocument2)(new HTMLDocumentClass()); this.Dispatcher.Invoke(() => { returnStr = WebBrowser.GetBrowser().MainFrame.GetSourceAsync(); }); var waiter = returnStr.GetAwaiter(); waiter.OnCompleted(() => { this._isDocumentGeted = true; document.write(returnStr.Result.ToString()); }); return((HTMLDocument)document); }
// Obsolete method. Extracts coin links by parsing HTML. Use GetWtmLinksFromJson instead. public static async Task <Dictionary <string, WtmLinks> > GetWtmLinks(CancellationToken cancelToken = default(CancellationToken)) { string allCoinsHTML = await WebDownloadAsync(@"http://whattomine.com/calculators", false, cancelToken).ConfigureAwait(false); if (allCoinsHTML == null) { return(null); } var result = new Dictionary <string, WtmLinks>(); IHTMLDocument2 doc = (IHTMLDocument2)(new HTMLDocument()); doc.write(allCoinsHTML); var hs = new HashSet <string>(); foreach (IHTMLAnchorElement link in doc.links) { hs.Add(link.href); } Regex reg = new Regex(@"^(about:/coins/)\d{1,3}"); Regex regCoin = new Regex(@"^(about:/coins/)\d{1,3}-(?<Coin>\w+)-"); foreach (var link in hs) { var match = reg.Match(link); if (match.Success) { var matchCoin = regCoin.Match(link); result[(matchCoin.Groups["Coin"]).ToString().ToUpper()] = new WtmLinks { CoinLink = link.Replace("about:", "http://whattomine.com"), JsonLink = match.Value.Replace("about:", "http://whattomine.com") + ".json" }; } } return(result); }
// Load the document in a worker thread private void LoadDocumentInMtaThread(Object state) { try { // Create a new IHTMLDocument2 object _parsedHtml = (IHTMLDocument2) new HTMLDocument(); // Attach the event handler var events = (HTMLDocumentEvents2_Event)_parsedHtml; events.onreadystatechange += _onreadystatechangeEventHandler; // Write the content and close the document _parsedHtml.write(Content); _parsedHtml.close(); // Wait for the onReadyStateChange event to be fired. On IE9, this never happens // so we check the readyState directly as well. bool wait = true; while (wait && !_stopWorkerThread) { if (String.Equals("complete", _parsedHtml.readyState, StringComparison.OrdinalIgnoreCase)) { break; } wait = !_stateChangeResetEvent.Wait(100); } // Detach the event handler events.onreadystatechange -= _onreadystatechangeEventHandler; } catch (Exception e) { _parsingException = e; } finally { _loadDocumentResetEvent.Set(); } }
public void Work() // 获得去噪数据 { try { rawdata = File.ReadAllText(path, Encoding.UTF8); // 先读取原始数据 //MessageBox.Show(rawdata); HTMLDocument doc = new HTMLDocument(); IHTMLDocument2 doc2 = (IHTMLDocument2)doc; rawdata = rawdata.Replace("script", "soript"); doc2.write(rawdata); denoisingdata = doc.documentElement == null ? "" : doc.documentElement.innerText; // 对网页中的html内容进行去噪 /* * 如果编码不对应,denoising应该会是null */ if (denoisingdata != null) // 如果获得了去噪数据 { denoisingdata = denoisingdata.Replace("|", " "); denoisingdata = Regex.Replace(denoisingdata, "\\s+", " "); File.WriteAllText("denoisinged.txt", denoisingdata); return; } denoisingdata = null; } catch (FileNotFoundException e) { denoisingdata = null; } catch (PathTooLongException f) { denoisingdata = null; } catch (ArgumentException a) { denoisingdata = null; } }
private string GetLcsDescription(string lcsId) { if (string.IsNullOrWhiteSpace(lcsId) == false) { WebClient client = new WebClient(); client.Headers.Add("user-agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"); client.UseDefaultCredentials = true; string htmlContent = client.DownloadString(this.LCS_BUG_URI + lcsId); // Obtain the document interface IHTMLDocument2 htmlDocument = (IHTMLDocument2) new mshtml.HTMLDocument(); // Construct the document htmlDocument.write(htmlContent); // Extract all elements IHTMLElementCollection allElements = htmlDocument.all; bool bugTitlefound = false; foreach (IHTMLElement element in allElements) { if (element.outerText == "Bug Title") { bugTitlefound = true; } if (bugTitlefound && string.IsNullOrWhiteSpace(element.outerText) == false && element.outerText != "Bug Title") { return(element.outerText); } } } return(string.Empty); }
// Load the document in a worker thread private void LoadDocumentInMtaThread(Object state) { try { // Create a new IHTMLDocument2 object _parsedHtml = (IHTMLDocument2)new HTMLDocument(); // Attach the event handler var events = (HTMLDocumentEvents2_Event)_parsedHtml; events.onreadystatechange += _onreadystatechangeEventHandler; // Write the content and close the document _parsedHtml.write(Content); _parsedHtml.close(); // Wait for the onReadyStateChange event to be fired. On IE9, this never happens // so we check the readyState directly as well. bool wait = true; while (wait && !_stopWorkerThread) { if (String.Equals("complete", _parsedHtml.readyState, StringComparison.OrdinalIgnoreCase)) { break; } wait = !_stateChangeResetEvent.Wait(100); } // Detach the event handler events.onreadystatechange -= _onreadystatechangeEventHandler; } catch (Exception e) { _parsingException = e; } finally { _loadDocumentResetEvent.Set(); } }
public string[] loadBookStat(string cid) { // new : http://114.70.3.72/DLiWeb25Fr/comp/search/SearchHandler.aspx?action=stock&cid=357465 // old : http://library.unist.ac.kr/DLiWeb25Eng/comp/search/SearchHandler.aspx?action=stock&cid=357465 // | 번호 | 등록 번호 | 소장 위치 | 도서 상태 | 청구 기호 | 출력 | string url = "http://114.70.3.72/DLiWeb25Fr/comp/search/SearchHandler.aspx?action=stock&cid=" + cid; if (!getResponse(url)) { return null; } doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); //IEnumerable<HtmlElement> elements = ElementsByClass(doc, "stock_callnumber"); IEnumerable<IHTMLElement> e = getTableRow(doc); IEnumerator<IHTMLElement> enumerator = e.GetEnumerator(); string[] rows = new string[(e.Count() - 1) * 4]; int count = 0; enumerator.MoveNext(); while (enumerator.MoveNext()) { IHTMLElement2 e2 = (IHTMLElement2)enumerator.Current; rows[count++] = ((IHTMLElement)(e2.getElementsByTagName("td").item(3, 0))).innerText; rows[count++] = ((IHTMLElement)(e2.getElementsByTagName("td").item(2, 0))).innerText; rows[count++] = ((IHTMLElement)(e2.getElementsByTagName("td").item(4, 0))).innerText; rows[count++] = ((IHTMLElement)(e2.getElementsByTagName("td").item(1, 0))).innerText; } return rows; }
//date : 201302 public void loadStudyroomStatus(int roomNum, string date) { if (!System.Net.NetworkInformation.NetworkInterface.GetIsNetworkAvailable()) { MainForm.isError = true; Application.Exit(); MessageBox.Show("인터넷 연결에 문제가 있습니다.\r\n 프로그램을 종료합니다. :^(", "Robot의 경고"); System.Diagnostics.Process[] mProcess = System.Diagnostics.Process.GetProcessesByName(Application.ProductName); foreach (System.Diagnostics.Process p in mProcess) p.Kill(); } // new : http://114.70.3.72/dliweb25fr/studyroom/detail.aspx?m_var=112&roomid=1 // old : http://library.unist.ac.kr/dliweb25eng/studyroom/detail.aspx?m_var=112&roomid=1 string url = "http://114.70.3.72/dliweb25fr/studyroom/detail.aspx?m_var=112&roomid=" + roomNum.ToString() + "&yearmonth=" + date; if (!getResponse(url)) { return; } doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); IEnumerable<IHTMLElement> e = getTd(doc); IEnumerator<IHTMLElement> enumerator = e.GetEnumerator(); dayCount = e.Count(); roomStat = new string[dayCount][]; for (int i = 0; i < roomStat.Length; i++) { roomStat[i] = new string[25]; } int count = 0; while (enumerator.MoveNext()) { IHTMLElement2 e2 = (IHTMLElement2)enumerator.Current; roomStat[count][0] = ((IHTMLElement)(e2.getElementsByTagName("td").item(0, 0))).innerText; for (int i = 1; i < 25; i++) { IHTMLElement elem = ((IHTMLElement)(e2.getElementsByTagName("td").item(i, 0))); roomStat[count][i] = elem.innerText; if (roomStat[count][i] == null) { IHTMLElement img = (IHTMLElement)(((IHTMLElement2)elem).getElementsByTagName("img").item(0)); if (img.getAttribute("src").ToString().IndexOf("icoA.gif") != -1) { roomStat[count][i] = "E"; } else if (img.getAttribute("src").ToString().IndexOf("icoN.gif") != -1) { roomStat[count][i] = "R"; } } } count++; } }
private IHTMLDocument2 HTMLToDom(string html) { doc = new HTMLDocumentClass(); doc.write(new object[] { html }); doc.close(); return doc; }
private void setLastestBoard(int sPage, int ePage) { for (int pageNum = sPage; pageNum <= ePage; pageNum++) { MainForm.gridView.Columns[4].HeaderText = "게시판"; string url = "http://portal.unist.ac.kr/EP/web/collaboration/bbs/jsp/BB_MyBoardLst.jsp?nfirst=" + pageNum; if (!getResponse(url)) return; doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); IEnumerable<IHTMLElement> titles = ElementsByClass(doc, "ltb_left"); IEnumerable<IHTMLElement> elements = ElementsByClass(doc, "ltb_center"); int docNum = elements.Count(); int index; for (int i = 0; i < docNum / 11; i++) { string[] rows = new string[5]; IHTMLElement title = titles.ElementAt(i); int titleLen = 30; if (title.innerText.Count() > titleLen) { rows[1] += title.innerText.Substring(0, titleLen); rows[1] += "\r\n"; rows[1] += title.innerText.Substring(titleLen); } else { rows[1] = title.innerText; } rows[2] = elements.ElementAt(i * 11 + 5).innerText; rows[3] = elements.ElementAt(i * 11 + 7).innerText; rows[4] = elements.ElementAt(i * 11 + 3).innerText; index = (pageNum - 1) * 10 + i; if (title.innerHTML.IndexOf("red") != -1) { board4[index].color = Color.Red; } if (elements.ElementAt(i * 11 + 0).innerText.Trim() == "") { rows[0] = "공지"; board4[index].anouncement = true; } board4[index].rows = rows; //board4[index].title = rows[1]; //board4[index].writer = rows[2]; //board4[index].date = rows[3]; board4[index].boardName = rows[4]; //board[index].viewCount = Convert.ToInt32(rows[3]); board4[index].page = pageNum; board4[index].boardId = title.innerHTML.Substring(title.innerHTML.IndexOf("boardid=")).Substring(8); board4[index].boardId = board4[index].boardId.Substring(0, board4[index].boardId.IndexOf("&")); board4[index].bullId = title.innerHTML.Substring(title.innerHTML.IndexOf("bullid=")).Substring(7); board4[index].bullId = board4[index].bullId.Substring(0, board4[index].bullId.IndexOf("&")); } if (docNum / 11 != 10) { return; } } }
private void setBoard(int sPage = 1, int ePage = 3) { for (int i = 0; i < 3 * 10; i++) { boards[i] = new DormBoard(); } for (int pageNum = sPage; pageNum <= ePage; pageNum++) { // http://dorm.unist.ac.kr/admin/board/view.asp?intNowPage=1&board_nm=dorm_notice&idx=2885 string url = "http://dorm.unist.ac.kr/admin/board/list.asp?board_nm=dorm_notice&intNowPage=" + pageNum; if (!getResponse(url)) return; doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); IEnumerable<IHTMLElement> tags = ElementsByTagName(doc, "tr"); string[] rows = new string[5]; for (int i = 1; i < tags.Count(); i++) { string html=tags.ElementAt(i).innerHTML; boards[i].link = html.Split('\'')[0]; } } }
private int setNewLastestBoard() { int diffCount = 0; for (int i = 0; i < PAGENUM * 10; i++) new_board4[i] = new PortalBoard(); for (int pageNum = 0; pageNum <= PAGENUM; pageNum++) { string url = "http://portal.unist.ac.kr/EP/web/collaboration/bbs/jsp/BB_MyBoardLst.jsp?nfirst=" + pageNum; if (!getResponse(url)) return 0; doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); IEnumerable<IHTMLElement> titles = ElementsByClass(doc, "ltb_left"); IEnumerable<IHTMLElement> elements = ElementsByClass(doc, "ltb_center"); int docNum = elements.Count(); int index; for (int i = 0; i < docNum / 11; i++) { string[] rows = new string[5]; IHTMLElement title = titles.ElementAt(i); int titleLen = 30; if (title.innerText.Count() > titleLen) { rows[1] += title.innerText.Substring(0, titleLen); rows[1] += "\r\n"; rows[1] += title.innerText.Substring(titleLen); } else { rows[1] = title.innerText; } if (rows[1] == board4[0].rows[1 + diffCount]) { return diffCount; } else { diffCount++; } rows[2] = elements.ElementAt(i * 11 + 5).innerText; rows[3] = elements.ElementAt(i * 11 + 7).innerText; rows[4] = elements.ElementAt(i * 11 + 3).innerText; index = (pageNum - 1) * 10 + i; new_board4[index].rows = rows; new_board4[index].boardName = rows[4]; new_board4[index].page = pageNum; new_board4[index].boardId = title.innerHTML.Substring(title.innerHTML.IndexOf("boardid=")).Substring(8); new_board4[index].bullId = title.innerHTML.Substring(title.innerHTML.IndexOf("bullid=")).Substring(7); } } return diffCount; }
/********************************************************** * * 포탈 검색, EUC-KR 인코딩 * **********************************************************/ private void searchBoard(PortalBoard[] board, string boardId, int sPage, int ePage, string query) { // http://portal.unist.ac.kr/EP/web/collaboration/bbs/jsp/BB_BoardLst.jsp?searchcondition=BULLTITLE&searchname=%B0%F8%C1%F6&boardid=B200902281833482321051&nfirst=1 MainForm.gridView.Columns[4].HeaderText = "조회수"; for (int i = 0; i < 10 * 10; i++) { board[i] = new PortalBoard(); } for (int pageNum = sPage; pageNum <= ePage; pageNum++) { byte[] b = System.Text.Encoding.GetEncoding(51949).GetBytes(query); string result = ""; foreach (byte ch in b) { result += ("%" + string.Format("{0:x2} ", ch)); // 2자리의 16진수로 출력, [참고] 링크 읽어볼 것 } string url = "http://portal.unist.ac.kr/EP/web/collaboration/bbs/jsp/BB_BoardLst.jsp?boardid=" + boardId + "&nfirst=" + pageNum + "&searchcondition=BULLTITLE&searchname=" + result.Replace(" ",""); if (!getResponse(url)) return; doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); IEnumerable<IHTMLElement> titles = ElementsByClass(doc, "ltb_left"); IEnumerable<IHTMLElement> elements = ElementsByClass(doc, "ltb_center"); int docNum = elements.Count(); int index; if (docNum == 0) return; for (int i = 0; i < docNum / BOARDTAGNUM; i++) { string[] rows = new string[5]; IHTMLElement title = titles.ElementAt(i); rows[0] = ""; rows[1] = title.innerText; rows[2] = elements.ElementAt(i * BOARDTAGNUM + 5).innerText; rows[3] = elements.ElementAt(i * BOARDTAGNUM + 7).innerText; rows[4] = elements.ElementAt(i * BOARDTAGNUM + 9).innerText; index = (pageNum - 1) * 10 + i; // new 체크 if (((IHTMLElement2)title).getElementsByTagName("img").length > 0) { board[index].newPost = true; rows[0] = "new"; } // 공지 체크 if (((IHTMLElement2)elements.ElementAt(i * BOARDTAGNUM + 1)).getElementsByTagName("img").length > 0) { board[index].anouncement = true; rows[0] = "공지"; } board[index].rows = rows; //board[index].title = rows[1]; //board[index].writer = rows[2]; //board[index].date = rows[3]; //board[index].viewCount = Convert.ToInt32(rows[4]); board[index].page = pageNum; board[index].boardId = boardId; // javascript:clickBulletin("BB201302011329070365135","BB201302011329070365135","BB201302011329070365135","0","N"); string javaUrl = title.innerHTML.Substring(title.innerHTML.IndexOf("javascript:")); board[index].bullId = javaUrl.Split('\"')[1]; IHTMLElement font = (IHTMLElement)((IHTMLElement2)title).getElementsByTagName("font").item(0, 0); if (font.getAttribute("color") != null) { board[index].color = ConvertColor_PhotoShopStyle_toRGB((string)font.getAttribute("color")); } if (title.outerHTML.IndexOf("FONT-WEIGHT: bold") != -1) { board[index].bold = true; } } } }
static public void func(object startLink) { String URL; string rString; string EnglishText; string link; WebRequest myWebRequest; WebResponse myWebResponse; Stream streamResponse; StreamReader sReader; HTMLDocument y = new HTMLDocument(); IHTMLDocument2 doc = (IHTMLDocument2)y; IHTMLElementCollection elements; try { URL = (String)startLink; // Create a new 'WebRequest' object to the mentioned URL. myWebRequest = WebRequest.Create(URL); // The response object of 'WebRequest' is assigned to a WebResponse' variable. myWebResponse = myWebRequest.GetResponse(); streamResponse = myWebResponse.GetResponseStream(); sReader = new StreamReader(streamResponse); rString = sReader.ReadToEnd(); EnglishText = getOnlyText(rString); EnglishText = eng(EnglishText); list.Add(URL); doc.write(rString); elements = doc.links; foreach (IHTMLElement el in elements) { link = (string)el.getAttribute("href", 0); if (!link.Contains("about:")) { if (list.Count + q.Count <= 3000) { if (!list.Contains(link)) { if (!q.Contains(link)) { q.Enqueue(link); } } } } } PutPageInfoInDB(URL, EnglishText); counter++; Console.Write(counter); Console.WriteLine(" : " + URL); streamResponse.Close(); sReader.Close(); myWebResponse.Close(); } catch (Exception ex) { Console.WriteLine("Ex message : " + ex.Message); if (ex is OverflowException) { list.Clear(); q.Clear(); q.Enqueue("https://zookeys.pensoft.net/"); } } NumOfThread--; }
public void bookSearch(string bq1, string bq2, string bo1, string bo2, string bo) { this.bookQuery1 = bq1; this.bookQuery2 = bq2; this.bookOption1 = bo1; this.bookOption2 = bo2; this.bookOperator = bo; bookQuery = queryMake(); // new : http://114.70.3.72/DLiWeb25Fr/comp/search/Results.aspx? // old : http://library.unist.ac.kr/DLiWeb25Eng/comp/search/Results.aspx? string url = "http://114.70.3.72/DLiWeb25Fr/comp/search/Results.aspx?" + bookQuery; if (!getResponse(url)) { return; } doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); IEnumerable<IHTMLElement> elements = ElementsByClass(doc, "item"); IEnumerable<IHTMLElement> authors = ElementsByClass(doc, "author"); IEnumerable<IHTMLElement> publishers = ElementsByClass(doc, "publisher"); IEnumerable<IHTMLElement> publishyears = ElementsByClass(doc, "publishyear"); IEnumerable<IHTMLElement> cclasses = ElementsByClass(doc, "cclass"); books = new Book[elements.Count()]; for (int i = 0; i < elements.Count(); i++) { string[] rows = new string[5]; books[i] = new Book(); string html = elements.ElementAt(i).innerHTML; if (html.IndexOf("no thumbnail") != -1) { books[i].thumbnail = ""; } else { books[i].thumbnail = html.Substring(html.IndexOf("thumb.axd?url=")).Split('\"')[0]; } IHTMLElement element = (IHTMLElement)((IHTMLElement2)elements.ElementAt(i)).getElementsByTagName("label").item(0, 0); rows[0] = books[i].title = element.getAttribute("title").ToString().Split('/')[0].Replace("선택하기", ""); if (((IHTMLElement)(authors.ElementAt(i))).innerText != null) rows[1] = books[i].author = ((IHTMLElement)(authors.ElementAt(i))).innerText.Replace("/ ", ""); else rows[1] = books[i].author = ""; if (((IHTMLElement)(publishers.ElementAt(i))).innerText != null) rows[2] = books[i].publisher = ((IHTMLElement)(publishers.ElementAt(i))).innerText.Replace("/ ", ""); else rows[2] = books[i].publisher = ""; if (((IHTMLElement)(publishyears.ElementAt(i))).innerText != null) rows[3] = books[i].publishYear = ((IHTMLElement)(publishyears.ElementAt(i))).innerText.Replace("/ ", ""); else rows[3] = books[i].publishYear = ""; if (((IHTMLElement)(cclasses.ElementAt(i))).innerText != null) rows[4] = books[i].kind = ((IHTMLElement)(cclasses.ElementAt(i))).innerText.Replace("/ ", ""); else rows[3] = books[i].publishYear = ""; books[i].isbn = html.Substring(html.IndexOf("isbn\">")).Split('>')[1].Split('<')[0]; IHTMLElement cid = (IHTMLElement)(((IHTMLElement2)elements.ElementAt(i)).getElementsByTagName("input").item(0, 0)); // 도서 상태를 위한 cid books[i].cid = cid.getAttribute("value").ToString(); if (html.IndexOf("Domestic Books") != -1) { books[i].kind = "국내 서적"; } books[i].rows = rows; } // http://114.70.3.72/DLiWeb25Fr/comp/search/Results.aspx?m_var=421&querytype=2&srv=31&method=2&field=TITL&keyword=%EC%95%84&operator=0&branch=01&classid=24,27,1,60,32,65,21,23,25,39,75,2,22,41,38,74,88,52,33,6,19,80,29,59,85,89,5,28,16,77,30,73,53,34,79,64,26,90,35,3,4,15,20,42,76,86,91&max=300&cntperpage=20&viewoption=1&sort=DEFAULT // new : http://library.unist.ac.kr/DLiWeb25Fr/comp/search/Results.aspx?method=2&field=TITL&keyword=%EC%95%84&operator=0&branch=01&classid=24,27,1,60,32,65,21,23,25,39,75,2,22,41,38,74,88,52,33,6,19,80,29,59,85,89,5,28,16,77,30,73,53,34,79,64,26,90,35,3,4,15,20,42,76,86,91&max=300&cntperpage=20&viewoption=1&sort=DEFAULT // old : http://library.unist.ac.kr/DLiWeb25Eng/comp/search/Results.aspx?method=2&field=TITL,AUTH,PUBN&keyword=%ED%95%B4%ED%82%B9,%ED%95%B4%ED%82%B9,%ED%95%B4%ED%82%B9&operator=0,1,3&branch=01&classid=24,27,1,60,32,65,21,23,25,39,75,2,22,41,38,74,88,52,33,6,19,80,29,59,85,89,63,5,28,16,77,30,73,53,34,64,79,26,90,35,3,4,15,20,42,76,86,91&max=300&classifyname=KDC&classify=&cntperpage=20&viewoption=1&sort=DEFAULT }
private void Parse(string html) { start_btn.IsEnabled = false; clear_btn.IsEnabled = false; try { if (!html.Contains("registerBox registerBoxBank margBtm20")) { stop_btn.IsEnabled = false; start_btn.IsEnabled = true; clear_btn.IsEnabled = true; start_btn.Content = "Ещё"; label_info.Content = "Нету данных по данному запросу"; progress_bar.Value = 0; return; } else { HTMLDocument doc = new HTMLDocument(); IHTMLDocument2 doc2 = (IHTMLDocument2)doc; doc2.write(html.Replace("http", "@@@ @@@")); IHTMLDocument3 doc3 = (IHTMLDocument3)doc2; var divs = doc3.getElementsByTagName("div"); foreach (var div in divs) { if (token.IsCancellationRequested) { Console.WriteLine("Операция прервана токеном"); goto M; } var t = (IHTMLElement)div; if (t.className != null) { if (t.className.Contains("registerBox registerBoxBank margBtm20")) { var hrefs = Find(t.innerHTML); HTMLDocument _doc = new HTMLDocument(); IHTMLDocument2 _doc2 = (IHTMLDocument2)_doc; _doc2.write(t.innerHTML); IHTMLDocument3 _doc3 = (IHTMLDocument3)_doc2; List <string> strong = new List <string>(); foreach (IHTMLElement temp in _doc3.getElementsByTagName("strong")) { if (temp.innerText != null && temp.innerText != "") { strong.Add(temp.innerText.Replace("\n", "").Replace(" ", "").Replace(" ", " ").Replace(",", ".")); } } List <string> span = new List <string>(); foreach (IHTMLElement temp in _doc3.getElementsByTagName("span")) { if (temp.innerText != null && temp.innerText != "") { span.Add(temp.innerText.Replace(" ", "").Replace("\n", "").Replace("/", "")); } } List <string> dd = new List <string>(); foreach (IHTMLElement temp in _doc3.getElementsByTagName("dd")) { if (temp.innerText != null && temp.innerText != "") { dd.Add(temp?.innerText.Replace(" ", "").Replace("\n", "").Replace("/", "").Replace("Заказчик:", "").Replace(" ", " ").Replace("закупки:", "закупки:\n").Replace("Идентификационный", "\nИдентификационный")); } } List <string> li = new List <string>(); foreach (IHTMLElement temp in _doc3.getElementsByTagName("li")) { li.Add(temp?.innerText.Replace(" ", "").Replace("\n", "").Replace("Размещено:", "").Replace("/", "").Replace(" ", " ")); } string descript = ""; if (dd.Count >= 5) { if (dd[4].Contains("function")) { dd[4].Substring(dd[4].LastIndexOf("}}"), dd[4].Length - dd[4].LastIndexOf("}}")); } else { descript = dd[3]; //+ " " + dd[4]; } } else if (dd.Count == 4 || dd.Count == 3) { descript = dd[2]; //+"\n"+dd[3]; } else if (dd.Count == 2) { descript = dd[1]; } Hyperlink link = new Hyperlink(); link.NavigateUri = new Uri(WebUtility.HtmlDecode("http://www.zakupki.gov.ru" + (from x in hrefs where x.Text == "Сведения" select x.Href).First().Replace("http://zakupki.gov.ru", ""))); link.TargetName = "Сведения"; Hyperlink link2 = new Hyperlink(); link2.NavigateUri = new Uri(WebUtility.HtmlDecode("http://www.zakupki.gov.ru" + (from x in hrefs where x.Href.Contains("organization") select x.Href.Replace("http://zakupki.gov.ru", "")).First())); link2.TargetName = "Организация"; Hyperlink link3 = new Hyperlink(); link3.NavigateUri = new Uri(WebUtility.HtmlDecode("http://www.zakupki.gov.ru" + (from x in hrefs where x.Text == "Документы" select x.Href).First().Replace("http://zakupki.gov.ru", ""))); link3.TargetName = "Документация"; all_lnks.Add(new Links { url1 = link.NavigateUri, url2 = link2.NavigateUri, url3 = link3.NavigateUri }); string _sum = ""; string value = strong[1]; double number; if (double.TryParse(value, out number)) { _sum = number.ToString(); } else { foreach (var _t in strong) { _sum += _t + " "; } } //_sum = _sum.Replace("Лот", "\nЛот ").Replace("аукцион", "аукцион\n ").Replace("форме", "форме\n ").Replace("конкурсе", "конкурсе\n ").Replace("котировок", "котировок\n ").Replace("отбор", "отбор\n ").Replace("закупка", "закупка\n ").Replace("закупки", "закупки\n ").Replace(" .", ".").Replace("поставщика", "поставщика\n ").Replace("предложений", "предложений\n "); custdata.Add(new Tenders { number = i, type = CheckLenght(strong[0] + " " + span[1]), description = CheckLenght(descript, 2), sum = CheckLenght(_sum, 2),// + "\n" + span[3], date_start = li[1], organizer = CheckLenght(li[0], 2), url = link.TargetName, contact = link2.TargetName, // WebUtility.HtmlDecode("http://www.zakupki.gov.ru" + (from x in hrefs where x.Href.Contains("organization") select x.Href.Replace("http://zakupki.gov.ru", "")).First()), documentation = link3.TargetName //WebUtility.HtmlDecode("http://www.zakupki.gov.ru" + (from x in hrefs where x.Text == "Документы" select x.Href).First().Replace("http://zakupki.gov.ru", "")) }); progress_bar.Value++; i++; } } } M: stop_btn.IsEnabled = false; start_btn.IsEnabled = true; clear_btn.IsEnabled = true; start_btn.Content = "Ещё"; progress_bar.Value = 0; label_info.Content = "Загрузка окончена."; } } catch (Exception ex) { MessageBox.Show("Ошибка в методе Parse"); MessageBox.Show(ex.Message + "\n" + ex.StackTrace); } }
private void setBoard(PortalBoard[] board, string boardId, int sPage, int ePage) { MainForm.gridView.Columns[4].HeaderText = "조회수"; for (int i = 0; i < 10 * 10; i++) { board[i] = new PortalBoard(); } for (int pageNum = sPage; pageNum <= ePage; pageNum++) { string url = "http://portal.unist.ac.kr/EP/web/collaboration/bbs/jsp/BB_BoardLst.jsp?boardid=" + boardId + "&nfirst=" + pageNum; if (!getResponse(url)) return; doc = (IHTMLDocument2)new HTMLDocument(); doc.clear(); doc.write(resResult); doc.close(); IEnumerable<IHTMLElement> titles = ElementsByClass(doc, "ltb_left"); IEnumerable<IHTMLElement> elements = ElementsByClass(doc, "ltb_center"); int docNum = elements.Count(); int index; for (int i = 0; i < docNum / BOARDTAGNUM; i++) { string[] rows = new string[5]; IHTMLElement title = titles.ElementAt(i); rows[0] = ""; rows[1] = title.innerText; rows[2] = elements.ElementAt(i * BOARDTAGNUM + 5).innerText; rows[3] = elements.ElementAt(i * BOARDTAGNUM + 7).innerText; rows[4] = elements.ElementAt(i * BOARDTAGNUM + 9).innerText; index = (pageNum - 1) * 10 + i; // new 체크 if (((IHTMLElement2)title).getElementsByTagName("img").length > 0) { board[index].newPost = true; rows[0] = "new"; } // 공지 체크 if (((IHTMLElement2)elements.ElementAt(i * BOARDTAGNUM + 1)).getElementsByTagName("img").length > 0) { board[index].anouncement = true; rows[0] = "공지"; } board[index].rows = rows; //board[index].title = rows[1]; //board[index].writer = rows[2]; //board[index].date = rows[3]; //board[index].viewCount = Convert.ToInt32(rows[4]); board[index].page = pageNum; board[index].boardId = boardId; // javascript:clickBulletin("BB201302011329070365135","BB201302011329070365135","BB201302011329070365135","0","N"); string javaUrl = title.innerHTML.Substring(title.innerHTML.IndexOf("javascript:")); board[index].bullId = javaUrl.Split('\"')[1]; IHTMLElement font = (IHTMLElement)((IHTMLElement2)title).getElementsByTagName("font").item(0, 0); if (font.getAttribute("color") != null) { board[index].color = ConvertColor_PhotoShopStyle_toRGB((string)font.getAttribute("color")); } if (title.outerHTML.IndexOf("FONT-WEIGHT: bold") != -1) { board[index].bold = true; } } } }