private void collecttheurl() { bool foundTheURL = false; mshtml.IHTMLDocument2 htmlDoc = webBrowser1.Document.DomDocument as mshtml.IHTMLDocument2; List <mshtml.IHTMLDivElement> allDiv = htmlDoc.all.OfType <mshtml.IHTMLDivElement>().ToList(); foreach (IHTMLElement div in allDiv) { //write2log(curElement.outerHTML); //write2log(curElement.tostring()); //write2log(curElement.className); if (div.className == "rc") { write2log("found a rc div"); IHTMLDOMNode divNode = (IHTMLDOMNode)div; //write2log(div.innerHTML); //write2log(div.className); //var child = ((IHTMLDOMNode)divnode).firstChild; if (!divNode.hasChildNodes()) { continue; } IHTMLDOMChildrenCollection children = (IHTMLDOMChildrenCollection)divNode.childNodes; foreach (IHTMLDOMNode child in children) { //write2log(child.GetType().Name); if (child != null && child.GetType().Name == "HTMLHeaderElementClass") { if (child.hasChildNodes()) { IHTMLAnchorElement ancharchild = (IHTMLAnchorElement)child.firstChild; //write2log(ancharchild.GetType().Name); if (ancharchild != null && ancharchild.GetType().Name == "HTMLAnchorElementClass") { write2urlList(GoogleQueryConf.queryterms[GoogleQueryConf.queryIndex] + "\t => \t" + ancharchild.href); write2urlList(ancharchild.href); foundTheURL = true; break; } } } //write2log(child.ToString()); } } if (foundTheURL) { break; } } return; }
private static bool GetEleParentFrames(mshtml.IHTMLDOMNode root, mshtml.IHTMLDOMNode node, List <mshtml.IHTMLDOMNode> frames) { bool flag = false; if (root == node) { return(true); } bool flag2 = false; switch (root.nodeName.ToLower()) { case "frame": case "iframe": flag2 = true; break; } IHTMLDOMChildrenCollection childNodes = null; if (flag2) { SHDocVw.IWebBrowser2 browser = root as SHDocVw.IWebBrowser2; if (browser != null) { mshtml.IHTMLDocument2 document = browser.Document as mshtml.IHTMLDocument2; if (document != null) { mshtml.IHTMLDOMNode parentElement = document.body.parentElement as mshtml.IHTMLDOMNode; childNodes = parentElement.childNodes as IHTMLDOMChildrenCollection; } } } if (childNodes == null) { childNodes = root.childNodes as IHTMLDOMChildrenCollection; } if (childNodes == null) { return(false); } for (int i = 0; i < childNodes.length; i++) { mshtml.IHTMLDOMNode node3 = childNodes.item(i) as mshtml.IHTMLDOMNode; if (GetEleParentFrames(node3, node, frames)) { if (flag2) { frames.Add(root); } flag = true; } } return(flag); }
private static Element getFromCollection(IHTMLDOMChildrenCollection Collection) { Element element = new Element(); if (Collection != null && Collection.length > 0) { element.element = Collection.item(0) as IHTMLElement; element.count = Collection.length; } return(element); }
public BBeB ParseHTML(HtmlDocument doc, BindingParams bindingParams, TocEntry tocEntries) { m_Book = new BBeB(); byte[] thumb = File.ReadAllBytes(bindingParams.IconFile); setHeaderValues(thumb.Length); m_Book.MetaData = bindingParams.MetaData; m_Book.ThumbnailData = thumb; // Create our default Attribute objects createDefaultAttributeObjects(BBeB.ReaderPageWidth, BBeB.ReaderPageHeight); // cover page works, but it's ugly createCoverPage(); m_CurrentPage = createPage(); PageObject firstBookPage = m_CurrentPage; m_StartReadingBlock = null; m_StartReadingPage = null; addBookPage(m_CurrentPage); IHTMLDocument2 dom = (IHTMLDocument2)doc.DomDocument; IHTMLDOMNode domNode = (IHTMLDOMNode)dom.body; IHTMLDOMChildrenCollection children = (IHTMLDOMChildrenCollection)domNode.childNodes; TextBlockBuilder tbBuilder = new TextBlockBuilder(GetNextObjId(), m_CharMapper); foreach (IHTMLDOMNode child in children) { tbBuilder = ParseDomNode(child, tbBuilder); } PrintHTMLElementChildren(children); // If we have any text left then add it FlushTextToBlock(m_CurrentPage, tbBuilder, m_MainBodyTextAttr); finalizePage(m_CurrentPage); // Create the table of contents createTocPage(firstBookPage, tocEntries); m_TocObject.AddEntry(m_StartReadingPage.ID, m_StartReadingBlock.ID, "Start Reading"); // Also serialize the table of contents object m_TocObject.Serialize(); finalizeBook(); return(m_Book); }
private void PrintHTMLElementChildren(IHTMLDOMChildrenCollection nodes) { foreach (IHTMLDOMNode child in nodes) { //Debug.Write(child.nodeName + ": "); //Debug.WriteLine(child.nodeValue); if (child.hasChildNodes()) { PrintHTMLElementChildren((IHTMLDOMChildrenCollection)child.childNodes); } } }
/// <summary> /// 得到node的所有父亲frames /// </summary> /// <param name="root"></param> /// <param name="elem"></param> /// <param name="frames"></param> /// <returns></returns> private static bool _getEleParentFrames(IHTMLDOMNode root, IHTMLDOMNode node, List <IHTMLDOMNode> frames) { if (root == node) { return(true); } bool isFrame = false; string tag = root.nodeName.ToLower(); if (tag == "frame" || tag == "iframe") { isFrame = true; } IHTMLDOMChildrenCollection cs = null; if (isFrame) { IWebBrowser2 pwb = root as IWebBrowser2; if (pwb != null) { IHTMLDocument2 pdoc2 = pwb.Document as IHTMLDocument2; if (pdoc2 != null) { IHTMLDOMNode htmlElem = pdoc2.body.parentElement as IHTMLDOMNode; cs = htmlElem.childNodes as IHTMLDOMChildrenCollection; } } } if (cs == null) { cs = root.childNodes as IHTMLDOMChildrenCollection; } if (cs == null) { return(false); } for (int idx = 0; idx < cs.length; idx++) { IHTMLDOMNode c = cs.item(idx) as IHTMLDOMNode; if (_getEleParentFrames(c, node, frames)) { if (isFrame) { frames.Add(root); } return(true); } } return(false); }
private void HighLightingText(HTMLDocument document, IHTMLDOMNode node, string keyword, int cnt) { // nodeType = 3:text节点 if (node.nodeType == 3) { string nodeText = node.nodeValue.ToString(); // 如果找到了关键字 if (nodeText.Contains(keyword)) { IHTMLDOMNode parentNode = node.parentNode; // 将关键字作为分隔符,将文本分离,并逐个添加到原text节点的父节点 string[] result = nodeText.Split(new string[] { keyword }, StringSplitOptions.None); for (int i = 0; i < result.Length - 1; i++) { if (result[i] != "") { IHTMLDOMNode txtNode = document.createTextNode(option[cnt] + result[i] + option[cnt]); parentNode.insertBefore(txtNode, node); } IHTMLDOMNode orgNode = document.createTextNode(option[cnt] + keyword + option[cnt]); IHTMLDOMNode hilightedNode = (IHTMLDOMNode)document.createElement("SPAN"); IHTMLStyle style = ((IHTMLElement)hilightedNode).style; style.color = "black"; style.backgroundColor = color[cnt]; hilightedNode.appendChild(orgNode); parentNode.insertBefore(hilightedNode, node); } if (result[result.Length - 1] != "") { IHTMLDOMNode postNode = document.createTextNode(option[cnt] + result[result.Length - 1] + option[cnt]); parentNode.insertBefore(postNode, node); } parentNode.removeChild(node); } // End of nodeText.Contains(keyword) } else { // 如果不是text节点,则递归搜索其子节点 IHTMLDOMChildrenCollection childNodes = node.childNodes as IHTMLDOMChildrenCollection; foreach (IHTMLDOMNode n in childNodes) { HighLightingText(document, n, keyword, cnt); } } }
/// <summary> /// Recursive method to walk the DOM, acounts for frames /// </summary> /// <param name="nd">Parent DOM node to walk</param> /// <param name="node">Parent tree node to populate</param> /// <returns></returns> private TreeNode parseNodes(IHTMLDOMNode nd, TreeNode node) { string str = nd.nodeName; TreeNode nextnode = null; //Add a new node to tree if (node != null) { nextnode = node.Nodes.Add(str); } else { nextnode = treeDOM.Nodes.Add(str); } //For each child, get children collection //And continue walking up and down the DOM try { //Frame? if (str == FRAMENODE) { //Get the nd.IWebBrowser2.IHTMLDocument3.documentelement and recurse IWebBrowser2 wb = (IWebBrowser2)nd; IHTMLDocument3 doc3 = (IHTMLDocument3)wb.Document; IHTMLDOMNode tempnode = (IHTMLDOMNode)doc3.documentElement; //get the comments for this node, if any IHTMLDOMChildrenCollection framends = (IHTMLDOMChildrenCollection)doc3.childNodes; foreach (IHTMLDOMNode tmpnd in framends) { str = tmpnd.nodeName; if (COMMENTNODE == str) { if (tmpnd.nodeValue != null) { str += VALUESEPERATOR + tmpnd.nodeValue.ToString() + VALUESEPERATOR1; } if (nextnode != null) { nextnode.Nodes.Add(str); } } } //parse document parseNodes(tempnode, nextnode); return(nextnode); } //Get the DOM collection string strdom = string.Empty; IHTMLDOMChildrenCollection nds = (IHTMLDOMChildrenCollection)nd.childNodes; foreach (IHTMLDOMNode childnd in nds) { strdom = childnd.nodeName; //Attempt to extract text and comments if ((COMMENTNODE == strdom) || (TEXTNODE == strdom)) { if (childnd.nodeValue != null) { strdom += VALUESEPERATOR + childnd.nodeValue.ToString() + VALUESEPERATOR1; } //Add a new node to tree if (nextnode != null) { nextnode.Nodes.Add(strdom); } } else { if ((BODYNODE == strdom) && (str == BASENODE)) { //In MSDN, one of the inner FRAMEs BASE element //contains the BODY element??? //Do nothing } else { parseNodes(childnd, nextnode); } } } } catch (System.InvalidCastException icee) { Console.Write("\r\n InvalidCastException =" + icee.ToString() + "\r\nName =" + str + " \r\n"); } catch (Exception) //Anything else throw it { throw; } return(nextnode); }
/// <summary> /// Walk the supplied HTML DOM node (recursively) and add its contents into the /// supplied page using the supplied TextBlockBuilder. /// </summary> /// <remarks>When this routine is done there may be some residual text still in /// tbBuilder. The caller is resonsible for checking this and adding it to the /// page if present.</remarks> /// <param name="node">The HTML DOM node to recursively walk.</param> /// <param name="tbBuilder">The TextBlockBuilder to put the text into.</param> private TextBlockBuilder ParseDomNode(IHTMLDOMNode node, TextBlockBuilder tbBuilder) { TagType tagType = GetTagType(node.nodeName); switch (tagType) { case TagType.IMG: // Before we add the image, see if we need to write the text object first if (tbBuilder.HasText) { // Yes it has tbBuilder.Append(TagId.EOL); FlushTextToBlock(m_CurrentPage, tbBuilder, m_MainBodyTextAttr); tbBuilder = new TextBlockBuilder(GetNextObjId(), m_CharMapper); } IHTMLAttributeCollection attribs = (IHTMLAttributeCollection)node.attributes; object name = "src"; string src = ((IHTMLDOMAttribute)attribs.item(ref name)).nodeValue.ToString(); name = "height"; string height = ((IHTMLDOMAttribute)attribs.item(ref name)).nodeValue.ToString(); name = "width"; string width = ((IHTMLDOMAttribute)attribs.item(ref name)).nodeValue.ToString(); addPageImage(m_CurrentPage, src, ushort.Parse(width), ushort.Parse(height)); break; case TagType.text: AppendTextToBlock((string)node.nodeValue, tbBuilder); break; case TagType.I: tbBuilder.Append(TagId.ItalicBegin); break; case TagType.B: tbBuilder.Append(TagId.FontWeight, LegacyBBeB.k_BoldFontWeight); break; case TagType.SUP: tbBuilder.Append(TagId.BeginSup); break; case TagType.SUB: tbBuilder.Append(TagId.BeginSub); break; case TagType.H1: case TagType.H2: case TagType.H3: case TagType.H4: case TagType.H5: case TagType.H6: FlushTextToBlock(m_CurrentPage, tbBuilder, m_MainBodyTextAttr); tbBuilder = new TextBlockBuilder(GetNextObjId(), m_CharMapper); if (GetHeadingLevel(tagType) <= GetHeadingLevel(m_eNewPageHeadingFilter)) { if (m_CurrentPage.Children.Count > 0) // If current page not empty { // Start a new page finalizePage(m_CurrentPage); m_CurrentPage = createPage(); addBookPage(m_CurrentPage); } } m_HeadingNodePageId[node] = m_CurrentPage.ID; m_TextObjectIdHeadingNode[tbBuilder.TextObjectId] = node; tbBuilder.Append(TagId.FontSize, GetHeadingFontSize(tagType)); break; } if (node.hasChildNodes()) { IHTMLDOMChildrenCollection childNodes = (IHTMLDOMChildrenCollection)node.childNodes; foreach (IHTMLDOMNode child in childNodes) { tbBuilder = ParseDomNode(child, tbBuilder); } } switch (tagType) { case TagType.I: tbBuilder.Append(TagId.ItalicEnd); break; case TagType.B: tbBuilder.Append(TagId.FontWeight, LegacyBBeB.k_NormalFontWeight); break; case TagType.SUP: tbBuilder.Append(TagId.EndSup); break; case TagType.SUB: tbBuilder.Append(TagId.EndSub); break; case TagType.P: tbBuilder.Append(TagId.EOL); tbBuilder.Append(TagId.EOL); break; case TagType.H1: case TagType.H2: case TagType.H3: case TagType.H4: case TagType.H5: case TagType.H6: tbBuilder.Append(TagId.FontSize, LegacyBBeB.DefaultFontSize); FlushTextToBlock(m_CurrentPage, tbBuilder, m_MainBodyTextAttr); tbBuilder = new TextBlockBuilder(GetNextObjId(), m_CharMapper); break; case TagType.BR: tbBuilder.Append(TagId.EOL); break; } return(tbBuilder); }
public void libraryLinky() { try { string href = document.location.href; Regex reg = new Regex(@"/(dp|ASIN|product)/([\dX]{10})"); Match m = reg.Match(href); string title; if (m.Success && m.Groups.Count == 3) { string isbn = m.Groups[2].Value; HTMLDivElement div = (HTMLDivElement)document.getElementById("btAsinTitle").parentElement.parentElement; title = truncate(Regex.Replace(document.getElementById("btAsinTitle").innerHTML, "</?[^>]+>", "")); addLoadingIcon((IHTMLDOMNode)div); string url = "http://api.calil.jp/check?appkey=" + appkey + "&isbn=" + isbn + "&systemid=" + selectedSystemId + "&format=xml"; checkLibrary(url, (IHTMLDOMNode)div, isbn, title); } else if ((href.IndexOf("wishlist") != -1) || (href.IndexOf("/s?") != -1) || (href.IndexOf("/s/") != -1) || (href.IndexOf("/exec/") != -1) || (href.IndexOf("/gp/search") != -1) || (href.IndexOf("/gp/bestsellers/") != -1)) { IHTMLElementCollection objects = null; if (href.IndexOf("wishlist") != -1) { objects = (IHTMLElementCollection)document.getElementsByTagName("span"); } else { objects = (IHTMLElementCollection)document.getElementsByTagName("div"); } if (objects != null) { IEnumerator objEnum = objects.GetEnumerator(); while (objEnum.MoveNext()) { IHTMLElement obj = (IHTMLElement)objEnum.Current; if (obj.className == null) { continue; } if (obj.className == "productTitle" || (obj.className == "title" && obj.parentElement.className == "data") || obj.className == "fixed-line") { IHTMLDOMChildrenCollection childs = null; if (((IHTMLElement)obj).tagName.ToLower() == "span") { childs = (IHTMLDOMChildrenCollection)((HTMLSpanElement)obj).childNodes; } else { childs = (IHTMLDOMChildrenCollection)((HTMLDivElement)obj).childNodes; } IEnumerator childEnum = childs.GetEnumerator(); while (childEnum.MoveNext()) { if (((IHTMLElement)childEnum.Current).tagName.ToLower() == "a") { HTMLAnchorElement link = (HTMLAnchorElement)childEnum.Current; if (link != null) { reg = new Regex("<span title='(.+)'>"); m = reg.Match(link.innerHTML); if (m.Success && m.Groups.Count == 2) { title = truncate(stripTags(m.Groups[1].Value.Trim())); } else { title = truncate(stripTags(Regex.Replace(link.innerHTML, @"<\w[^>]*?>", ""))); } reg = new Regex(@"/dp/([\dX]{10})/ref"); m = reg.Match(link.href); if (m.Success && m.Groups.Count == 2) { string isbn = m.Groups[1].Value; addLoadingIcon((IHTMLDOMNode)obj); string url = "http://api.calil.jp/check?appkey=" + appkey + "&isbn=" + isbn + "&systemid=" + selectedSystemId + "&format=xml"; checkLibrary(url, (IHTMLDOMNode)obj, isbn, title); } } break; } } } } } } } catch (Exception ex) { AddErrorLog(ex); MessageBox.Show(ex.Message); } }
private void findElement() { HtmlElement body = _browser.Document.Body; //C#网页元素索引(不包含文本节点) int index = 0; //dom元素索引(包含文本节点) int domIndex = 0; IHTMLDOMNode node = null; HtmlElement element = null; Stack <KeyValuePair <HtmlDocument, int> > frameIndex = new Stack <KeyValuePair <HtmlDocument, int> >(); // 保存网页中找到的文本节点 Stack <KeyValuePair <HtmlElement, string> > textNode = new Stack <KeyValuePair <HtmlElement, string> >(body.All.Count); // 保存网页中的form //Stack<HtmlElement> form = new Stack<HtmlElement>(3); // 保存正在遍历的元素列表 Stack <KeyValuePair <HtmlElement, int[]> > enumList = new Stack <KeyValuePair <HtmlElement, int[]> >(body.All.Count); HtmlElement currentElement = body; IHTMLDOMChildrenCollection domCollection = ((IHTMLDOMNode)body.DomElement).childNodes; int currentFrameIndex = 0; for (; domIndex < domCollection.length || enumList.Count > 0; domIndex++, index++) { if (domCollection.length <= domIndex) { KeyValuePair <HtmlElement, int[]> prev = enumList.Pop(); domCollection = ((IHTMLDOMNode)prev.Key.DomElement).childNodes; currentElement = prev.Key; index = prev.Value[0]; domIndex = prev.Value[1]; continue; } node = domCollection.item(domIndex); if (node.nodeType == 3) // 若当前是文本节点则放入textNode中 { string text = HandleFindedText(node.nodeValue); if (string.IsNullOrEmpty(text) == false) { textNode.Push(new KeyValuePair <HtmlElement, string>(currentElement, text)); } index--; continue; } if (index >= currentElement.Children.Count) { continue; } element = currentElement.Children[index]; switch (element.TagName) { case Matcher.TYPE_FRAME: case Matcher.TYPE_IFRAME: pushToStack(enumList, currentElement, ref index, ref domIndex); KeyValuePair <HtmlDocument, int>?lastFrame = frameIndex.Count == 0 ? (KeyValuePair <HtmlDocument, int>?)null : frameIndex.Peek(); HtmlWindow wnd = null; if (lastFrame != null) { int findedIndex = 0; foreach (var item in frameIndex) { if (item.Key.Window.Frames[item.Value].Document == element.Document) { while (findedIndex > 0) { frameIndex.Pop(); findedIndex--; } currentFrameIndex = 0; break; } else if (item.Key == element.Document) { currentFrameIndex = item.Value + 1; do { frameIndex.Pop(); findedIndex--; } while (findedIndex >= 0); break; } else { currentFrameIndex = 0; } findedIndex++; } } else { currentFrameIndex = 0; } frameIndex.Push(new KeyValuePair <HtmlDocument, int>(element.Document, currentFrameIndex)); wnd = element.Document.Window.Frames[currentFrameIndex]; currentElement = wnd.Document.Body; domCollection = ((IHTMLDOMNode)currentElement.DomElement).childNodes; break; case Matcher.TYPE_FORM: if (OnFormElementDetected(element) == false) { return; } break; case Matcher.TYPE_A: if (OnLinkElementFinded(element, textNode) == false) { return; } break; case Matcher.TYPE_SELECT: if (OnSelectElementFinded(element, textNode) == false) { return; } break; case Matcher.TYPE_INPUT: if (InputElementFinded(element, textNode) == false) { return; } break; case Matcher.TYPE_TEXTAREA: if (OnTextAreaElementDetected(element, textNode) == false) { return; } break; default: if (OnOtherElementFinded(element) == false) { return; } break; } if (node.childNodes.length > 0) { pushToStack(enumList, currentElement, ref index, ref domIndex); currentElement = element; domCollection = ((IHTMLDOMNode)currentElement.DomElement).childNodes; } } }