public static string[] GetHTMLUrls(string Page, string HostAbsolutePath, int MaxLen, int FindUrlLevel) { ArrayList list = new ArrayList(); ParseHTML ehtml = new ParseHTML(); ehtml.Source = Page; while (!ehtml.Eof()) { if (ehtml.Parse() == '\0') { Shove.HTML.HtmlParse.Attribute attribute = ehtml.GetTag()["HREF"]; if (attribute != null) { string str = attribute.Value.Trim().ToLower(); if ((((str != "") && !str.StartsWith("mailto")) && !str.StartsWith("#")) && (((FindUrlLevel == 2) || str.StartsWith("http://")) || str.StartsWith("https://"))) { str = GetPath(str, HostAbsolutePath); if ((MaxLen < 1) || (str.Length <= MaxLen)) { list.Add(str); } } } attribute = ehtml.GetTag()["SRC"]; if (attribute != null) { string str2 = attribute.Value.Trim().ToLower(); if ((str2 != "") && (((FindUrlLevel == 2) || str2.StartsWith("http://")) || str2.StartsWith("https://"))) { str2 = GetPath(str2, HostAbsolutePath); if ((MaxLen < 1) || (str2.Length <= MaxLen)) { list.Add(str2); } } } } } if (list.Count == 0) { return(null); } string[] strArray = new string[list.Count]; for (int i = 0; i < list.Count; i++) { strArray[i] = list[i].ToString(); } return(strArray); }
// Very simple: just parse the EventLists out of the album front page HTML. // No need to get the event list names (usually years) from the HTML, as these // names should be embedded in the EventLists. public Album ReadAlbum(string masterFilename, out string aDiagnostic) { aDiagnostic = null; string xmlFilename = GetDirectory(masterFilename) + "Album.xml"; Album album = new Album(xmlFilename); string html = ReadFile(masterFilename); if (html == null) { aDiagnostic = "ReadAlbum: bad HTML Album file " + masterFilename; return(null); } else { // Determine the master directory string masterDirectory = GetDirectory(masterFilename); //Console.WriteLine("HtmlReader: parsing " + masterFilename); ParseHTML parse = new ParseHTML(); parse.Source = html; while (!parse.Eof()) { char ch = parse.Parse(); if (ch == 0) { AttributeList tag = parse.GetTag(); if (tag["href"] != null) { string href = tag["href"].Value.Replace('/', '\\'); //Console.WriteLine("HtmlReader: add year " + href + " to master XML file"); // Process child events file EventList events = ReadEvents(masterDirectory + href, out aDiagnostic); if (events == null) { return(null); } else { album.Add(events); } } } } return(album); } }
// Read one HTML page and return a slide show public SlideShow ReadSlideShow(string aSlideFile, out string aDiagnostic) { aDiagnostic = null; // Determine the name of the future XML slide show file string xmlFilePath = aSlideFile.Replace(".htm", ".xml"); SlideShow slideShow = new SlideShow(xmlFilePath); //Console.WriteLine(" HtmlReader ReadSlideShow: parsing " + aSlideFile); string html = ReadFile(aSlideFile); if (html == null) { aDiagnostic = "ReadSlideShow: bad HTML slideshow file " + aSlideFile; return(null); } else { ParseHTML parse = new ParseHTML(); parse.Source = html; HtmlPreprocess htmlPreprocess = new HtmlPreprocess(); // Default overall title for the slide show, hopefully replaced with something better string title = "A most peculiar day"; bool collectingTitle = false; bool collectingCaption = false; Caption caption = new Caption(); string link = ""; while (!parse.Eof()) { char ch = parse.Parse(); if (ch == 0) { AttributeList tag = parse.GetTag(); if (tag.Name.Equals("title", StringComparison.CurrentCultureIgnoreCase)) { collectingTitle = true; // Start collecting title title = string.Empty; } else if (tag.Name.Equals("/title", StringComparison.CurrentCultureIgnoreCase)) { collectingTitle = false; // Title now complete slideShow.Title = title.Trim(); } if (tag.Name.Equals("td", StringComparison.CurrentCultureIgnoreCase)) { collectingCaption = true; // Start collecting new caption caption = new Caption(); } else if (tag.Name.Equals("/td", StringComparison.CurrentCultureIgnoreCase)) { collectingCaption = false; // Any caption is now complete if (!link.Equals("")) { // Got a link to go with the caption slideShow.Add(link, caption); link = ""; } } else if (collectingCaption && tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)) { // HTML paragraph tag within caption caption.NewLine(); } else if (tag["href"] != null) { string href = tag["href"].Value.Replace('/', '\\'); if (IsPhoto(href)) { //Console.WriteLine(" + HtmlReader ReadSlideShow: add " + href + // " from tag " + tag.Name); link = href; } } // Preprocessing of regular character stream starts with clean sheet after tag htmlPreprocess.Reset(); } else { // Got a character ch = htmlPreprocess.Add(ch); if (ch != HtmlPreprocess.NullChar) { if (collectingTitle) { title += ch; } else if (collectingCaption) { caption.AddChar(ch); } } } } return(slideShow); } }
// Read one HTML page and return an Event list public EventList ReadEvents(string aEventsFile, out string aDiagnostic) { aDiagnostic = null; string xmlFilePath = aEventsFile.Replace(".htm", ".xml"); EventList events = new EventList(xmlFilePath); string html = ReadFile(aEventsFile); if (html == null) { aDiagnostic = "ReadEvents: bad HTML Events file " + aEventsFile; return(null); } else { // Determine the events directory string eventsDirectory = GetDirectory(aEventsFile); //Console.WriteLine(" HtmlReader ReadEvents: parsing " + aEventsFile); ParseHTML parse = new ParseHTML(); parse.Source = html; HtmlPreprocess htmlPreprocess = new HtmlPreprocess(); string name = ""; // Collect stream of characters in HTML source int indent = 0; SlideShow slideShow = null; // Collect slide show from href while (!parse.Eof()) { char ch = parse.Parse(); if (ch == 0) { AttributeList tag = parse.GetTag(); if (tag.Name.Equals("h2", StringComparison.CurrentCultureIgnoreCase)) { // Start collecting title name = ""; indent = 0; } else if (tag.Name.Equals("/h2", StringComparison.CurrentCultureIgnoreCase)) { // Title is now complete events.Title = name.Trim(); name = ""; indent = 0; } else if ((tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)) || (tag.Name.Equals("br", StringComparison.CurrentCultureIgnoreCase)) || (tag.Name.Equals("hr", StringComparison.CurrentCultureIgnoreCase))) { // End of line, check whether we have an event if (name.Length > 0) { // Use indent as the level for now events.Add(indent, name.Trim(), slideShow); } // Reset for next event name = ""; indent = 0; slideShow = null; } else if (tag["href"] != null) { string href = tag["href"].Value.Replace('/', '\\'); //Console.WriteLine(" + HtmlReader ReadEvents: add event " + href + " to events XML file"); // Strip any anchor: we cannot handle it if (href.Contains('#')) { href = href.Remove(href.IndexOf('#')); } // Process child events file slideShow = ReadSlideShow(eventsDirectory + href, out aDiagnostic); if (slideShow == null) { return(null); } } // Preprocessing of regular character stream starts with clean sheet after tag htmlPreprocess.Reset(); } else { // Preprocess ch = htmlPreprocess.Add(ch); if (ch == HtmlPreprocess.NullChar) { // Nothing to do continue; } else if (ch.Equals(' ')) { if (name.Length == 0) { // Leading space: count the indent indent++; } else { // Count all non-leading spaces returned by the preprocessor name += ch; } } else if (ch.Equals('+')) { if (name.Length == 0) { // Initial plus marks a subevent - count it in the indent indent++; } else { // Transcribe other plus symbols into the event name name += ch; } } else { // Add regular character name += ch; } } } // End of event list, check for any outstanding event if (name.Length > 0) { events.Add(indent, name.Trim(), slideShow); } // The event levels were arbitrarily set as a measure of the indentation // of each event name in the HTML. Reassign sequential levels. events.Relevel(); return(events); } }
private string ProcessURL(int currentRecursive , string strURL,string fromURL,string strDownloadPath,string startLinkURL,string endLinkURL,Boolean onlyLink) { Invoke(new MethodInvoker(delegate() { // 階層表示を更新 recursiveLevelLabel.Text = "" + currentRecursive; recursiveLevelLabel.Update(); toolStripStatusLabel.Text = strURL; statusStrip.Update(); notifyIcon1.Text = "CSMDown:"; if (strURL.Length > 50) { notifyIcon1.Text += strURL.Substring(0, 50); } else { notifyIcon1.Text += strURL; } })); // -----< HTMLを読み込む >----- // Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("ProcessURL : "); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText(strURL); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("RefererURL : "); })); if (fromURL != null) { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText(fromURL); })); } Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); string strHTML = ""; string mimeType = ""; if (m_cansel == true) { return mimeType; } // キャンセル処理 mimeType = ReadHTML(strURL, ref strHTML, fromURL, currentRecursive); if (strHTML == null) { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLを読み込めませんでした。:"+strURL+"\r\n"); })); return mimeType; } if (isHTML(mimeType) == false) { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLではないので無視します。:" + strURL+"\r\n"); })); return mimeType; } insertAlreadyAccessedURL(strURL, currentRecursive); // -----< ブラウザをナビゲートする >----- // Invoke(new MethodInvoker(delegate() { // ブラウザをナビゲート if (m_BrowserForm != null) { if (m_BrowserForm.IsDisposed == false) { m_BrowserForm.Navigate(strURL); } } })); if (m_cansel == true) { return mimeType; } // キャンセル処理 // -----< HTMLをパースする >----- // Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLをパースします。\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); // <<< パース結果の格納先を準備する >>> ArrayList URLArray = new ArrayList(); URLArray.Clear(); // <<< パースを実行する >>> Boolean startLinkProcessed = false; ParseHTML parser = new ParseHTML(); parser.Source = strHTML; while (!parser.Eof()) { if (m_cansel == true) { return mimeType; } // キャンセル処理 // パースする char ch = parser.Parse(); if (ch != 0) { continue; } string linkURL = ""; // href のとき、リンク先を確保する AttributeList tag = parser.GetTag(); if (tag["href"] != null) { linkURL = (string)( tag["href"].Value ); } // src のとき、リンク先を確保する if (tag["src"] != null) { linkURL = (string)( tag["src"].Value ); } // # が付いているときには、#以降を消す if (linkURL.IndexOf("#") >= 0) { int index = linkURL.IndexOf("#"); linkURL = linkURL.Substring(0, index); } if (linkURL.Length > 0) { string linkAbsolute = null; if (linkURL.Length > 0) { linkAbsolute = CreateLinkURL(linkURL, strURL); } else { linkAbsolute = linkURL; } if (linkAbsolute == null) { //Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("絶対パスを作れなかったか、もしくは無効なリンクと判定しました。" + linkURL + "\r\n"); })); //Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); continue; } if ((startLinkURL.Length > 0)&&(startLinkProcessed == false)) { if (startLinkURL.Equals(linkAbsolute)) { startLinkProcessed = true; } else { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("StartLink以前なので無視します:" + linkURL + "\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); continue; } } linkAbsolute = linkAbsolute.Replace("\r\n", ""); linkAbsolute = linkAbsolute.Replace("\n", ""); URLArray.Add(linkAbsolute); // EndLinkで終了 if (endLinkURL.Length > 0) { if (endLinkURL.Equals(linkAbsolute)) { // EndLinkを検出 Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("EndLinkを検出。パースを終了します。:" + linkURL + "\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); break; } } } } Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("リンク数:" + URLArray.Count); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); if (m_cansel == true) { return mimeType; } // キャンセル処理 // -----< ダウンロードする >----- // Boolean result = true; Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("ダウンロードを開始します。\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); if (onlyLink == false) { result = HTTPDownload(URLArray, strDownloadPath, strURL, currentRecursive); if (result != true) { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("ダウンロードに失敗しました。"); })); } } // -----< HTMLの再起呼び出し >------ // int recursiveMax = int.Parse(recursiveLevelTextBox.Text); if (currentRecursive < recursiveMax) { int i; int c; // 0:優先URLを処理する 1:同一ドメインを処理する for (c = 0; c < 2; c++) { // 優先パス指定がなければ、優先関連は何もしない。 if (c == 0) { if (highPriorityPathTextBox.Text.Length <= 0) { continue; } } // 優先URLのみなら、それ以外のパターンは処理しない if ((c == 1) && (highPriorityPathTextBox.Text.Length>0)) { if (priorityOnlyCheckBox.Checked == true) { continue; } } // URLを切り分けつつ処理する for (i = 0; i < URLArray.Count; i++) { if (m_cansel == true) { break; } // キャンセル処理 try { // 優先URLでなければ無視する if (c == 0) { // 優先URLを処理する string priorityURL = highPriorityPathTextBox.Text.Trim(); if (!((string)URLArray[i]).StartsWith(priorityURL)) { // 優先が指定されていて、URLが指定優先パターンと異なるなら、次へ進む continue; } } else { // 優先URLでないパターンを処理する if ((highPriorityPathTextBox.Text.Length > 0) && (strURL.StartsWith(highPriorityPathTextBox.Text))) { // 優先が指定されていて、URLが指定優先パターンと一致するなら、次へ進む continue; } } } catch (Exception e) { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("URI Compare failed\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText(e.Message+"\r\n"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); })); continue; } if (isAlreadyAccessed((string)URLArray[i],currentRecursive+1) == true) { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("アクセス済みURL:"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText((string)URLArray[i]); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); })); continue; } // -----< 再起呼び出し >----- // string res; res = ProcessURL(currentRecursive + 1, (string)URLArray[i],strURL, strDownloadPath,"","",false); if (isHTML(res) != true) { Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLではありませんでした。:"); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText((string)URLArray[i]); })); Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); })); Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Text = "" + currentRecursive; })); Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Update(); })); } Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Text = "" + currentRecursive; })); Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Update(); })); Invoke(new MethodInvoker(delegate() { toolStripStatusLabel.Text = strURL; })); Invoke(new MethodInvoker(delegate() { statusStrip.Update(); })); if (m_cansel == true) { return mimeType; } // キャンセル処理 } } } return mimeType; }