Пример #1
0
        // Very simple: just parse the EventLists out of the album front page HTML.
        // No need to get the event list names (usually years) from the HTML, as these
        // names should be embedded in the EventLists.
        public Album ReadAlbum(string masterFilename, out string aDiagnostic)
        {
            aDiagnostic = null;
            string xmlFilename = GetDirectory(masterFilename) + "Album.xml";
            Album  album       = new Album(xmlFilename);

            string html = ReadFile(masterFilename);

            if (html == null)
            {
                aDiagnostic = "ReadAlbum: bad HTML Album file " + masterFilename;
                return(null);
            }
            else
            {
                // Determine the master directory
                string masterDirectory = GetDirectory(masterFilename);

                //Console.WriteLine("HtmlReader: parsing " + masterFilename);
                ParseHTML parse = new ParseHTML();
                parse.Source = html;

                while (!parse.Eof())
                {
                    char ch = parse.Parse();
                    if (ch == 0)
                    {
                        AttributeList tag = parse.GetTag();
                        if (tag["href"] != null)
                        {
                            string href = tag["href"].Value.Replace('/', '\\');
                            //Console.WriteLine("HtmlReader: add year " + href + " to master XML file");

                            // Process child events file
                            EventList events = ReadEvents(masterDirectory + href, out aDiagnostic);
                            if (events == null)
                            {
                                return(null);
                            }
                            else
                            {
                                album.Add(events);
                            }
                        }
                    }
                }

                return(album);
            }
        }
Пример #2
0
        public static string[] GetHTMLUrls(string Page, string HostAbsolutePath, int MaxLen, int FindUrlLevel)
        {
            ArrayList list  = new ArrayList();
            ParseHTML ehtml = new ParseHTML();

            ehtml.Source = Page;
            while (!ehtml.Eof())
            {
                if (ehtml.Parse() == '\0')
                {
                    Shove.HTML.HtmlParse.Attribute attribute = ehtml.GetTag()["HREF"];
                    if (attribute != null)
                    {
                        string str = attribute.Value.Trim().ToLower();
                        if ((((str != "") && !str.StartsWith("mailto")) && !str.StartsWith("#")) && (((FindUrlLevel == 2) || str.StartsWith("http://")) || str.StartsWith("https://")))
                        {
                            str = GetPath(str, HostAbsolutePath);
                            if ((MaxLen < 1) || (str.Length <= MaxLen))
                            {
                                list.Add(str);
                            }
                        }
                    }
                    attribute = ehtml.GetTag()["SRC"];
                    if (attribute != null)
                    {
                        string str2 = attribute.Value.Trim().ToLower();
                        if ((str2 != "") && (((FindUrlLevel == 2) || str2.StartsWith("http://")) || str2.StartsWith("https://")))
                        {
                            str2 = GetPath(str2, HostAbsolutePath);
                            if ((MaxLen < 1) || (str2.Length <= MaxLen))
                            {
                                list.Add(str2);
                            }
                        }
                    }
                }
            }
            if (list.Count == 0)
            {
                return(null);
            }
            string[] strArray = new string[list.Count];
            for (int i = 0; i < list.Count; i++)
            {
                strArray[i] = list[i].ToString();
            }
            return(strArray);
        }
Пример #3
0
        // Read one HTML page and return a slide show
        public SlideShow ReadSlideShow(string aSlideFile, out string aDiagnostic)
        {
            aDiagnostic = null;

            // Determine the name of the future XML slide show file
            string    xmlFilePath = aSlideFile.Replace(".htm", ".xml");
            SlideShow slideShow   = new SlideShow(xmlFilePath);

            //Console.WriteLine("     HtmlReader ReadSlideShow: parsing " + aSlideFile);
            string html = ReadFile(aSlideFile);

            if (html == null)
            {
                aDiagnostic = "ReadSlideShow: bad HTML slideshow file " + aSlideFile;
                return(null);
            }
            else
            {
                ParseHTML parse = new ParseHTML();
                parse.Source = html;

                HtmlPreprocess htmlPreprocess = new HtmlPreprocess();

                // Default overall title for the slide show, hopefully replaced with something better
                string title           = "A most peculiar day";
                bool   collectingTitle = false;

                bool    collectingCaption = false;
                Caption caption           = new Caption();
                string  link = "";
                while (!parse.Eof())
                {
                    char ch = parse.Parse();
                    if (ch == 0)
                    {
                        AttributeList tag = parse.GetTag();
                        if (tag.Name.Equals("title", StringComparison.CurrentCultureIgnoreCase))
                        {
                            collectingTitle = true;       // Start collecting title
                            title           = string.Empty;
                        }
                        else if (tag.Name.Equals("/title", StringComparison.CurrentCultureIgnoreCase))
                        {
                            collectingTitle = false;      // Title now complete
                            slideShow.Title = title.Trim();
                        }
                        if (tag.Name.Equals("td", StringComparison.CurrentCultureIgnoreCase))
                        {
                            collectingCaption = true;       // Start collecting new caption
                            caption           = new Caption();
                        }
                        else if (tag.Name.Equals("/td", StringComparison.CurrentCultureIgnoreCase))
                        {
                            collectingCaption = false;      // Any caption is now complete
                            if (!link.Equals(""))
                            {
                                // Got a link to go with the caption
                                slideShow.Add(link, caption);
                                link = "";
                            }
                        }
                        else if (collectingCaption &&
                                 tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase))
                        {
                            // HTML paragraph tag within caption
                            caption.NewLine();
                        }
                        else if (tag["href"] != null)
                        {
                            string href = tag["href"].Value.Replace('/', '\\');
                            if (IsPhoto(href))
                            {
                                //Console.WriteLine("     + HtmlReader ReadSlideShow: add " + href +
                                //                  " from tag " + tag.Name);
                                link = href;
                            }
                        }

                        // Preprocessing of regular character stream starts with clean sheet after tag
                        htmlPreprocess.Reset();
                    }
                    else
                    {
                        // Got a character
                        ch = htmlPreprocess.Add(ch);
                        if (ch != HtmlPreprocess.NullChar)
                        {
                            if (collectingTitle)
                            {
                                title += ch;
                            }
                            else if (collectingCaption)
                            {
                                caption.AddChar(ch);
                            }
                        }
                    }
                }

                return(slideShow);
            }
        }
Пример #4
0
        // Read one HTML page and return an Event list
        public EventList ReadEvents(string aEventsFile, out string aDiagnostic)
        {
            aDiagnostic = null;
            string    xmlFilePath = aEventsFile.Replace(".htm", ".xml");
            EventList events      = new EventList(xmlFilePath);

            string html = ReadFile(aEventsFile);

            if (html == null)
            {
                aDiagnostic = "ReadEvents: bad HTML Events file " + aEventsFile;
                return(null);
            }
            else
            {
                // Determine the events directory
                string eventsDirectory = GetDirectory(aEventsFile);

                //Console.WriteLine("  HtmlReader ReadEvents: parsing " + aEventsFile);
                ParseHTML parse = new ParseHTML();
                parse.Source = html;

                HtmlPreprocess htmlPreprocess = new HtmlPreprocess();
                string         name           = ""; // Collect stream of characters in HTML source
                int            indent         = 0;

                SlideShow slideShow = null;     // Collect slide show from href

                while (!parse.Eof())
                {
                    char ch = parse.Parse();
                    if (ch == 0)
                    {
                        AttributeList tag = parse.GetTag();
                        if (tag.Name.Equals("h2", StringComparison.CurrentCultureIgnoreCase))
                        {
                            // Start collecting title
                            name   = "";
                            indent = 0;
                        }
                        else if (tag.Name.Equals("/h2", StringComparison.CurrentCultureIgnoreCase))
                        {
                            // Title is now complete
                            events.Title = name.Trim();
                            name         = "";
                            indent       = 0;
                        }
                        else if ((tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)) ||
                                 (tag.Name.Equals("br", StringComparison.CurrentCultureIgnoreCase)) ||
                                 (tag.Name.Equals("hr", StringComparison.CurrentCultureIgnoreCase)))
                        {
                            // End of line, check whether we have an event
                            if (name.Length > 0)
                            {
                                // Use indent as the level for now
                                events.Add(indent, name.Trim(), slideShow);
                            }

                            // Reset for next event
                            name      = "";
                            indent    = 0;
                            slideShow = null;
                        }
                        else if (tag["href"] != null)
                        {
                            string href = tag["href"].Value.Replace('/', '\\');
                            //Console.WriteLine("   + HtmlReader ReadEvents: add event " + href + " to events XML file");

                            // Strip any anchor: we cannot handle it
                            if (href.Contains('#'))
                            {
                                href = href.Remove(href.IndexOf('#'));
                            }

                            // Process child events file
                            slideShow = ReadSlideShow(eventsDirectory + href, out aDiagnostic);
                            if (slideShow == null)
                            {
                                return(null);
                            }
                        }

                        // Preprocessing of regular character stream starts with clean sheet after tag
                        htmlPreprocess.Reset();
                    }
                    else
                    {
                        // Preprocess
                        ch = htmlPreprocess.Add(ch);
                        if (ch == HtmlPreprocess.NullChar)
                        {
                            // Nothing to do
                            continue;
                        }
                        else if (ch.Equals(' '))
                        {
                            if (name.Length == 0)
                            {
                                // Leading space: count the indent
                                indent++;
                            }
                            else
                            {
                                // Count all non-leading spaces returned by the preprocessor
                                name += ch;
                            }
                        }
                        else if (ch.Equals('+'))
                        {
                            if (name.Length == 0)
                            {
                                // Initial plus marks a subevent - count it in the indent
                                indent++;
                            }
                            else
                            {
                                // Transcribe other plus symbols into the event name
                                name += ch;
                            }
                        }
                        else
                        {
                            // Add regular character
                            name += ch;
                        }
                    }
                }

                // End of event list, check for any outstanding event
                if (name.Length > 0)
                {
                    events.Add(indent, name.Trim(), slideShow);
                }

                // The event levels were arbitrarily set as a measure of the indentation
                // of each event name in the HTML. Reassign sequential levels.
                events.Relevel();

                return(events);
            }
        }
Пример #5
0
        private string ProcessURL(int currentRecursive , string strURL,string fromURL,string strDownloadPath,string startLinkURL,string endLinkURL,Boolean onlyLink)
        {
            Invoke(new MethodInvoker(delegate()
            {
                // 階層表示を更新
                recursiveLevelLabel.Text = "" + currentRecursive;
                recursiveLevelLabel.Update();
                toolStripStatusLabel.Text = strURL;
                statusStrip.Update();
                notifyIcon1.Text = "CSMDown:";
                if (strURL.Length > 50)
                {
                    notifyIcon1.Text += strURL.Substring(0, 50);
                }
                else
                {
                    notifyIcon1.Text += strURL;
                }
            }));

            // -----< HTMLを読み込む >-----
            //
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("ProcessURL : "); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText(strURL); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("RefererURL : "); }));
            if (fromURL != null)
            {
                Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText(fromURL); }));
            }
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));

            string strHTML = "";
            string mimeType = "";
            if (m_cansel == true) { return mimeType; }  // キャンセル処理
            mimeType = ReadHTML(strURL, ref strHTML, fromURL, currentRecursive);
            if (strHTML == null)
            {
                Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLを読み込めませんでした。:"+strURL+"\r\n"); }));
                return mimeType;
            }
            if (isHTML(mimeType) == false)
            {
                Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLではないので無視します。:" + strURL+"\r\n"); }));
                return mimeType;
            }
            insertAlreadyAccessedURL(strURL, currentRecursive);

            // -----< ブラウザをナビゲートする >-----
            //
            Invoke(new MethodInvoker(delegate()
            {
                // ブラウザをナビゲート
                if (m_BrowserForm != null)
                {
                    if (m_BrowserForm.IsDisposed == false)
                    {
                        m_BrowserForm.Navigate(strURL);
                    }
                }
            }));

            if (m_cansel == true) { return mimeType; }  // キャンセル処理

            // -----< HTMLをパースする >-----
            //
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLをパースします。\r\n"); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));

            // <<< パース結果の格納先を準備する >>>
            ArrayList URLArray = new ArrayList();
            URLArray.Clear();

            // <<< パースを実行する >>>
            Boolean startLinkProcessed = false;
            ParseHTML parser = new ParseHTML();
            parser.Source = strHTML;
            while (!parser.Eof())
            {
                if (m_cansel == true) { return mimeType; }  // キャンセル処理

                // パースする
                char ch = parser.Parse();
                if (ch != 0) { continue; }

                string linkURL = "";

                // href のとき、リンク先を確保する
                AttributeList tag = parser.GetTag();
                if (tag["href"] != null)
                {
                    linkURL = (string)( tag["href"].Value );
                }

                // src のとき、リンク先を確保する
                if (tag["src"] != null)
                {
                    linkURL = (string)( tag["src"].Value );
                }

                // # が付いているときには、#以降を消す
                if (linkURL.IndexOf("#") >= 0)
                {
                    int index = linkURL.IndexOf("#");
                    linkURL = linkURL.Substring(0, index);
                }

                if (linkURL.Length > 0)
                {
                    string linkAbsolute = null;
                    if (linkURL.Length > 0)
                    {
                        linkAbsolute = CreateLinkURL(linkURL, strURL);
                    }
                    else
                    {
                        linkAbsolute = linkURL;
                    }
                    if (linkAbsolute == null)
                    {
                        //Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("絶対パスを作れなかったか、もしくは無効なリンクと判定しました。" + linkURL + "\r\n"); }));
                        //Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));
                        continue;
                    }

                    if ((startLinkURL.Length > 0)&&(startLinkProcessed == false))
                    {
                        if (startLinkURL.Equals(linkAbsolute))
                        {
                            startLinkProcessed = true;
                        }
                        else
                        {
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("StartLink以前なので無視します:" + linkURL + "\r\n"); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));
                            continue;
                        }
                    }
                    linkAbsolute = linkAbsolute.Replace("\r\n", "");
                    linkAbsolute = linkAbsolute.Replace("\n", "");
                    URLArray.Add(linkAbsolute);

                    // EndLinkで終了
                    if (endLinkURL.Length > 0)
                    {
                        if (endLinkURL.Equals(linkAbsolute))
                        {
                            // EndLinkを検出
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("EndLinkを検出。パースを終了します。:" + linkURL + "\r\n"); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));
                            break;
                        }
                    }
                }
            }
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("リンク数:" + URLArray.Count); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));
            if (m_cansel == true) { return mimeType; }  // キャンセル処理

            // -----< ダウンロードする >-----
            //
            Boolean result = true;
            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("ダウンロードを開始します。\r\n"); }));
            Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));
            if (onlyLink == false)
            {
                result = HTTPDownload(URLArray, strDownloadPath, strURL, currentRecursive);
                if (result != true)
                {
                    Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("ダウンロードに失敗しました。"); }));
                }
            }

            // -----< HTMLの再起呼び出し >------
            //
            int recursiveMax = int.Parse(recursiveLevelTextBox.Text);
            if (currentRecursive < recursiveMax)
            {
                int i;
                int c;      // 0:優先URLを処理する  1:同一ドメインを処理する
                for (c = 0; c < 2; c++)
                {
                    // 優先パス指定がなければ、優先関連は何もしない。
                    if (c == 0)
                    {
                        if (highPriorityPathTextBox.Text.Length <= 0)
                        {
                            continue;
                        }
                    }

                    // 優先URLのみなら、それ以外のパターンは処理しない
                    if ((c == 1) && (highPriorityPathTextBox.Text.Length>0))
                    {
                        if (priorityOnlyCheckBox.Checked == true)
                        {
                            continue;
                        }
                    }

                    // URLを切り分けつつ処理する
                    for (i = 0; i < URLArray.Count; i++)
                    {
                        if (m_cansel == true) { break; }  // キャンセル処理

                        try
                        {
                            // 優先URLでなければ無視する
                            if (c == 0)
                            {
                                // 優先URLを処理する
                                string priorityURL = highPriorityPathTextBox.Text.Trim();
                                if (!((string)URLArray[i]).StartsWith(priorityURL))
                                {
                                    // 優先が指定されていて、URLが指定優先パターンと異なるなら、次へ進む
                                    continue;
                                }
                            }
                            else
                            {
                                // 優先URLでないパターンを処理する
                                if ((highPriorityPathTextBox.Text.Length > 0) && (strURL.StartsWith(highPriorityPathTextBox.Text)))
                                {
                                    // 優先が指定されていて、URLが指定優先パターンと一致するなら、次へ進む
                                    continue;
                                }
                            }
                        }
                        catch (Exception e)
                        {
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("URI Compare failed\r\n"); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText(e.Message+"\r\n"); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.Update(); }));
                            continue;
                        }
                        if (isAlreadyAccessed((string)URLArray[i],currentRecursive+1) == true)
                        {
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("アクセス済みURL:"); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText((string)URLArray[i]); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); }));
                            continue;
                        }

                        // -----< 再起呼び出し >-----
                        //
                        string res;
                        res = ProcessURL(currentRecursive + 1, (string)URLArray[i],strURL, strDownloadPath,"","",false);
                        if (isHTML(res) != true)
                        {
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("HTMLではありませんでした。:"); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText((string)URLArray[i]); }));
                            Invoke(new MethodInvoker(delegate() { statusTextBox.AppendText("\r\n"); }));
                            Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Text = "" + currentRecursive; }));
                            Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Update(); }));
                        }

                        Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Text = "" + currentRecursive; }));
                        Invoke(new MethodInvoker(delegate() { recursiveLevelLabel.Update(); }));
                        Invoke(new MethodInvoker(delegate() { toolStripStatusLabel.Text = strURL; }));
                        Invoke(new MethodInvoker(delegate() { statusStrip.Update(); }));

                        if (m_cansel == true) { return mimeType; }  // キャンセル処理
                    }
                }
            }

            return mimeType;
        }