Exemple #1
0
            public void Download()
            {
                _tickableProgress.Message("Indexing " + _url);
                string filePath             = TempFileManager.Instance.CreateTempFile();
                WebRequestWithCache request = new WebRequestWithCache(_url);

                Stream     response   = request.GetResponseStream(WebRequestWithCache.CacheSettings.CHECKCACHE, _timeout);
                FileStream fileStream = new FileStream(filePath, FileMode.Open);

                using (response)
                    using (fileStream)
                        StreamHelper.Transfer(response, fileStream);

                _filePath = filePath;

                _tickableProgress.Tick();
            }
        /// <summary>
        /// Actually downloads the files (note that it is synchronous)
        /// </summary>
        protected override void DoWork()
        {
            if (CancelRequested)
            {
                AcknowledgeCancel();
                return;
            }

            // If the base document hasn't been populated, go get it
            if (m_htmlDocument == null)
                m_htmlDocument = HTMLDocumentHelper.GetHTMLDocFromURL(m_url);

            if (CancelRequested)
            {
                AcknowledgeCancel();
                return;
            }

            // Get a list of referenced URLs from the document
            Hashtable urlList = HTMLDocumentHelper.GetResourceUrlsFromDocument(m_htmlDocument);

            if (CancelRequested)
            {
                AcknowledgeCancel();
                return;
            }

            // Get the HTML from this document- we'll use this as the base HTML and replace
            // paths inside of it.
            string finalHTML = HTMLDocumentHelper.HTMLDocToString(m_htmlDocument);

            IEnumerator urlEnum = urlList.GetEnumerator();
            while (urlEnum.MoveNext())
            {
                DictionaryEntry element = (DictionaryEntry) urlEnum.Current;
                string url = (string)element.Key;
                string urlType = (string)element.Value;

                string fullUrl = HTMLDocumentHelper.EscapeRelativeURL(m_url, url);
                string fileName = FileHelper.GetValidFileName(Path.GetFileName(new Uri(fullUrl).AbsolutePath));
                string relativePath;

                if (fileName != string.Empty)
                {
                    if (urlType != HTMLTokens.Frame && urlType != HTMLTokens.IFrame)
                    {
                        relativePath = "referencedFiles/" + fileName;
                        WebRequestWithCache request = new WebRequestWithCache(fullUrl);

                        // Add the html document to the site Storage.
                        using (Stream requestStream = request.GetResponseStream())
                        {
                            if (requestStream != null)
                            {
                                using (Stream fileStream =
                                            m_siteStorage.Open(m_rootPath + relativePath, AccessMode.Write))
                                {
                                    StreamHelper.Transfer(requestStream, fileStream, 8192, true);
                                }
                            }
                        }
                    }
                    else
                    {
                        fileName = Path.GetFileNameWithoutExtension(fileName) + ".htm";
                        relativePath = "referencedFiles/" + fileName;

                        AsyncPageDownload frameDownload =
                            new AsyncPageDownload(fullUrl,
                                    m_siteStorage,
                                    fileName,
                                    m_rootPath + "referencedFiles/",
                                    this.Target);
                        frameDownload.Start();
                        frameDownload.WaitUntilDone();

                        // Regular expressions would allow more flexibility here, but note that
                        // characters like ? / & have meaning in regular expressions and so need
                        // to be escaped
                    }
                    finalHTML = finalHTML.Replace(UrlHelper.CleanUpUrl(url), relativePath);
                }
                if (CancelRequested)
                {
                    AcknowledgeCancel();
                    return;
                }
            }

            // Escape any high ascii characters
            finalHTML = HTMLDocumentHelper.EscapeHighAscii(finalHTML.ToCharArray());

            // Add the html document to the site Storage.
            Stream  htmlStream = m_siteStorage.Open(m_rootPath + m_rootFile, AccessMode.Write);

            using (StreamWriter writer = new StreamWriter(htmlStream, Encoding.UTF8))
            {
                writer.Write(finalHTML);
            }
            m_siteStorage.RootFile = m_rootFile;
        }
Exemple #3
0
        /// <summary>
        /// Actually downloads the files (note that it is synchronous)
        /// </summary>
        protected override void DoWork()
        {
            if (CancelRequested)
            {
                AcknowledgeCancel();
                return;
            }

            // If the base document hasn't been populated, go get it
            if (m_htmlDocument == null)
            {
                m_htmlDocument = HTMLDocumentHelper.GetHTMLDocFromURL(m_url);
            }

            if (CancelRequested)
            {
                AcknowledgeCancel();
                return;
            }

            // Get a list of referenced URLs from the document
            Hashtable urlList = HTMLDocumentHelper.GetResourceUrlsFromDocument(m_htmlDocument);

            if (CancelRequested)
            {
                AcknowledgeCancel();
                return;
            }

            // Get the HTML from this document- we'll use this as the base HTML and replace
            // paths inside of it.
            string finalHTML = HTMLDocumentHelper.HTMLDocToString(m_htmlDocument);

            IEnumerator urlEnum = urlList.GetEnumerator();

            while (urlEnum.MoveNext())
            {
                DictionaryEntry element = (DictionaryEntry)urlEnum.Current;
                string          url     = (string)element.Key;
                string          urlType = (string)element.Value;

                string fullUrl  = HTMLDocumentHelper.EscapeRelativeURL(m_url, url);
                string fileName = FileHelper.GetValidFileName(Path.GetFileName(new Uri(fullUrl).AbsolutePath));
                string relativePath;

                if (fileName != string.Empty)
                {
                    if (urlType != HTMLTokens.Frame && urlType != HTMLTokens.IFrame)
                    {
                        relativePath = "referencedFiles/" + fileName;
                        WebRequestWithCache request = new WebRequestWithCache(fullUrl);

                        // Add the html document to the site Storage.
                        using (Stream requestStream = request.GetResponseStream())
                        {
                            if (requestStream != null)
                            {
                                using (Stream fileStream =
                                           m_siteStorage.Open(m_rootPath + relativePath, AccessMode.Write))
                                {
                                    StreamHelper.Transfer(requestStream, fileStream, 8192, true);
                                }
                            }
                        }
                    }
                    else
                    {
                        fileName     = Path.GetFileNameWithoutExtension(fileName) + ".htm";
                        relativePath = "referencedFiles/" + fileName;

                        AsyncPageDownload frameDownload =
                            new AsyncPageDownload(fullUrl,
                                                  m_siteStorage,
                                                  fileName,
                                                  m_rootPath + "referencedFiles/",
                                                  this.Target);
                        frameDownload.Start();
                        frameDownload.WaitUntilDone();

                        // Regular expressions would allow more flexibility here, but note that
                        // characters like ? / & have meaning in regular expressions and so need
                        // to be escaped
                    }
                    finalHTML = finalHTML.Replace(UrlHelper.CleanUpUrl(url), relativePath);
                }
                if (CancelRequested)
                {
                    AcknowledgeCancel();
                    return;
                }
            }

            // Escape any high ascii characters
            finalHTML = HTMLDocumentHelper.EscapeHighAscii(finalHTML.ToCharArray());

            // Add the html document to the site Storage.
            Stream htmlStream = m_siteStorage.Open(m_rootPath + m_rootFile, AccessMode.Write);

            using (StreamWriter writer = new StreamWriter(htmlStream, Encoding.UTF8))
            {
                writer.Write(finalHTML);
            }
            m_siteStorage.RootFile = m_rootFile;
        }
            public void Download()
            {
                _tickableProgress.Message("Indexing " + _url);
                string filePath = TempFileManager.Instance.CreateTempFile();
                WebRequestWithCache request = new WebRequestWithCache(_url);

                Stream response = request.GetResponseStream(WebRequestWithCache.CacheSettings.CHECKCACHE,_timeout);
                FileStream fileStream = new FileStream(filePath, FileMode.Open);
                using (response)
                    using (fileStream)
                        StreamHelper.Transfer(response, fileStream);

                _filePath = filePath;

                _tickableProgress.Tick();
            }