Пример #1
0
 /// <summary>
 /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径
 /// </summary>
 /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param>
 /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param>
 /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param>
 /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param>
 /// <param name="saveDir">下载文件存放的目录</param>
 /// <param name="regGroupName">指定匹配url的正则分组的组名</param>
 /// <param name="urlHandle">指定对匹配url进行处理的函数回调</param>
 /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param>
 /// <returns>返回替换后的标签字符串</returns>
 internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle)
 {
     return(MatchUrl(match, dirConfig, resourceUrl, referenceDir, saveDir, regGroupName, urlHandle, downloadHandle, false, null));
 }
Пример #2
0
        /// <summary>
        /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径
        /// </summary>
        /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param>
        /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param>
        /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param>
        /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param>
        /// <param name="saveDir">下载文件存放的目录</param>
        /// <param name="regGroupName">指定匹配url的正则分组的组名</param>
        /// <param name="urlHandle">指定对匹配url(也就是href或者src)进行处理的函数回调,此回调主要对url进行一些处理(例如去引号等等)</param>
        /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param>
        /// <param name="async">是否异步下载</param>
        /// <param name="dic">记录异步下载信息的字典</param>
        /// <returns>返回替换后的标签字符串</returns>
        internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle, bool async, Dictionary <string, string> dic)
        {
            //文件的地址(可能是相对也可能是绝对)、文件的绝对Url地址、以及保存到本地以后的本地路径
            string href, url, localPath = "";

            href = match.Groups[regGroupName].Value;
            if (null != urlHandle)
            {
                href = urlHandle(href);
            }
            url = PathUtility.ConvertToAbsoluteHref(resourceUrl ?? this.pageUrl, href);
            //如果设置了根据网站url结构按对应目录存储资源,则重新设置保存目录的路径
            if (dirConfig.UseWebSite)
            {
                saveDir = PathUtility.GetSaveDir(this.PageUrl, url, dirConfig.HtmlDirPath);
            }
            //如果使用异步下载
            if (async)
            {
                //如果未指定异步下载函数
                if (null == downloadHandle)
                {
                    localPath = this.GetPlaceHolder();
                    //开始异步下载资源
                    Spider.SaveResourceAsync(url, saveDir, new Action <string>(delegate(string filePath)
                    {
                        lock (dic)
                        {
                            dic[localPath] = null == filePath ? url : PathUtility.GetRelativePath(referenceDir, filePath);
                        }
                    }));
                }
                else
                {
                    localPath = downloadHandle(url);
                }
                if (!dic.ContainsKey(localPath))
                {
                    lock (dic)
                    {
                        dic.Add(localPath, null);
                    }
                }
            }//如果使用同步下载
            else
            {
                if (null == downloadHandle)
                {
                    //保存文件,如果成功,则返回文件保存后的本地路
                    localPath = Spider.SaveResource(url, saveDir);
                }
                else
                {
                    localPath = downloadHandle(url);
                }
                //如果保存失败,则引用绝对url地址
                if (string.IsNullOrEmpty(localPath))
                {
                    localPath = url;
                }
                else
                {
                    localPath = PathUtility.GetRelativePath(referenceDir, localPath);
                }
            }
            //使用localPath替换html内容中的引用路径
            return(match.Value.Replace(href, localPath));
        }
Пример #3
0
        /// <summary>Enumerates all of the matches with the specified regex, invoking the callback for each.</summary>
        /// <remarks>
        /// This repeatedly hands out the same Match instance, updated with new information.
        /// </remarks>
        internal void Scan <TState>(Regex regex, string text, int textstart, ref TState state, MatchCallback <TState> callback, TimeSpan timeout)
        {
            // Store arguments into fields for derived runner to examine
            runregex   = regex;
            runtext    = text;
            runtextbeg = 0;
            runtextend = text.Length;
            runtextpos = runtextstart = textstart;

            // Handle timeout argument
            _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds
            bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout;

            if (!ignoreTimeout)
            {
                // We are using Environment.TickCount and not Stopwatch for performance reasons.
                // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt
                // overflow it will still stay ahead of Environment.TickCount for comparisons made
                // in DoCheckTimeout().
                _timeout             = (int)(timeout.TotalMilliseconds + 0.5); // Round;
                _timeoutOccursAt     = Environment.TickCount + _timeout;
                _timeoutChecksToSkip = TimeoutCheckFrequency;
            }

            // Configure the additional value to "bump" the position along each time we loop around
            // to call FindFirstChar again, as well as the stopping position for the loop.  We generally
            // bump by 1 and stop at runtextend, but if we're examining right-to-left, we instead bump
            // by -1 and stop at runtextbeg.
            int bump = 1, stoppos = runtextend;

            if (runregex.RightToLeft)
            {
                bump    = -1;
                stoppos = runtextbeg;
            }

            // Main loop: FindFirstChar/Go + bump until the ending position.
            bool initialized = false;

            while (true)
            {
#if DEBUG
                if (runregex.IsDebug)
                {
                    Debug.WriteLine("");
                    Debug.WriteLine($"Search range: from {runtextbeg} to {runtextend}");
                    Debug.WriteLine($"Firstchar search starting at {runtextpos} stopping at {stoppos}");
                }
#endif

                // Find the next potential location for a match in the input.
                if (FindFirstChar())
                {
                    if (!ignoreTimeout)
                    {
                        DoCheckTimeout();
                    }

                    // Ensure that the runner is initialized.  This includes initializing all of the state in the runner
                    // that Go might use, such as the backtracking stack, as well as a Match object for it to populate.
                    if (!initialized)
                    {
                        InitializeForGo();
                        initialized = true;
                    }

#if DEBUG
                    if (runregex.IsDebug)
                    {
                        Debug.WriteLine($"Executing engine starting at {runtextpos}");
                        Debug.WriteLine("");
                    }
#endif

                    // See if there's a match at this position.
                    Go();

                    // See if we have a match.
                    Match match = runmatch !;
                    if (match._matchcount[0] > 0)
                    {
                        // Hand it out to the callback in canonical form.
                        match.Tidy(runtextpos);
                        initialized = false;
                        if (!callback(ref state, match))
                        {
                            // If the callback returns false, we're done.
                            return;
                        }

                        // Reset state for another iteration.
                        runtrackpos = runtrack !.Length;
                        runstackpos = runstack !.Length;
                        runcrawlpos = runcrawl !.Length;
                        if (match.Length == 0)
                        {
                            if (runtextpos == stoppos)
                            {
                                return;
                            }

                            runtextpos += bump;
                        }

                        // Loop around to perform next match from where we left off.
                        continue;
                    }

                    // Ran Go but it didn't find a match. Reset state for another iteration.
                    runtrackpos = runtrack !.Length;
                    runstackpos = runstack !.Length;
                    runcrawlpos = runcrawl !.Length;
                }

                // We failed to match at this position.  If we're at the stopping point, we're done.
                if (runtextpos == stoppos)
                {
                    return;
                }

                // Bump by one (in whichever direction is appropriate) and loop to go again.
                runtextpos += bump;
            }
        }
Пример #4
0
        /// <summary>Enumerates all of the matches with the specified regex, invoking the callback for each.</summary>
        /// <remarks>
        /// This optionally repeatedly hands out the same Match instance, updated with new information.
        /// <paramref name="reuseMatchObject"/> should be set to false if the Match object is handed out to user code.
        /// </remarks>
        internal void Scan <TState>(Regex regex, string text, int textstart, ref TState state, MatchCallback <TState> callback, bool reuseMatchObject, TimeSpan timeout)
        {
            // Handle timeout argument
            _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds
            bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout;

            if (!ignoreTimeout)
            {
                // We are using Environment.TickCount and not Stopwatch for performance reasons.
                // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt
                // overflow it will still stay ahead of Environment.TickCount for comparisons made
                // in DoCheckTimeout().
                _timeout             = (int)(timeout.TotalMilliseconds + 0.5); // Round;
                _timeoutOccursAt     = Environment.TickCount + _timeout;
                _timeoutChecksToSkip = TimeoutCheckFrequency;
            }

            // Configure the additional value to "bump" the position along each time we loop around
            // to call FindFirstChar again, as well as the stopping position for the loop.  We generally
            // bump by 1 and stop at text.Length, but if we're examining right-to-left, we instead bump
            // by -1 and stop at 0.
            int bump = 1, stoppos = text.Length;

            if (regex.RightToLeft)
            {
                bump    = -1;
                stoppos = 0;
            }

            // Store remaining arguments into fields now that we're going to start the scan.
            // These are referenced by the derived runner.
            runregex     = regex;
            runtextstart = runtextpos = textstart;
            runtext      = text;
            runtextend   = text.Length;
            runtextbeg   = 0;

            // Main loop: FindFirstChar/Go + bump until the ending position.
            bool initialized = false;

            while (true)
            {
#if DEBUG
                if (regex.IsDebug)
                {
                    Debug.WriteLine("");
                    Debug.WriteLine($"Search range: from {runtextbeg} to {runtextend}");
                    Debug.WriteLine($"Firstchar search starting at {runtextpos} stopping at {stoppos}");
                }
#endif

                // Find the next potential location for a match in the input.
                if (FindFirstChar())
                {
                    if (!ignoreTimeout)
                    {
                        DoCheckTimeout();
                    }

                    // Ensure that the runner is initialized.  This includes initializing all of the state in the runner
                    // that Go might use, such as the backtracking stack, as well as a Match object for it to populate.
                    if (!initialized)
                    {
                        InitializeForGo();
                        initialized = true;
                    }

#if DEBUG
                    if (regex.IsDebug)
                    {
                        Debug.WriteLine($"Executing engine starting at {runtextpos}");
                        Debug.WriteLine("");
                    }
#endif

                    // See if there's a match at this position.
                    Go();

                    // See if we have a match.
                    Match match = runmatch !;
                    if (match._matchcount[0] > 0)
                    {
                        // Hand it out to the callback in canonical form.
                        if (!reuseMatchObject)
                        {
                            // We're not reusing match objects, so null out our field reference to the instance.
                            // It'll be recreated the next time one is needed.
                            runmatch = null;
                        }
                        match.Tidy(runtextpos);
                        initialized = false;
                        if (!callback(ref state, match))
                        {
                            // If the callback returns false, we're done.
                            // Drop reference to text to avoid keeping it alive in a cache.
                            runtext = null !;
                            if (reuseMatchObject)
                            {
                                // We're reusing the single match instance, so clear out its text as well.
                                // We don't do this if we're not reusing instances, as in that case we're
                                // dropping the whole reference to the match, and we no longer own the instance
                                // having handed it out to the callback.
                                match.Text = null !;
                            }
                            return;
                        }

                        // Now that we've matched successfully, update the starting position to reflect
                        // the current position, just as Match.NextMatch() would pass in _textpos as textstart.
                        runtextstart = runtextpos;

                        // Reset state for another iteration.
                        runtrackpos = runtrack !.Length;
                        runstackpos = runstack !.Length;
                        runcrawlpos = runcrawl !.Length;
                        if (match.Length == 0)
                        {
                            if (runtextpos == stoppos)
                            {
                                // Drop reference to text to avoid keeping it alive in a cache.
                                runtext = null !;
                                if (reuseMatchObject)
                                {
                                    // See above comment.
                                    match.Text = null !;
                                }
                                return;
                            }

                            runtextpos += bump;
                        }

                        // Loop around to perform next match from where we left off.
                        continue;
                    }

                    // Ran Go but it didn't find a match. Reset state for another iteration.
                    runtrackpos = runtrack !.Length;
                    runstackpos = runstack !.Length;
                    runcrawlpos = runcrawl !.Length;
                }

                // We failed to match at this position.  If we're at the stopping point, we're done.
                if (runtextpos == stoppos)
                {
                    runtext = null; // drop reference to text to avoid keeping it alive in a cache
                    if (runmatch != null)
                    {
                        runmatch.Text = null !;
                    }
                    return;
                }

                // Bump by one (in whichever direction is appropriate) and loop to go again.
                runtextpos += bump;
            }
        }
Пример #5
0
        /// <summary>
        /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径
        /// </summary>
        /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param>
        /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param>
        /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param>
        /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param>
        /// <param name="saveDir">下载文件存放的目录</param>
        /// <param name="regGroupName">指定匹配url的正则分组的组名</param>
        /// <param name="urlHandle">指定对匹配url(也就是href或者src)进行处理的函数回调,此回调主要对url进行一些处理(例如去引号等等)</param>
        /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param>
        /// <param name="async">是否异步下载</param>
        /// <param name="dic">记录异步下载信息的字典</param>
        /// <returns>返回替换后的标签字符串</returns>
        internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle, bool async, Dictionary<string, string> dic)
        {
            //文件的地址(可能是相对也可能是绝对)、文件的绝对Url地址、以及保存到本地以后的本地路径
            string href, url, localPath = "";
            href = match.Groups[regGroupName].Value;
            if (null != urlHandle)
            {
                href = urlHandle(href);
            }
            url = PathUtility.ConvertToAbsoluteHref(resourceUrl ?? this.url, href);
            //如果设置了根据网站url结构按对应目录存储资源,则重新设置保存目录的路径
            if (dirConfig.UseWebSite)
            {
                saveDir = PathUtility.GetSaveDir(this.Url, url, dirConfig.HtmlDirPath);
            }
            //如果使用异步下载
            if (async)
            {
                ////如果未指定异步下载函数
                //if (null == downloadHandle)
                //{
                //    localPath = this.GetPlaceHolder();
                //    //开始异步下载资源
                //    Spider.SaveResourceAsync(url, saveDir, new Action<string>(delegate(string filePath)
                //    {
                //        lock (dic)
                //        {
                //            dic[localPath] = null == filePath ? url : PathUtility.GetRelativePath(referenceDir, filePath);
                //        }
                //    }));
                //}
                //else
                //{
                //    localPath = downloadHandle(url);
                //}
                //if (!dic.ContainsKey(localPath))
                //{
                //    lock (dic)
                //    {
                //        dic.Add(localPath, null);
                //    }
                //}

            }//如果使用同步下载
            else
            {
                if (null == downloadHandle)
                {
                    //保存文件,如果成功,则返回文件保存后的本地路
                    localPath = SaveResource(url, saveDir);
                }
                else
                {
                    localPath = downloadHandle(url);
                }
                //如果保存失败,则引用绝对url地址
                if (string.IsNullOrEmpty(localPath))
                {
                    localPath = url;
                }
                else
                {
                    localPath = PathUtility.GetRelativePath(referenceDir, localPath);
                }
            }
            //使用localPath替换html内容中的引用路径
            return match.Value.Replace(href, localPath);
        }
Пример #6
0
 /// <summary>
 /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径
 /// </summary>
 /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param>
 /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param>
 /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param>
 /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param>
 /// <param name="saveDir">下载文件存放的目录</param>
 /// <param name="regGroupName">指定匹配url的正则分组的组名</param>
 /// <param name="urlHandle">指定对匹配url进行处理的函数回调</param>
 /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param>
 /// <returns>返回替换后的标签字符串</returns>
 internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle)
 {
     return MatchUrl(match, dirConfig, resourceUrl, referenceDir, saveDir, regGroupName, urlHandle, downloadHandle, false, null);
 }
Пример #7
0
 public MessageCode CreateMatchAsyn(BaseMatchData stateObj, MatchCallback callback)
 {
     return(CreateMatchAsyn(stateObj, (a, b) => callback(a)));
 }