/// <summary> /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径 /// </summary> /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param> /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param> /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param> /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param> /// <param name="saveDir">下载文件存放的目录</param> /// <param name="regGroupName">指定匹配url的正则分组的组名</param> /// <param name="urlHandle">指定对匹配url进行处理的函数回调</param> /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param> /// <returns>返回替换后的标签字符串</returns> internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle) { return(MatchUrl(match, dirConfig, resourceUrl, referenceDir, saveDir, regGroupName, urlHandle, downloadHandle, false, null)); }
/// <summary> /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径 /// </summary> /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param> /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param> /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param> /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param> /// <param name="saveDir">下载文件存放的目录</param> /// <param name="regGroupName">指定匹配url的正则分组的组名</param> /// <param name="urlHandle">指定对匹配url(也就是href或者src)进行处理的函数回调,此回调主要对url进行一些处理(例如去引号等等)</param> /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param> /// <param name="async">是否异步下载</param> /// <param name="dic">记录异步下载信息的字典</param> /// <returns>返回替换后的标签字符串</returns> internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle, bool async, Dictionary <string, string> dic) { //文件的地址(可能是相对也可能是绝对)、文件的绝对Url地址、以及保存到本地以后的本地路径 string href, url, localPath = ""; href = match.Groups[regGroupName].Value; if (null != urlHandle) { href = urlHandle(href); } url = PathUtility.ConvertToAbsoluteHref(resourceUrl ?? this.pageUrl, href); //如果设置了根据网站url结构按对应目录存储资源,则重新设置保存目录的路径 if (dirConfig.UseWebSite) { saveDir = PathUtility.GetSaveDir(this.PageUrl, url, dirConfig.HtmlDirPath); } //如果使用异步下载 if (async) { //如果未指定异步下载函数 if (null == downloadHandle) { localPath = this.GetPlaceHolder(); //开始异步下载资源 Spider.SaveResourceAsync(url, saveDir, new Action <string>(delegate(string filePath) { lock (dic) { dic[localPath] = null == filePath ? url : PathUtility.GetRelativePath(referenceDir, filePath); } })); } else { localPath = downloadHandle(url); } if (!dic.ContainsKey(localPath)) { lock (dic) { dic.Add(localPath, null); } } }//如果使用同步下载 else { if (null == downloadHandle) { //保存文件,如果成功,则返回文件保存后的本地路 localPath = Spider.SaveResource(url, saveDir); } else { localPath = downloadHandle(url); } //如果保存失败,则引用绝对url地址 if (string.IsNullOrEmpty(localPath)) { localPath = url; } else { localPath = PathUtility.GetRelativePath(referenceDir, localPath); } } //使用localPath替换html内容中的引用路径 return(match.Value.Replace(href, localPath)); }
/// <summary>Enumerates all of the matches with the specified regex, invoking the callback for each.</summary> /// <remarks> /// This repeatedly hands out the same Match instance, updated with new information. /// </remarks> internal void Scan <TState>(Regex regex, string text, int textstart, ref TState state, MatchCallback <TState> callback, TimeSpan timeout) { // Store arguments into fields for derived runner to examine runregex = regex; runtext = text; runtextbeg = 0; runtextend = text.Length; runtextpos = runtextstart = textstart; // Handle timeout argument _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; if (!ignoreTimeout) { // We are using Environment.TickCount and not Stopwatch for performance reasons. // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt // overflow it will still stay ahead of Environment.TickCount for comparisons made // in DoCheckTimeout(). _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; _timeoutOccursAt = Environment.TickCount + _timeout; _timeoutChecksToSkip = TimeoutCheckFrequency; } // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at runtextend, but if we're examining right-to-left, we instead bump // by -1 and stop at runtextbeg. int bump = 1, stoppos = runtextend; if (runregex.RightToLeft) { bump = -1; stoppos = runtextbeg; } // Main loop: FindFirstChar/Go + bump until the ending position. bool initialized = false; while (true) { #if DEBUG if (runregex.IsDebug) { Debug.WriteLine(""); Debug.WriteLine($"Search range: from {runtextbeg} to {runtextend}"); Debug.WriteLine($"Firstchar search starting at {runtextpos} stopping at {stoppos}"); } #endif // Find the next potential location for a match in the input. if (FindFirstChar()) { if (!ignoreTimeout) { DoCheckTimeout(); } // Ensure that the runner is initialized. This includes initializing all of the state in the runner // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. if (!initialized) { InitializeForGo(); initialized = true; } #if DEBUG if (runregex.IsDebug) { Debug.WriteLine($"Executing engine starting at {runtextpos}"); Debug.WriteLine(""); } #endif // See if there's a match at this position. Go(); // See if we have a match. Match match = runmatch !; if (match._matchcount[0] > 0) { // Hand it out to the callback in canonical form. match.Tidy(runtextpos); initialized = false; if (!callback(ref state, match)) { // If the callback returns false, we're done. return; } // Reset state for another iteration. runtrackpos = runtrack !.Length; runstackpos = runstack !.Length; runcrawlpos = runcrawl !.Length; if (match.Length == 0) { if (runtextpos == stoppos) { return; } runtextpos += bump; } // Loop around to perform next match from where we left off. continue; } // Ran Go but it didn't find a match. Reset state for another iteration. runtrackpos = runtrack !.Length; runstackpos = runstack !.Length; runcrawlpos = runcrawl !.Length; } // We failed to match at this position. If we're at the stopping point, we're done. if (runtextpos == stoppos) { return; } // Bump by one (in whichever direction is appropriate) and loop to go again. runtextpos += bump; } }
/// <summary>Enumerates all of the matches with the specified regex, invoking the callback for each.</summary> /// <remarks> /// This optionally repeatedly hands out the same Match instance, updated with new information. /// <paramref name="reuseMatchObject"/> should be set to false if the Match object is handed out to user code. /// </remarks> internal void Scan <TState>(Regex regex, string text, int textstart, ref TState state, MatchCallback <TState> callback, bool reuseMatchObject, TimeSpan timeout) { // Handle timeout argument _timeout = -1; // (int)Regex.InfiniteMatchTimeout.TotalMilliseconds bool ignoreTimeout = _ignoreTimeout = Regex.InfiniteMatchTimeout == timeout; if (!ignoreTimeout) { // We are using Environment.TickCount and not Stopwatch for performance reasons. // Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt // overflow it will still stay ahead of Environment.TickCount for comparisons made // in DoCheckTimeout(). _timeout = (int)(timeout.TotalMilliseconds + 0.5); // Round; _timeoutOccursAt = Environment.TickCount + _timeout; _timeoutChecksToSkip = TimeoutCheckFrequency; } // Configure the additional value to "bump" the position along each time we loop around // to call FindFirstChar again, as well as the stopping position for the loop. We generally // bump by 1 and stop at text.Length, but if we're examining right-to-left, we instead bump // by -1 and stop at 0. int bump = 1, stoppos = text.Length; if (regex.RightToLeft) { bump = -1; stoppos = 0; } // Store remaining arguments into fields now that we're going to start the scan. // These are referenced by the derived runner. runregex = regex; runtextstart = runtextpos = textstart; runtext = text; runtextend = text.Length; runtextbeg = 0; // Main loop: FindFirstChar/Go + bump until the ending position. bool initialized = false; while (true) { #if DEBUG if (regex.IsDebug) { Debug.WriteLine(""); Debug.WriteLine($"Search range: from {runtextbeg} to {runtextend}"); Debug.WriteLine($"Firstchar search starting at {runtextpos} stopping at {stoppos}"); } #endif // Find the next potential location for a match in the input. if (FindFirstChar()) { if (!ignoreTimeout) { DoCheckTimeout(); } // Ensure that the runner is initialized. This includes initializing all of the state in the runner // that Go might use, such as the backtracking stack, as well as a Match object for it to populate. if (!initialized) { InitializeForGo(); initialized = true; } #if DEBUG if (regex.IsDebug) { Debug.WriteLine($"Executing engine starting at {runtextpos}"); Debug.WriteLine(""); } #endif // See if there's a match at this position. Go(); // See if we have a match. Match match = runmatch !; if (match._matchcount[0] > 0) { // Hand it out to the callback in canonical form. if (!reuseMatchObject) { // We're not reusing match objects, so null out our field reference to the instance. // It'll be recreated the next time one is needed. runmatch = null; } match.Tidy(runtextpos); initialized = false; if (!callback(ref state, match)) { // If the callback returns false, we're done. // Drop reference to text to avoid keeping it alive in a cache. runtext = null !; if (reuseMatchObject) { // We're reusing the single match instance, so clear out its text as well. // We don't do this if we're not reusing instances, as in that case we're // dropping the whole reference to the match, and we no longer own the instance // having handed it out to the callback. match.Text = null !; } return; } // Now that we've matched successfully, update the starting position to reflect // the current position, just as Match.NextMatch() would pass in _textpos as textstart. runtextstart = runtextpos; // Reset state for another iteration. runtrackpos = runtrack !.Length; runstackpos = runstack !.Length; runcrawlpos = runcrawl !.Length; if (match.Length == 0) { if (runtextpos == stoppos) { // Drop reference to text to avoid keeping it alive in a cache. runtext = null !; if (reuseMatchObject) { // See above comment. match.Text = null !; } return; } runtextpos += bump; } // Loop around to perform next match from where we left off. continue; } // Ran Go but it didn't find a match. Reset state for another iteration. runtrackpos = runtrack !.Length; runstackpos = runstack !.Length; runcrawlpos = runcrawl !.Length; } // We failed to match at this position. If we're at the stopping point, we're done. if (runtextpos == stoppos) { runtext = null; // drop reference to text to avoid keeping it alive in a cache if (runmatch != null) { runmatch.Text = null !; } return; } // Bump by one (in whichever direction is appropriate) and loop to go again. runtextpos += bump; } }
/// <summary> /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径 /// </summary> /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param> /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param> /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param> /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param> /// <param name="saveDir">下载文件存放的目录</param> /// <param name="regGroupName">指定匹配url的正则分组的组名</param> /// <param name="urlHandle">指定对匹配url(也就是href或者src)进行处理的函数回调,此回调主要对url进行一些处理(例如去引号等等)</param> /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param> /// <param name="async">是否异步下载</param> /// <param name="dic">记录异步下载信息的字典</param> /// <returns>返回替换后的标签字符串</returns> internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle, bool async, Dictionary<string, string> dic) { //文件的地址(可能是相对也可能是绝对)、文件的绝对Url地址、以及保存到本地以后的本地路径 string href, url, localPath = ""; href = match.Groups[regGroupName].Value; if (null != urlHandle) { href = urlHandle(href); } url = PathUtility.ConvertToAbsoluteHref(resourceUrl ?? this.url, href); //如果设置了根据网站url结构按对应目录存储资源,则重新设置保存目录的路径 if (dirConfig.UseWebSite) { saveDir = PathUtility.GetSaveDir(this.Url, url, dirConfig.HtmlDirPath); } //如果使用异步下载 if (async) { ////如果未指定异步下载函数 //if (null == downloadHandle) //{ // localPath = this.GetPlaceHolder(); // //开始异步下载资源 // Spider.SaveResourceAsync(url, saveDir, new Action<string>(delegate(string filePath) // { // lock (dic) // { // dic[localPath] = null == filePath ? url : PathUtility.GetRelativePath(referenceDir, filePath); // } // })); //} //else //{ // localPath = downloadHandle(url); //} //if (!dic.ContainsKey(localPath)) //{ // lock (dic) // { // dic.Add(localPath, null); // } //} }//如果使用同步下载 else { if (null == downloadHandle) { //保存文件,如果成功,则返回文件保存后的本地路 localPath = SaveResource(url, saveDir); } else { localPath = downloadHandle(url); } //如果保存失败,则引用绝对url地址 if (string.IsNullOrEmpty(localPath)) { localPath = url; } else { localPath = PathUtility.GetRelativePath(referenceDir, localPath); } } //使用localPath替换html内容中的引用路径 return match.Value.Replace(href, localPath); }
/// <summary> /// 替换将js、css、图片、flash等文件下载到本地后,调用此方法获取在页面中引用本地文件的路径 /// </summary> /// <param name="match">匹配到的js、css、图片、flash等标签的正则实例</param> /// <param name="dirConfig">目录配置的实例,从中判断是否根据url结构保存</param> /// <param name="resourceUrl">当前html页面或者css文件的url。此url用于将其内容中的相对地址转换为绝对地址并下载</param> /// <param name="referenceDir">引用图片的文件所在目录(可能是html文件所在目录,也能是css文件所在目录)</param> /// <param name="saveDir">下载文件存放的目录</param> /// <param name="regGroupName">指定匹配url的正则分组的组名</param> /// <param name="urlHandle">指定对匹配url进行处理的函数回调</param> /// <param name="downloadHandle">指定自定义下载处理的函数回调,此回调返回下载文件的本地路径或者占位符</param> /// <returns>返回替换后的标签字符串</returns> internal string MatchUrl(Match match, DirConfig dirConfig, string resourceUrl, string referenceDir, string saveDir, string regGroupName, MatchCallback urlHandle, MatchCallback downloadHandle) { return MatchUrl(match, dirConfig, resourceUrl, referenceDir, saveDir, regGroupName, urlHandle, downloadHandle, false, null); }
public MessageCode CreateMatchAsyn(BaseMatchData stateObj, MatchCallback callback) { return(CreateMatchAsyn(stateObj, (a, b) => callback(a))); }