public void RepFile() { try { List <string> list = FileAction.ReadToArr(file.FullName); string p = file.FullName.Replace(path, outpath); // 匹配注释正则 string regNotes = "([/]{2,}|[*]+).*"; // 匹配中文正则 string regChinese = @"([\u4e00-\u9fa5]{1,}[\s,,‘“;(()):、:.&\\-a-zA-Z0-9\u4e00-\u9fa5]{0,}[。”’!0-9\u4e00-\u9fa5]{1,})|([\u4e00-\u9fa5]{1})"; int index = 0; foreach (var item in list) { index++; // 去掉注释 string str = DataCheck.RepStr(item.Trim(), regNotes, ""); //是否包含中文 if (!DataCheck.CheckReg(str, regChinese)) { FileAction.AppendStr(p, item + "\n"); } else { // 取出中文 string[] strArr = DataCheck.GetRegStrArr(str, regChinese); string temp = str; string get = ""; // 在语言包中寻找匹配 foreach (var chinese in strArr) { // 如果没有包含汉字,查找下一个 if (!DataCheck.CheckReg(chinese, "[\u4e00-\u9fa5]+")) { continue; } // 若语言包中存在对应中文,直接替换 if (dic.ContainsKey(chinese)) { get = dic[chinese]; temp = temp.Replace(chinese, DataCheck.RepLanguage(dic[chinese], false)); errInfo.Add($"{chinese}\t{dic[chinese]}\t{file.FullName}"); } // 否则,去寻找最类似的中文 else { // 获取极限长度 int min = chinese.Length - 2; int max = chinese.Length + 2; // 判断是否替换 bool bl = false; // 循环字典 foreach (var key in dic.Keys) { // 超出极限长度,则跳出 if (max < key.Length || key.Length < min) { continue; } // 若符合极限长度,且包含当前文字 if (key.Contains(chinese)) { // 进行替换 temp = temp.Replace(chinese, DataCheck.RepLanguage(dic[key], false)); errInfo.Insert(0, $"^{chinese}:{index}行\t{dic[key]}\t{file.FullName}"); bl = true; } } if (!bl) { errInfo.Insert(0, $"^^{chinese}:{index}行\t{file.FullName}"); } } } // 将当前行写入文件 FileAction.AppendStr(p, item.Replace(str, temp) + "\n"); } } } catch (Exception e) { errInfo.Add("错误:" + file.FullName + "\t" + e.Message); } }
/// <summary> /// /// </summary> /// <param name="sender"></param> /// <param name="e"></param> private void Crawler_OnCompleted(OnCompletedEventArgs e, int layer, string Title = "") { SetLog($"读取网站:{e.Uri.ToString()}\n\t\t\t\t深度:{layer}\t用时:{e.Milliseconds} 毫秒\t线程ID:{e.TreadID}", Color.Gray); UpAll(1); urlList.Add(e.Uri.ToString()); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(e.PageSource); ConReg = "//div[@id='endText']"; // 查看是否存在内容 HtmlNode conNode = doc.DocumentNode.SelectSingleNode(ConReg); if (conNode != null) { string url = e.Uri.ToString(); // 判断此超链接是否已经读取 if (_GSQ_NewsService.Exists(c => c.url == url)) { return; } if (!string.IsNullOrEmpty(conNode.InnerHtml)) { GSQ_News _News = new GSQ_News(); _News.title = Title; _News.url = e.Uri.ToString(); _News.sourcewebsite = conNode.InnerHtml; _News.num = 0; _News.CreateDate = DateTime.Now; _GSQ_NewsService.AddEntity(_News); UpCon(1); UpNum(1); SetLog($"抓取新闻《{Title}》,用时:{e.Milliseconds} 毫秒", Color.Gray); } return; } if (layer >= 4) { return; } // 获取所有a标签 var AList = doc.DocumentNode.Descendants("a"); foreach (var item in AList) { // 爬虫类 Crawler crawler = new Crawler(); Operation operation = new Operation() { Action = (x) => { }, Condition = (x) => { return(true); }, timeout = 5000 }; crawler.OnError += Crawler_OnError; crawler.OnCompleted += (s, ex) => { Crawler_OnCompleted(ex, layer + 1, item.InnerText); }; string url = Utils.DelLastChar(e.Uri.ToString(), "/", 0); string href = item.Attributes["href"]?.Value; if (!string.IsNullOrEmpty(href) && !urlList.Contains(href) && DataCheck.CheckReg(href, DataCheck.Reg_Url) && href.Contains(url)) { crawler.Start(href, operation, null).Wait(); } } }