public static void workWithListFile(string[] args) { var argument = CommandLineArgumentParser.Parse(args); string[] urls = null; Entities.Processor psr = null; if (argument.Has("-l")) { var listfile = argument.Get("-l").Next; urls = System.IO.File.ReadAllLines(listfile); } if (argument.Has("-p")) { var psrfile = argument.Get("-p").Next; psr = Tools.Serializer.DeSerializePSR(psrfile); } StringBuilder sb = new StringBuilder(); foreach (var url in urls) { var htmlt = Tools.DownLoader.GetDocument(url); var f1 = Tools.Scraper.Scrape(htmlt, psr); Console.WriteLine(DateTime.Now.ToString() + " GET:" + f1.Count); foreach (var item in f1) { sb.AppendLine(url + "\t" + item); } } System.IO.File.WriteAllText("taskFromListfile.txt", sb.ToString()); System.Windows.Forms.MessageBox.Show("OK!"); }
public static Entities.Processor DeSerializeFromJsonStringPSR(string json) { using (Newtonsoft.Json.JsonReader jr = new JsonTextReader(new StringReader(json))) { Entities.Processor psrr = serializer.Deserialize <Entities.Processor>(jr); return(psrr); } }
public static Entities.Processor DeSerializePSR(string fileName) { using (StreamReader srr = new StreamReader(fileName)) { using (Newtonsoft.Json.JsonReader jr = new JsonTextReader(srr)) { Entities.Processor psrr = serializer.Deserialize <Entities.Processor>(jr); return(psrr); } } }
public static void Serialize(string filename, Entities.Processor psr) { using (StreamWriter sw = new StreamWriter(filename)) using (JsonWriter writer = new JsonTextWriter(sw) { Formatting = Formatting.Indented, Indentation = 4, IndentChar = ' ' }) { serializer.Serialize(writer, psr); //{"Name":null,"StartURL":null,"XPath":null,"CssSelector":null,"NodeOffset":3,"NodeAttribute":null,"Remover":["FFFF","FFFFJJ"],"Replacer":{"s":"B","J":"k"}} } }
public static string ConvertJsonStringFromPSR(Entities.Processor obj) { //格式化json字符串 //JsonSerializer serializer = new JsonSerializer(); //TextReader tr = new StringReader(psr); //JsonTextReader jtr = new JsonTextReader(tr); //object obj = serializer.Deserialize(jtr); if (obj != null) { StringWriter textWriter = new StringWriter(); JsonTextWriter jsonWriter = new JsonTextWriter(textWriter) { Formatting = Formatting.Indented, Indentation = 4, IndentChar = ' ' }; serializer.Serialize(jsonWriter, obj); return(textWriter.ToString()); } else { return("Error!转换Json错误,对象为空"); } }
/// <summary> /// 主要提取函数,从网页文档中提取所需要的信息 /// </summary> /// <param name="doc"></param> /// <param name="psr"></param> /// <param name="usingCss"></param> /// <returns></returns> public static List <string> Scrape(HtmlAgilityPack.HtmlDocument doc, Entities.Processor psr, bool usingCss = false) { List <string> list = new List <string>(); IEnumerable <HtmlAgilityPack.HtmlNode> nodes = null; if (doc == null) { return(list); } if (string.IsNullOrEmpty(psr.XPath)) { usingCss = true; } if (usingCss) { nodes = doc.DocumentNode.CssSelect(psr.CssSelector); } else { nodes = doc.DocumentNode.SelectNodes(psr.XPath); } if (nodes != null && nodes.Count() > 0) { foreach (var item in nodes) { HtmlAgilityPack.HtmlNode node2 = item; string data;//第二阶段的值 try { foreach (Entities.EnumNodeOffset itemOffset in psr.NodeOffset) { switch (itemOffset) { case XMT281Scraper.Entities.EnumNodeOffset.NoOffset: //node2 = node2; break; case XMT281Scraper.Entities.EnumNodeOffset.SinblingLeft: node2 = node2.PreviousSibling; break; case XMT281Scraper.Entities.EnumNodeOffset.SinblingRight: node2 = node2.NextSibling; break; case XMT281Scraper.Entities.EnumNodeOffset.Parent: node2 = node2.ParentNode; break; case XMT281Scraper.Entities.EnumNodeOffset.Child: node2 = node2.FirstChild; //FirstChild.NextSibling break; default: break; } } } catch (NullReferenceException) { data = "Node偏移到空位"; } //==============接下来对node2进行处理============= if (psr.NodeAttribute == "" || psr.NodeAttribute == "OuterHtml") { if (node2 == null) { data = "Node偏移到空位"; } else { data = node2.OuterHtml.Trim(); } } else if (psr.NodeAttribute == "InnerText") { if (node2 == null) { data = "Node偏移到空位"; } else { data = node2.InnerText.Trim(); } } else { if (node2 == null) { data = "Node偏移到空位"; } else { try { data = node2.Attributes[psr.NodeAttribute].Value.Trim(); } catch (Exception) { data = "【NoAttrib没有这个属性,用OuterHtml占位】" + node2.OuterHtml; } } } //=============接下来对data进行 删除 处理=============== foreach (var rm in psr.Remover) { data = data.Replace(rm, ""); } //=============接下来对data进行 替换 处理=============== foreach (var rp in psr.Replacer) { data = data.Replace(rp.Key, rp.Value); } if (!string.IsNullOrEmpty(psr.RemoveBefore)) { int k = data.IndexOf(psr.RemoveBefore); data = data.Substring(k + 1); } if (!string.IsNullOrEmpty(psr.RemoveAfter)) { int j = data.LastIndexOf(psr.RemoveAfter); if (j != -1) { data = data.Substring(0, j); } } list.Add(data.Trim()); } } else { list.Add("未识别"); } return(list); }
public static string ShowJson(Entities.Processor psr) { return(ConvertJsonStringFromPSR(psr)); }