Esempio n. 1
0
        public static void workWithListFile(string[] args)
        {
            var argument = CommandLineArgumentParser.Parse(args);

            string[]           urls = null;
            Entities.Processor psr  = null;
            if (argument.Has("-l"))
            {
                var listfile = argument.Get("-l").Next;
                urls = System.IO.File.ReadAllLines(listfile);
            }

            if (argument.Has("-p"))
            {
                var psrfile = argument.Get("-p").Next;
                psr = Tools.Serializer.DeSerializePSR(psrfile);
            }
            StringBuilder sb = new StringBuilder();

            foreach (var url in urls)
            {
                var htmlt = Tools.DownLoader.GetDocument(url);
                var f1    = Tools.Scraper.Scrape(htmlt, psr);
                Console.WriteLine(DateTime.Now.ToString() + " GET:" + f1.Count);
                foreach (var item in f1)
                {
                    sb.AppendLine(url + "\t" + item);
                }
            }

            System.IO.File.WriteAllText("taskFromListfile.txt", sb.ToString());
            System.Windows.Forms.MessageBox.Show("OK!");
        }
Esempio n. 2
0
        public static Entities.Processor DeSerializeFromJsonStringPSR(string json)
        {
            using (Newtonsoft.Json.JsonReader jr = new JsonTextReader(new StringReader(json)))
            {
                Entities.Processor psrr = serializer.Deserialize <Entities.Processor>(jr);

                return(psrr);
            }
        }
Esempio n. 3
0
 public static Entities.Processor DeSerializePSR(string fileName)
 {
     using (StreamReader srr = new StreamReader(fileName))
     {
         using (Newtonsoft.Json.JsonReader jr = new JsonTextReader(srr))
         {
             Entities.Processor psrr = serializer.Deserialize <Entities.Processor>(jr);
             return(psrr);
         }
     }
 }
Esempio n. 4
0
 public static void Serialize(string filename, Entities.Processor psr)
 {
     using (StreamWriter sw = new StreamWriter(filename))
         using (JsonWriter writer = new JsonTextWriter(sw)
         {
             Formatting = Formatting.Indented,
             Indentation = 4,
             IndentChar = ' '
         })
         {
             serializer.Serialize(writer, psr);
             //{"Name":null,"StartURL":null,"XPath":null,"CssSelector":null,"NodeOffset":3,"NodeAttribute":null,"Remover":["FFFF","FFFFJJ"],"Replacer":{"s":"B","J":"k"}}
         }
 }
Esempio n. 5
0
 public static string ConvertJsonStringFromPSR(Entities.Processor obj)
 {
     //格式化json字符串
     //JsonSerializer serializer = new JsonSerializer();
     //TextReader tr = new StringReader(psr);
     //JsonTextReader jtr = new JsonTextReader(tr);
     //object obj = serializer.Deserialize(jtr);
     if (obj != null)
     {
         StringWriter   textWriter = new StringWriter();
         JsonTextWriter jsonWriter = new JsonTextWriter(textWriter)
         {
             Formatting  = Formatting.Indented,
             Indentation = 4,
             IndentChar  = ' '
         };
         serializer.Serialize(jsonWriter, obj);
         return(textWriter.ToString());
     }
     else
     {
         return("Error!转换Json错误,对象为空");
     }
 }
Esempio n. 6
0
        /// <summary>
        /// 主要提取函数,从网页文档中提取所需要的信息
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="psr"></param>
        /// <param name="usingCss"></param>
        /// <returns></returns>
        public static List <string> Scrape(HtmlAgilityPack.HtmlDocument doc, Entities.Processor psr, bool usingCss = false)
        {
            List <string> list = new List <string>();
            IEnumerable <HtmlAgilityPack.HtmlNode> nodes = null;

            if (doc == null)
            {
                return(list);
            }
            if (string.IsNullOrEmpty(psr.XPath))
            {
                usingCss = true;
            }

            if (usingCss)
            {
                nodes = doc.DocumentNode.CssSelect(psr.CssSelector);
            }
            else
            {
                nodes = doc.DocumentNode.SelectNodes(psr.XPath);
            }

            if (nodes != null && nodes.Count() > 0)
            {
                foreach (var item in nodes)
                {
                    HtmlAgilityPack.HtmlNode node2 = item;
                    string data;//第二阶段的值
                    try
                    {
                        foreach (Entities.EnumNodeOffset itemOffset in psr.NodeOffset)
                        {
                            switch (itemOffset)
                            {
                            case XMT281Scraper.Entities.EnumNodeOffset.NoOffset:
                                //node2 = node2;
                                break;

                            case XMT281Scraper.Entities.EnumNodeOffset.SinblingLeft:
                                node2 = node2.PreviousSibling;
                                break;

                            case XMT281Scraper.Entities.EnumNodeOffset.SinblingRight:
                                node2 = node2.NextSibling;
                                break;

                            case XMT281Scraper.Entities.EnumNodeOffset.Parent:
                                node2 = node2.ParentNode;
                                break;

                            case XMT281Scraper.Entities.EnumNodeOffset.Child:
                                node2 = node2.FirstChild;
                                //FirstChild.NextSibling
                                break;

                            default:
                                break;
                            }
                        }
                    }
                    catch (NullReferenceException)
                    {
                        data = "Node偏移到空位";
                    }

                    //==============接下来对node2进行处理=============

                    if (psr.NodeAttribute == "" || psr.NodeAttribute == "OuterHtml")
                    {
                        if (node2 == null)
                        {
                            data = "Node偏移到空位";
                        }
                        else
                        {
                            data = node2.OuterHtml.Trim();
                        }
                    }
                    else if (psr.NodeAttribute == "InnerText")
                    {
                        if (node2 == null)
                        {
                            data = "Node偏移到空位";
                        }
                        else
                        {
                            data = node2.InnerText.Trim();
                        }
                    }
                    else
                    {
                        if (node2 == null)
                        {
                            data = "Node偏移到空位";
                        }
                        else
                        {
                            try
                            {
                                data = node2.Attributes[psr.NodeAttribute].Value.Trim();
                            }
                            catch (Exception)
                            {
                                data = "【NoAttrib没有这个属性,用OuterHtml占位】" + node2.OuterHtml;
                            }
                        }
                    }
                    //=============接下来对data进行 删除 处理===============
                    foreach (var rm in psr.Remover)
                    {
                        data = data.Replace(rm, "");
                    }
                    //=============接下来对data进行 替换 处理===============
                    foreach (var rp in psr.Replacer)
                    {
                        data = data.Replace(rp.Key, rp.Value);
                    }

                    if (!string.IsNullOrEmpty(psr.RemoveBefore))
                    {
                        int k = data.IndexOf(psr.RemoveBefore);
                        data = data.Substring(k + 1);
                    }

                    if (!string.IsNullOrEmpty(psr.RemoveAfter))
                    {
                        int j = data.LastIndexOf(psr.RemoveAfter);
                        if (j != -1)
                        {
                            data = data.Substring(0, j);
                        }
                    }

                    list.Add(data.Trim());
                }
            }
            else
            {
                list.Add("未识别");
            }

            return(list);
        }
Esempio n. 7
0
 public static string ShowJson(Entities.Processor psr)
 {
     return(ConvertJsonStringFromPSR(psr));
 }