Exemplo n.º 1
0
        private Hashtable extract(string url, ref Hashtable ht)
        {
            Hashtable newHt = new Hashtable();

            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            string html = http.get(url);

            doc.LoadHtml(html);

            string xpath = Build(conf.itemsCollectionIndices);

            var collection = doc.DocumentNode.SelectNodes(xpath)
                             .Select(p => p.InnerHtml)
                             .ToList();

            foreach (string item in collection)
            {
                try
                {
                    doc.LoadHtml(item);

                    xpath = Build(conf.itemsNameIndices);
                    string itemName = HttpUtility.HtmlDecode(doc.DocumentNode.SelectSingleNode(xpath).InnerText);

                    xpath = Build(conf.itemsUrlIndices);
                    string itemUrl = doc.DocumentNode.SelectSingleNode(xpath).Attributes["href"].Value;

                    string itemImage = string.Empty;
                    if (conf.itemsImageIndices != string.Empty)
                    {
                        xpath     = Build(conf.itemsImageIndices);
                        itemImage = doc.DocumentNode.SelectSingleNode(xpath).Attributes["src"].Value;
                    }

                    if (checkURL(itemUrl))
                    {
                        html = http.get(itemUrl);
                        doc.LoadHtml(html);

                        List <object> partsList = new List <object>();

                        xpath = Build(conf.itemUrlSourceIndices);
                        string   itemFirstPartName  = string.Empty;
                        string   itemFirstUrlSource = string.Empty;
                        string[] xp = xpath.Split(new string[] { ";" }, StringSplitOptions.RemoveEmptyEntries);
                        if (xp.Length >= 1)
                        {
                            itemFirstUrlSource = doc.DocumentNode
                                                 .SelectSingleNode(xp[0] + xp[1])
                                                 .Attributes["src"].Value;
                        }
                        partsList.Add(new { name = "Source 1", url = HttpUtility.HtmlDecode(itemFirstUrlSource) });

                        xpath = Build(conf.itemDescriptionIndices);
                        string itemDescription = doc.DocumentNode.SelectSingleNode(xpath).InnerText;

                        string key = itemName.ToLower();

                        if (!conf.nameAsUnique && ht.ContainsKey(key))
                        {
                            int i = 1;
                            while (true)
                            {
                                if (ht.ContainsKey(key))
                                {
                                    key = key + "_" + i;
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }

                        if (conf.itemPartCollectionIndices != string.Empty)
                        {
                            xpath = Build(conf.itemPartCollectionIndices);

                            try
                            {
                                var parts = doc.DocumentNode.SelectNodes("//div[@class='keremiya_part']/a")
                                            .Select(p => p.InnerText + "S|P" + p.Attributes["href"].Value)
                                            .ToList();

                                foreach (string part in parts)
                                {
                                    string[] a = part.Split(new string[] { "S|P" }, StringSplitOptions.RemoveEmptyEntries);

                                    string partName = a[0];

                                    html = http.get(a[1]);
                                    doc.LoadHtml(html);

                                    string subUrlSource = string.Empty;
                                    xpath = Build(conf.itemUrlSourceIndices);
                                    xp    = xpath.Split(new string[] { ";" }, StringSplitOptions.RemoveEmptyEntries);
                                    if (xp.Length >= 1)
                                    {
                                        subUrlSource = doc.DocumentNode
                                                       .SelectSingleNode(xp[0] + xp[1])
                                                       .Attributes["src"].Value;
                                    }

                                    partsList.Add(new { name = HttpUtility.HtmlDecode(partName), url = HttpUtility.HtmlDecode(subUrlSource), });
                                }

                                if (partsList.Count > 1)
                                {
                                    dynamic expl  = partsList[partsList.Count - 1];
                                    dynamic first = partsList[0];

                                    string nameScheme = expl.name;
                                    nameScheme = nameScheme.TrimStart().Split(' ')[0];
                                    nameScheme = nameScheme + " 1";

                                    partsList[0] = new { name = nameScheme, url = expl.url };
                                }
                            }
                            catch (Exception e)
                            {
                                Console.WriteLine(e.ToString());
                            }
                        }

                        dynamic obj = new
                        {
                            key         = key,
                            name        = itemName,
                            url         = HttpUtility.HtmlDecode(itemUrl),
                            image       = HttpUtility.HtmlDecode(itemImage),
                            description = clearStartDescription(HttpUtility.HtmlDecode(itemDescription)),
                            parts       = partsList,
                            countParts  = partsList.Count.ToString(),
                            domain      = new Uri(url).Host,
                            date        = DateTime.Now.ToString()
                        };
                        JavaScriptSerializer ser = new JavaScriptSerializer();
                        obj = ser.DeserializeObject(ser.Serialize(obj));
                        ht.Add(key, obj);
                        //Nouveaux
                        newHt.Add(key, obj);
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.ToString());
                }
            }
            return(newHt);
        }
 public string Get()
 {
     return(hr.get(Router.url("get_command")));
 }